yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
从dimension_associations_analysis.json中提取分类之间的边关系
"""

import json
from pathlib import Path
from typing import Dict, List, Any
import argparse


def get_last_segment(path: str) -> str:
    """获取路径的最后一段"""
    return path.split("/")[-1]


def build_node_id(dimension: str, node_type: str, name: str) -> str:
    """
    构建节点ID

    Args:
        dimension: 节点层级（灵感点、目的点、关键点）
        node_type: 节点类型（分类、标签）
        name: 节点名称（完整路径）

    Returns:
        节点ID，格式: {层级}_{类型}_{名称最后一段}
    """
    last_segment = get_last_segment(name)
    return f"{dimension}_{node_type}_{last_segment}"


def extract_edges_from_single_dimension(data: Dict) -> List[Dict]:
    """
    从单维度关联分析中提取边

    Args:
        data: 单维度关联分析数据

    Returns:
        边列表
    """
    edges = []

    if "单维度关联分析" not in data:
        return edges

    single_dim = data["单维度关联分析"]

    # 维度映射
    dimension_map = {
        "灵感点维度": "灵感点",
        "目的点维度": "目的点",
        "关键点维度": "关键点"
    }

    for dim_key, dim_data in single_dim.items():
        if dim_key not in dimension_map:
            continue

        source_dimension = dimension_map[dim_key]

        # 遍历该维度下的所有关联方向
        for direction_key, direction_data in dim_data.items():
            if direction_key == "说明":
                continue

            # 解析方向，如 "灵感点→目的点"
            if "→" not in direction_key:
                continue

            # 遍历每个源分类
            for source_path, source_info in direction_data.items():
                source_node_id = build_node_id(source_dimension, "分类", source_path)

                # 确定目标维度
                # 从关联字段名推断，如 "与目的点的关联"
                for field_name, associations in source_info.items():
                    if not field_name.startswith("与") or not field_name.endswith("的关联"):
                        continue

                    # 提取目标维度名称
                    target_dimension = field_name[1:-3]  # 去掉"与"和"的关联"

                    if not isinstance(associations, list):
                        continue

                    for assoc in associations:
                        target_path = assoc.get("目标分类", "")
                        if not target_path:
                            continue

                        target_node_id = build_node_id(target_dimension, "分类", target_path)

                        edge = {
                            "源节点ID": source_node_id,
                            "目标节点ID": target_node_id,
                            "边类型": f"{source_dimension}_分类-{target_dimension}_分类",
                            "边详情": {
                                "Jaccard相似度": assoc.get("Jaccard相似度", 0),
                                "重叠系数": assoc.get("重叠系数", 0),
                                "共同帖子数": assoc.get("共同帖子数", 0),
                                "共同帖子ID": assoc.get("共同帖子ID", [])
                            }
                        }
                        edges.append(edge)

    return edges


def main():
    parser = argparse.ArgumentParser(description="从dimension_associations_analysis.json中提取分类边关系")
    parser.add_argument("--input", "-i", type=str, required=True, help="输入文件路径")
    parser.add_argument("--output", "-o", type=str, required=True, help="输出文件路径")
    args = parser.parse_args()

    input_file = Path(args.input)
    output_file = Path(args.output)

    print(f"输入文件: {input_file}")
    print(f"输出文件: {output_file}")

    # 读取输入文件
    print(f"\n正在读取文件: {input_file}")
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 提取边
    print("\n正在提取边关系...")
    edges = extract_edges_from_single_dimension(data)

    print(f"提取到 {len(edges)} 条边")

    # 统计边类型
    edge_type_count = {}
    for edge in edges:
        edge_type = edge["边类型"]
        edge_type_count[edge_type] = edge_type_count.get(edge_type, 0) + 1

    print("\n边类型统计:")
    for edge_type, count in sorted(edge_type_count.items()):
        print(f"  {edge_type}: {count} 条")

    # 构建输出
    output = {
        "说明": {
            "描述": "分类之间的边关系",
            "数据来源": input_file.name
        },
        "边列表": edges
    }

    # 确保输出目录存在
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # 保存结果
    print(f"\n正在保存结果到: {output_file}")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

    print("完成!")


if __name__ == "__main__":
    main()