#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从dimension_associations_analysis.json中提取分类之间的边关系 """ import json from pathlib import Path from typing import Dict, List, Any import argparse def get_last_segment(path: str) -> str: """获取路径的最后一段""" return path.split("/")[-1] def build_node_id(dimension: str, node_type: str, name: str) -> str: """ 构建节点ID Args: dimension: 节点层级(灵感点、目的点、关键点) node_type: 节点类型(分类、标签) name: 节点名称(完整路径) Returns: 节点ID,格式: {层级}_{类型}_{名称最后一段} """ last_segment = get_last_segment(name) return f"{dimension}_{node_type}_{last_segment}" def extract_edges_from_single_dimension(data: Dict) -> List[Dict]: """ 从单维度关联分析中提取边 Args: data: 单维度关联分析数据 Returns: 边列表 """ edges = [] if "单维度关联分析" not in data: return edges single_dim = data["单维度关联分析"] # 维度映射 dimension_map = { "灵感点维度": "灵感点", "目的点维度": "目的点", "关键点维度": "关键点" } for dim_key, dim_data in single_dim.items(): if dim_key not in dimension_map: continue source_dimension = dimension_map[dim_key] # 遍历该维度下的所有关联方向 for direction_key, direction_data in dim_data.items(): if direction_key == "说明": continue # 解析方向,如 "灵感点→目的点" if "→" not in direction_key: continue # 遍历每个源分类 for source_path, source_info in direction_data.items(): source_node_id = build_node_id(source_dimension, "分类", source_path) # 确定目标维度 # 从关联字段名推断,如 "与目的点的关联" for field_name, associations in source_info.items(): if not field_name.startswith("与") or not field_name.endswith("的关联"): continue # 提取目标维度名称 target_dimension = field_name[1:-3] # 去掉"与"和"的关联" if not isinstance(associations, list): continue for assoc in associations: target_path = assoc.get("目标分类", "") if not target_path: continue target_node_id = build_node_id(target_dimension, "分类", target_path) edge = { "源节点ID": source_node_id, "目标节点ID": target_node_id, "边类型": f"{source_dimension}_分类-{target_dimension}_分类", "边详情": { "Jaccard相似度": assoc.get("Jaccard相似度", 0), "重叠系数": assoc.get("重叠系数", 0), "共同帖子数": assoc.get("共同帖子数", 0), "共同帖子ID": assoc.get("共同帖子ID", []) } } edges.append(edge) return edges def main(): parser = argparse.ArgumentParser(description="从dimension_associations_analysis.json中提取分类边关系") parser.add_argument("--input", "-i", type=str, required=True, help="输入文件路径") parser.add_argument("--output", "-o", type=str, required=True, help="输出文件路径") args = parser.parse_args() input_file = Path(args.input) output_file = Path(args.output) print(f"输入文件: {input_file}") print(f"输出文件: {output_file}") # 读取输入文件 print(f"\n正在读取文件: {input_file}") with open(input_file, "r", encoding="utf-8") as f: data = json.load(f) # 提取边 print("\n正在提取边关系...") edges = extract_edges_from_single_dimension(data) print(f"提取到 {len(edges)} 条边") # 统计边类型 edge_type_count = {} for edge in edges: edge_type = edge["边类型"] edge_type_count[edge_type] = edge_type_count.get(edge_type, 0) + 1 print("\n边类型统计:") for edge_type, count in sorted(edge_type_count.items()): print(f" {edge_type}: {count} 条") # 构建输出 output = { "说明": { "描述": "分类之间的边关系", "数据来源": input_file.name }, "边列表": edges } # 确保输出目录存在 output_file.parent.mkdir(parents=True, exist_ok=True) # 保存结果 print(f"\n正在保存结果到: {output_file}") with open(output_file, "w", encoding="utf-8") as f: json.dump(output, f, ensure_ascii=False, indent=2) print("完成!") if __name__ == "__main__": main()