| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 从dimension_associations_analysis.json中提取分类之间的边关系
- """
- import json
- from pathlib import Path
- from typing import Dict, List, Any
- import argparse
- def get_last_segment(path: str) -> str:
- """获取路径的最后一段"""
- return path.split("/")[-1]
- def build_node_id(dimension: str, node_type: str, name: str) -> str:
- """
- 构建节点ID
- Args:
- dimension: 节点层级(灵感点、目的点、关键点)
- node_type: 节点类型(分类、标签)
- name: 节点名称(完整路径)
- Returns:
- 节点ID,格式: {层级}_{类型}_{名称最后一段}
- """
- last_segment = get_last_segment(name)
- return f"{dimension}_{node_type}_{last_segment}"
- def extract_edges_from_single_dimension(data: Dict) -> List[Dict]:
- """
- 从单维度关联分析中提取边
- Args:
- data: 单维度关联分析数据
- Returns:
- 边列表
- """
- edges = []
- if "单维度关联分析" not in data:
- return edges
- single_dim = data["单维度关联分析"]
- # 维度映射
- dimension_map = {
- "灵感点维度": "灵感点",
- "目的点维度": "目的点",
- "关键点维度": "关键点"
- }
- for dim_key, dim_data in single_dim.items():
- if dim_key not in dimension_map:
- continue
- source_dimension = dimension_map[dim_key]
- # 遍历该维度下的所有关联方向
- for direction_key, direction_data in dim_data.items():
- if direction_key == "说明":
- continue
- # 解析方向,如 "灵感点→目的点"
- if "→" not in direction_key:
- continue
- # 遍历每个源分类
- for source_path, source_info in direction_data.items():
- source_node_id = build_node_id(source_dimension, "分类", source_path)
- # 确定目标维度
- # 从关联字段名推断,如 "与目的点的关联"
- for field_name, associations in source_info.items():
- if not field_name.startswith("与") or not field_name.endswith("的关联"):
- continue
- # 提取目标维度名称
- target_dimension = field_name[1:-3] # 去掉"与"和"的关联"
- if not isinstance(associations, list):
- continue
- for assoc in associations:
- target_path = assoc.get("目标分类", "")
- if not target_path:
- continue
- target_node_id = build_node_id(target_dimension, "分类", target_path)
- edge = {
- "源节点ID": source_node_id,
- "目标节点ID": target_node_id,
- "边类型": f"{source_dimension}_分类-{target_dimension}_分类",
- "边详情": {
- "Jaccard相似度": assoc.get("Jaccard相似度", 0),
- "重叠系数": assoc.get("重叠系数", 0),
- "共同帖子数": assoc.get("共同帖子数", 0),
- "共同帖子ID": assoc.get("共同帖子ID", [])
- }
- }
- edges.append(edge)
- return edges
- def main():
- parser = argparse.ArgumentParser(description="从dimension_associations_analysis.json中提取分类边关系")
- parser.add_argument("--input", "-i", type=str, required=True, help="输入文件路径")
- parser.add_argument("--output", "-o", type=str, required=True, help="输出文件路径")
- args = parser.parse_args()
- input_file = Path(args.input)
- output_file = Path(args.output)
- print(f"输入文件: {input_file}")
- print(f"输出文件: {output_file}")
- # 读取输入文件
- print(f"\n正在读取文件: {input_file}")
- with open(input_file, "r", encoding="utf-8") as f:
- data = json.load(f)
- # 提取边
- print("\n正在提取边关系...")
- edges = extract_edges_from_single_dimension(data)
- print(f"提取到 {len(edges)} 条边")
- # 统计边类型
- edge_type_count = {}
- for edge in edges:
- edge_type = edge["边类型"]
- edge_type_count[edge_type] = edge_type_count.get(edge_type, 0) + 1
- print("\n边类型统计:")
- for edge_type, count in sorted(edge_type_count.items()):
- print(f" {edge_type}: {count} 条")
- # 构建输出
- output = {
- "说明": {
- "描述": "分类之间的边关系",
- "数据来源": input_file.name
- },
- "边列表": edges
- }
- # 确保输出目录存在
- output_file.parent.mkdir(parents=True, exist_ok=True)
- # 保存结果
- print(f"\n正在保存结果到: {output_file}")
- with open(output_file, "w", encoding="utf-8") as f:
- json.dump(output, f, ensure_ascii=False, indent=2)
- print("完成!")
- if __name__ == "__main__":
- main()
|