extract_category_edges.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从dimension_associations_analysis.json中提取分类之间的边关系
  5. """
  6. import json
  7. from pathlib import Path
  8. from typing import Dict, List, Any
  9. import argparse
  10. def get_last_segment(path: str) -> str:
  11. """获取路径的最后一段"""
  12. return path.split("/")[-1]
  13. def build_node_id(dimension: str, node_type: str, name: str) -> str:
  14. """
  15. 构建节点ID
  16. Args:
  17. dimension: 节点层级(灵感点、目的点、关键点)
  18. node_type: 节点类型(分类、标签)
  19. name: 节点名称(完整路径)
  20. Returns:
  21. 节点ID,格式: {层级}_{类型}_{名称最后一段}
  22. """
  23. last_segment = get_last_segment(name)
  24. return f"{dimension}_{node_type}_{last_segment}"
  25. def extract_edges_from_single_dimension(data: Dict) -> List[Dict]:
  26. """
  27. 从单维度关联分析中提取边
  28. Args:
  29. data: 单维度关联分析数据
  30. Returns:
  31. 边列表
  32. """
  33. edges = []
  34. if "单维度关联分析" not in data:
  35. return edges
  36. single_dim = data["单维度关联分析"]
  37. # 维度映射
  38. dimension_map = {
  39. "灵感点维度": "灵感点",
  40. "目的点维度": "目的点",
  41. "关键点维度": "关键点"
  42. }
  43. for dim_key, dim_data in single_dim.items():
  44. if dim_key not in dimension_map:
  45. continue
  46. source_dimension = dimension_map[dim_key]
  47. # 遍历该维度下的所有关联方向
  48. for direction_key, direction_data in dim_data.items():
  49. if direction_key == "说明":
  50. continue
  51. # 解析方向,如 "灵感点→目的点"
  52. if "→" not in direction_key:
  53. continue
  54. # 遍历每个源分类
  55. for source_path, source_info in direction_data.items():
  56. source_node_id = build_node_id(source_dimension, "分类", source_path)
  57. # 确定目标维度
  58. # 从关联字段名推断,如 "与目的点的关联"
  59. for field_name, associations in source_info.items():
  60. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  61. continue
  62. # 提取目标维度名称
  63. target_dimension = field_name[1:-3] # 去掉"与"和"的关联"
  64. if not isinstance(associations, list):
  65. continue
  66. for assoc in associations:
  67. target_path = assoc.get("目标分类", "")
  68. if not target_path:
  69. continue
  70. target_node_id = build_node_id(target_dimension, "分类", target_path)
  71. edge = {
  72. "源节点ID": source_node_id,
  73. "目标节点ID": target_node_id,
  74. "边类型": f"{source_dimension}_分类-{target_dimension}_分类",
  75. "边详情": {
  76. "Jaccard相似度": assoc.get("Jaccard相似度", 0),
  77. "重叠系数": assoc.get("重叠系数", 0),
  78. "共同帖子数": assoc.get("共同帖子数", 0),
  79. "共同帖子ID": assoc.get("共同帖子ID", [])
  80. }
  81. }
  82. edges.append(edge)
  83. return edges
  84. def main():
  85. parser = argparse.ArgumentParser(description="从dimension_associations_analysis.json中提取分类边关系")
  86. parser.add_argument("--input", "-i", type=str, required=True, help="输入文件路径")
  87. parser.add_argument("--output", "-o", type=str, required=True, help="输出文件路径")
  88. args = parser.parse_args()
  89. input_file = Path(args.input)
  90. output_file = Path(args.output)
  91. print(f"输入文件: {input_file}")
  92. print(f"输出文件: {output_file}")
  93. # 读取输入文件
  94. print(f"\n正在读取文件: {input_file}")
  95. with open(input_file, "r", encoding="utf-8") as f:
  96. data = json.load(f)
  97. # 提取边
  98. print("\n正在提取边关系...")
  99. edges = extract_edges_from_single_dimension(data)
  100. print(f"提取到 {len(edges)} 条边")
  101. # 统计边类型
  102. edge_type_count = {}
  103. for edge in edges:
  104. edge_type = edge["边类型"]
  105. edge_type_count[edge_type] = edge_type_count.get(edge_type, 0) + 1
  106. print("\n边类型统计:")
  107. for edge_type, count in sorted(edge_type_count.items()):
  108. print(f" {edge_type}: {count} 条")
  109. # 构建输出
  110. output = {
  111. "说明": {
  112. "描述": "分类之间的边关系",
  113. "数据来源": input_file.name
  114. },
  115. "边列表": edges
  116. }
  117. # 确保输出目录存在
  118. output_file.parent.mkdir(parents=True, exist_ok=True)
  119. # 保存结果
  120. print(f"\n正在保存结果到: {output_file}")
  121. with open(output_file, "w", encoding="utf-8") as f:
  122. json.dump(output, f, ensure_ascii=False, indent=2)
  123. print("完成!")
  124. if __name__ == "__main__":
  125. main()