build_persona_tree.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建人设树的中间数据
  5. 输入:节点列表.json, 边关系.json
  6. 输出:persona_tree.json(包含分类和标签的层级树结构)
  7. """
  8. import json
  9. from pathlib import Path
  10. import sys
  11. # 添加项目根目录到路径
  12. project_root = Path(__file__).parent.parent.parent
  13. sys.path.insert(0, str(project_root))
  14. from script.data_processing.path_config import PathConfig
  15. def build_persona_tree():
  16. """构建人设树数据"""
  17. config = PathConfig()
  18. print(f"账号: {config.account_name}")
  19. print(f"输出版本: {config.output_version}")
  20. print()
  21. node_list_file = config.intermediate_dir / "节点列表.json"
  22. edge_list_file = config.intermediate_dir / "边关系.json"
  23. output_file = config.intermediate_dir / "persona_tree.json"
  24. # 读取节点
  25. print(f"读取节点列表: {node_list_file.name}")
  26. with open(node_list_file, "r", encoding="utf-8") as f:
  27. node_data = json.load(f)
  28. all_nodes = node_data.get("节点列表", [])
  29. # 分离分类和标签
  30. category_nodes = [n for n in all_nodes if n.get("节点类型") == "分类"]
  31. tag_nodes = [n for n in all_nodes if n.get("节点类型") == "标签"]
  32. print(f" 分类节点: {len(category_nodes)}")
  33. print(f" 标签节点: {len(tag_nodes)}")
  34. # 读取边关系(获取所有边)
  35. print(f"读取边关系: {edge_list_file.name}")
  36. with open(edge_list_file, "r", encoding="utf-8") as f:
  37. edge_data = json.load(f)
  38. all_edges = edge_data.get("边列表", [])
  39. # 统计各类型边
  40. edge_type_counts = {}
  41. for e in all_edges:
  42. t = e.get("边类型", "未知")
  43. edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
  44. for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
  45. print(f" {t}: {count}")
  46. # 构建树结构
  47. tree_nodes = []
  48. tree_edges = []
  49. # 添加分类节点
  50. for n in category_nodes:
  51. tree_nodes.append({
  52. "节点ID": n["节点ID"],
  53. "节点名称": n["节点名称"],
  54. "节点类型": "分类",
  55. "节点层级": n.get("节点层级", ""),
  56. "所属分类": n.get("所属分类", []),
  57. "帖子数": n.get("帖子数", 0)
  58. })
  59. # 添加标签节点
  60. for n in tag_nodes:
  61. tree_nodes.append({
  62. "节点ID": n["节点ID"],
  63. "节点名称": n["节点名称"],
  64. "节点类型": "标签",
  65. "节点层级": n.get("节点层级", ""),
  66. "所属分类": n.get("所属分类", []),
  67. "帖子数": n.get("帖子数", 0)
  68. })
  69. # 构建节点ID集合和名称映射
  70. node_ids = set(n["节点ID"] for n in tree_nodes)
  71. # 按层级构建分类名称到ID的映射
  72. category_name_to_id = {}
  73. for n in category_nodes:
  74. level = n.get("节点层级", "")
  75. name = n.get("节点名称", "")
  76. category_name_to_id[(level, name)] = n["节点ID"]
  77. # 先添加所有原始边(两端节点都在树中的)
  78. for e in all_edges:
  79. src_id = e["源节点ID"]
  80. tgt_id = e["目标节点ID"]
  81. edge_type = e["边类型"]
  82. if src_id in node_ids and tgt_id in node_ids:
  83. tree_edges.append({
  84. "源节点ID": src_id,
  85. "目标节点ID": tgt_id,
  86. "边类型": edge_type,
  87. "边详情": e.get("边详情", {})
  88. })
  89. # 从分类的"所属分类"字段补充分类之间的层级边(如果不存在)
  90. for n in category_nodes:
  91. level = n.get("节点层级", "")
  92. parent_names = n.get("所属分类", [])
  93. if parent_names:
  94. parent_name = parent_names[-1] # 取最后一个作为直接父分类
  95. parent_id = category_name_to_id.get((level, parent_name))
  96. if parent_id:
  97. # 检查是否已存在属于边
  98. edge_exists = any(
  99. e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id
  100. and e["边类型"] == "属于"
  101. for e in tree_edges
  102. )
  103. if not edge_exists:
  104. tree_edges.append({
  105. "源节点ID": n["节点ID"],
  106. "目标节点ID": parent_id,
  107. "边类型": "属于"
  108. })
  109. # 从标签的"所属分类"字段补充标签->分类的边(如果不存在)
  110. for n in tag_nodes:
  111. level = n.get("节点层级", "")
  112. parent_names = n.get("所属分类", [])
  113. if parent_names:
  114. parent_name = parent_names[-1]
  115. parent_id = category_name_to_id.get((level, parent_name))
  116. if parent_id:
  117. # 检查是否已存在属于边
  118. edge_exists = any(
  119. e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id
  120. and e["边类型"] == "属于"
  121. for e in tree_edges
  122. )
  123. if not edge_exists:
  124. tree_edges.append({
  125. "源节点ID": n["节点ID"],
  126. "目标节点ID": parent_id,
  127. "边类型": "属于",
  128. "边详情": {}
  129. })
  130. # 为分类间的"属于"边生成反向的"包含"边
  131. # 这样 父分类→子分类 也有边,查询"包含"时可以找到子分类
  132. category_ids = set(n["节点ID"] for n in category_nodes)
  133. contain_edges_to_add = []
  134. for e in tree_edges:
  135. if e["边类型"] == "属于":
  136. src_id = e["源节点ID"]
  137. tgt_id = e["目标节点ID"]
  138. # 只为分类→分类的属于边生成反向包含边
  139. if src_id in category_ids and tgt_id in category_ids:
  140. # 检查是否已存在包含边
  141. edge_exists = any(
  142. ex["源节点ID"] == tgt_id and ex["目标节点ID"] == src_id
  143. and ex["边类型"] == "包含"
  144. for ex in tree_edges
  145. )
  146. if not edge_exists:
  147. contain_edges_to_add.append({
  148. "源节点ID": tgt_id,
  149. "目标节点ID": src_id,
  150. "边类型": "包含",
  151. "边详情": {"说明": "分类层级关系(属于的反向)"}
  152. })
  153. tree_edges.extend(contain_edges_to_add)
  154. # 统计各类型边
  155. tree_edge_counts = {}
  156. for e in tree_edges:
  157. t = e["边类型"]
  158. tree_edge_counts[t] = tree_edge_counts.get(t, 0) + 1
  159. print()
  160. print(f"构建人设树:")
  161. print(f" 总节点数: {len(tree_nodes)}")
  162. print(f" 总边数: {len(tree_edges)}")
  163. for t, count in sorted(tree_edge_counts.items(), key=lambda x: -x[1]):
  164. print(f" {t}: {count}")
  165. # 输出
  166. output_data = {
  167. "说明": {
  168. "描述": "人设树结构数据(包含分类、标签和所有边类型)",
  169. "分类节点数": len(category_nodes),
  170. "标签节点数": len(tag_nodes),
  171. "总边数": len(tree_edges),
  172. "边类型统计": tree_edge_counts
  173. },
  174. "nodes": tree_nodes,
  175. "edges": tree_edges
  176. }
  177. with open(output_file, "w", encoding="utf-8") as f:
  178. json.dump(output_data, f, ensure_ascii=False, indent=2)
  179. print()
  180. print(f"输出文件: {output_file}")
  181. return output_file
  182. if __name__ == "__main__":
  183. build_persona_tree()