před 5 dny · 40a7b792e3
--- a/script/data_processing/build_persona_tree.py
+++ b/script/data_processing/build_persona_tree.py
@@ -0,0 +1,187 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+"""
			
 
				+构建人设树的中间数据
			
 
				+
			
 
				+输入：节点列表.json, 边关系.json
			
 
				+输出：persona_tree.json（包含分类和标签的层级树结构）
			
 
				+"""
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+import sys
			
 
				+
			
 
				+# 添加项目根目录到路径
			
 
				+project_root = Path(__file__).parent.parent.parent
			
 
				+sys.path.insert(0, str(project_root))
			
 
				+
			
 
				+from script.data_processing.path_config import PathConfig
			
 
				+
			
 
				+
			
 
				+def build_persona_tree():
			
 
				+    """构建人设树数据"""
			
 
				+    config = PathConfig()
			
 
				+
			
 
				+    print(f"账号: {config.account_name}")
			
 
				+    print(f"输出版本: {config.output_version}")
			
 
				+    print()
			
 
				+
			
 
				+    node_list_file = config.intermediate_dir / "节点列表.json"
			
 
				+    edge_list_file = config.intermediate_dir / "边关系.json"
			
 
				+    output_file = config.intermediate_dir / "persona_tree.json"
			
 
				+
			
 
				+    # 读取节点
			
 
				+    print(f"读取节点列表: {node_list_file.name}")
			
 
				+    with open(node_list_file, "r", encoding="utf-8") as f:
			
 
				+        node_data = json.load(f)
			
 
				+
			
 
				+    all_nodes = node_data.get("节点列表", [])
			
 
				+
			
 
				+    # 分离分类和标签
			
 
				+    category_nodes = [n for n in all_nodes if n.get("节点类型") == "分类"]
			
 
				+    tag_nodes = [n for n in all_nodes if n.get("节点类型") == "标签"]
			
 
				+
			
 
				+    print(f"  分类节点: {len(category_nodes)}")
			
 
				+    print(f"  标签节点: {len(tag_nodes)}")
			
 
				+
			
 
				+    # 读取边关系（获取所有边）
			
 
				+    print(f"读取边关系: {edge_list_file.name}")
			
 
				+    with open(edge_list_file, "r", encoding="utf-8") as f:
			
 
				+        edge_data = json.load(f)
			
 
				+
			
 
				+    all_edges = edge_data.get("边列表", [])
			
 
				+
			
 
				+    # 统计各类型边
			
 
				+    edge_type_counts = {}
			
 
				+    for e in all_edges:
			
 
				+        t = e.get("边类型", "未知")
			
 
				+        edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
			
 
				+
			
 
				+    for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
			
 
				+        print(f"  {t}: {count}")
			
 
				+
			
 
				+    # 构建树结构
			
 
				+    tree_nodes = []
			
 
				+    tree_edges = []
			
 
				+
			
 
				+    # 添加分类节点
			
 
				+    for n in category_nodes:
			
 
				+        tree_nodes.append({
			
 
				+            "节点ID": n["节点ID"],
			
 
				+            "节点名称": n["节点名称"],
			
 
				+            "节点类型": "分类",
			
 
				+            "节点层级": n.get("节点层级", ""),
			
 
				+            "所属分类": n.get("所属分类", []),
			
 
				+            "帖子数": n.get("帖子数", 0)
			
 
				+        })
			
 
				+
			
 
				+    # 添加标签节点
			
 
				+    for n in tag_nodes:
			
 
				+        tree_nodes.append({
			
 
				+            "节点ID": n["节点ID"],
			
 
				+            "节点名称": n["节点名称"],
			
 
				+            "节点类型": "标签",
			
 
				+            "节点层级": n.get("节点层级", ""),
			
 
				+            "所属分类": n.get("所属分类", []),
			
 
				+            "帖子数": n.get("帖子数", 0)
			
 
				+        })
			
 
				+
			
 
				+    # 构建节点ID集合和名称映射
			
 
				+    node_ids = set(n["节点ID"] for n in tree_nodes)
			
 
				+
			
 
				+    # 按层级构建分类名称到ID的映射
			
 
				+    category_name_to_id = {}
			
 
				+    for n in category_nodes:
			
 
				+        level = n.get("节点层级", "")
			
 
				+        name = n.get("节点名称", "")
			
 
				+        category_name_to_id[(level, name)] = n["节点ID"]
			
 
				+
			
 
				+    # 从分类的"所属分类"字段构建分类之间的层级边（统一用"属于"）
			
 
				+    for n in category_nodes:
			
 
				+        level = n.get("节点层级", "")
			
 
				+        parent_names = n.get("所属分类", [])
			
 
				+        if parent_names:
			
 
				+            parent_name = parent_names[-1]  # 取最后一个作为直接父分类
			
 
				+            parent_id = category_name_to_id.get((level, parent_name))
			
 
				+            if parent_id:
			
 
				+                tree_edges.append({
			
 
				+                    "源节点ID": n["节点ID"],
			
 
				+                    "目标节点ID": parent_id,
			
 
				+                    "边类型": "属于"
			
 
				+                })
			
 
				+
			
 
				+    # 添加所有原始边（两端节点都在树中的，排除"包含"边因为与"属于"重复）
			
 
				+    for e in all_edges:
			
 
				+        src_id = e["源节点ID"]
			
 
				+        tgt_id = e["目标节点ID"]
			
 
				+        edge_type = e["边类型"]
			
 
				+        # 跳过"包含"边（与"属于"是反向关系，保留"属于"即可）
			
 
				+        if edge_type == "包含":
			
 
				+            continue
			
 
				+        if src_id in node_ids and tgt_id in node_ids:
			
 
				+            tree_edges.append({
			
 
				+                "源节点ID": src_id,
			
 
				+                "目标节点ID": tgt_id,
			
 
				+                "边类型": edge_type,
			
 
				+                "边详情": e.get("边详情", {})
			
 
				+            })
			
 
				+
			
 
				+    # 从标签的"所属分类"字段补充标签->分类的边（如果不存在）
			
 
				+    for n in tag_nodes:
			
 
				+        level = n.get("节点层级", "")
			
 
				+        parent_names = n.get("所属分类", [])
			
 
				+        if parent_names:
			
 
				+            parent_name = parent_names[-1]
			
 
				+            parent_id = category_name_to_id.get((level, parent_name))
			
 
				+            if parent_id:
			
 
				+                # 检查是否已存在属于边
			
 
				+                edge_exists = any(
			
 
				+                    e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id
			
 
				+                    and e["边类型"] == "属于"
			
 
				+                    for e in tree_edges
			
 
				+                )
			
 
				+                if not edge_exists:
			
 
				+                    tree_edges.append({
			
 
				+                        "源节点ID": n["节点ID"],
			
 
				+                        "目标节点ID": parent_id,
			
 
				+                        "边类型": "属于",
			
 
				+                        "边详情": {}
			
 
				+                    })
			
 
				+
			
 
				+    # 统计各类型边
			
 
				+    tree_edge_counts = {}
			
 
				+    for e in tree_edges:
			
 
				+        t = e["边类型"]
			
 
				+        tree_edge_counts[t] = tree_edge_counts.get(t, 0) + 1
			
 
				+
			
 
				+    print()
			
 
				+    print(f"构建人设树:")
			
 
				+    print(f"  总节点数: {len(tree_nodes)}")
			
 
				+    print(f"  总边数: {len(tree_edges)}")
			
 
				+    for t, count in sorted(tree_edge_counts.items(), key=lambda x: -x[1]):
			
 
				+        print(f"    {t}: {count}")
			
 
				+
			
 
				+    # 输出
			
 
				+    output_data = {
			
 
				+        "说明": {
			
 
				+            "描述": "人设树结构数据（包含分类、标签和所有边类型）",
			
 
				+            "分类节点数": len(category_nodes),
			
 
				+            "标签节点数": len(tag_nodes),
			
 
				+            "总边数": len(tree_edges),
			
 
				+            "边类型统计": tree_edge_counts
			
 
				+        },
			
 
				+        "nodes": tree_nodes,
			
 
				+        "edges": tree_edges
			
 
				+    }
			
 
				+
			
 
				+    with open(output_file, "w", encoding="utf-8") as f:
			
 
				+        json.dump(output_data, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    print()
			
 
				+    print(f"输出文件: {output_file}")
			
 
				+
			
 
				+    return output_file
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    build_persona_tree()
			
--- a/script/data_processing/visualize_match_graph.py
+++ b/script/data_processing/visualize_match_graph.py