|
@@ -0,0 +1,187 @@
|
|
|
|
|
+#!/usr/bin/env python3
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+"""
|
|
|
|
|
+构建人设树的中间数据
|
|
|
|
|
+
|
|
|
|
|
+输入:节点列表.json, 边关系.json
|
|
|
|
|
+输出:persona_tree.json(包含分类和标签的层级树结构)
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import json
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+import sys
|
|
|
|
|
+
|
|
|
|
|
+# 添加项目根目录到路径
|
|
|
|
|
+project_root = Path(__file__).parent.parent.parent
|
|
|
|
|
+sys.path.insert(0, str(project_root))
|
|
|
|
|
+
|
|
|
|
|
+from script.data_processing.path_config import PathConfig
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_persona_tree():
|
|
|
|
|
+ """构建人设树数据"""
|
|
|
|
|
+ config = PathConfig()
|
|
|
|
|
+
|
|
|
|
|
+ print(f"账号: {config.account_name}")
|
|
|
|
|
+ print(f"输出版本: {config.output_version}")
|
|
|
|
|
+ print()
|
|
|
|
|
+
|
|
|
|
|
+ node_list_file = config.intermediate_dir / "节点列表.json"
|
|
|
|
|
+ edge_list_file = config.intermediate_dir / "边关系.json"
|
|
|
|
|
+ output_file = config.intermediate_dir / "persona_tree.json"
|
|
|
|
|
+
|
|
|
|
|
+ # 读取节点
|
|
|
|
|
+ print(f"读取节点列表: {node_list_file.name}")
|
|
|
|
|
+ with open(node_list_file, "r", encoding="utf-8") as f:
|
|
|
|
|
+ node_data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ all_nodes = node_data.get("节点列表", [])
|
|
|
|
|
+
|
|
|
|
|
+ # 分离分类和标签
|
|
|
|
|
+ category_nodes = [n for n in all_nodes if n.get("节点类型") == "分类"]
|
|
|
|
|
+ tag_nodes = [n for n in all_nodes if n.get("节点类型") == "标签"]
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 分类节点: {len(category_nodes)}")
|
|
|
|
|
+ print(f" 标签节点: {len(tag_nodes)}")
|
|
|
|
|
+
|
|
|
|
|
+ # 读取边关系(获取所有边)
|
|
|
|
|
+ print(f"读取边关系: {edge_list_file.name}")
|
|
|
|
|
+ with open(edge_list_file, "r", encoding="utf-8") as f:
|
|
|
|
|
+ edge_data = json.load(f)
|
|
|
|
|
+
|
|
|
|
|
+ all_edges = edge_data.get("边列表", [])
|
|
|
|
|
+
|
|
|
|
|
+ # 统计各类型边
|
|
|
|
|
+ edge_type_counts = {}
|
|
|
|
|
+ for e in all_edges:
|
|
|
|
|
+ t = e.get("边类型", "未知")
|
|
|
|
|
+ edge_type_counts[t] = edge_type_counts.get(t, 0) + 1
|
|
|
|
|
+
|
|
|
|
|
+ for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]):
|
|
|
|
|
+ print(f" {t}: {count}")
|
|
|
|
|
+
|
|
|
|
|
+ # 构建树结构
|
|
|
|
|
+ tree_nodes = []
|
|
|
|
|
+ tree_edges = []
|
|
|
|
|
+
|
|
|
|
|
+ # 添加分类节点
|
|
|
|
|
+ for n in category_nodes:
|
|
|
|
|
+ tree_nodes.append({
|
|
|
|
|
+ "节点ID": n["节点ID"],
|
|
|
|
|
+ "节点名称": n["节点名称"],
|
|
|
|
|
+ "节点类型": "分类",
|
|
|
|
|
+ "节点层级": n.get("节点层级", ""),
|
|
|
|
|
+ "所属分类": n.get("所属分类", []),
|
|
|
|
|
+ "帖子数": n.get("帖子数", 0)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 添加标签节点
|
|
|
|
|
+ for n in tag_nodes:
|
|
|
|
|
+ tree_nodes.append({
|
|
|
|
|
+ "节点ID": n["节点ID"],
|
|
|
|
|
+ "节点名称": n["节点名称"],
|
|
|
|
|
+ "节点类型": "标签",
|
|
|
|
|
+ "节点层级": n.get("节点层级", ""),
|
|
|
|
|
+ "所属分类": n.get("所属分类", []),
|
|
|
|
|
+ "帖子数": n.get("帖子数", 0)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 构建节点ID集合和名称映射
|
|
|
|
|
+ node_ids = set(n["节点ID"] for n in tree_nodes)
|
|
|
|
|
+
|
|
|
|
|
+ # 按层级构建分类名称到ID的映射
|
|
|
|
|
+ category_name_to_id = {}
|
|
|
|
|
+ for n in category_nodes:
|
|
|
|
|
+ level = n.get("节点层级", "")
|
|
|
|
|
+ name = n.get("节点名称", "")
|
|
|
|
|
+ category_name_to_id[(level, name)] = n["节点ID"]
|
|
|
|
|
+
|
|
|
|
|
+ # 从分类的"所属分类"字段构建分类之间的层级边(统一用"属于")
|
|
|
|
|
+ for n in category_nodes:
|
|
|
|
|
+ level = n.get("节点层级", "")
|
|
|
|
|
+ parent_names = n.get("所属分类", [])
|
|
|
|
|
+ if parent_names:
|
|
|
|
|
+ parent_name = parent_names[-1] # 取最后一个作为直接父分类
|
|
|
|
|
+ parent_id = category_name_to_id.get((level, parent_name))
|
|
|
|
|
+ if parent_id:
|
|
|
|
|
+ tree_edges.append({
|
|
|
|
|
+ "源节点ID": n["节点ID"],
|
|
|
|
|
+ "目标节点ID": parent_id,
|
|
|
|
|
+ "边类型": "属于"
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 添加所有原始边(两端节点都在树中的,排除"包含"边因为与"属于"重复)
|
|
|
|
|
+ for e in all_edges:
|
|
|
|
|
+ src_id = e["源节点ID"]
|
|
|
|
|
+ tgt_id = e["目标节点ID"]
|
|
|
|
|
+ edge_type = e["边类型"]
|
|
|
|
|
+ # 跳过"包含"边(与"属于"是反向关系,保留"属于"即可)
|
|
|
|
|
+ if edge_type == "包含":
|
|
|
|
|
+ continue
|
|
|
|
|
+ if src_id in node_ids and tgt_id in node_ids:
|
|
|
|
|
+ tree_edges.append({
|
|
|
|
|
+ "源节点ID": src_id,
|
|
|
|
|
+ "目标节点ID": tgt_id,
|
|
|
|
|
+ "边类型": edge_type,
|
|
|
|
|
+ "边详情": e.get("边详情", {})
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 从标签的"所属分类"字段补充标签->分类的边(如果不存在)
|
|
|
|
|
+ for n in tag_nodes:
|
|
|
|
|
+ level = n.get("节点层级", "")
|
|
|
|
|
+ parent_names = n.get("所属分类", [])
|
|
|
|
|
+ if parent_names:
|
|
|
|
|
+ parent_name = parent_names[-1]
|
|
|
|
|
+ parent_id = category_name_to_id.get((level, parent_name))
|
|
|
|
|
+ if parent_id:
|
|
|
|
|
+ # 检查是否已存在属于边
|
|
|
|
|
+ edge_exists = any(
|
|
|
|
|
+ e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id
|
|
|
|
|
+ and e["边类型"] == "属于"
|
|
|
|
|
+ for e in tree_edges
|
|
|
|
|
+ )
|
|
|
|
|
+ if not edge_exists:
|
|
|
|
|
+ tree_edges.append({
|
|
|
|
|
+ "源节点ID": n["节点ID"],
|
|
|
|
|
+ "目标节点ID": parent_id,
|
|
|
|
|
+ "边类型": "属于",
|
|
|
|
|
+ "边详情": {}
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ # 统计各类型边
|
|
|
|
|
+ tree_edge_counts = {}
|
|
|
|
|
+ for e in tree_edges:
|
|
|
|
|
+ t = e["边类型"]
|
|
|
|
|
+ tree_edge_counts[t] = tree_edge_counts.get(t, 0) + 1
|
|
|
|
|
+
|
|
|
|
|
+ print()
|
|
|
|
|
+ print(f"构建人设树:")
|
|
|
|
|
+ print(f" 总节点数: {len(tree_nodes)}")
|
|
|
|
|
+ print(f" 总边数: {len(tree_edges)}")
|
|
|
|
|
+ for t, count in sorted(tree_edge_counts.items(), key=lambda x: -x[1]):
|
|
|
|
|
+ print(f" {t}: {count}")
|
|
|
|
|
+
|
|
|
|
|
+ # 输出
|
|
|
|
|
+ output_data = {
|
|
|
|
|
+ "说明": {
|
|
|
|
|
+ "描述": "人设树结构数据(包含分类、标签和所有边类型)",
|
|
|
|
|
+ "分类节点数": len(category_nodes),
|
|
|
|
|
+ "标签节点数": len(tag_nodes),
|
|
|
|
|
+ "总边数": len(tree_edges),
|
|
|
|
|
+ "边类型统计": tree_edge_counts
|
|
|
|
|
+ },
|
|
|
|
|
+ "nodes": tree_nodes,
|
|
|
|
|
+ "edges": tree_edges
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ with open(output_file, "w", encoding="utf-8") as f:
|
|
|
|
|
+ json.dump(output_data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
+
|
|
|
|
|
+ print()
|
|
|
|
|
+ print(f"输出文件: {output_file}")
|
|
|
|
|
+
|
|
|
|
|
+ return output_file
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ build_persona_tree()
|