#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 构建人设树的中间数据 输入:节点列表.json, 边关系.json 输出:persona_tree.json(包含分类和标签的层级树结构) """ import json from pathlib import Path import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig def build_persona_tree(): """构建人设树数据""" config = PathConfig() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() node_list_file = config.intermediate_dir / "节点列表.json" edge_list_file = config.intermediate_dir / "边关系.json" output_file = config.intermediate_dir / "persona_tree.json" # 读取节点 print(f"读取节点列表: {node_list_file.name}") with open(node_list_file, "r", encoding="utf-8") as f: node_data = json.load(f) all_nodes = node_data.get("节点列表", []) # 分离分类和标签 category_nodes = [n for n in all_nodes if n.get("节点类型") == "分类"] tag_nodes = [n for n in all_nodes if n.get("节点类型") == "标签"] print(f" 分类节点: {len(category_nodes)}") print(f" 标签节点: {len(tag_nodes)}") # 读取边关系(获取所有边) print(f"读取边关系: {edge_list_file.name}") with open(edge_list_file, "r", encoding="utf-8") as f: edge_data = json.load(f) all_edges = edge_data.get("边列表", []) # 统计各类型边 edge_type_counts = {} for e in all_edges: t = e.get("边类型", "未知") edge_type_counts[t] = edge_type_counts.get(t, 0) + 1 for t, count in sorted(edge_type_counts.items(), key=lambda x: -x[1]): print(f" {t}: {count}") # 构建树结构 tree_nodes = [] tree_edges = [] # 添加分类节点 for n in category_nodes: tree_nodes.append({ "节点ID": n["节点ID"], "节点名称": n["节点名称"], "节点类型": "分类", "节点层级": n.get("节点层级", ""), "所属分类": n.get("所属分类", []), "帖子数": n.get("帖子数", 0) }) # 添加标签节点 for n in tag_nodes: tree_nodes.append({ "节点ID": n["节点ID"], "节点名称": n["节点名称"], "节点类型": "标签", "节点层级": n.get("节点层级", ""), "所属分类": n.get("所属分类", []), "帖子数": n.get("帖子数", 0) }) # 构建节点ID集合和名称映射 node_ids = set(n["节点ID"] for n in tree_nodes) # 按层级构建分类名称到ID的映射 category_name_to_id = {} for n in category_nodes: level = n.get("节点层级", "") name = n.get("节点名称", "") category_name_to_id[(level, name)] = n["节点ID"] # 先添加所有原始边(两端节点都在树中的) for e in all_edges: src_id = e["源节点ID"] tgt_id = e["目标节点ID"] edge_type = e["边类型"] if src_id in node_ids and tgt_id in node_ids: tree_edges.append({ "源节点ID": src_id, "目标节点ID": tgt_id, "边类型": edge_type, "边详情": e.get("边详情", {}) }) # 从分类的"所属分类"字段补充分类之间的层级边(如果不存在) for n in category_nodes: level = n.get("节点层级", "") parent_names = n.get("所属分类", []) if parent_names: parent_name = parent_names[-1] # 取最后一个作为直接父分类 parent_id = category_name_to_id.get((level, parent_name)) if parent_id: # 检查是否已存在属于边 edge_exists = any( e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id and e["边类型"] == "属于" for e in tree_edges ) if not edge_exists: tree_edges.append({ "源节点ID": n["节点ID"], "目标节点ID": parent_id, "边类型": "属于" }) # 从标签的"所属分类"字段补充标签->分类的边(如果不存在) for n in tag_nodes: level = n.get("节点层级", "") parent_names = n.get("所属分类", []) if parent_names: parent_name = parent_names[-1] parent_id = category_name_to_id.get((level, parent_name)) if parent_id: # 检查是否已存在属于边 edge_exists = any( e["源节点ID"] == n["节点ID"] and e["目标节点ID"] == parent_id and e["边类型"] == "属于" for e in tree_edges ) if not edge_exists: tree_edges.append({ "源节点ID": n["节点ID"], "目标节点ID": parent_id, "边类型": "属于", "边详情": {} }) # 为分类间的"属于"边生成反向的"包含"边 # 这样 父分类→子分类 也有边,查询"包含"时可以找到子分类 category_ids = set(n["节点ID"] for n in category_nodes) contain_edges_to_add = [] for e in tree_edges: if e["边类型"] == "属于": src_id = e["源节点ID"] tgt_id = e["目标节点ID"] # 只为分类→分类的属于边生成反向包含边 if src_id in category_ids and tgt_id in category_ids: # 检查是否已存在包含边 edge_exists = any( ex["源节点ID"] == tgt_id and ex["目标节点ID"] == src_id and ex["边类型"] == "包含" for ex in tree_edges ) if not edge_exists: contain_edges_to_add.append({ "源节点ID": tgt_id, "目标节点ID": src_id, "边类型": "包含", "边详情": {"说明": "分类层级关系(属于的反向)"} }) tree_edges.extend(contain_edges_to_add) # 统计各类型边 tree_edge_counts = {} for e in tree_edges: t = e["边类型"] tree_edge_counts[t] = tree_edge_counts.get(t, 0) + 1 print() print(f"构建人设树:") print(f" 总节点数: {len(tree_nodes)}") print(f" 总边数: {len(tree_edges)}") for t, count in sorted(tree_edge_counts.items(), key=lambda x: -x[1]): print(f" {t}: {count}") # 输出 output_data = { "说明": { "描述": "人设树结构数据(包含分类、标签和所有边类型)", "分类节点数": len(category_nodes), "标签节点数": len(tag_nodes), "总边数": len(tree_edges), "边类型统计": tree_edge_counts }, "nodes": tree_nodes, "edges": tree_edges } with open(output_file, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) print() print(f"输出文件: {output_file}") return output_file if __name__ == "__main__": build_persona_tree()