#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 构建帖子树的中间数据 输入:match_graph/*.json, results/*.json 输出:match_graph/post_trees.json(包含所有帖子的树结构) """ import json from pathlib import Path import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig def build_post_trees(): """构建所有帖子的树数据""" config = PathConfig() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() match_graph_dir = config.intermediate_dir / "match_graph" results_dir = config.intermediate_dir.parent / "results" output_file = match_graph_dir / "post_trees.json" # 读取所有匹配图谱文件 graph_files = sorted(match_graph_dir.glob("*_match_graph.json")) print(f"找到 {len(graph_files)} 个匹配图谱文件") all_post_trees = [] for i, graph_file in enumerate(graph_files, 1): print(f"\n[{i}/{len(graph_files)}] 处理: {graph_file.name}") with open(graph_file, "r", encoding="utf-8") as f: match_graph_data = json.load(f) post_id = match_graph_data["说明"]["帖子ID"] post_title = match_graph_data["说明"].get("帖子标题", "") # 读取完整帖子详情 post_detail = { "title": post_title, "post_id": post_id } how_file = results_dir / f"{post_id}_how.json" if how_file.exists(): with open(how_file, "r", encoding="utf-8") as f: how_data = json.load(f) if "帖子详情" in how_data: post_detail = how_data["帖子详情"] post_detail["post_id"] = post_id print(f" 读取帖子详情: {how_file.name}") # 获取帖子点和帖子标签 post_points = match_graph_data.get("帖子点节点列表", []) post_tags = match_graph_data.get("帖子标签节点列表", []) belong_edges = match_graph_data.get("帖子属于边列表", []) # 获取匹配边(帖子标签 -> 人设标签) all_edges = match_graph_data.get("边列表", []) match_edges = [e for e in all_edges if e["边类型"].startswith("匹配_")] print(f" 帖子点: {len(post_points)}, 帖子标签: {len(post_tags)}, 属于边: {len(belong_edges)}, 匹配边: {len(match_edges)}") # 构建树结构 # 维度颜色 dim_colors = { "灵感点": "#f39c12", "目的点": "#3498db", "关键点": "#9b59b6" } # 构建节点映射 point_map = {} for n in post_points: point_map[n["节点ID"]] = { "id": n["节点ID"], "name": n["节点名称"], "nodeType": "点", "level": n.get("节点层级", ""), "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"), "description": n.get("描述", ""), "children": [] } tag_map = {} for n in post_tags: tag_map[n["节点ID"]] = { "id": n["节点ID"], "name": n["节点名称"], "nodeType": "标签", "level": n.get("节点层级", ""), "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"), "weight": n.get("权重", 0), "children": [] } # 获取所有节点(用于查找扩展节点) all_nodes = match_graph_data.get("节点列表", []) expanded_nodes_map = {} for n in all_nodes: if n.get("是否扩展"): expanded_nodes_map[n["节点ID"]] = n # 构建人设节点之间的边关系(用于找扩展节点) # 边类型:属于、包含、分类共现等 persona_edges = [e for e in all_edges if not e["边类型"].startswith("匹配_")] # 构建帖子标签到人设匹配的映射 tag_to_persona_matches = {} direct_persona_ids = set() # 记录直接匹配的人设ID for e in match_edges: src_id = e["源节点ID"] # 帖子标签 tgt_id = e["目标节点ID"] # 人设标签 edge_type = e["边类型"] # 匹配_相同 或 匹配_相似 edge_detail = e.get("边详情", {}) similarity = edge_detail.get("相似度", 0) if src_id not in tag_to_persona_matches: tag_to_persona_matches[src_id] = [] direct_persona_ids.add(tgt_id) # 从人设标签ID提取维度和名称 persona_name = tgt_id persona_level = "" if "_标签_" in tgt_id: parts = tgt_id.split("_标签_") persona_level = parts[0] persona_name = parts[1] if len(parts) > 1 else tgt_id elif "_分类_" in tgt_id: parts = tgt_id.split("_分类_") persona_level = parts[0] persona_name = parts[1] if len(parts) > 1 else tgt_id # 判断原始节点类型(分类/标签) original_type = "标签" if "_标签_" in tgt_id else ("分类" if "_分类_" in tgt_id else "标签") persona_node = { "id": f"persona_{tgt_id}", "name": persona_name, "nodeType": "人设", "originalType": original_type, # 原始类型:分类或标签 "personaId": tgt_id, "level": persona_level, "dimColor": dim_colors.get(persona_level, "#2ecc71"), "matchType": edge_type.replace("匹配_", ""), "similarity": similarity, "children": [] } tag_to_persona_matches[src_id].append(persona_node) # 为每个直接匹配的人设节点找扩展节点(第二层) persona_to_expanded = {} for e in persona_edges: src_id = e["源节点ID"] tgt_id = e["目标节点ID"] edge_type = e["边类型"] # 如果源是直接匹配节点,目标是扩展节点 if src_id in direct_persona_ids and tgt_id in expanded_nodes_map: if src_id not in persona_to_expanded: persona_to_expanded[src_id] = [] exp_node = expanded_nodes_map[tgt_id] exp_name = exp_node.get("节点名称", tgt_id) exp_level = exp_node.get("节点层级", "") # 扩展节点的原始类型 exp_original_type = exp_node.get("节点类型", "标签") expanded_node = { "id": f"expanded_{tgt_id}", "name": exp_name, "nodeType": "人设扩展", "originalType": exp_original_type, # 分类或标签 "personaId": tgt_id, "level": exp_level, "dimColor": dim_colors.get(exp_level, "#2ecc71"), "edgeType": edge_type, "children": [] } # 避免重复 if not any(x["personaId"] == tgt_id for x in persona_to_expanded[src_id]): persona_to_expanded[src_id].append(expanded_node) # 如果目标是直接匹配节点,源是扩展节点 if tgt_id in direct_persona_ids and src_id in expanded_nodes_map: if tgt_id not in persona_to_expanded: persona_to_expanded[tgt_id] = [] exp_node = expanded_nodes_map[src_id] exp_name = exp_node.get("节点名称", src_id) exp_level = exp_node.get("节点层级", "") exp_original_type = exp_node.get("节点类型", "标签") expanded_node = { "id": f"expanded_{src_id}", "name": exp_name, "nodeType": "人设扩展", "originalType": exp_original_type, "personaId": src_id, "level": exp_level, "dimColor": dim_colors.get(exp_level, "#2ecc71"), "edgeType": edge_type, "children": [] } if not any(x["personaId"] == src_id for x in persona_to_expanded[tgt_id]): persona_to_expanded[tgt_id].append(expanded_node) # 将扩展节点添加到对应的人设节点下 expanded_count = 0 for tag_id, persona_nodes in tag_to_persona_matches.items(): for persona_node in persona_nodes: persona_id = persona_node["personaId"] if persona_id in persona_to_expanded: persona_node["children"] = persona_to_expanded[persona_id] expanded_count += len(persona_to_expanded[persona_id]) # 将人设匹配节点添加到对应标签下 persona_count = 0 for tag_id, persona_nodes in tag_to_persona_matches.items(): if tag_id in tag_map: tag_map[tag_id]["children"] = persona_nodes persona_count += len(persona_nodes) print(f" 人设匹配节点(1层): {persona_count}, 扩展节点(2层): {expanded_count}") # 根据属于边,把标签挂到点下面 for e in belong_edges: tag_node = tag_map.get(e["源节点ID"]) point_node = point_map.get(e["目标节点ID"]) if tag_node and point_node: point_node["children"].append(tag_node) # 按维度分组点节点 dimensions = ["灵感点", "目的点", "关键点"] dimension_children = [] for dim in dimensions: dim_points = [ point_map[n["节点ID"]] for n in post_points if n.get("节点层级") == dim and n["节点ID"] in point_map ] if dim_points: dim_node = { "id": f"dim_{dim}", "name": dim, "nodeType": "维度", "isDimension": True, "dimColor": dim_colors[dim], "children": dim_points } dimension_children.append(dim_node) # 根节点(帖子) root_node = { "id": f"post_{post_id}", "name": post_title[:20] + "..." if len(post_title) > 20 else post_title, "nodeType": "帖子", "isRoot": True, "postDetail": post_detail, "children": dimension_children } # 统计节点数 total_nodes = 1 + len(dimension_children) # 根节点 + 维度节点 for dim_node in dimension_children: total_nodes += len(dim_node["children"]) # 点节点 for point_node in dim_node["children"]: total_nodes += len(point_node["children"]) # 标签节点 for tag_node in point_node["children"]: total_nodes += len(tag_node["children"]) # 人设节点(1层) for persona_node in tag_node["children"]: total_nodes += len(persona_node["children"]) # 扩展节点(2层) post_tree = { "postId": post_id, "postTitle": post_title, "postDetail": post_detail, "root": root_node, "stats": { "totalNodes": total_nodes, "pointCount": len(post_points), "tagCount": len(post_tags), "personaCount": persona_count } } all_post_trees.append(post_tree) print(f" 构建完成: {total_nodes} 个节点(人设1层: {persona_count}, 扩展2层: {expanded_count})") # 输出 output_data = { "说明": { "描述": "帖子树结构数据(每个帖子一棵树)", "帖子数": len(all_post_trees) }, "postTrees": all_post_trees } with open(output_file, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) print() print("=" * 60) print(f"构建完成!") print(f" 帖子数: {len(all_post_trees)}") print(f" 输出文件: {output_file}") return output_file if __name__ == "__main__": build_post_trees()