#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 构建帖子树的中间数据 输入:match_graph/*.json, results/*.json 输出:match_graph/post_trees.json(包含所有帖子的树结构) """ import json from pathlib import Path import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig def build_post_trees(): """构建所有帖子的树数据""" config = PathConfig() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() match_graph_dir = config.intermediate_dir / "match_graph" results_dir = config.intermediate_dir.parent / "results" output_file = match_graph_dir / "post_trees.json" # 读取所有匹配图谱文件 graph_files = sorted(match_graph_dir.glob("*_match_graph.json")) print(f"找到 {len(graph_files)} 个匹配图谱文件") all_post_trees = [] for i, graph_file in enumerate(graph_files, 1): print(f"\n[{i}/{len(graph_files)}] 处理: {graph_file.name}") with open(graph_file, "r", encoding="utf-8") as f: match_graph_data = json.load(f) post_id = match_graph_data["说明"]["帖子ID"] post_title = match_graph_data["说明"].get("帖子标题", "") # 读取完整帖子详情 post_detail = { "title": post_title, "post_id": post_id } how_file = results_dir / f"{post_id}_how.json" if how_file.exists(): with open(how_file, "r", encoding="utf-8") as f: how_data = json.load(f) if "帖子详情" in how_data: post_detail = how_data["帖子详情"] post_detail["post_id"] = post_id print(f" 读取帖子详情: {how_file.name}") # 获取帖子点和帖子标签 post_points = match_graph_data.get("帖子点节点列表", []) post_tags = match_graph_data.get("帖子标签节点列表", []) belong_edges = match_graph_data.get("帖子属于边列表", []) print(f" 帖子点: {len(post_points)}, 帖子标签: {len(post_tags)}, 属于边: {len(belong_edges)}") # 构建树结构 # 维度颜色 dim_colors = { "灵感点": "#f39c12", "目的点": "#3498db", "关键点": "#9b59b6" } # 构建节点映射 point_map = {} for n in post_points: point_map[n["节点ID"]] = { "id": n["节点ID"], "name": n["节点名称"], "nodeType": "点", "level": n.get("节点层级", ""), "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"), "description": n.get("描述", ""), "children": [] } tag_map = {} for n in post_tags: tag_map[n["节点ID"]] = { "id": n["节点ID"], "name": n["节点名称"], "nodeType": "标签", "level": n.get("节点层级", ""), "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"), "weight": n.get("权重", 0), "children": [] } # 根据属于边,把标签挂到点下面 for e in belong_edges: tag_node = tag_map.get(e["源节点ID"]) point_node = point_map.get(e["目标节点ID"]) if tag_node and point_node: point_node["children"].append(tag_node) # 按维度分组点节点 dimensions = ["灵感点", "目的点", "关键点"] dimension_children = [] for dim in dimensions: dim_points = [ point_map[n["节点ID"]] for n in post_points if n.get("节点层级") == dim and n["节点ID"] in point_map ] if dim_points: dim_node = { "id": f"dim_{dim}", "name": dim, "nodeType": "维度", "isDimension": True, "dimColor": dim_colors[dim], "children": dim_points } dimension_children.append(dim_node) # 根节点(帖子) root_node = { "id": f"post_{post_id}", "name": post_title[:20] + "..." if len(post_title) > 20 else post_title, "nodeType": "帖子", "isRoot": True, "postDetail": post_detail, "children": dimension_children } # 统计节点数 total_nodes = 1 + len(dimension_children) # 根节点 + 维度节点 for dim_node in dimension_children: total_nodes += len(dim_node["children"]) # 点节点 for point_node in dim_node["children"]: total_nodes += len(point_node["children"]) # 标签节点 post_tree = { "postId": post_id, "postTitle": post_title, "postDetail": post_detail, "root": root_node, "stats": { "totalNodes": total_nodes, "pointCount": len(post_points), "tagCount": len(post_tags) } } all_post_trees.append(post_tree) print(f" 构建完成: {total_nodes} 个节点") # 输出 output_data = { "说明": { "描述": "帖子树结构数据(每个帖子一棵树)", "帖子数": len(all_post_trees) }, "postTrees": all_post_trees } with open(output_file, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) print() print("=" * 60) print(f"构建完成!") print(f" 帖子数: {len(all_post_trees)}") print(f" 输出文件: {output_file}") return output_file if __name__ == "__main__": build_post_trees()