|
|
@@ -0,0 +1,185 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+"""
|
|
|
+构建帖子树的中间数据
|
|
|
+
|
|
|
+输入:match_graph/*.json, results/*.json
|
|
|
+输出:match_graph/post_trees.json(包含所有帖子的树结构)
|
|
|
+"""
|
|
|
+
|
|
|
+import json
|
|
|
+from pathlib import Path
|
|
|
+import sys
|
|
|
+
|
|
|
+# 添加项目根目录到路径
|
|
|
+project_root = Path(__file__).parent.parent.parent
|
|
|
+sys.path.insert(0, str(project_root))
|
|
|
+
|
|
|
+from script.data_processing.path_config import PathConfig
|
|
|
+
|
|
|
+
|
|
|
+def build_post_trees():
|
|
|
+ """构建所有帖子的树数据"""
|
|
|
+ config = PathConfig()
|
|
|
+
|
|
|
+ print(f"账号: {config.account_name}")
|
|
|
+ print(f"输出版本: {config.output_version}")
|
|
|
+ print()
|
|
|
+
|
|
|
+ match_graph_dir = config.intermediate_dir / "match_graph"
|
|
|
+ results_dir = config.intermediate_dir.parent / "results"
|
|
|
+ output_file = match_graph_dir / "post_trees.json"
|
|
|
+
|
|
|
+ # 读取所有匹配图谱文件
|
|
|
+ graph_files = sorted(match_graph_dir.glob("*_match_graph.json"))
|
|
|
+ print(f"找到 {len(graph_files)} 个匹配图谱文件")
|
|
|
+
|
|
|
+ all_post_trees = []
|
|
|
+
|
|
|
+ for i, graph_file in enumerate(graph_files, 1):
|
|
|
+ print(f"\n[{i}/{len(graph_files)}] 处理: {graph_file.name}")
|
|
|
+
|
|
|
+ with open(graph_file, "r", encoding="utf-8") as f:
|
|
|
+ match_graph_data = json.load(f)
|
|
|
+
|
|
|
+ post_id = match_graph_data["说明"]["帖子ID"]
|
|
|
+ post_title = match_graph_data["说明"].get("帖子标题", "")
|
|
|
+
|
|
|
+ # 读取完整帖子详情
|
|
|
+ post_detail = {
|
|
|
+ "title": post_title,
|
|
|
+ "post_id": post_id
|
|
|
+ }
|
|
|
+ how_file = results_dir / f"{post_id}_how.json"
|
|
|
+ if how_file.exists():
|
|
|
+ with open(how_file, "r", encoding="utf-8") as f:
|
|
|
+ how_data = json.load(f)
|
|
|
+ if "帖子详情" in how_data:
|
|
|
+ post_detail = how_data["帖子详情"]
|
|
|
+ post_detail["post_id"] = post_id
|
|
|
+ print(f" 读取帖子详情: {how_file.name}")
|
|
|
+
|
|
|
+ # 获取帖子点和帖子标签
|
|
|
+ post_points = match_graph_data.get("帖子点节点列表", [])
|
|
|
+ post_tags = match_graph_data.get("帖子标签节点列表", [])
|
|
|
+ belong_edges = match_graph_data.get("帖子属于边列表", [])
|
|
|
+
|
|
|
+ print(f" 帖子点: {len(post_points)}, 帖子标签: {len(post_tags)}, 属于边: {len(belong_edges)}")
|
|
|
+
|
|
|
+ # 构建树结构
|
|
|
+ # 维度颜色
|
|
|
+ dim_colors = {
|
|
|
+ "灵感点": "#f39c12",
|
|
|
+ "目的点": "#3498db",
|
|
|
+ "关键点": "#9b59b6"
|
|
|
+ }
|
|
|
+
|
|
|
+ # 构建节点映射
|
|
|
+ point_map = {}
|
|
|
+ for n in post_points:
|
|
|
+ point_map[n["节点ID"]] = {
|
|
|
+ "id": n["节点ID"],
|
|
|
+ "name": n["节点名称"],
|
|
|
+ "nodeType": "点",
|
|
|
+ "level": n.get("节点层级", ""),
|
|
|
+ "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
|
|
|
+ "description": n.get("描述", ""),
|
|
|
+ "children": []
|
|
|
+ }
|
|
|
+
|
|
|
+ tag_map = {}
|
|
|
+ for n in post_tags:
|
|
|
+ tag_map[n["节点ID"]] = {
|
|
|
+ "id": n["节点ID"],
|
|
|
+ "name": n["节点名称"],
|
|
|
+ "nodeType": "标签",
|
|
|
+ "level": n.get("节点层级", ""),
|
|
|
+ "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
|
|
|
+ "weight": n.get("权重", 0),
|
|
|
+ "children": []
|
|
|
+ }
|
|
|
+
|
|
|
+ # 根据属于边,把标签挂到点下面
|
|
|
+ for e in belong_edges:
|
|
|
+ tag_node = tag_map.get(e["源节点ID"])
|
|
|
+ point_node = point_map.get(e["目标节点ID"])
|
|
|
+ if tag_node and point_node:
|
|
|
+ point_node["children"].append(tag_node)
|
|
|
+
|
|
|
+ # 按维度分组点节点
|
|
|
+ dimensions = ["灵感点", "目的点", "关键点"]
|
|
|
+ dimension_children = []
|
|
|
+
|
|
|
+ for dim in dimensions:
|
|
|
+ dim_points = [
|
|
|
+ point_map[n["节点ID"]]
|
|
|
+ for n in post_points
|
|
|
+ if n.get("节点层级") == dim and n["节点ID"] in point_map
|
|
|
+ ]
|
|
|
+
|
|
|
+ if dim_points:
|
|
|
+ dim_node = {
|
|
|
+ "id": f"dim_{dim}",
|
|
|
+ "name": dim,
|
|
|
+ "nodeType": "维度",
|
|
|
+ "isDimension": True,
|
|
|
+ "dimColor": dim_colors[dim],
|
|
|
+ "children": dim_points
|
|
|
+ }
|
|
|
+ dimension_children.append(dim_node)
|
|
|
+
|
|
|
+ # 根节点(帖子)
|
|
|
+ root_node = {
|
|
|
+ "id": f"post_{post_id}",
|
|
|
+ "name": post_title[:20] + "..." if len(post_title) > 20 else post_title,
|
|
|
+ "nodeType": "帖子",
|
|
|
+ "isRoot": True,
|
|
|
+ "postDetail": post_detail,
|
|
|
+ "children": dimension_children
|
|
|
+ }
|
|
|
+
|
|
|
+ # 统计节点数
|
|
|
+ total_nodes = 1 + len(dimension_children) # 根节点 + 维度节点
|
|
|
+ for dim_node in dimension_children:
|
|
|
+ total_nodes += len(dim_node["children"]) # 点节点
|
|
|
+ for point_node in dim_node["children"]:
|
|
|
+ total_nodes += len(point_node["children"]) # 标签节点
|
|
|
+
|
|
|
+ post_tree = {
|
|
|
+ "postId": post_id,
|
|
|
+ "postTitle": post_title,
|
|
|
+ "postDetail": post_detail,
|
|
|
+ "root": root_node,
|
|
|
+ "stats": {
|
|
|
+ "totalNodes": total_nodes,
|
|
|
+ "pointCount": len(post_points),
|
|
|
+ "tagCount": len(post_tags)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ all_post_trees.append(post_tree)
|
|
|
+ print(f" 构建完成: {total_nodes} 个节点")
|
|
|
+
|
|
|
+ # 输出
|
|
|
+ output_data = {
|
|
|
+ "说明": {
|
|
|
+ "描述": "帖子树结构数据(每个帖子一棵树)",
|
|
|
+ "帖子数": len(all_post_trees)
|
|
|
+ },
|
|
|
+ "postTrees": all_post_trees
|
|
|
+ }
|
|
|
+
|
|
|
+ with open(output_file, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(output_data, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ print()
|
|
|
+ print("=" * 60)
|
|
|
+ print(f"构建完成!")
|
|
|
+ print(f" 帖子数: {len(all_post_trees)}")
|
|
|
+ print(f" 输出文件: {output_file}")
|
|
|
+
|
|
|
+ return output_file
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ build_post_trees()
|