yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
构建帖子树的中间数据

输入：match_graph/*.json, results/*.json
输出：match_graph/post_trees.json（包含所有帖子的树结构）
"""

import json
from pathlib import Path
import sys

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from script.data_processing.path_config import PathConfig


def build_post_trees():
    """构建所有帖子的树数据"""
    config = PathConfig()

    print(f"账号: {config.account_name}")
    print(f"输出版本: {config.output_version}")
    print()

    match_graph_dir = config.intermediate_dir / "match_graph"
    results_dir = config.intermediate_dir.parent / "results"
    output_file = match_graph_dir / "post_trees.json"

    # 读取所有匹配图谱文件
    graph_files = sorted(match_graph_dir.glob("*_match_graph.json"))
    print(f"找到 {len(graph_files)} 个匹配图谱文件")

    all_post_trees = []

    for i, graph_file in enumerate(graph_files, 1):
        print(f"\n[{i}/{len(graph_files)}] 处理: {graph_file.name}")

        with open(graph_file, "r", encoding="utf-8") as f:
            match_graph_data = json.load(f)

        post_id = match_graph_data["说明"]["帖子ID"]
        post_title = match_graph_data["说明"].get("帖子标题", "")

        # 读取完整帖子详情
        post_detail = {
            "title": post_title,
            "post_id": post_id
        }
        how_file = results_dir / f"{post_id}_how.json"
        if how_file.exists():
            with open(how_file, "r", encoding="utf-8") as f:
                how_data = json.load(f)
                if "帖子详情" in how_data:
                    post_detail = how_data["帖子详情"]
                    post_detail["post_id"] = post_id
            print(f"  读取帖子详情: {how_file.name}")

        # 获取帖子点和帖子标签
        post_points = match_graph_data.get("帖子点节点列表", [])
        post_tags = match_graph_data.get("帖子标签节点列表", [])
        belong_edges = match_graph_data.get("帖子属于边列表", [])

        # 获取匹配边（帖子标签 -> 人设标签）
        all_edges = match_graph_data.get("边列表", [])
        match_edges = [e for e in all_edges if e["边类型"].startswith("匹配_")]

        print(f"  帖子点: {len(post_points)}, 帖子标签: {len(post_tags)}, 属于边: {len(belong_edges)}, 匹配边: {len(match_edges)}")

        # 构建树结构
        # 维度颜色
        dim_colors = {
            "灵感点": "#f39c12",
            "目的点": "#3498db",
            "关键点": "#9b59b6"
        }

        # 构建节点映射
        point_map = {}
        for n in post_points:
            point_map[n["节点ID"]] = {
                "id": n["节点ID"],
                "name": n["节点名称"],
                "nodeType": "点",
                "level": n.get("节点层级", ""),
                "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
                "description": n.get("描述", ""),
                "children": []
            }

        tag_map = {}
        for n in post_tags:
            tag_map[n["节点ID"]] = {
                "id": n["节点ID"],
                "name": n["节点名称"],
                "nodeType": "标签",
                "level": n.get("节点层级", ""),
                "dimColor": dim_colors.get(n.get("节点层级", ""), "#888"),
                "weight": n.get("权重", 0),
                "children": []
            }

        # 获取所有节点（用于查找扩展节点）
        all_nodes = match_graph_data.get("节点列表", [])
        expanded_nodes_map = {}
        for n in all_nodes:
            if n.get("是否扩展"):
                expanded_nodes_map[n["节点ID"]] = n

        # 构建人设节点之间的边关系（用于找扩展节点）
        # 边类型：属于、包含、分类共现等
        persona_edges = [e for e in all_edges if not e["边类型"].startswith("匹配_")]

        # 构建帖子标签到人设匹配的映射
        tag_to_persona_matches = {}
        direct_persona_ids = set()  # 记录直接匹配的人设ID

        for e in match_edges:
            src_id = e["源节点ID"]  # 帖子标签
            tgt_id = e["目标节点ID"]  # 人设标签
            edge_type = e["边类型"]  # 匹配_相同 或 匹配_相似
            edge_detail = e.get("边详情", {})
            similarity = edge_detail.get("相似度", 0)

            if src_id not in tag_to_persona_matches:
                tag_to_persona_matches[src_id] = []

            direct_persona_ids.add(tgt_id)

            # 从人设标签ID提取维度和名称
            persona_name = tgt_id
            persona_level = ""
            if "_标签_" in tgt_id:
                parts = tgt_id.split("_标签_")
                persona_level = parts[0]
                persona_name = parts[1] if len(parts) > 1 else tgt_id
            elif "_分类_" in tgt_id:
                parts = tgt_id.split("_分类_")
                persona_level = parts[0]
                persona_name = parts[1] if len(parts) > 1 else tgt_id

            # 判断原始节点类型（分类/标签）
            original_type = "标签" if "_标签_" in tgt_id else ("分类" if "_分类_" in tgt_id else "标签")

            persona_node = {
                "id": f"persona_{tgt_id}",
                "name": persona_name,
                "nodeType": "人设",
                "originalType": original_type,  # 原始类型：分类或标签
                "personaId": tgt_id,
                "level": persona_level,
                "dimColor": dim_colors.get(persona_level, "#2ecc71"),
                "matchType": edge_type.replace("匹配_", ""),
                "similarity": similarity,
                "children": []
            }
            tag_to_persona_matches[src_id].append(persona_node)

        # 为每个直接匹配的人设节点找扩展节点（第二层）
        persona_to_expanded = {}
        for e in persona_edges:
            src_id = e["源节点ID"]
            tgt_id = e["目标节点ID"]
            edge_type = e["边类型"]

            # 如果源是直接匹配节点，目标是扩展节点
            if src_id in direct_persona_ids and tgt_id in expanded_nodes_map:
                if src_id not in persona_to_expanded:
                    persona_to_expanded[src_id] = []
                exp_node = expanded_nodes_map[tgt_id]
                exp_name = exp_node.get("节点名称", tgt_id)
                exp_level = exp_node.get("节点层级", "")

                # 扩展节点的原始类型
                exp_original_type = exp_node.get("节点类型", "标签")

                expanded_node = {
                    "id": f"expanded_{tgt_id}",
                    "name": exp_name,
                    "nodeType": "人设扩展",
                    "originalType": exp_original_type,  # 分类或标签
                    "personaId": tgt_id,
                    "level": exp_level,
                    "dimColor": dim_colors.get(exp_level, "#2ecc71"),
                    "edgeType": edge_type,
                    "children": []
                }
                # 避免重复
                if not any(x["personaId"] == tgt_id for x in persona_to_expanded[src_id]):
                    persona_to_expanded[src_id].append(expanded_node)

            # 如果目标是直接匹配节点，源是扩展节点
            if tgt_id in direct_persona_ids and src_id in expanded_nodes_map:
                if tgt_id not in persona_to_expanded:
                    persona_to_expanded[tgt_id] = []
                exp_node = expanded_nodes_map[src_id]
                exp_name = exp_node.get("节点名称", src_id)
                exp_level = exp_node.get("节点层级", "")
                exp_original_type = exp_node.get("节点类型", "标签")

                expanded_node = {
                    "id": f"expanded_{src_id}",
                    "name": exp_name,
                    "nodeType": "人设扩展",
                    "originalType": exp_original_type,
                    "personaId": src_id,
                    "level": exp_level,
                    "dimColor": dim_colors.get(exp_level, "#2ecc71"),
                    "edgeType": edge_type,
                    "children": []
                }
                if not any(x["personaId"] == src_id for x in persona_to_expanded[tgt_id]):
                    persona_to_expanded[tgt_id].append(expanded_node)

        # 将扩展节点添加到对应的人设节点下
        expanded_count = 0
        for tag_id, persona_nodes in tag_to_persona_matches.items():
            for persona_node in persona_nodes:
                persona_id = persona_node["personaId"]
                if persona_id in persona_to_expanded:
                    persona_node["children"] = persona_to_expanded[persona_id]
                    expanded_count += len(persona_to_expanded[persona_id])

        # 将人设匹配节点添加到对应标签下
        persona_count = 0
        for tag_id, persona_nodes in tag_to_persona_matches.items():
            if tag_id in tag_map:
                tag_map[tag_id]["children"] = persona_nodes
                persona_count += len(persona_nodes)

        print(f"  人设匹配节点(1层): {persona_count}, 扩展节点(2层): {expanded_count}")

        # 根据属于边，把标签挂到点下面
        for e in belong_edges:
            tag_node = tag_map.get(e["源节点ID"])
            point_node = point_map.get(e["目标节点ID"])
            if tag_node and point_node:
                point_node["children"].append(tag_node)

        # 按维度分组点节点
        dimensions = ["灵感点", "目的点", "关键点"]
        dimension_children = []

        for dim in dimensions:
            dim_points = [
                point_map[n["节点ID"]]
                for n in post_points
                if n.get("节点层级") == dim and n["节点ID"] in point_map
            ]

            if dim_points:
                dim_node = {
                    "id": f"dim_{dim}",
                    "name": dim,
                    "nodeType": "维度",
                    "isDimension": True,
                    "dimColor": dim_colors[dim],
                    "children": dim_points
                }
                dimension_children.append(dim_node)

        # 根节点（帖子）
        root_node = {
            "id": f"post_{post_id}",
            "name": post_title[:20] + "..." if len(post_title) > 20 else post_title,
            "nodeType": "帖子",
            "isRoot": True,
            "postDetail": post_detail,
            "children": dimension_children
        }

        # 统计节点数
        total_nodes = 1 + len(dimension_children)  # 根节点 + 维度节点
        for dim_node in dimension_children:
            total_nodes += len(dim_node["children"])  # 点节点
            for point_node in dim_node["children"]:
                total_nodes += len(point_node["children"])  # 标签节点
                for tag_node in point_node["children"]:
                    total_nodes += len(tag_node["children"])  # 人设节点(1层)
                    for persona_node in tag_node["children"]:
                        total_nodes += len(persona_node["children"])  # 扩展节点(2层)

        post_tree = {
            "postId": post_id,
            "postTitle": post_title,
            "postDetail": post_detail,
            "root": root_node,
            "stats": {
                "totalNodes": total_nodes,
                "pointCount": len(post_points),
                "tagCount": len(post_tags),
                "personaCount": persona_count
            }
        }

        all_post_trees.append(post_tree)
        print(f"  构建完成: {total_nodes} 个节点（人设1层: {persona_count}, 扩展2层: {expanded_count}）")

    # 输出
    output_data = {
        "说明": {
            "描述": "帖子树结构数据（每个帖子一棵树）",
            "帖子数": len(all_post_trees)
        },
        "postTrees": all_post_trees
    }

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    print()
    print("=" * 60)
    print(f"构建完成!")
    print(f"  帖子数: {len(all_post_trees)}")
    print(f"  输出文件: {output_file}")

    return output_file


if __name__ == "__main__":
    build_post_trees()