yangxiaohui
/
how


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
							#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
构建帖子图谱

================================================================================
输入文件:
================================================================================
filtered_results/*_filtered.json - 帖子解构结果（过滤后的how解构）

================================================================================
输出文件: post_graph/{post_id}_帖子图谱.json（每个帖子一个文件）
================================================================================
{
    "meta": {                    # 元信息
        "postId": "帖子ID",
        "postTitle": "帖子标题",
        "postDetail": {...},
        "createdAt": "时间戳",
        "stats": { ... }
    },
    "nodes": {                   # 节点字典 (nodeId -> nodeData)
        "{domain}:{dimension}:{type}:{name}": {
            "name": "显示名称",
            "type": "帖子|灵感点|目的点|关键点|点|标签",
            "domain": "帖子",
            "dimension": "帖子|灵感点|目的点|关键点",
            "detail": { ... }
        }
    },
    "edges": {                   # 边字典 (edgeId -> edgeData)
        "{source}|{type}|{target}": {
            "source": "源节点ID",
            "target": "目标节点ID",
            "type": "属于|包含",
            "score": 1.0,
            "detail": { ... }
        }
    },
    "index": {                   # 游走索引
        "outEdges": { nodeId: { edgeType: [{ target, score }] } },
        "inEdges": { nodeId: { edgeType: [{ source, score }] } }
    },
    "tree": { ... }              # 嵌套树结构
}

================================================================================
核心逻辑:
================================================================================
1. 从 filtered_results 读取帖子解构结果
2. 提取点节点和标签节点
3. 添加根节点（帖子）和维度节点（灵感点/目的点/关键点）
4. 构建属于/包含边
5. 构建索引和嵌套树

================================================================================
层级对应（人设 vs 帖子）:
================================================================================
| 人设   | 帖子   |
|--------|--------|
| 人设   | 帖子   |
| 维度   | 维度   |
| 分类   | 点     |
| 标签   | 标签   |

================================================================================
节点ID格式: {domain}:{dimension}:{type}:{name}
================================================================================
- 根节点:   帖子:帖子:帖子:{post_id}
- 维度节点: 帖子:灵感点:灵感点:灵感点
- 点节点:   帖子:灵感点:点:{point_name}
- 标签节点: 帖子:灵感点:标签:{tag_name}

================================================================================
边类型:
================================================================================
- 属于: 子节点 -> 父节点（层级关系）
- 包含: 父节点 -> 子节点（层级关系）
- 匹配: 帖子标签 <-> 人设标签（双向，score为相似度）

================================================================================
匹配边说明:
================================================================================
帖子图谱包含与人设图谱的匹配边，通过节点ID关联：
- 帖子标签ID: 帖子:灵感点:标签:{tag_name}
- 人设标签ID: 人设:灵感点:标签:{persona_tag_name}

使用方式：从帖子标签出发，沿"匹配"边游走到人设标签ID，
再从人设图谱.json中查找该ID的详细信息。

================================================================================
"""

import json
from pathlib import Path
from typing import Dict, List, Set
from datetime import datetime
import sys

# 添加项目根目录到路径
project_root = Path(__file__).parent.parent.parent
sys.path.insert(0, str(project_root))

from script.data_processing.path_config import PathConfig


# ==================== 节点和边构建工具 ====================

def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
    """构建节点ID"""
    return f"{domain}:{dimension}:{node_type}:{name}"


def build_edge_id(source: str, edge_type: str, target: str) -> str:
    """构建边ID"""
    return f"{source}|{edge_type}|{target}"


def create_node(
    domain: str,
    dimension: str,
    node_type: str,
    name: str,
    detail: Dict = None
) -> Dict:
    """创建节点"""
    return {
        "name": name,
        "type": node_type,
        "dimension": dimension,
        "domain": domain,
        "detail": detail or {}
    }


def create_edge(
    source: str,
    target: str,
    edge_type: str,
    score: float = None,
    detail: Dict = None
) -> Dict:
    """创建边"""
    return {
        "source": source,
        "target": target,
        "type": edge_type,
        "score": score,
        "detail": detail or {}
    }


# ==================== 从帖子解构结果提取节点和匹配边 ====================

def extract_tags_and_matches(filtered_data: Dict) -> tuple:
    """
    从帖子解构结果中提取标签节点和匹配边（适配新结构）

    新结构：解构结果 → 点列表 → 点 → 匹配人设结果
    新结构的"点"对应旧结构的"标签"节点，直接挂在维度下

    Returns:
        (标签节点字典, 匹配边字典, 支撑边字典, 关联边字典)
    """
    tag_nodes = {}    # nodeId -> nodeData
    match_edges = {}  # edgeId -> edgeData
    support_edges = {}  # 支撑边
    relation_edges = {}  # 关联边

    # ID 到节点ID的映射（用于构建支撑边和关联边）
    id_to_node_id = {}

    # 新结构使用 "解构结果"
    result = filtered_data.get("解构结果", {})

    dimension_mapping = {
        "灵感点列表": "灵感点",
        "目的点列表": "目的点",
        "关键点列表": "关键点"
    }

    # 第一遍：创建节点并建立 ID 映射
    for list_key, dimension in dimension_mapping.items():
        points = result.get(list_key, [])

        for point in points:
            tag_name = point.get("名称", "")
            tag_desc = point.get("描述", "")
            point_id = point.get("ID", "")

            if not tag_name:
                continue

            # 新结构的"点"直接创建为"标签"节点
            tag_id = build_node_id("帖子", dimension, "标签", tag_name)
            tag_nodes[tag_id] = create_node(
                domain="帖子",
                dimension=dimension,
                node_type="标签",
                name=tag_name,
                detail={
                    "description": tag_desc,
                    "pointId": point_id
                }
            )

            # 建立 ID 映射
            if point_id:
                id_to_node_id[point_id] = tag_id

            # 直接从点的 匹配人设结果 提取匹配边
            matches = point.get("匹配人设结果", [])
            for match in matches:
                persona_name = match.get("人设特征名称", "")
                persona_dimension = match.get("人设特征层级", "")
                # 映射：源数据中 "点" → "标签"
                persona_type = match.get("特征类型", "标签")
                if persona_type == "点":
                    persona_type = "标签"
                similarity = match.get("相似度", 0)

                if not persona_name or not persona_dimension:
                    continue

                # 构建人设节点ID
                persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name)

                # 创建双向匹配边
                # 帖子标签 -> 人设标签
                edge_id_1 = build_edge_id(tag_id, "匹配", persona_id)
                match_edges[edge_id_1] = create_edge(
                    source=tag_id,
                    target=persona_id,
                    edge_type="匹配",
                    score=similarity,
                    detail={}
                )

                # 人设标签 -> 帖子标签
                edge_id_2 = build_edge_id(persona_id, "匹配", tag_id)
                match_edges[edge_id_2] = create_edge(
                    source=persona_id,
                    target=tag_id,
                    edge_type="匹配",
                    score=similarity,
                    detail={}
                )

    # 第二遍：构建支撑边和关联边
    for list_key, dimension in dimension_mapping.items():
        points = result.get(list_key, [])

        for point in points:
            tag_name = point.get("名称", "")
            point_id = point.get("ID", "")

            if not tag_name or not point_id:
                continue

            tag_id = id_to_node_id.get(point_id)
            if not tag_id:
                continue

            # 支撑边：当前点 -> 被支撑的点
            support_ids = point.get("支撑的ID", [])
            for target_point_id in support_ids:
                target_node_id = id_to_node_id.get(target_point_id)
                if target_node_id:
                    edge_id = build_edge_id(tag_id, "支撑", target_node_id)
                    support_edges[edge_id] = create_edge(
                        source=tag_id,
                        target=target_node_id,
                        edge_type="支撑",
                        score=1.0,
                        detail={}
                    )

            # 关联边：当前点 <-> 关联的点（双向）
            relation_ids = point.get("关联的ID", [])
            for target_point_id in relation_ids:
                target_node_id = id_to_node_id.get(target_point_id)
                if target_node_id:
                    # 只创建一个方向的边（避免重复）
                    edge_id = build_edge_id(tag_id, "关联", target_node_id)
                    if edge_id not in relation_edges:
                        relation_edges[edge_id] = create_edge(
                            source=tag_id,
                            target=target_node_id,
                            edge_type="关联",
                            score=1.0,
                            detail={}
                        )

    return tag_nodes, match_edges, support_edges, relation_edges


# ==================== 构建边 ====================

def build_belong_contain_edges(
    tag_nodes: Dict[str, Dict],
    dimension_node_ids: Dict[str, str]
) -> Dict[str, Dict]:
    """
    构建属于/包含边（新结构：标签直接挂维度下）

    Returns:
        边字典 { edgeId: edgeData }
    """
    edges = {}

    # 标签 -> 维度（属于/包含）
    for tag_id, tag_data in tag_nodes.items():
        dimension = tag_data["dimension"]
        dim_node_id = dimension_node_ids[dimension]

        # 属于边：标签 -> 维度
        edge_id = build_edge_id(tag_id, "属于", dim_node_id)
        edges[edge_id] = create_edge(
            source=tag_id,
            target=dim_node_id,
            edge_type="属于",
            score=1.0
        )

        # 包含边：维度 -> 标签
        edge_id_contain = build_edge_id(dim_node_id, "包含", tag_id)
        edges[edge_id_contain] = create_edge(
            source=dim_node_id,
            target=tag_id,
            edge_type="包含",
            score=1.0
        )

    return edges


# ==================== 构建索引 ====================

def build_index(edges: Dict[str, Dict]) -> Dict:
    """
    构建游走索引

    Returns:
        {
            "outEdges": { nodeId: { edgeType: [{ target, score }] } },
            "inEdges": { nodeId: { edgeType: [{ source, score }] } }
        }
    """
    out_edges = {}
    in_edges = {}

    for edge_data in edges.values():
        source = edge_data["source"]
        target = edge_data["target"]
        edge_type = edge_data["type"]
        score = edge_data["score"]

        # outEdges
        if source not in out_edges:
            out_edges[source] = {}
        if edge_type not in out_edges[source]:
            out_edges[source][edge_type] = []
        out_edges[source][edge_type].append({
            "target": target,
            "score": score
        })

        # inEdges
        if target not in in_edges:
            in_edges[target] = {}
        if edge_type not in in_edges[target]:
            in_edges[target][edge_type] = []
        in_edges[target][edge_type].append({
            "source": source,
            "score": score
        })

    return {
        "outEdges": out_edges,
        "inEdges": in_edges
    }


# ==================== 构建嵌套树 ====================

def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict:
    """
    从根节点开始，沿"包含"边递归构建嵌套树结构

    Returns:
        嵌套的树结构
    """
    # 从"包含"边构建 父节点 -> [子节点] 的映射
    parent_to_children = {}

    for edge_data in edges.values():
        if edge_data["type"] == "包含":
            parent_id = edge_data["source"]
            child_id = edge_data["target"]

            if parent_id not in parent_to_children:
                parent_to_children[parent_id] = []
            parent_to_children[parent_id].append(child_id)

    # 递归构建子树
    def build_subtree(node_id: str) -> Dict:
        node_data = nodes[node_id]

        subtree = {
            "id": node_id,
            "name": node_data["name"],
            "type": node_data["type"],
            "domain": node_data["domain"],
            "dimension": node_data["dimension"],
            "detail": node_data.get("detail", {}),
            "children": []
        }

        # 获取子节点
        child_ids = parent_to_children.get(node_id, [])

        for child_id in child_ids:
            if child_id in nodes:
                subtree["children"].append(build_subtree(child_id))

        return subtree

    return build_subtree(root_id)


# ==================== 图游走工具 ====================

def walk_graph(
    index: Dict,
    start_node: str,
    edge_types: List[str],
    direction: str = "out",
    min_score: float = None
) -> Set[str]:
    """
    从起始节点出发，按指定边类型序列游走N步

    Args:
        index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
        start_node: 起始节点ID
        edge_types: 边类型序列，如 ["属于", "包含"]
        direction: 游走方向 "out"(沿出边) / "in"(沿入边)
        min_score: 最小分数过滤

    Returns:
        到达的节点ID集合
    """
    edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
    target_key = "target" if direction == "out" else "source"

    current_nodes = {start_node}

    for edge_type in edge_types:
        next_nodes = set()
        for node in current_nodes:
            neighbors = edge_index.get(node, {}).get(edge_type, [])
            for neighbor in neighbors:
                if min_score is not None and neighbor.get("score", 0) < min_score:
                    continue
                next_nodes.add(neighbor[target_key])
        current_nodes = next_nodes

        if not current_nodes:
            break

    return current_nodes


def get_neighbors(
    index: Dict,
    node_id: str,
    edge_type: str = None,
    direction: str = "out",
    min_score: float = None
) -> List[Dict]:
    """
    获取节点的邻居

    Args:
        index: 游走索引
        node_id: 节点ID
        edge_type: 边类型（可选，不指定则返回所有类型）
        direction: 方向 "out" / "in"
        min_score: 最小分数过滤

    Returns:
        邻居列表 [{"target": "...", "score": 0.5}, ...]
    """
    edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
    node_edges = edge_index.get(node_id, {})

    if edge_type:
        neighbors = node_edges.get(edge_type, [])
    else:
        neighbors = []
        for edges in node_edges.values():
            neighbors.extend(edges)

    if min_score is not None:
        neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]

    return neighbors


# ==================== 处理单个帖子 ====================

def process_single_post(filtered_file: Path, output_dir: Path) -> Dict:
    """
    处理单个帖子，生成帖子图谱

    Returns:
        处理结果统计
    """
    # 读取数据
    with open(filtered_file, "r", encoding="utf-8") as f:
        filtered_data = json.load(f)

    post_id = filtered_data.get("帖子id", "")
    post_detail = filtered_data.get("帖子详情", {})
    post_title = post_detail.get("title", "")

    # 初始化节点和边
    all_nodes = {}
    all_edges = {}

    # 1. 提取标签节点和匹配边（新结构：没有点层）
    tag_nodes, match_edges, support_edges, relation_edges = extract_tags_and_matches(filtered_data)

    # 2. 添加根节点
    root_id = build_node_id("帖子", "帖子", "帖子", post_id)
    all_nodes[root_id] = create_node(
        domain="帖子",
        dimension="帖子",
        node_type="帖子",
        name=post_id,
        detail={
            "postTitle": post_title,
            "postDetail": post_detail
        }
    )

    # 3. 添加维度节点
    dimensions = ["灵感点", "目的点", "关键点"]
    dimension_node_ids = {}

    for dim in dimensions:
        dim_id = build_node_id("帖子", dim, dim, dim)
        dimension_node_ids[dim] = dim_id

        all_nodes[dim_id] = create_node(
            domain="帖子",
            dimension=dim,
            node_type=dim,
            name=dim,
            detail={}
        )

        # 维度 -> 根 的属于边
        edge_id = build_edge_id(dim_id, "属于", root_id)
        all_edges[edge_id] = create_edge(
            source=dim_id,
            target=root_id,
            edge_type="属于",
            score=1.0
        )

        # 根 -> 维度 的包含边
        edge_id_contain = build_edge_id(root_id, "包含", dim_id)
        all_edges[edge_id_contain] = create_edge(
            source=root_id,
            target=dim_id,
            edge_type="包含",
            score=1.0
        )

    # 4. 添加标签节点
    all_nodes.update(tag_nodes)

    # 5. 构建属于/包含边（标签直接挂维度下）
    belong_contain_edges = build_belong_contain_edges(tag_nodes, dimension_node_ids)
    all_edges.update(belong_contain_edges)

    # 6. 添加匹配边
    all_edges.update(match_edges)

    # 7. 添加支撑边和关联边
    all_edges.update(support_edges)
    all_edges.update(relation_edges)

    # 8. 构建索引
    index = build_index(all_edges)

    # 9. 构建嵌套树
    tree = build_nested_tree(all_nodes, all_edges, root_id)

    # 统计
    tag_count = len(tag_nodes)
    match_count = len(match_edges) // 2  # 双向边，除以2得到实际匹配数
    support_count = len(support_edges)
    relation_count = len(relation_edges)

    dimension_stats = {}
    for dim in dimensions:
        dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim)
        dimension_stats[dim] = {
            "tagCount": dim_tags
        }

    # 构建输出
    output_data = {
        "meta": {
            "postId": post_id,
            "postTitle": post_title,
            "postDetail": post_detail,
            "createdAt": datetime.now().isoformat(),
            "stats": {
                "nodeCount": len(all_nodes),
                "edgeCount": len(all_edges),
                "tagCount": tag_count,
                "matchCount": match_count,
                "supportCount": support_count,
                "relationCount": relation_count,
                "dimensions": dimension_stats
            }
        },
        "nodes": all_nodes,
        "edges": all_edges,
        "index": index,
        "tree": tree
    }

    # 保存
    output_file = output_dir / f"{post_id}_帖子图谱.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output_data, f, ensure_ascii=False, indent=2)

    return {
        "postId": post_id,
        "postTitle": post_title,
        "nodeCount": len(all_nodes),
        "edgeCount": len(all_edges),
        "tagCount": tag_count,
        "matchCount": match_count,
        "supportCount": support_count,
        "relationCount": relation_count,
        "outputFile": str(output_file)
    }


# ==================== 主函数 ====================

def main():
    config = PathConfig()
    config.ensure_dirs()

    print(f"账号: {config.account_name}")
    print(f"输出版本: {config.output_version}")
    print()

    # 输入目录
    filtered_results_dir = config.intermediate_dir / "filtered_results"

    # 输出目录
    output_dir = config.intermediate_dir / "post_graph"
    output_dir.mkdir(parents=True, exist_ok=True)

    print(f"输入目录: {filtered_results_dir}")
    print(f"输出目录: {output_dir}")
    print()

    # 获取所有帖子文件
    filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
    print(f"找到 {len(filtered_files)} 个帖子文件")
    print()

    # 处理每个帖子
    results = []
    for i, filtered_file in enumerate(filtered_files, 1):
        print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
        result = process_single_post(filtered_file, output_dir)
        results.append(result)
        print(f"  节点: {result['nodeCount']}, 边: {result['edgeCount']}")
        print(f"  标签: {result['tagCount']}, 匹配: {result['matchCount']}, 支撑: {result['supportCount']}, 关联: {result['relationCount']}")
        print(f"  → {Path(result['outputFile']).name}")
        print()

    # 汇总统计
    print("=" * 60)
    print("处理完成!")
    print(f"  帖子数: {len(results)}")
    print(f"  总节点数: {sum(r['nodeCount'] for r in results)}")
    print(f"  总边数: {sum(r['edgeCount'] for r in results)}")
    print(f"  总标签数: {sum(r['tagCount'] for r in results)}")
    print(f"  总匹配数: {sum(r['matchCount'] for r in results)}")
    print(f"  总支撑边: {sum(r['supportCount'] for r in results)}")
    print(f"  总关联边: {sum(r['relationCount'] for r in results)}")
    print(f"\n输出目录: {output_dir}")


if __name__ == "__main__":
    main()