#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 构建帖子图谱 ================================================================================ 输入文件: ================================================================================ filtered_results/*_filtered.json - 帖子解构结果(过滤后的how解构) ================================================================================ 输出文件: post_graph/{post_id}_帖子图谱.json(每个帖子一个文件) ================================================================================ { "meta": { # 元信息 "postId": "帖子ID", "postTitle": "帖子标题", "postDetail": {...}, "createdAt": "时间戳", "stats": { ... } }, "nodes": { # 节点字典 (nodeId -> nodeData) "{domain}:{dimension}:{type}:{name}": { "name": "显示名称", "type": "帖子|灵感点|目的点|关键点|点|标签", "domain": "帖子", "dimension": "帖子|灵感点|目的点|关键点", "detail": { ... } } }, "edges": { # 边字典 (edgeId -> edgeData) "{source}|{type}|{target}": { "source": "源节点ID", "target": "目标节点ID", "type": "属于|包含", "score": 1.0, "detail": { ... } } }, "index": { # 游走索引 "outEdges": { nodeId: { edgeType: [{ target, score }] } }, "inEdges": { nodeId: { edgeType: [{ source, score }] } } }, "tree": { ... } # 嵌套树结构 } ================================================================================ 核心逻辑: ================================================================================ 1. 从 filtered_results 读取帖子解构结果 2. 提取点节点和标签节点 3. 添加根节点(帖子)和维度节点(灵感点/目的点/关键点) 4. 构建属于/包含边 5. 构建索引和嵌套树 ================================================================================ 层级对应(人设 vs 帖子): ================================================================================ | 人设 | 帖子 | |--------|--------| | 人设 | 帖子 | | 维度 | 维度 | | 分类 | 点 | | 标签 | 标签 | ================================================================================ 节点ID格式: {domain}:{dimension}:{type}:{name} ================================================================================ - 根节点: 帖子:帖子:帖子:{post_id} - 维度节点: 帖子:灵感点:灵感点:灵感点 - 点节点: 帖子:灵感点:点:{point_name} - 标签节点: 帖子:灵感点:标签:{tag_name} ================================================================================ 边类型: ================================================================================ - 属于: 子节点 -> 父节点(层级关系) - 包含: 父节点 -> 子节点(层级关系) - 匹配: 帖子标签 <-> 人设标签(双向,score为相似度) ================================================================================ 匹配边说明: ================================================================================ 帖子图谱包含与人设图谱的匹配边,通过节点ID关联: - 帖子标签ID: 帖子:灵感点:标签:{tag_name} - 人设标签ID: 人设:灵感点:标签:{persona_tag_name} 使用方式:从帖子标签出发,沿"匹配"边游走到人设标签ID, 再从人设图谱.json中查找该ID的详细信息。 ================================================================================ """ import json from pathlib import Path from typing import Dict, List, Set from datetime import datetime import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig # ==================== 节点和边构建工具 ==================== def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str: """构建节点ID""" return f"{domain}:{dimension}:{node_type}:{name}" def build_edge_id(source: str, edge_type: str, target: str) -> str: """构建边ID""" return f"{source}|{edge_type}|{target}" def create_node( domain: str, dimension: str, node_type: str, name: str, detail: Dict = None ) -> Dict: """创建节点""" return { "name": name, "type": node_type, "dimension": dimension, "domain": domain, "detail": detail or {} } def create_edge( source: str, target: str, edge_type: str, score: float = None, detail: Dict = None ) -> Dict: """创建边""" return { "source": source, "target": target, "type": edge_type, "score": score, "detail": detail or {} } # ==================== 从帖子解构结果提取节点和匹配边 ==================== def extract_points_tags_and_matches(filtered_data: Dict) -> tuple: """ 从帖子解构结果中提取点节点、标签节点和匹配边 Returns: (点节点字典, 标签节点字典, 标签到点的映射, 匹配边字典) """ point_nodes = {} # nodeId -> nodeData tag_nodes = {} # nodeId -> nodeData tag_to_point = {} # tagId -> [pointId, ...] match_edges = {} # edgeId -> edgeData how_result = filtered_data.get("how解构结果", {}) dimension_mapping = { "灵感点列表": "灵感点", "目的点列表": "目的点", "关键点列表": "关键点" } for list_key, dimension in dimension_mapping.items(): points = how_result.get(list_key, []) for point in points: point_name = point.get("名称", "") point_desc = point.get("描述", "") if not point_name: continue # 创建点节点 point_id = build_node_id("帖子", dimension, "点", point_name) point_nodes[point_id] = create_node( domain="帖子", dimension=dimension, node_type="点", name=point_name, detail={ "description": point_desc } ) # 遍历how步骤列表,提取标签和匹配 how_steps = point.get("how步骤列表", []) for step in how_steps: step_name = step.get("步骤名称", "") features = step.get("特征列表", []) for feature in features: tag_name = feature.get("特征名称", "") weight = feature.get("权重", 1.0) if not tag_name: continue # 创建标签节点 tag_id = build_node_id("帖子", dimension, "标签", tag_name) if tag_id not in tag_nodes: tag_nodes[tag_id] = create_node( domain="帖子", dimension=dimension, node_type="标签", name=tag_name, detail={ "weight": weight, "stepName": step_name, "pointNames": [point_name] } ) else: # 同一标签可能属于多个点 if point_name not in tag_nodes[tag_id]["detail"]["pointNames"]: tag_nodes[tag_id]["detail"]["pointNames"].append(point_name) # 记录标签到点的映射 if tag_id not in tag_to_point: tag_to_point[tag_id] = [] if point_id not in tag_to_point[tag_id]: tag_to_point[tag_id].append(point_id) # 提取匹配边 matches = feature.get("匹配结果", []) for match in matches: persona_name = match.get("人设特征名称", "") persona_dimension = match.get("人设特征层级", "") persona_type = match.get("特征类型", "标签") match_detail = match.get("匹配结果", {}) similarity = match_detail.get("相似度", 0) if not persona_name or not persona_dimension: continue # 构建人设节点ID persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name) # 创建双向匹配边 # 帖子标签 -> 人设标签 edge_id_1 = build_edge_id(tag_id, "匹配", persona_id) match_edges[edge_id_1] = create_edge( source=tag_id, target=persona_id, edge_type="匹配", score=similarity, detail={} ) # 人设标签 -> 帖子标签 edge_id_2 = build_edge_id(persona_id, "匹配", tag_id) match_edges[edge_id_2] = create_edge( source=persona_id, target=tag_id, edge_type="匹配", score=similarity, detail={} ) return point_nodes, tag_nodes, tag_to_point, match_edges # ==================== 构建边 ==================== def build_belong_contain_edges( point_nodes: Dict[str, Dict], tag_nodes: Dict[str, Dict], tag_to_point: Dict[str, List[str]], dimension_node_ids: Dict[str, str] ) -> Dict[str, Dict]: """ 构建属于/包含边 Returns: 边字典 { edgeId: edgeData } """ edges = {} # 1. 点 -> 维度(属于/包含) for point_id, point_data in point_nodes.items(): dimension = point_data["dimension"] dim_node_id = dimension_node_ids[dimension] # 属于边:点 -> 维度 edge_id = build_edge_id(point_id, "属于", dim_node_id) edges[edge_id] = create_edge( source=point_id, target=dim_node_id, edge_type="属于", score=1.0 ) # 包含边:维度 -> 点 edge_id_contain = build_edge_id(dim_node_id, "包含", point_id) edges[edge_id_contain] = create_edge( source=dim_node_id, target=point_id, edge_type="包含", score=1.0 ) # 2. 标签 -> 点(属于/包含) for tag_id, point_ids in tag_to_point.items(): for point_id in point_ids: # 属于边:标签 -> 点 edge_id = build_edge_id(tag_id, "属于", point_id) edges[edge_id] = create_edge( source=tag_id, target=point_id, edge_type="属于", score=1.0 ) # 包含边:点 -> 标签 edge_id_contain = build_edge_id(point_id, "包含", tag_id) edges[edge_id_contain] = create_edge( source=point_id, target=tag_id, edge_type="包含", score=1.0 ) return edges # ==================== 构建索引 ==================== def build_index(edges: Dict[str, Dict]) -> Dict: """ 构建游走索引 Returns: { "outEdges": { nodeId: { edgeType: [{ target, score }] } }, "inEdges": { nodeId: { edgeType: [{ source, score }] } } } """ out_edges = {} in_edges = {} for edge_data in edges.values(): source = edge_data["source"] target = edge_data["target"] edge_type = edge_data["type"] score = edge_data["score"] # outEdges if source not in out_edges: out_edges[source] = {} if edge_type not in out_edges[source]: out_edges[source][edge_type] = [] out_edges[source][edge_type].append({ "target": target, "score": score }) # inEdges if target not in in_edges: in_edges[target] = {} if edge_type not in in_edges[target]: in_edges[target][edge_type] = [] in_edges[target][edge_type].append({ "source": source, "score": score }) return { "outEdges": out_edges, "inEdges": in_edges } # ==================== 构建嵌套树 ==================== def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict: """ 从根节点开始,沿"包含"边递归构建嵌套树结构 Returns: 嵌套的树结构 """ # 从"包含"边构建 父节点 -> [子节点] 的映射 parent_to_children = {} for edge_data in edges.values(): if edge_data["type"] == "包含": parent_id = edge_data["source"] child_id = edge_data["target"] if parent_id not in parent_to_children: parent_to_children[parent_id] = [] parent_to_children[parent_id].append(child_id) # 递归构建子树 def build_subtree(node_id: str) -> Dict: node_data = nodes[node_id] subtree = { "id": node_id, "name": node_data["name"], "type": node_data["type"], "domain": node_data["domain"], "dimension": node_data["dimension"], "detail": node_data.get("detail", {}), "children": [] } # 获取子节点 child_ids = parent_to_children.get(node_id, []) for child_id in child_ids: if child_id in nodes: subtree["children"].append(build_subtree(child_id)) return subtree return build_subtree(root_id) # ==================== 图游走工具 ==================== def walk_graph( index: Dict, start_node: str, edge_types: List[str], direction: str = "out", min_score: float = None ) -> Set[str]: """ 从起始节点出发,按指定边类型序列游走N步 Args: index: 游走索引 {"outEdges": {...}, "inEdges": {...}} start_node: 起始节点ID edge_types: 边类型序列,如 ["属于", "包含"] direction: 游走方向 "out"(沿出边) / "in"(沿入边) min_score: 最小分数过滤 Returns: 到达的节点ID集合 """ edge_index = index["outEdges"] if direction == "out" else index["inEdges"] target_key = "target" if direction == "out" else "source" current_nodes = {start_node} for edge_type in edge_types: next_nodes = set() for node in current_nodes: neighbors = edge_index.get(node, {}).get(edge_type, []) for neighbor in neighbors: if min_score is not None and neighbor.get("score", 0) < min_score: continue next_nodes.add(neighbor[target_key]) current_nodes = next_nodes if not current_nodes: break return current_nodes def get_neighbors( index: Dict, node_id: str, edge_type: str = None, direction: str = "out", min_score: float = None ) -> List[Dict]: """ 获取节点的邻居 Args: index: 游走索引 node_id: 节点ID edge_type: 边类型(可选,不指定则返回所有类型) direction: 方向 "out" / "in" min_score: 最小分数过滤 Returns: 邻居列表 [{"target": "...", "score": 0.5}, ...] """ edge_index = index["outEdges"] if direction == "out" else index["inEdges"] node_edges = edge_index.get(node_id, {}) if edge_type: neighbors = node_edges.get(edge_type, []) else: neighbors = [] for edges in node_edges.values(): neighbors.extend(edges) if min_score is not None: neighbors = [n for n in neighbors if n.get("score", 0) >= min_score] return neighbors # ==================== 处理单个帖子 ==================== def process_single_post(filtered_file: Path, output_dir: Path) -> Dict: """ 处理单个帖子,生成帖子图谱 Returns: 处理结果统计 """ # 读取数据 with open(filtered_file, "r", encoding="utf-8") as f: filtered_data = json.load(f) post_id = filtered_data.get("帖子id", "") post_detail = filtered_data.get("帖子详情", {}) post_title = post_detail.get("title", "") # 初始化节点和边 all_nodes = {} all_edges = {} # 1. 提取点节点、标签节点和匹配边 point_nodes, tag_nodes, tag_to_point, match_edges = extract_points_tags_and_matches(filtered_data) # 2. 添加根节点 root_id = build_node_id("帖子", "帖子", "帖子", post_id) all_nodes[root_id] = create_node( domain="帖子", dimension="帖子", node_type="帖子", name=post_id, detail={ "postTitle": post_title, "postDetail": post_detail } ) # 3. 添加维度节点 dimensions = ["灵感点", "目的点", "关键点"] dimension_node_ids = {} for dim in dimensions: dim_id = build_node_id("帖子", dim, dim, dim) dimension_node_ids[dim] = dim_id all_nodes[dim_id] = create_node( domain="帖子", dimension=dim, node_type=dim, name=dim, detail={} ) # 维度 -> 根 的属于边 edge_id = build_edge_id(dim_id, "属于", root_id) all_edges[edge_id] = create_edge( source=dim_id, target=root_id, edge_type="属于", score=1.0 ) # 根 -> 维度 的包含边 edge_id_contain = build_edge_id(root_id, "包含", dim_id) all_edges[edge_id_contain] = create_edge( source=root_id, target=dim_id, edge_type="包含", score=1.0 ) # 4. 添加点节点和标签节点 all_nodes.update(point_nodes) all_nodes.update(tag_nodes) # 5. 构建属于/包含边 belong_contain_edges = build_belong_contain_edges( point_nodes, tag_nodes, tag_to_point, dimension_node_ids ) all_edges.update(belong_contain_edges) # 6. 添加匹配边 all_edges.update(match_edges) # 7. 构建索引 index = build_index(all_edges) # 8. 构建嵌套树 tree = build_nested_tree(all_nodes, all_edges, root_id) # 统计 point_count = len(point_nodes) tag_count = len(tag_nodes) match_count = len(match_edges) // 2 # 双向边,除以2得到实际匹配数 dimension_stats = {} for dim in dimensions: dim_points = sum(1 for n in point_nodes.values() if n["dimension"] == dim) dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim) dimension_stats[dim] = { "pointCount": dim_points, "tagCount": dim_tags } # 构建输出 output_data = { "meta": { "postId": post_id, "postTitle": post_title, "postDetail": post_detail, "createdAt": datetime.now().isoformat(), "stats": { "nodeCount": len(all_nodes), "edgeCount": len(all_edges), "pointCount": point_count, "tagCount": tag_count, "matchCount": match_count, "dimensions": dimension_stats } }, "nodes": all_nodes, "edges": all_edges, "index": index, "tree": tree } # 保存 output_file = output_dir / f"{post_id}_帖子图谱.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) return { "postId": post_id, "postTitle": post_title, "nodeCount": len(all_nodes), "edgeCount": len(all_edges), "pointCount": point_count, "tagCount": tag_count, "matchCount": match_count, "outputFile": str(output_file) } # ==================== 主函数 ==================== def main(): config = PathConfig() config.ensure_dirs() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() # 输入目录 filtered_results_dir = config.intermediate_dir / "filtered_results" # 输出目录 output_dir = config.intermediate_dir / "post_graph" output_dir.mkdir(parents=True, exist_ok=True) print(f"输入目录: {filtered_results_dir}") print(f"输出目录: {output_dir}") print() # 获取所有帖子文件 filtered_files = list(filtered_results_dir.glob("*_filtered.json")) print(f"找到 {len(filtered_files)} 个帖子文件") print() # 处理每个帖子 results = [] for i, filtered_file in enumerate(filtered_files, 1): print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}") result = process_single_post(filtered_file, output_dir) results.append(result) print(f" 节点: {result['nodeCount']}, 边: {result['edgeCount']}") print(f" 点: {result['pointCount']}, 标签: {result['tagCount']}, 匹配: {result['matchCount']}") print(f" → {Path(result['outputFile']).name}") print() # 汇总统计 print("=" * 60) print("处理完成!") print(f" 帖子数: {len(results)}") print(f" 总节点数: {sum(r['nodeCount'] for r in results)}") print(f" 总边数: {sum(r['edgeCount'] for r in results)}") print(f" 总点数: {sum(r['pointCount'] for r in results)}") print(f" 总标签数: {sum(r['tagCount'] for r in results)}") print(f" 总匹配数: {sum(r['matchCount'] for r in results)}") print(f"\n输出目录: {output_dir}") if __name__ == "__main__": main()