| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 构建帖子图谱
- ================================================================================
- 输入文件:
- ================================================================================
- filtered_results/*_filtered.json - 帖子解构结果(过滤后的how解构)
- ================================================================================
- 输出文件: post_graph/{post_id}_帖子图谱.json(每个帖子一个文件)
- ================================================================================
- {
- "meta": { # 元信息
- "postId": "帖子ID",
- "postTitle": "帖子标题",
- "postDetail": {...},
- "createdAt": "时间戳",
- "stats": { ... }
- },
- "nodes": { # 节点字典 (nodeId -> nodeData)
- "{domain}:{dimension}:{type}:{name}": {
- "name": "显示名称",
- "type": "帖子|灵感点|目的点|关键点|点|标签",
- "domain": "帖子",
- "dimension": "帖子|灵感点|目的点|关键点",
- "detail": { ... }
- }
- },
- "edges": { # 边字典 (edgeId -> edgeData)
- "{source}|{type}|{target}": {
- "source": "源节点ID",
- "target": "目标节点ID",
- "type": "属于|包含",
- "score": 1.0,
- "detail": { ... }
- }
- },
- "index": { # 游走索引
- "outEdges": { nodeId: { edgeType: [{ target, score }] } },
- "inEdges": { nodeId: { edgeType: [{ source, score }] } }
- },
- "tree": { ... } # 嵌套树结构
- }
- ================================================================================
- 核心逻辑:
- ================================================================================
- 1. 从 filtered_results 读取帖子解构结果
- 2. 提取点节点和标签节点
- 3. 添加根节点(帖子)和维度节点(灵感点/目的点/关键点)
- 4. 构建属于/包含边
- 5. 构建索引和嵌套树
- ================================================================================
- 层级对应(人设 vs 帖子):
- ================================================================================
- | 人设 | 帖子 |
- |--------|--------|
- | 人设 | 帖子 |
- | 维度 | 维度 |
- | 分类 | 点 |
- | 标签 | 标签 |
- ================================================================================
- 节点ID格式: {domain}:{dimension}:{type}:{name}
- ================================================================================
- - 根节点: 帖子:帖子:帖子:{post_id}
- - 维度节点: 帖子:灵感点:灵感点:灵感点
- - 点节点: 帖子:灵感点:点:{point_name}
- - 标签节点: 帖子:灵感点:标签:{tag_name}
- ================================================================================
- 边类型:
- ================================================================================
- - 属于: 子节点 -> 父节点(层级关系)
- - 包含: 父节点 -> 子节点(层级关系)
- - 匹配: 帖子标签 <-> 人设标签(双向,score为相似度)
- ================================================================================
- 匹配边说明:
- ================================================================================
- 帖子图谱包含与人设图谱的匹配边,通过节点ID关联:
- - 帖子标签ID: 帖子:灵感点:标签:{tag_name}
- - 人设标签ID: 人设:灵感点:标签:{persona_tag_name}
- 使用方式:从帖子标签出发,沿"匹配"边游走到人设标签ID,
- 再从人设图谱.json中查找该ID的详细信息。
- ================================================================================
- """
- import json
- from pathlib import Path
- from typing import Dict, List, Set
- from datetime import datetime
- import sys
- # 添加项目根目录到路径
- project_root = Path(__file__).parent.parent.parent
- sys.path.insert(0, str(project_root))
- from script.data_processing.path_config import PathConfig
- # ==================== 节点和边构建工具 ====================
- def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
- """构建节点ID"""
- return f"{domain}:{dimension}:{node_type}:{name}"
- def build_edge_id(source: str, edge_type: str, target: str) -> str:
- """构建边ID"""
- return f"{source}|{edge_type}|{target}"
- def create_node(
- domain: str,
- dimension: str,
- node_type: str,
- name: str,
- detail: Dict = None
- ) -> Dict:
- """创建节点"""
- return {
- "name": name,
- "type": node_type,
- "dimension": dimension,
- "domain": domain,
- "detail": detail or {}
- }
- def create_edge(
- source: str,
- target: str,
- edge_type: str,
- score: float = None,
- detail: Dict = None
- ) -> Dict:
- """创建边"""
- return {
- "source": source,
- "target": target,
- "type": edge_type,
- "score": score,
- "detail": detail or {}
- }
- # ==================== 从帖子解构结果提取节点和匹配边 ====================
- def extract_points_tags_and_matches(filtered_data: Dict) -> tuple:
- """
- 从帖子解构结果中提取点节点、标签节点和匹配边
- Returns:
- (点节点字典, 标签节点字典, 标签到点的映射, 匹配边字典)
- """
- point_nodes = {} # nodeId -> nodeData
- tag_nodes = {} # nodeId -> nodeData
- tag_to_point = {} # tagId -> [pointId, ...]
- match_edges = {} # edgeId -> edgeData
- how_result = filtered_data.get("how解构结果", {})
- dimension_mapping = {
- "灵感点列表": "灵感点",
- "目的点列表": "目的点",
- "关键点列表": "关键点"
- }
- for list_key, dimension in dimension_mapping.items():
- points = how_result.get(list_key, [])
- for point in points:
- point_name = point.get("名称", "")
- point_desc = point.get("描述", "")
- if not point_name:
- continue
- # 创建点节点
- point_id = build_node_id("帖子", dimension, "点", point_name)
- point_nodes[point_id] = create_node(
- domain="帖子",
- dimension=dimension,
- node_type="点",
- name=point_name,
- detail={
- "description": point_desc
- }
- )
- # 遍历how步骤列表,提取标签和匹配
- how_steps = point.get("how步骤列表", [])
- for step in how_steps:
- step_name = step.get("步骤名称", "")
- features = step.get("特征列表", [])
- for feature in features:
- tag_name = feature.get("特征名称", "")
- weight = feature.get("权重", 1.0)
- if not tag_name:
- continue
- # 创建标签节点
- tag_id = build_node_id("帖子", dimension, "标签", tag_name)
- if tag_id not in tag_nodes:
- tag_nodes[tag_id] = create_node(
- domain="帖子",
- dimension=dimension,
- node_type="标签",
- name=tag_name,
- detail={
- "weight": weight,
- "stepName": step_name,
- "pointNames": [point_name]
- }
- )
- else:
- # 同一标签可能属于多个点
- if point_name not in tag_nodes[tag_id]["detail"]["pointNames"]:
- tag_nodes[tag_id]["detail"]["pointNames"].append(point_name)
- # 记录标签到点的映射
- if tag_id not in tag_to_point:
- tag_to_point[tag_id] = []
- if point_id not in tag_to_point[tag_id]:
- tag_to_point[tag_id].append(point_id)
- # 提取匹配边
- matches = feature.get("匹配结果", [])
- for match in matches:
- persona_name = match.get("人设特征名称", "")
- persona_dimension = match.get("人设特征层级", "")
- persona_type = match.get("特征类型", "标签")
- match_detail = match.get("匹配结果", {})
- similarity = match_detail.get("相似度", 0)
- if not persona_name or not persona_dimension:
- continue
- # 构建人设节点ID
- persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name)
- # 创建双向匹配边
- # 帖子标签 -> 人设标签
- edge_id_1 = build_edge_id(tag_id, "匹配", persona_id)
- match_edges[edge_id_1] = create_edge(
- source=tag_id,
- target=persona_id,
- edge_type="匹配",
- score=similarity,
- detail={}
- )
- # 人设标签 -> 帖子标签
- edge_id_2 = build_edge_id(persona_id, "匹配", tag_id)
- match_edges[edge_id_2] = create_edge(
- source=persona_id,
- target=tag_id,
- edge_type="匹配",
- score=similarity,
- detail={}
- )
- return point_nodes, tag_nodes, tag_to_point, match_edges
- # ==================== 构建边 ====================
- def build_belong_contain_edges(
- point_nodes: Dict[str, Dict],
- tag_nodes: Dict[str, Dict],
- tag_to_point: Dict[str, List[str]],
- dimension_node_ids: Dict[str, str]
- ) -> Dict[str, Dict]:
- """
- 构建属于/包含边
- Returns:
- 边字典 { edgeId: edgeData }
- """
- edges = {}
- # 1. 点 -> 维度(属于/包含)
- for point_id, point_data in point_nodes.items():
- dimension = point_data["dimension"]
- dim_node_id = dimension_node_ids[dimension]
- # 属于边:点 -> 维度
- edge_id = build_edge_id(point_id, "属于", dim_node_id)
- edges[edge_id] = create_edge(
- source=point_id,
- target=dim_node_id,
- edge_type="属于",
- score=1.0
- )
- # 包含边:维度 -> 点
- edge_id_contain = build_edge_id(dim_node_id, "包含", point_id)
- edges[edge_id_contain] = create_edge(
- source=dim_node_id,
- target=point_id,
- edge_type="包含",
- score=1.0
- )
- # 2. 标签 -> 点(属于/包含)
- for tag_id, point_ids in tag_to_point.items():
- for point_id in point_ids:
- # 属于边:标签 -> 点
- edge_id = build_edge_id(tag_id, "属于", point_id)
- edges[edge_id] = create_edge(
- source=tag_id,
- target=point_id,
- edge_type="属于",
- score=1.0
- )
- # 包含边:点 -> 标签
- edge_id_contain = build_edge_id(point_id, "包含", tag_id)
- edges[edge_id_contain] = create_edge(
- source=point_id,
- target=tag_id,
- edge_type="包含",
- score=1.0
- )
- return edges
- # ==================== 构建索引 ====================
- def build_index(edges: Dict[str, Dict]) -> Dict:
- """
- 构建游走索引
- Returns:
- {
- "outEdges": { nodeId: { edgeType: [{ target, score }] } },
- "inEdges": { nodeId: { edgeType: [{ source, score }] } }
- }
- """
- out_edges = {}
- in_edges = {}
- for edge_data in edges.values():
- source = edge_data["source"]
- target = edge_data["target"]
- edge_type = edge_data["type"]
- score = edge_data["score"]
- # outEdges
- if source not in out_edges:
- out_edges[source] = {}
- if edge_type not in out_edges[source]:
- out_edges[source][edge_type] = []
- out_edges[source][edge_type].append({
- "target": target,
- "score": score
- })
- # inEdges
- if target not in in_edges:
- in_edges[target] = {}
- if edge_type not in in_edges[target]:
- in_edges[target][edge_type] = []
- in_edges[target][edge_type].append({
- "source": source,
- "score": score
- })
- return {
- "outEdges": out_edges,
- "inEdges": in_edges
- }
- # ==================== 构建嵌套树 ====================
- def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict:
- """
- 从根节点开始,沿"包含"边递归构建嵌套树结构
- Returns:
- 嵌套的树结构
- """
- # 从"包含"边构建 父节点 -> [子节点] 的映射
- parent_to_children = {}
- for edge_data in edges.values():
- if edge_data["type"] == "包含":
- parent_id = edge_data["source"]
- child_id = edge_data["target"]
- if parent_id not in parent_to_children:
- parent_to_children[parent_id] = []
- parent_to_children[parent_id].append(child_id)
- # 递归构建子树
- def build_subtree(node_id: str) -> Dict:
- node_data = nodes[node_id]
- subtree = {
- "id": node_id,
- "name": node_data["name"],
- "type": node_data["type"],
- "domain": node_data["domain"],
- "dimension": node_data["dimension"],
- "detail": node_data.get("detail", {}),
- "children": []
- }
- # 获取子节点
- child_ids = parent_to_children.get(node_id, [])
- for child_id in child_ids:
- if child_id in nodes:
- subtree["children"].append(build_subtree(child_id))
- return subtree
- return build_subtree(root_id)
- # ==================== 图游走工具 ====================
- def walk_graph(
- index: Dict,
- start_node: str,
- edge_types: List[str],
- direction: str = "out",
- min_score: float = None
- ) -> Set[str]:
- """
- 从起始节点出发,按指定边类型序列游走N步
- Args:
- index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
- start_node: 起始节点ID
- edge_types: 边类型序列,如 ["属于", "包含"]
- direction: 游走方向 "out"(沿出边) / "in"(沿入边)
- min_score: 最小分数过滤
- Returns:
- 到达的节点ID集合
- """
- edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
- target_key = "target" if direction == "out" else "source"
- current_nodes = {start_node}
- for edge_type in edge_types:
- next_nodes = set()
- for node in current_nodes:
- neighbors = edge_index.get(node, {}).get(edge_type, [])
- for neighbor in neighbors:
- if min_score is not None and neighbor.get("score", 0) < min_score:
- continue
- next_nodes.add(neighbor[target_key])
- current_nodes = next_nodes
- if not current_nodes:
- break
- return current_nodes
- def get_neighbors(
- index: Dict,
- node_id: str,
- edge_type: str = None,
- direction: str = "out",
- min_score: float = None
- ) -> List[Dict]:
- """
- 获取节点的邻居
- Args:
- index: 游走索引
- node_id: 节点ID
- edge_type: 边类型(可选,不指定则返回所有类型)
- direction: 方向 "out" / "in"
- min_score: 最小分数过滤
- Returns:
- 邻居列表 [{"target": "...", "score": 0.5}, ...]
- """
- edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
- node_edges = edge_index.get(node_id, {})
- if edge_type:
- neighbors = node_edges.get(edge_type, [])
- else:
- neighbors = []
- for edges in node_edges.values():
- neighbors.extend(edges)
- if min_score is not None:
- neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
- return neighbors
- # ==================== 处理单个帖子 ====================
- def process_single_post(filtered_file: Path, output_dir: Path) -> Dict:
- """
- 处理单个帖子,生成帖子图谱
- Returns:
- 处理结果统计
- """
- # 读取数据
- with open(filtered_file, "r", encoding="utf-8") as f:
- filtered_data = json.load(f)
- post_id = filtered_data.get("帖子id", "")
- post_detail = filtered_data.get("帖子详情", {})
- post_title = post_detail.get("title", "")
- # 初始化节点和边
- all_nodes = {}
- all_edges = {}
- # 1. 提取点节点、标签节点和匹配边
- point_nodes, tag_nodes, tag_to_point, match_edges = extract_points_tags_and_matches(filtered_data)
- # 2. 添加根节点
- root_id = build_node_id("帖子", "帖子", "帖子", post_id)
- all_nodes[root_id] = create_node(
- domain="帖子",
- dimension="帖子",
- node_type="帖子",
- name=post_id,
- detail={
- "postTitle": post_title,
- "postDetail": post_detail
- }
- )
- # 3. 添加维度节点
- dimensions = ["灵感点", "目的点", "关键点"]
- dimension_node_ids = {}
- for dim in dimensions:
- dim_id = build_node_id("帖子", dim, dim, dim)
- dimension_node_ids[dim] = dim_id
- all_nodes[dim_id] = create_node(
- domain="帖子",
- dimension=dim,
- node_type=dim,
- name=dim,
- detail={}
- )
- # 维度 -> 根 的属于边
- edge_id = build_edge_id(dim_id, "属于", root_id)
- all_edges[edge_id] = create_edge(
- source=dim_id,
- target=root_id,
- edge_type="属于",
- score=1.0
- )
- # 根 -> 维度 的包含边
- edge_id_contain = build_edge_id(root_id, "包含", dim_id)
- all_edges[edge_id_contain] = create_edge(
- source=root_id,
- target=dim_id,
- edge_type="包含",
- score=1.0
- )
- # 4. 添加点节点和标签节点
- all_nodes.update(point_nodes)
- all_nodes.update(tag_nodes)
- # 5. 构建属于/包含边
- belong_contain_edges = build_belong_contain_edges(
- point_nodes, tag_nodes, tag_to_point, dimension_node_ids
- )
- all_edges.update(belong_contain_edges)
- # 6. 添加匹配边
- all_edges.update(match_edges)
- # 7. 构建索引
- index = build_index(all_edges)
- # 8. 构建嵌套树
- tree = build_nested_tree(all_nodes, all_edges, root_id)
- # 统计
- point_count = len(point_nodes)
- tag_count = len(tag_nodes)
- match_count = len(match_edges) // 2 # 双向边,除以2得到实际匹配数
- dimension_stats = {}
- for dim in dimensions:
- dim_points = sum(1 for n in point_nodes.values() if n["dimension"] == dim)
- dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim)
- dimension_stats[dim] = {
- "pointCount": dim_points,
- "tagCount": dim_tags
- }
- # 构建输出
- output_data = {
- "meta": {
- "postId": post_id,
- "postTitle": post_title,
- "postDetail": post_detail,
- "createdAt": datetime.now().isoformat(),
- "stats": {
- "nodeCount": len(all_nodes),
- "edgeCount": len(all_edges),
- "pointCount": point_count,
- "tagCount": tag_count,
- "matchCount": match_count,
- "dimensions": dimension_stats
- }
- },
- "nodes": all_nodes,
- "edges": all_edges,
- "index": index,
- "tree": tree
- }
- # 保存
- output_file = output_dir / f"{post_id}_帖子图谱.json"
- with open(output_file, "w", encoding="utf-8") as f:
- json.dump(output_data, f, ensure_ascii=False, indent=2)
- return {
- "postId": post_id,
- "postTitle": post_title,
- "nodeCount": len(all_nodes),
- "edgeCount": len(all_edges),
- "pointCount": point_count,
- "tagCount": tag_count,
- "matchCount": match_count,
- "outputFile": str(output_file)
- }
- # ==================== 主函数 ====================
- def main():
- config = PathConfig()
- config.ensure_dirs()
- print(f"账号: {config.account_name}")
- print(f"输出版本: {config.output_version}")
- print()
- # 输入目录
- filtered_results_dir = config.intermediate_dir / "filtered_results"
- # 输出目录
- output_dir = config.intermediate_dir / "post_graph"
- output_dir.mkdir(parents=True, exist_ok=True)
- print(f"输入目录: {filtered_results_dir}")
- print(f"输出目录: {output_dir}")
- print()
- # 获取所有帖子文件
- filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
- print(f"找到 {len(filtered_files)} 个帖子文件")
- print()
- # 处理每个帖子
- results = []
- for i, filtered_file in enumerate(filtered_files, 1):
- print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
- result = process_single_post(filtered_file, output_dir)
- results.append(result)
- print(f" 节点: {result['nodeCount']}, 边: {result['edgeCount']}")
- print(f" 点: {result['pointCount']}, 标签: {result['tagCount']}, 匹配: {result['matchCount']}")
- print(f" → {Path(result['outputFile']).name}")
- print()
- # 汇总统计
- print("=" * 60)
- print("处理完成!")
- print(f" 帖子数: {len(results)}")
- print(f" 总节点数: {sum(r['nodeCount'] for r in results)}")
- print(f" 总边数: {sum(r['edgeCount'] for r in results)}")
- print(f" 总点数: {sum(r['pointCount'] for r in results)}")
- print(f" 总标签数: {sum(r['tagCount'] for r in results)}")
- print(f" 总匹配数: {sum(r['matchCount'] for r in results)}")
- print(f"\n输出目录: {output_dir}")
- if __name__ == "__main__":
- main()
|