#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 从匹配结果中构建帖子与人设的节点边关系图 输入: 1. filtered_results目录下的匹配结果文件 2. 节点列表.json 3. 边关系.json 输出: 1. match_graph目录下的节点边关系文件 """ import json from pathlib import Path from typing import Dict, List, Set, Any, Optional import sys # 添加项目根目录到路径 project_root = Path(__file__).parent.parent.parent sys.path.insert(0, str(project_root)) from script.data_processing.path_config import PathConfig def build_post_node_id(dimension: str, node_type: str, name: str) -> str: """构建帖子节点ID Args: dimension: 维度(灵感点/关键点/目的点) node_type: 节点类型(点/标签) name: 节点名称 """ return f"帖子_{dimension}_{node_type}_{name}" def build_persona_node_id(dimension: str, node_type: str, name: str) -> str: """构建人设节点ID""" return f"{dimension}_{node_type}_{name}" def extract_matched_nodes_and_edges(filtered_data: Dict) -> tuple: """ 从匹配结果中提取帖子节点(点+标签)、人设节点和边 Args: filtered_data: 匹配结果数据 Returns: (帖子节点列表, 人设节点ID集合, 边列表) 帖子节点包括:点节点(灵感点/关键点/目的点)和标签节点 边包括:点→标签的属于边 + 标签→人设的匹配边 """ post_nodes = [] persona_node_ids = set() edges = [] # 包含属于边和匹配边 how_result = filtered_data.get("how解构结果", {}) # 维度映射 dimension_mapping = { "灵感点列表": "灵感点", "目的点列表": "目的点", "关键点列表": "关键点" } for list_key, dimension in dimension_mapping.items(): points = how_result.get(list_key, []) for point in points: point_name = point.get("名称", "") point_desc = point.get("描述", "") if not point_name: continue # 创建帖子点节点 point_node_id = build_post_node_id(dimension, "点", point_name) point_node = { "节点ID": point_node_id, "节点名称": point_name, "节点类型": "点", "节点层级": dimension, "描述": point_desc, "source": "帖子" } # 避免重复添加点节点 if not any(n["节点ID"] == point_node_id for n in post_nodes): post_nodes.append(point_node) # 遍历how步骤列表,提取标签节点 how_steps = point.get("how步骤列表", []) for step in how_steps: features = step.get("特征列表", []) for feature in features: feature_name = feature.get("特征名称", "") weight = feature.get("权重", 0) match_results = feature.get("匹配结果", []) if not feature_name: continue # 创建帖子标签节点(无论是否有匹配结果) tag_node_id = build_post_node_id(dimension, "标签", feature_name) tag_node = { "节点ID": tag_node_id, "节点名称": feature_name, "节点类型": "标签", "节点层级": dimension, "权重": weight, "source": "帖子", "已匹配": len(match_results) > 0 # 标记是否有匹配 } # 避免重复添加标签节点 if not any(n["节点ID"] == tag_node_id for n in post_nodes): post_nodes.append(tag_node) # 创建标签→点的属于边 belong_edge = { "源节点ID": tag_node_id, "目标节点ID": point_node_id, "边类型": "属于", "边详情": { "说明": f"标签「{feature_name}」属于点「{point_name}」" } } # 避免重复添加属于边 edge_key = (tag_node_id, point_node_id, "属于") if not any((e["源节点ID"], e["目标节点ID"], e["边类型"]) == edge_key for e in edges): edges.append(belong_edge) # 如果有匹配结果,创建匹配边 if match_results: for match in match_results: persona_name = match.get("人设特征名称", "") persona_dimension = match.get("人设特征层级", "") persona_type = match.get("特征类型", "标签") match_detail = match.get("匹配结果", {}) if not persona_name or not persona_dimension: continue # 构建人设节点ID persona_node_id = build_persona_node_id( persona_dimension, persona_type, persona_name ) persona_node_ids.add(persona_node_id) # 创建匹配边(根据相似度区分类型) similarity = match_detail.get("相似度", 0) if similarity >= 0.8: edge_type = "匹配_相同" else: edge_type = "匹配_相似" match_edge = { "源节点ID": tag_node_id, "目标节点ID": persona_node_id, "边类型": edge_type, "边详情": { "相似度": similarity, "说明": match_detail.get("说明", "") } } edges.append(match_edge) return post_nodes, persona_node_ids, edges def get_persona_nodes_details( persona_node_ids: Set[str], nodes_data: Dict ) -> List[Dict]: """ 从节点列表中获取人设节点的详细信息 Args: persona_node_ids: 人设节点ID集合 nodes_data: 节点列表数据 Returns: 人设节点详情列表 """ persona_nodes = [] all_nodes = nodes_data.get("节点列表", []) for node in all_nodes: if node["节点ID"] in persona_node_ids: persona_nodes.append(node) return persona_nodes def get_edges_between_nodes( node_ids: Set[str], edges_data: Dict ) -> List[Dict]: """ 获取指定节点之间的边关系 Args: node_ids: 节点ID集合 edges_data: 边关系数据 Returns: 节点之间的边列表 """ edges_between = [] all_edges = edges_data.get("边列表", []) for edge in all_edges: source_id = edge["源节点ID"] target_id = edge["目标节点ID"] # 两个节点都在集合中 if source_id in node_ids and target_id in node_ids: edges_between.append(edge) return edges_between def create_mirrored_post_edges( match_edges: List[Dict], persona_edges: List[Dict] ) -> List[Dict]: """ 根据人设节点之间的边,创建帖子节点之间的镜像边 逻辑:如果人设节点A和B之间有边,且帖子节点X匹配A,帖子节点Y匹配B, 则创建帖子节点X和Y之间的镜像边 Args: match_edges: 匹配边列表(帖子节点 -> 人设节点) persona_edges: 人设节点之间的边列表 Returns: 帖子节点之间的镜像边列表 """ # 构建人设节点到帖子节点的反向映射 # persona_id -> [post_id1, post_id2, ...] persona_to_posts = {} for edge in match_edges: post_id = edge["源节点ID"] persona_id = edge["目标节点ID"] if persona_id not in persona_to_posts: persona_to_posts[persona_id] = [] if post_id not in persona_to_posts[persona_id]: persona_to_posts[persona_id].append(post_id) # 根据人设边创建帖子镜像边 post_edges = [] seen_edges = set() for persona_edge in persona_edges: source_persona = persona_edge["源节点ID"] target_persona = persona_edge["目标节点ID"] edge_type = persona_edge["边类型"] # 获取匹配到这两个人设节点的帖子节点 source_posts = persona_to_posts.get(source_persona, []) target_posts = persona_to_posts.get(target_persona, []) # 为每对帖子节点创建镜像边 for src_post in source_posts: for tgt_post in target_posts: if src_post == tgt_post: continue # 使用排序后的key避免重复(A-B 和 B-A 视为同一条边) edge_key = tuple(sorted([src_post, tgt_post])) + (edge_type,) if edge_key in seen_edges: continue seen_edges.add(edge_key) post_edge = { "源节点ID": src_post, "目标节点ID": tgt_post, "边类型": f"镜像_{edge_type}", # 标记为镜像边 "边详情": { "原始边类型": edge_type, "源人设节点": source_persona, "目标人设节点": target_persona } } post_edges.append(post_edge) return post_edges def expand_one_layer( node_ids: Set[str], edges_data: Dict, nodes_data: Dict, edge_types: List[str] = None, direction: str = "both" ) -> tuple: """ 从指定节点扩展一层,获取相邻节点和连接边 Args: node_ids: 起始节点ID集合 edges_data: 边关系数据 nodes_data: 节点列表数据 edge_types: 要扩展的边类型列表,None表示所有类型 direction: 扩展方向 - "outgoing": 只沿出边扩展(源节点在集合中,扩展到目标节点) - "incoming": 只沿入边扩展(目标节点在集合中,扩展到源节点) - "both": 双向扩展 Returns: (扩展的节点列表, 扩展的边列表, 扩展的节点ID集合) """ expanded_node_ids = set() expanded_edges = [] all_edges = edges_data.get("边列表", []) # 找出所有与起始节点相连的边和节点 for edge in all_edges: # 过滤边类型 if edge_types and edge["边类型"] not in edge_types: continue source_id = edge["源节点ID"] target_id = edge["目标节点ID"] # 沿出边扩展:源节点在集合中,扩展到目标节点 if direction in ["outgoing", "both"]: if source_id in node_ids and target_id not in node_ids: expanded_node_ids.add(target_id) expanded_edges.append(edge) # 沿入边扩展:目标节点在集合中,扩展到源节点 if direction in ["incoming", "both"]: if target_id in node_ids and source_id not in node_ids: expanded_node_ids.add(source_id) expanded_edges.append(edge) # 获取扩展节点的详情 expanded_nodes = [] all_nodes = nodes_data.get("节点列表", []) for node in all_nodes: if node["节点ID"] in expanded_node_ids: # 标记为扩展节点 node_copy = node.copy() node_copy["是否扩展"] = True node_copy["source"] = "人设" expanded_nodes.append(node_copy) return expanded_nodes, expanded_edges, expanded_node_ids def expand_and_filter_useful_nodes( matched_persona_ids: Set[str], match_edges: List[Dict], edges_data: Dict, nodes_data: Dict, exclude_edge_types: List[str] = None ) -> tuple: """ 扩展人设节点一层,只保留能产生新帖子连线的扩展节点 逻辑:如果扩展节点E连接了2个以上的已匹配人设节点, 那么通过E可以产生新的帖子间连线,保留E Args: matched_persona_ids: 已匹配的人设节点ID集合 match_edges: 匹配边列表 edges_data: 边关系数据 nodes_data: 节点列表数据 exclude_edge_types: 要排除的边类型列表 Returns: (有效扩展节点列表, 扩展边列表, 通过扩展节点的帖子镜像边列表) """ if exclude_edge_types is None: exclude_edge_types = [] all_edges = edges_data.get("边列表", []) # 构建人设节点到帖子节点的映射 persona_to_posts = {} for edge in match_edges: post_id = edge["源节点ID"] persona_id = edge["目标节点ID"] if persona_id not in persona_to_posts: persona_to_posts[persona_id] = [] if post_id not in persona_to_posts[persona_id]: persona_to_posts[persona_id].append(post_id) # 找出所有扩展节点及其连接的已匹配人设节点 # expanded_node_id -> [(matched_persona_id, edge), ...] expanded_connections = {} for edge in all_edges: # 跳过排除的边类型 if edge["边类型"] in exclude_edge_types: continue source_id = edge["源节点ID"] target_id = edge["目标节点ID"] # 源节点是已匹配的,目标节点是扩展候选 if source_id in matched_persona_ids and target_id not in matched_persona_ids: if target_id not in expanded_connections: expanded_connections[target_id] = [] expanded_connections[target_id].append((source_id, edge)) # 目标节点是已匹配的,源节点是扩展候选 if target_id in matched_persona_ids and source_id not in matched_persona_ids: if source_id not in expanded_connections: expanded_connections[source_id] = [] expanded_connections[source_id].append((target_id, edge)) # 过滤:只保留连接2个以上已匹配人设节点的扩展节点 useful_expanded_ids = set() useful_edges = [] post_mirror_edges = [] seen_mirror_edges = set() for expanded_id, connections in expanded_connections.items(): connected_personas = list(set([c[0] for c in connections])) if len(connected_personas) >= 2: useful_expanded_ids.add(expanded_id) # 收集边 for persona_id, edge in connections: useful_edges.append(edge) # 为通过此扩展节点连接的每对人设节点,创建帖子镜像边 for i, p1 in enumerate(connected_personas): for p2 in connected_personas[i+1:]: posts1 = persona_to_posts.get(p1, []) posts2 = persona_to_posts.get(p2, []) # 找出连接p1和p2的边类型 edge_types_p1 = [c[1]["边类型"] for c in connections if c[0] == p1] edge_types_p2 = [c[1]["边类型"] for c in connections if c[0] == p2] # 用第一个边类型作为代表 edge_type = edge_types_p1[0] if edge_types_p1 else (edge_types_p2[0] if edge_types_p2 else "扩展") for post1 in posts1: for post2 in posts2: if post1 == post2: continue # 避免重复 edge_key = tuple(sorted([post1, post2])) + (f"二阶_{edge_type}",) if edge_key in seen_mirror_edges: continue seen_mirror_edges.add(edge_key) post_mirror_edges.append({ "源节点ID": post1, "目标节点ID": post2, "边类型": f"二阶_{edge_type}", "边详情": { "原始边类型": edge_type, "扩展节点": expanded_id, "源人设节点": p1, "目标人设节点": p2 } }) # 获取扩展节点详情 useful_expanded_nodes = [] all_nodes = nodes_data.get("节点列表", []) for node in all_nodes: if node["节点ID"] in useful_expanded_ids: node_copy = node.copy() node_copy["是否扩展"] = True useful_expanded_nodes.append(node_copy) # 边去重 seen_edges = set() unique_edges = [] for edge in useful_edges: edge_key = (edge["源节点ID"], edge["目标节点ID"], edge["边类型"]) if edge_key not in seen_edges: seen_edges.add(edge_key) unique_edges.append(edge) return useful_expanded_nodes, unique_edges, post_mirror_edges def process_filtered_result( filtered_file: Path, nodes_data: Dict, edges_data: Dict, output_dir: Path ) -> Dict: """ 处理单个匹配结果文件 Args: filtered_file: 匹配结果文件路径 nodes_data: 节点列表数据 edges_data: 边关系数据 output_dir: 输出目录 Returns: 处理结果统计 """ # 读取匹配结果 with open(filtered_file, "r", encoding="utf-8") as f: filtered_data = json.load(f) post_id = filtered_data.get("帖子id", "") post_detail = filtered_data.get("帖子详情", {}) post_title = post_detail.get("title", "") # 提取节点和边(包括帖子点节点、标签节点、属于边和匹配边) post_nodes, persona_node_ids, post_edges_raw = extract_matched_nodes_and_edges(filtered_data) # 分离帖子侧的边:属于边(标签→点)和匹配边(标签→人设) post_belong_edges = [e for e in post_edges_raw if e["边类型"] == "属于"] match_edges = [e for e in post_edges_raw if e["边类型"].startswith("匹配_")] # 统计帖子点节点和标签节点 post_point_nodes = [n for n in post_nodes if n["节点类型"] == "点"] post_tag_nodes = [n for n in post_nodes if n["节点类型"] == "标签"] # 获取人设节点详情(直接匹配的,标记为非扩展) persona_nodes = get_persona_nodes_details(persona_node_ids, nodes_data) for node in persona_nodes: node["是否扩展"] = False node["source"] = "人设" # 获取人设节点之间的边 persona_edges = get_edges_between_nodes(persona_node_ids, edges_data) # 创建帖子节点之间的镜像边(基于直接人设边的投影) post_edges = create_mirrored_post_edges(match_edges, persona_edges) # 扩展人设节点一层,只对标签类型的节点通过"属于"边扩展到分类 # 过滤出标签类型的人设节点(只有标签才能"属于"分类) tag_persona_ids = {pid for pid in persona_node_ids if "_标签_" in pid} expanded_nodes, expanded_edges, _ = expand_one_layer( tag_persona_ids, edges_data, nodes_data, edge_types=["属于"], direction="outgoing" # 只向外扩展:标签->分类 ) # 创建通过扩展节点的帖子镜像边(正确逻辑) # 逻辑:帖子->标签->分类,分类之间有边,则对应帖子产生二阶边 # 1. 构建 标签 -> 帖子列表 的映射 tag_to_posts = {} for edge in match_edges: post_node_id = edge["源节点ID"] tag_id = edge["目标节点ID"] if tag_id not in tag_to_posts: tag_to_posts[tag_id] = [] if post_node_id not in tag_to_posts[tag_id]: tag_to_posts[tag_id].append(post_node_id) # 2. 构建 分类 -> 标签列表 的映射(通过属于边) expanded_node_ids = set(n["节点ID"] for n in expanded_nodes) category_to_tags = {} # 分类 -> [连接的标签] for edge in expanded_edges: src, tgt = edge["源节点ID"], edge["目标节点ID"] # 属于边:标签 -> 分类 if tgt in expanded_node_ids and src in persona_node_ids: if tgt not in category_to_tags: category_to_tags[tgt] = [] if src not in category_to_tags[tgt]: category_to_tags[tgt].append(src) # 3. 获取扩展节点(分类)之间的边 category_edges = [] for edge in edges_data.get("边列表", []): src, tgt = edge["源节点ID"], edge["目标节点ID"] # 两端都是扩展节点(分类) if src in expanded_node_ids and tgt in expanded_node_ids: category_edges.append(edge) # 4. 基于分类之间的边,生成帖子之间的二阶镜像边 post_edges_via_expanded = [] seen_mirror = set() for cat_edge in category_edges: cat1, cat2 = cat_edge["源节点ID"], cat_edge["目标节点ID"] edge_type = cat_edge["边类型"] # 获取连接到这两个分类的标签 tags1 = category_to_tags.get(cat1, []) tags2 = category_to_tags.get(cat2, []) # 通过标签找到对应的帖子,产生二阶边 for tag1 in tags1: for tag2 in tags2: posts1 = tag_to_posts.get(tag1, []) posts2 = tag_to_posts.get(tag2, []) for post1 in posts1: for post2 in posts2: if post1 == post2: continue edge_key = tuple(sorted([post1, post2])) + (f"二阶_{edge_type}",) if edge_key in seen_mirror: continue seen_mirror.add(edge_key) post_edges_via_expanded.append({ "源节点ID": post1, "目标节点ID": post2, "边类型": f"二阶_{edge_type}", "边详情": { "原始边类型": edge_type, "分类节点1": cat1, "分类节点2": cat2, "标签节点1": tag1, "标签节点2": tag2 } }) # 只保留对帖子连接有帮助的扩展节点和边 # 1. 找出产生了二阶帖子边的扩展节点(分类) useful_expanded_ids = set() for edge in post_edges_via_expanded: cat1 = edge.get("边详情", {}).get("分类节点1") cat2 = edge.get("边详情", {}).get("分类节点2") if cat1: useful_expanded_ids.add(cat1) if cat2: useful_expanded_ids.add(cat2) # 2. 只保留有用的扩展节点 useful_expanded_nodes = [n for n in expanded_nodes if n["节点ID"] in useful_expanded_ids] # 3. 只保留连接到有用扩展节点的属于边 useful_expanded_edges = [e for e in expanded_edges if e["目标节点ID"] in useful_expanded_ids or e["源节点ID"] in useful_expanded_ids] # 4. 只保留有用的分类之间的边(产生了二阶帖子边的) useful_category_edges = [e for e in category_edges if e["源节点ID"] in useful_expanded_ids and e["目标节点ID"] in useful_expanded_ids] # 合并节点列表 all_nodes = post_nodes + persona_nodes + useful_expanded_nodes # 合并边列表(加入帖子内的属于边) all_edges = post_belong_edges + match_edges + persona_edges + post_edges + useful_expanded_edges + useful_category_edges + post_edges_via_expanded # 去重边 seen_edges = set() unique_edges = [] for edge in all_edges: edge_key = (edge["源节点ID"], edge["目标节点ID"], edge["边类型"]) if edge_key not in seen_edges: seen_edges.add(edge_key) unique_edges.append(edge) all_edges = unique_edges # 构建节点边索引 edges_by_node = {} for edge in all_edges: source_id = edge["源节点ID"] target_id = edge["目标节点ID"] edge_type = edge["边类型"] if source_id not in edges_by_node: edges_by_node[source_id] = {} if edge_type not in edges_by_node[source_id]: edges_by_node[source_id][edge_type] = {} edges_by_node[source_id][edge_type][target_id] = edge # 构建输出数据 output_data = { "说明": { "帖子ID": post_id, "帖子标题": post_title, "描述": "帖子与人设的节点匹配关系", "统计": { "帖子点节点数": len(post_point_nodes), "帖子标签节点数": len(post_tag_nodes), "帖子节点总数": len(post_nodes), "人设节点数(直接匹配)": len(persona_nodes), "扩展节点数(有效)": len(useful_expanded_nodes), "帖子属于边数": len(post_belong_edges), "匹配边数": len(match_edges), "人设节点间边数": len(persona_edges), "扩展边数(有效)": len(useful_expanded_edges), "帖子镜像边数(直接)": len(post_edges), "帖子镜像边数(二阶)": len(post_edges_via_expanded), "总节点数": len(all_nodes), "总边数": len(all_edges) } }, "帖子点节点列表": post_point_nodes, "帖子标签节点列表": post_tag_nodes, "帖子节点列表": post_nodes, "人设节点列表": persona_nodes, "扩展节点列表": useful_expanded_nodes, "帖子属于边列表": post_belong_edges, "匹配边列表": match_edges, "人设节点间边列表": persona_edges, "扩展边列表": useful_expanded_edges, "帖子镜像边列表(直接)": post_edges, "帖子镜像边列表(二阶)": post_edges_via_expanded, "节点列表": all_nodes, "边列表": all_edges, "节点边索引": edges_by_node } # 保存输出文件 output_file = output_dir / f"{post_id}_match_graph.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(output_data, f, ensure_ascii=False, indent=2) return { "帖子ID": post_id, "帖子点节点数": len(post_point_nodes), "帖子标签节点数": len(post_tag_nodes), "帖子节点数": len(post_nodes), "人设节点数": len(persona_nodes), "扩展节点数": len(useful_expanded_nodes), "帖子属于边数": len(post_belong_edges), "匹配边数": len(match_edges), "人设边数": len(persona_edges), "扩展边数": len(useful_expanded_edges), "帖子边数(直接)": len(post_edges), "帖子边数(二阶)": len(post_edges_via_expanded), "总节点数": len(all_nodes), "总边数": len(all_edges), "输出文件": str(output_file) } def main(): # 使用路径配置 config = PathConfig() config.ensure_dirs() print(f"账号: {config.account_name}") print(f"输出版本: {config.output_version}") print() # 输入文件/目录 filtered_results_dir = config.intermediate_dir / "filtered_results" nodes_file = config.intermediate_dir / "节点列表.json" edges_file = config.intermediate_dir / "边关系.json" # 输出目录 output_dir = config.intermediate_dir / "match_graph" output_dir.mkdir(parents=True, exist_ok=True) print(f"输入:") print(f" 匹配结果目录: {filtered_results_dir}") print(f" 节点列表: {nodes_file}") print(f" 边关系: {edges_file}") print(f"\n输出目录: {output_dir}") print() # 读取节点和边数据 print("正在读取节点列表...") with open(nodes_file, "r", encoding="utf-8") as f: nodes_data = json.load(f) print(f" 共 {len(nodes_data.get('节点列表', []))} 个节点") print("正在读取边关系...") with open(edges_file, "r", encoding="utf-8") as f: edges_data = json.load(f) print(f" 共 {len(edges_data.get('边列表', []))} 条边") # 处理所有匹配结果文件 print("\n" + "="*60) print("处理匹配结果文件...") filtered_files = list(filtered_results_dir.glob("*_filtered.json")) print(f"找到 {len(filtered_files)} 个匹配结果文件") results = [] for i, filtered_file in enumerate(filtered_files, 1): print(f"\n[{i}/{len(filtered_files)}] 处理: {filtered_file.name}") result = process_filtered_result(filtered_file, nodes_data, edges_data, output_dir) results.append(result) print(f" 帖子节点: {result['帖子节点数']}, 人设节点: {result['人设节点数']}, 扩展节点: {result['扩展节点数']}") print(f" 匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}") print(f" 帖子边(直接): {result['帖子边数(直接)']}, 帖子边(二阶): {result['帖子边数(二阶)']}") # 汇总统计 print("\n" + "="*60) print("处理完成!") print(f"\n汇总:") print(f" 处理文件数: {len(results)}") total_post = sum(r['帖子节点数'] for r in results) total_persona = sum(r['人设节点数'] for r in results) total_expanded = sum(r['扩展节点数'] for r in results) total_match = sum(r['匹配边数'] for r in results) total_persona_edges = sum(r['人设边数'] for r in results) total_expanded_edges = sum(r['扩展边数'] for r in results) total_post_edges_direct = sum(r['帖子边数(直接)'] for r in results) total_post_edges_2hop = sum(r['帖子边数(二阶)'] for r in results) print(f" 总帖子节点: {total_post}") print(f" 总人设节点: {total_persona}") print(f" 总扩展节点: {total_expanded}") print(f" 总匹配边: {total_match}") print(f" 总人设边: {total_persona_edges}") print(f" 总扩展边: {total_expanded_edges}") print(f" 总帖子边(直接): {total_post_edges_direct}") print(f" 总帖子边(二阶): {total_post_edges_2hop}") print(f"\n输出目录: {output_dir}") if __name__ == "__main__": main()