build_match_graph.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从匹配结果中构建帖子与人设的节点边关系图
  5. 输入:
  6. 1. filtered_results目录下的匹配结果文件
  7. 2. 节点列表.json
  8. 3. 边关系.json
  9. 输出:
  10. 1. match_graph目录下的节点边关系文件
  11. """
  12. import json
  13. from pathlib import Path
  14. from typing import Dict, List, Set, Any, Optional
  15. import sys
  16. # 添加项目根目录到路径
  17. project_root = Path(__file__).parent.parent.parent
  18. sys.path.insert(0, str(project_root))
  19. from script.data_processing.path_config import PathConfig
  20. def build_post_node_id(dimension: str, node_type: str, name: str) -> str:
  21. """构建帖子节点ID
  22. Args:
  23. dimension: 维度(灵感点/关键点/目的点)
  24. node_type: 节点类型(点/标签)
  25. name: 节点名称
  26. """
  27. return f"帖子_{dimension}_{node_type}_{name}"
  28. def build_persona_node_id(dimension: str, node_type: str, name: str) -> str:
  29. """构建人设节点ID"""
  30. return f"{dimension}_{node_type}_{name}"
  31. def extract_matched_nodes_and_edges(filtered_data: Dict) -> tuple:
  32. """
  33. 从匹配结果中提取帖子节点(点+标签)、人设节点和边
  34. Args:
  35. filtered_data: 匹配结果数据
  36. Returns:
  37. (帖子节点列表, 人设节点ID集合, 边列表)
  38. 帖子节点包括:点节点(灵感点/关键点/目的点)和标签节点
  39. 边包括:点→标签的属于边 + 标签→人设的匹配边
  40. """
  41. post_nodes = []
  42. persona_node_ids = set()
  43. edges = [] # 包含属于边和匹配边
  44. how_result = filtered_data.get("how解构结果", {})
  45. # 维度映射
  46. dimension_mapping = {
  47. "灵感点列表": "灵感点",
  48. "目的点列表": "目的点",
  49. "关键点列表": "关键点"
  50. }
  51. for list_key, dimension in dimension_mapping.items():
  52. points = how_result.get(list_key, [])
  53. for point in points:
  54. point_name = point.get("名称", "")
  55. point_desc = point.get("描述", "")
  56. if not point_name:
  57. continue
  58. # 创建帖子点节点
  59. point_node_id = build_post_node_id(dimension, "点", point_name)
  60. point_node = {
  61. "节点ID": point_node_id,
  62. "节点名称": point_name,
  63. "节点类型": "点",
  64. "节点层级": dimension,
  65. "描述": point_desc,
  66. "source": "帖子"
  67. }
  68. # 避免重复添加点节点
  69. if not any(n["节点ID"] == point_node_id for n in post_nodes):
  70. post_nodes.append(point_node)
  71. # 遍历how步骤列表,提取标签节点
  72. how_steps = point.get("how步骤列表", [])
  73. for step in how_steps:
  74. features = step.get("特征列表", [])
  75. for feature in features:
  76. feature_name = feature.get("特征名称", "")
  77. weight = feature.get("权重", 0)
  78. match_results = feature.get("匹配结果", [])
  79. if not feature_name:
  80. continue
  81. # 创建帖子标签节点(无论是否有匹配结果)
  82. tag_node_id = build_post_node_id(dimension, "标签", feature_name)
  83. tag_node = {
  84. "节点ID": tag_node_id,
  85. "节点名称": feature_name,
  86. "节点类型": "标签",
  87. "节点层级": dimension,
  88. "权重": weight,
  89. "source": "帖子",
  90. "已匹配": len(match_results) > 0 # 标记是否有匹配
  91. }
  92. # 避免重复添加标签节点
  93. if not any(n["节点ID"] == tag_node_id for n in post_nodes):
  94. post_nodes.append(tag_node)
  95. # 创建标签→点的属于边
  96. belong_edge = {
  97. "源节点ID": tag_node_id,
  98. "目标节点ID": point_node_id,
  99. "边类型": "属于",
  100. "边详情": {
  101. "说明": f"标签「{feature_name}」属于点「{point_name}」"
  102. }
  103. }
  104. # 避免重复添加属于边
  105. edge_key = (tag_node_id, point_node_id, "属于")
  106. if not any((e["源节点ID"], e["目标节点ID"], e["边类型"]) == edge_key for e in edges):
  107. edges.append(belong_edge)
  108. # 如果有匹配结果,创建匹配边
  109. if match_results:
  110. for match in match_results:
  111. persona_name = match.get("人设特征名称", "")
  112. persona_dimension = match.get("人设特征层级", "")
  113. persona_type = match.get("特征类型", "标签")
  114. match_detail = match.get("匹配结果", {})
  115. if not persona_name or not persona_dimension:
  116. continue
  117. # 构建人设节点ID
  118. persona_node_id = build_persona_node_id(
  119. persona_dimension, persona_type, persona_name
  120. )
  121. persona_node_ids.add(persona_node_id)
  122. # 创建匹配边(根据相似度区分类型)
  123. similarity = match_detail.get("相似度", 0)
  124. if similarity >= 0.8:
  125. edge_type = "匹配_相同"
  126. else:
  127. edge_type = "匹配_相似"
  128. match_edge = {
  129. "源节点ID": tag_node_id,
  130. "目标节点ID": persona_node_id,
  131. "边类型": edge_type,
  132. "边详情": {
  133. "相似度": similarity,
  134. "说明": match_detail.get("说明", "")
  135. }
  136. }
  137. edges.append(match_edge)
  138. return post_nodes, persona_node_ids, edges
  139. def get_persona_nodes_details(
  140. persona_node_ids: Set[str],
  141. nodes_data: Dict
  142. ) -> List[Dict]:
  143. """
  144. 从节点列表中获取人设节点的详细信息
  145. Args:
  146. persona_node_ids: 人设节点ID集合
  147. nodes_data: 节点列表数据
  148. Returns:
  149. 人设节点详情列表
  150. """
  151. persona_nodes = []
  152. all_nodes = nodes_data.get("节点列表", [])
  153. for node in all_nodes:
  154. if node["节点ID"] in persona_node_ids:
  155. persona_nodes.append(node)
  156. return persona_nodes
  157. def get_edges_between_nodes(
  158. node_ids: Set[str],
  159. edges_data: Dict
  160. ) -> List[Dict]:
  161. """
  162. 获取指定节点之间的边关系
  163. Args:
  164. node_ids: 节点ID集合
  165. edges_data: 边关系数据
  166. Returns:
  167. 节点之间的边列表
  168. """
  169. edges_between = []
  170. all_edges = edges_data.get("边列表", [])
  171. for edge in all_edges:
  172. source_id = edge["源节点ID"]
  173. target_id = edge["目标节点ID"]
  174. # 两个节点都在集合中
  175. if source_id in node_ids and target_id in node_ids:
  176. edges_between.append(edge)
  177. return edges_between
  178. def create_mirrored_post_edges(
  179. match_edges: List[Dict],
  180. persona_edges: List[Dict]
  181. ) -> List[Dict]:
  182. """
  183. 根据人设节点之间的边,创建帖子节点之间的镜像边
  184. 逻辑:如果人设节点A和B之间有边,且帖子节点X匹配A,帖子节点Y匹配B,
  185. 则创建帖子节点X和Y之间的镜像边
  186. Args:
  187. match_edges: 匹配边列表(帖子节点 -> 人设节点)
  188. persona_edges: 人设节点之间的边列表
  189. Returns:
  190. 帖子节点之间的镜像边列表
  191. """
  192. # 构建人设节点到帖子节点的反向映射
  193. # persona_id -> [post_id1, post_id2, ...]
  194. persona_to_posts = {}
  195. for edge in match_edges:
  196. post_id = edge["源节点ID"]
  197. persona_id = edge["目标节点ID"]
  198. if persona_id not in persona_to_posts:
  199. persona_to_posts[persona_id] = []
  200. if post_id not in persona_to_posts[persona_id]:
  201. persona_to_posts[persona_id].append(post_id)
  202. # 根据人设边创建帖子镜像边
  203. post_edges = []
  204. seen_edges = set()
  205. for persona_edge in persona_edges:
  206. source_persona = persona_edge["源节点ID"]
  207. target_persona = persona_edge["目标节点ID"]
  208. edge_type = persona_edge["边类型"]
  209. # 获取匹配到这两个人设节点的帖子节点
  210. source_posts = persona_to_posts.get(source_persona, [])
  211. target_posts = persona_to_posts.get(target_persona, [])
  212. # 为每对帖子节点创建镜像边
  213. for src_post in source_posts:
  214. for tgt_post in target_posts:
  215. if src_post == tgt_post:
  216. continue
  217. # 使用排序后的key避免重复(A-B 和 B-A 视为同一条边)
  218. edge_key = tuple(sorted([src_post, tgt_post])) + (edge_type,)
  219. if edge_key in seen_edges:
  220. continue
  221. seen_edges.add(edge_key)
  222. post_edge = {
  223. "源节点ID": src_post,
  224. "目标节点ID": tgt_post,
  225. "边类型": f"镜像_{edge_type}", # 标记为镜像边
  226. "边详情": {
  227. "原始边类型": edge_type,
  228. "源人设节点": source_persona,
  229. "目标人设节点": target_persona,
  230. # 完整路径节点(用于前端高亮)
  231. "路径节点": [src_post, source_persona, target_persona, tgt_post]
  232. }
  233. }
  234. post_edges.append(post_edge)
  235. return post_edges
  236. def expand_one_layer(
  237. node_ids: Set[str],
  238. edges_data: Dict,
  239. nodes_data: Dict,
  240. edge_types: List[str] = None,
  241. direction: str = "both"
  242. ) -> tuple:
  243. """
  244. 从指定节点扩展一层,获取相邻节点和连接边
  245. Args:
  246. node_ids: 起始节点ID集合
  247. edges_data: 边关系数据
  248. nodes_data: 节点列表数据
  249. edge_types: 要扩展的边类型列表,None表示所有类型
  250. direction: 扩展方向
  251. - "outgoing": 只沿出边扩展(源节点在集合中,扩展到目标节点)
  252. - "incoming": 只沿入边扩展(目标节点在集合中,扩展到源节点)
  253. - "both": 双向扩展
  254. Returns:
  255. (扩展的节点列表, 扩展的边列表, 扩展的节点ID集合)
  256. """
  257. expanded_node_ids = set()
  258. expanded_edges = []
  259. all_edges = edges_data.get("边列表", [])
  260. # 找出所有与起始节点相连的边和节点
  261. for edge in all_edges:
  262. # 过滤边类型
  263. if edge_types and edge["边类型"] not in edge_types:
  264. continue
  265. source_id = edge["源节点ID"]
  266. target_id = edge["目标节点ID"]
  267. # 沿出边扩展:源节点在集合中,扩展到目标节点
  268. if direction in ["outgoing", "both"]:
  269. if source_id in node_ids and target_id not in node_ids:
  270. expanded_node_ids.add(target_id)
  271. expanded_edges.append(edge)
  272. # 沿入边扩展:目标节点在集合中,扩展到源节点
  273. if direction in ["incoming", "both"]:
  274. if target_id in node_ids and source_id not in node_ids:
  275. expanded_node_ids.add(source_id)
  276. expanded_edges.append(edge)
  277. # 获取扩展节点的详情
  278. expanded_nodes = []
  279. all_nodes = nodes_data.get("节点列表", [])
  280. for node in all_nodes:
  281. if node["节点ID"] in expanded_node_ids:
  282. # 标记为扩展节点
  283. node_copy = node.copy()
  284. node_copy["是否扩展"] = True
  285. node_copy["source"] = "人设"
  286. expanded_nodes.append(node_copy)
  287. return expanded_nodes, expanded_edges, expanded_node_ids
  288. def expand_and_filter_useful_nodes(
  289. matched_persona_ids: Set[str],
  290. match_edges: List[Dict],
  291. edges_data: Dict,
  292. nodes_data: Dict,
  293. exclude_edge_types: List[str] = None
  294. ) -> tuple:
  295. """
  296. 扩展人设节点一层,只保留能产生新帖子连线的扩展节点
  297. 逻辑:如果扩展节点E连接了2个以上的已匹配人设节点,
  298. 那么通过E可以产生新的帖子间连线,保留E
  299. Args:
  300. matched_persona_ids: 已匹配的人设节点ID集合
  301. match_edges: 匹配边列表
  302. edges_data: 边关系数据
  303. nodes_data: 节点列表数据
  304. exclude_edge_types: 要排除的边类型列表
  305. Returns:
  306. (有效扩展节点列表, 扩展边列表, 通过扩展节点的帖子镜像边列表)
  307. """
  308. if exclude_edge_types is None:
  309. exclude_edge_types = []
  310. all_edges = edges_data.get("边列表", [])
  311. # 构建人设节点到帖子节点的映射
  312. persona_to_posts = {}
  313. for edge in match_edges:
  314. post_id = edge["源节点ID"]
  315. persona_id = edge["目标节点ID"]
  316. if persona_id not in persona_to_posts:
  317. persona_to_posts[persona_id] = []
  318. if post_id not in persona_to_posts[persona_id]:
  319. persona_to_posts[persona_id].append(post_id)
  320. # 找出所有扩展节点及其连接的已匹配人设节点
  321. # expanded_node_id -> [(matched_persona_id, edge), ...]
  322. expanded_connections = {}
  323. for edge in all_edges:
  324. # 跳过排除的边类型
  325. if edge["边类型"] in exclude_edge_types:
  326. continue
  327. source_id = edge["源节点ID"]
  328. target_id = edge["目标节点ID"]
  329. # 源节点是已匹配的,目标节点是扩展候选
  330. if source_id in matched_persona_ids and target_id not in matched_persona_ids:
  331. if target_id not in expanded_connections:
  332. expanded_connections[target_id] = []
  333. expanded_connections[target_id].append((source_id, edge))
  334. # 目标节点是已匹配的,源节点是扩展候选
  335. if target_id in matched_persona_ids and source_id not in matched_persona_ids:
  336. if source_id not in expanded_connections:
  337. expanded_connections[source_id] = []
  338. expanded_connections[source_id].append((target_id, edge))
  339. # 过滤:只保留连接2个以上已匹配人设节点的扩展节点
  340. useful_expanded_ids = set()
  341. useful_edges = []
  342. post_mirror_edges = []
  343. seen_mirror_edges = set()
  344. for expanded_id, connections in expanded_connections.items():
  345. connected_personas = list(set([c[0] for c in connections]))
  346. if len(connected_personas) >= 2:
  347. useful_expanded_ids.add(expanded_id)
  348. # 收集边
  349. for persona_id, edge in connections:
  350. useful_edges.append(edge)
  351. # 为通过此扩展节点连接的每对人设节点,创建帖子镜像边
  352. for i, p1 in enumerate(connected_personas):
  353. for p2 in connected_personas[i+1:]:
  354. posts1 = persona_to_posts.get(p1, [])
  355. posts2 = persona_to_posts.get(p2, [])
  356. # 找出连接p1和p2的边类型
  357. edge_types_p1 = [c[1]["边类型"] for c in connections if c[0] == p1]
  358. edge_types_p2 = [c[1]["边类型"] for c in connections if c[0] == p2]
  359. # 用第一个边类型作为代表
  360. edge_type = edge_types_p1[0] if edge_types_p1 else (edge_types_p2[0] if edge_types_p2 else "扩展")
  361. for post1 in posts1:
  362. for post2 in posts2:
  363. if post1 == post2:
  364. continue
  365. # 避免重复
  366. edge_key = tuple(sorted([post1, post2])) + (f"二阶_{edge_type}",)
  367. if edge_key in seen_mirror_edges:
  368. continue
  369. seen_mirror_edges.add(edge_key)
  370. post_mirror_edges.append({
  371. "源节点ID": post1,
  372. "目标节点ID": post2,
  373. "边类型": f"二阶_{edge_type}",
  374. "边详情": {
  375. "原始边类型": edge_type,
  376. "扩展节点": expanded_id,
  377. "源人设节点": p1,
  378. "目标人设节点": p2
  379. }
  380. })
  381. # 获取扩展节点详情
  382. useful_expanded_nodes = []
  383. all_nodes = nodes_data.get("节点列表", [])
  384. for node in all_nodes:
  385. if node["节点ID"] in useful_expanded_ids:
  386. node_copy = node.copy()
  387. node_copy["是否扩展"] = True
  388. useful_expanded_nodes.append(node_copy)
  389. # 边去重
  390. seen_edges = set()
  391. unique_edges = []
  392. for edge in useful_edges:
  393. edge_key = (edge["源节点ID"], edge["目标节点ID"], edge["边类型"])
  394. if edge_key not in seen_edges:
  395. seen_edges.add(edge_key)
  396. unique_edges.append(edge)
  397. return useful_expanded_nodes, unique_edges, post_mirror_edges
  398. def process_filtered_result(
  399. filtered_file: Path,
  400. nodes_data: Dict,
  401. edges_data: Dict,
  402. output_dir: Path
  403. ) -> Dict:
  404. """
  405. 处理单个匹配结果文件
  406. Args:
  407. filtered_file: 匹配结果文件路径
  408. nodes_data: 节点列表数据
  409. edges_data: 边关系数据
  410. output_dir: 输出目录
  411. Returns:
  412. 处理结果统计
  413. """
  414. # 读取匹配结果
  415. with open(filtered_file, "r", encoding="utf-8") as f:
  416. filtered_data = json.load(f)
  417. post_id = filtered_data.get("帖子id", "")
  418. post_detail = filtered_data.get("帖子详情", {})
  419. post_title = post_detail.get("title", "")
  420. # 提取节点和边(包括帖子点节点、标签节点、属于边和匹配边)
  421. post_nodes, persona_node_ids, post_edges_raw = extract_matched_nodes_and_edges(filtered_data)
  422. # 分离帖子侧的边:属于边(标签→点)和匹配边(标签→人设)
  423. post_belong_edges = [e for e in post_edges_raw if e["边类型"] == "属于"]
  424. match_edges = [e for e in post_edges_raw if e["边类型"].startswith("匹配_")]
  425. # 统计帖子点节点和标签节点
  426. post_point_nodes = [n for n in post_nodes if n["节点类型"] == "点"]
  427. post_tag_nodes = [n for n in post_nodes if n["节点类型"] == "标签"]
  428. # 获取人设节点详情(直接匹配的,标记为非扩展)
  429. persona_nodes = get_persona_nodes_details(persona_node_ids, nodes_data)
  430. for node in persona_nodes:
  431. node["是否扩展"] = False
  432. node["source"] = "人设"
  433. # 获取人设节点之间的边
  434. persona_edges = get_edges_between_nodes(persona_node_ids, edges_data)
  435. # 创建帖子节点之间的镜像边(基于直接人设边的投影)
  436. post_edges = create_mirrored_post_edges(match_edges, persona_edges)
  437. # 扩展人设节点一层,只对标签类型的节点通过"属于"边扩展到分类
  438. # 过滤出标签类型的人设节点(只有标签才能"属于"分类)
  439. tag_persona_ids = {pid for pid in persona_node_ids if "_标签_" in pid}
  440. expanded_nodes_raw, expanded_edges_raw, _ = expand_one_layer(
  441. tag_persona_ids, edges_data, nodes_data,
  442. edge_types=["属于"],
  443. direction="outgoing" # 只向外扩展:标签->分类
  444. )
  445. # 排除已经在第3层(直接匹配)中的节点,避免同一节点出现在两层
  446. expanded_nodes = [n for n in expanded_nodes_raw if n["节点ID"] not in persona_node_ids]
  447. expanded_edges = [e for e in expanded_edges_raw
  448. if e["目标节点ID"] not in persona_node_ids or e["源节点ID"] not in persona_node_ids]
  449. # 创建通过扩展节点的帖子镜像边(正确逻辑)
  450. # 逻辑:帖子->标签->分类,分类之间有边,则对应帖子产生二阶边
  451. # 1. 构建 标签 -> 帖子列表 的映射
  452. tag_to_posts = {}
  453. for edge in match_edges:
  454. post_node_id = edge["源节点ID"]
  455. tag_id = edge["目标节点ID"]
  456. if tag_id not in tag_to_posts:
  457. tag_to_posts[tag_id] = []
  458. if post_node_id not in tag_to_posts[tag_id]:
  459. tag_to_posts[tag_id].append(post_node_id)
  460. # 2. 构建 分类 -> 标签列表 的映射(通过属于边)
  461. expanded_node_ids = set(n["节点ID"] for n in expanded_nodes)
  462. category_to_tags = {} # 分类 -> [连接的标签]
  463. for edge in expanded_edges:
  464. src, tgt = edge["源节点ID"], edge["目标节点ID"]
  465. # 属于边:标签 -> 分类
  466. if tgt in expanded_node_ids and src in persona_node_ids:
  467. if tgt not in category_to_tags:
  468. category_to_tags[tgt] = []
  469. if src not in category_to_tags[tgt]:
  470. category_to_tags[tgt].append(src)
  471. # 3. 获取扩展节点(分类)之间的边
  472. category_edges = []
  473. for edge in edges_data.get("边列表", []):
  474. src, tgt = edge["源节点ID"], edge["目标节点ID"]
  475. # 两端都是扩展节点(分类)
  476. if src in expanded_node_ids and tgt in expanded_node_ids:
  477. category_edges.append(edge)
  478. # 4. 基于分类之间的边,生成帖子之间的二阶镜像边
  479. post_edges_via_expanded = []
  480. seen_mirror = set()
  481. for cat_edge in category_edges:
  482. cat1, cat2 = cat_edge["源节点ID"], cat_edge["目标节点ID"]
  483. edge_type = cat_edge["边类型"]
  484. # 获取连接到这两个分类的标签
  485. tags1 = category_to_tags.get(cat1, [])
  486. tags2 = category_to_tags.get(cat2, [])
  487. # 通过标签找到对应的帖子,产生二阶边
  488. for tag1 in tags1:
  489. for tag2 in tags2:
  490. posts1 = tag_to_posts.get(tag1, [])
  491. posts2 = tag_to_posts.get(tag2, [])
  492. for post1 in posts1:
  493. for post2 in posts2:
  494. if post1 == post2:
  495. continue
  496. edge_key = tuple(sorted([post1, post2])) + (f"二阶_{edge_type}",)
  497. if edge_key in seen_mirror:
  498. continue
  499. seen_mirror.add(edge_key)
  500. post_edges_via_expanded.append({
  501. "源节点ID": post1,
  502. "目标节点ID": post2,
  503. "边类型": f"二阶_{edge_type}",
  504. "边详情": {
  505. "原始边类型": edge_type,
  506. "源人设节点": cat1, # 统一字段:指向产生关系的人设节点(分类)
  507. "目标人设节点": cat2,
  508. # 完整路径节点(用于前端高亮)
  509. "路径节点": [post1, tag1, cat1, cat2, tag2, post2]
  510. }
  511. })
  512. # 只保留对帖子连接有帮助的扩展节点和边
  513. # 1. 找出产生了二阶帖子边的扩展节点(分类)
  514. useful_expanded_ids = set()
  515. for edge in post_edges_via_expanded:
  516. cat1 = edge.get("边详情", {}).get("源人设节点")
  517. cat2 = edge.get("边详情", {}).get("目标人设节点")
  518. if cat1:
  519. useful_expanded_ids.add(cat1)
  520. if cat2:
  521. useful_expanded_ids.add(cat2)
  522. # 2. 只保留有用的扩展节点
  523. useful_expanded_nodes = [n for n in expanded_nodes if n["节点ID"] in useful_expanded_ids]
  524. # 3. 只保留连接到有用扩展节点的属于边
  525. useful_expanded_edges = [e for e in expanded_edges
  526. if e["目标节点ID"] in useful_expanded_ids or e["源节点ID"] in useful_expanded_ids]
  527. # 4. 只保留有用的分类之间的边(产生了二阶帖子边的)
  528. useful_category_edges = [e for e in category_edges
  529. if e["源节点ID"] in useful_expanded_ids and e["目标节点ID"] in useful_expanded_ids]
  530. # 5. 获取直接匹配层(第2层)和扩展层(第3层)之间的所有跨层边
  531. # 这些边连接了直接匹配的人设节点和扩展的分类节点
  532. cross_layer_edges = []
  533. for edge in edges_data.get("边列表", []):
  534. src, tgt = edge["源节点ID"], edge["目标节点ID"]
  535. # 一端在直接匹配层,另一端在扩展层
  536. src_in_direct = src in persona_node_ids
  537. src_in_expanded = src in useful_expanded_ids
  538. tgt_in_direct = tgt in persona_node_ids
  539. tgt_in_expanded = tgt in useful_expanded_ids
  540. if (src_in_direct and tgt_in_expanded) or (src_in_expanded and tgt_in_direct):
  541. cross_layer_edges.append(edge)
  542. # 合并节点列表
  543. all_nodes = post_nodes + persona_nodes + useful_expanded_nodes
  544. # 合并边列表(加入帖子内的属于边)
  545. all_edges = (post_belong_edges + match_edges + persona_edges + post_edges +
  546. useful_expanded_edges + useful_category_edges + cross_layer_edges +
  547. post_edges_via_expanded)
  548. # 去重边
  549. seen_edges = set()
  550. unique_edges = []
  551. for edge in all_edges:
  552. edge_key = (edge["源节点ID"], edge["目标节点ID"], edge["边类型"])
  553. if edge_key not in seen_edges:
  554. seen_edges.add(edge_key)
  555. unique_edges.append(edge)
  556. all_edges = unique_edges
  557. # 构建人设边到镜像边的反向映射
  558. # key: "源人设节点ID|目标人设节点ID" (排序后的)
  559. # value: [{镜像边信息}, ...]
  560. persona_edge_to_mirror_edges = {}
  561. all_mirror_edges = post_edges + post_edges_via_expanded
  562. for mirror_edge in all_mirror_edges:
  563. detail = mirror_edge.get("边详情", {})
  564. src_persona = detail.get("源人设节点")
  565. tgt_persona = detail.get("目标人设节点")
  566. if src_persona and tgt_persona:
  567. # 使用排序后的key,确保 A|B 和 B|A 映射到同一个key
  568. edge_key = "|".join(sorted([src_persona, tgt_persona]))
  569. if edge_key not in persona_edge_to_mirror_edges:
  570. persona_edge_to_mirror_edges[edge_key] = []
  571. persona_edge_to_mirror_edges[edge_key].append({
  572. "源节点ID": mirror_edge["源节点ID"],
  573. "目标节点ID": mirror_edge["目标节点ID"],
  574. "边类型": mirror_edge["边类型"]
  575. })
  576. # 构建节点边索引
  577. edges_by_node = {}
  578. for edge in all_edges:
  579. source_id = edge["源节点ID"]
  580. target_id = edge["目标节点ID"]
  581. edge_type = edge["边类型"]
  582. if source_id not in edges_by_node:
  583. edges_by_node[source_id] = {}
  584. if edge_type not in edges_by_node[source_id]:
  585. edges_by_node[source_id][edge_type] = {}
  586. edges_by_node[source_id][edge_type][target_id] = edge
  587. # 构建输出数据
  588. output_data = {
  589. "说明": {
  590. "帖子ID": post_id,
  591. "帖子标题": post_title,
  592. "描述": "帖子与人设的节点匹配关系",
  593. "统计": {
  594. "帖子点节点数": len(post_point_nodes),
  595. "帖子标签节点数": len(post_tag_nodes),
  596. "帖子节点总数": len(post_nodes),
  597. "人设节点数(直接匹配)": len(persona_nodes),
  598. "扩展节点数(有效)": len(useful_expanded_nodes),
  599. "帖子属于边数": len(post_belong_edges),
  600. "匹配边数": len(match_edges),
  601. "人设节点间边数": len(persona_edges),
  602. "扩展边数(有效)": len(useful_expanded_edges),
  603. "跨层边数": len(cross_layer_edges),
  604. "帖子镜像边数(直接)": len(post_edges),
  605. "帖子镜像边数(二阶)": len(post_edges_via_expanded),
  606. "总节点数": len(all_nodes),
  607. "总边数": len(all_edges)
  608. }
  609. },
  610. "帖子点节点列表": post_point_nodes,
  611. "帖子标签节点列表": post_tag_nodes,
  612. "帖子节点列表": post_nodes,
  613. "人设节点列表": persona_nodes,
  614. "扩展节点列表": useful_expanded_nodes,
  615. "帖子属于边列表": post_belong_edges,
  616. "匹配边列表": match_edges,
  617. "人设节点间边列表": persona_edges,
  618. "扩展边列表": useful_expanded_edges,
  619. "跨层边列表": cross_layer_edges,
  620. "帖子镜像边列表(直接)": post_edges,
  621. "帖子镜像边列表(二阶)": post_edges_via_expanded,
  622. "节点列表": all_nodes,
  623. "边列表": all_edges,
  624. "节点边索引": edges_by_node,
  625. "人设边到镜像边映射": persona_edge_to_mirror_edges
  626. }
  627. # 保存输出文件
  628. output_file = output_dir / f"{post_id}_match_graph.json"
  629. with open(output_file, "w", encoding="utf-8") as f:
  630. json.dump(output_data, f, ensure_ascii=False, indent=2)
  631. return {
  632. "帖子ID": post_id,
  633. "帖子点节点数": len(post_point_nodes),
  634. "帖子标签节点数": len(post_tag_nodes),
  635. "帖子节点数": len(post_nodes),
  636. "人设节点数": len(persona_nodes),
  637. "扩展节点数": len(useful_expanded_nodes),
  638. "帖子属于边数": len(post_belong_edges),
  639. "匹配边数": len(match_edges),
  640. "人设边数": len(persona_edges),
  641. "扩展边数": len(useful_expanded_edges),
  642. "跨层边数": len(cross_layer_edges),
  643. "帖子边数(直接)": len(post_edges),
  644. "帖子边数(二阶)": len(post_edges_via_expanded),
  645. "总节点数": len(all_nodes),
  646. "总边数": len(all_edges),
  647. "输出文件": str(output_file)
  648. }
  649. def main():
  650. # 使用路径配置
  651. config = PathConfig()
  652. config.ensure_dirs()
  653. print(f"账号: {config.account_name}")
  654. print(f"输出版本: {config.output_version}")
  655. print()
  656. # 输入文件/目录
  657. filtered_results_dir = config.intermediate_dir / "filtered_results"
  658. nodes_file = config.intermediate_dir / "节点列表.json"
  659. edges_file = config.intermediate_dir / "边关系.json"
  660. # 输出目录
  661. output_dir = config.intermediate_dir / "match_graph"
  662. output_dir.mkdir(parents=True, exist_ok=True)
  663. print(f"输入:")
  664. print(f" 匹配结果目录: {filtered_results_dir}")
  665. print(f" 节点列表: {nodes_file}")
  666. print(f" 边关系: {edges_file}")
  667. print(f"\n输出目录: {output_dir}")
  668. print()
  669. # 读取节点和边数据
  670. print("正在读取节点列表...")
  671. with open(nodes_file, "r", encoding="utf-8") as f:
  672. nodes_data = json.load(f)
  673. print(f" 共 {len(nodes_data.get('节点列表', []))} 个节点")
  674. print("正在读取边关系...")
  675. with open(edges_file, "r", encoding="utf-8") as f:
  676. edges_data = json.load(f)
  677. print(f" 共 {len(edges_data.get('边列表', []))} 条边")
  678. # 处理所有匹配结果文件
  679. print("\n" + "="*60)
  680. print("处理匹配结果文件...")
  681. filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
  682. print(f"找到 {len(filtered_files)} 个匹配结果文件")
  683. results = []
  684. for i, filtered_file in enumerate(filtered_files, 1):
  685. print(f"\n[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
  686. result = process_filtered_result(filtered_file, nodes_data, edges_data, output_dir)
  687. results.append(result)
  688. print(f" 帖子节点: {result['帖子节点数']}, 人设节点: {result['人设节点数']}, 扩展节点: {result['扩展节点数']}")
  689. print(f" 匹配边: {result['匹配边数']}, 人设边: {result['人设边数']}, 扩展边: {result['扩展边数']}, 跨层边: {result['跨层边数']}")
  690. print(f" 帖子边(直接): {result['帖子边数(直接)']}, 帖子边(二阶): {result['帖子边数(二阶)']}")
  691. # 汇总统计
  692. print("\n" + "="*60)
  693. print("处理完成!")
  694. print(f"\n汇总:")
  695. print(f" 处理文件数: {len(results)}")
  696. total_post = sum(r['帖子节点数'] for r in results)
  697. total_persona = sum(r['人设节点数'] for r in results)
  698. total_expanded = sum(r['扩展节点数'] for r in results)
  699. total_match = sum(r['匹配边数'] for r in results)
  700. total_persona_edges = sum(r['人设边数'] for r in results)
  701. total_expanded_edges = sum(r['扩展边数'] for r in results)
  702. total_cross_layer_edges = sum(r['跨层边数'] for r in results)
  703. total_post_edges_direct = sum(r['帖子边数(直接)'] for r in results)
  704. total_post_edges_2hop = sum(r['帖子边数(二阶)'] for r in results)
  705. print(f" 总帖子节点: {total_post}")
  706. print(f" 总人设节点: {total_persona}")
  707. print(f" 总扩展节点: {total_expanded}")
  708. print(f" 总匹配边: {total_match}")
  709. print(f" 总人设边: {total_persona_edges}")
  710. print(f" 总扩展边: {total_expanded_edges}")
  711. print(f" 总跨层边: {total_cross_layer_edges}")
  712. print(f" 总帖子边(直接): {total_post_edges_direct}")
  713. print(f" 总帖子边(二阶): {total_post_edges_2hop}")
  714. print(f"\n输出目录: {output_dir}")
  715. if __name__ == "__main__":
  716. main()