build_post_graph.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建帖子图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. filtered_results/*_filtered.json - 帖子解构结果(过滤后的how解构)
  9. ================================================================================
  10. 输出文件: post_graph/{post_id}_帖子图谱.json(每个帖子一个文件)
  11. ================================================================================
  12. {
  13. "meta": { # 元信息
  14. "postId": "帖子ID",
  15. "postTitle": "帖子标题",
  16. "postDetail": {...},
  17. "createdAt": "时间戳",
  18. "stats": { ... }
  19. },
  20. "nodes": { # 节点字典 (nodeId -> nodeData)
  21. "{domain}:{dimension}:{type}:{name}": {
  22. "name": "显示名称",
  23. "type": "帖子|灵感点|目的点|关键点|点|标签",
  24. "domain": "帖子",
  25. "dimension": "帖子|灵感点|目的点|关键点",
  26. "detail": { ... }
  27. }
  28. },
  29. "edges": { # 边字典 (edgeId -> edgeData)
  30. "{source}|{type}|{target}": {
  31. "source": "源节点ID",
  32. "target": "目标节点ID",
  33. "type": "属于|包含",
  34. "score": 1.0,
  35. "detail": { ... }
  36. }
  37. },
  38. "index": { # 游走索引
  39. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  40. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  41. },
  42. "tree": { ... } # 嵌套树结构
  43. }
  44. ================================================================================
  45. 核心逻辑:
  46. ================================================================================
  47. 1. 从 filtered_results 读取帖子解构结果
  48. 2. 提取点节点和标签节点
  49. 3. 添加根节点(帖子)和维度节点(灵感点/目的点/关键点)
  50. 4. 构建属于/包含边
  51. 5. 构建索引和嵌套树
  52. ================================================================================
  53. 层级对应(人设 vs 帖子):
  54. ================================================================================
  55. | 人设 | 帖子 |
  56. |--------|--------|
  57. | 人设 | 帖子 |
  58. | 维度 | 维度 |
  59. | 分类 | 点 |
  60. | 标签 | 标签 |
  61. ================================================================================
  62. 节点ID格式: {domain}:{dimension}:{type}:{name}
  63. ================================================================================
  64. - 根节点: 帖子:帖子:帖子:{post_id}
  65. - 维度节点: 帖子:灵感点:灵感点:灵感点
  66. - 点节点: 帖子:灵感点:点:{point_name}
  67. - 标签节点: 帖子:灵感点:标签:{tag_name}
  68. ================================================================================
  69. 边类型:
  70. ================================================================================
  71. - 属于: 子节点 -> 父节点(层级关系)
  72. - 包含: 父节点 -> 子节点(层级关系)
  73. - 匹配: 帖子标签 <-> 人设标签(双向,score为相似度)
  74. ================================================================================
  75. 匹配边说明:
  76. ================================================================================
  77. 帖子图谱包含与人设图谱的匹配边,通过节点ID关联:
  78. - 帖子标签ID: 帖子:灵感点:标签:{tag_name}
  79. - 人设标签ID: 人设:灵感点:标签:{persona_tag_name}
  80. 使用方式:从帖子标签出发,沿"匹配"边游走到人设标签ID,
  81. 再从人设图谱.json中查找该ID的详细信息。
  82. ================================================================================
  83. """
  84. import json
  85. from pathlib import Path
  86. from typing import Dict, List, Set
  87. from datetime import datetime
  88. import sys
  89. # 添加项目根目录到路径
  90. project_root = Path(__file__).parent.parent.parent
  91. sys.path.insert(0, str(project_root))
  92. from script.data_processing.path_config import PathConfig
  93. # ==================== 节点和边构建工具 ====================
  94. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  95. """构建节点ID"""
  96. return f"{domain}:{dimension}:{node_type}:{name}"
  97. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  98. """构建边ID"""
  99. return f"{source}|{edge_type}|{target}"
  100. def create_node(
  101. domain: str,
  102. dimension: str,
  103. node_type: str,
  104. name: str,
  105. detail: Dict = None
  106. ) -> Dict:
  107. """创建节点"""
  108. return {
  109. "name": name,
  110. "type": node_type,
  111. "dimension": dimension,
  112. "domain": domain,
  113. "detail": detail or {}
  114. }
  115. def create_edge(
  116. source: str,
  117. target: str,
  118. edge_type: str,
  119. score: float = None,
  120. detail: Dict = None
  121. ) -> Dict:
  122. """创建边"""
  123. return {
  124. "source": source,
  125. "target": target,
  126. "type": edge_type,
  127. "score": score,
  128. "detail": detail or {}
  129. }
  130. # ==================== 从帖子解构结果提取节点和匹配边 ====================
  131. def extract_tags_and_matches(filtered_data: Dict) -> tuple:
  132. """
  133. 从帖子解构结果中提取标签节点和匹配边(适配新结构)
  134. 新结构:解构结果 → 点列表 → 点 → 匹配人设结果
  135. 新结构的"点"对应旧结构的"标签"节点,直接挂在维度下
  136. Returns:
  137. (标签节点字典, 匹配边字典, 支撑边字典, 关联边字典)
  138. """
  139. tag_nodes = {} # nodeId -> nodeData
  140. match_edges = {} # edgeId -> edgeData
  141. support_edges = {} # 支撑边
  142. relation_edges = {} # 关联边
  143. # ID 到节点ID的映射(用于构建支撑边和关联边)
  144. id_to_node_id = {}
  145. # 新结构使用 "解构结果"
  146. result = filtered_data.get("解构结果", {})
  147. dimension_mapping = {
  148. "灵感点列表": "灵感点",
  149. "目的点列表": "目的点",
  150. "关键点列表": "关键点"
  151. }
  152. # 第一遍:创建节点并建立 ID 映射
  153. for list_key, dimension in dimension_mapping.items():
  154. points = result.get(list_key, [])
  155. for point in points:
  156. tag_name = point.get("名称", "")
  157. tag_desc = point.get("描述", "")
  158. point_id = point.get("ID", "")
  159. if not tag_name:
  160. continue
  161. # 新结构的"点"直接创建为"标签"节点
  162. tag_id = build_node_id("帖子", dimension, "标签", tag_name)
  163. tag_nodes[tag_id] = create_node(
  164. domain="帖子",
  165. dimension=dimension,
  166. node_type="标签",
  167. name=tag_name,
  168. detail={
  169. "description": tag_desc,
  170. "pointId": point_id
  171. }
  172. )
  173. # 建立 ID 映射
  174. if point_id:
  175. id_to_node_id[point_id] = tag_id
  176. # 直接从点的 匹配人设结果 提取匹配边
  177. matches = point.get("匹配人设结果", [])
  178. for match in matches:
  179. persona_name = match.get("人设特征名称", "")
  180. persona_dimension = match.get("人设特征层级", "")
  181. # 映射:源数据中 "点" → "标签"
  182. persona_type = match.get("特征类型", "标签")
  183. if persona_type == "点":
  184. persona_type = "标签"
  185. similarity = match.get("相似度", 0)
  186. if not persona_name or not persona_dimension:
  187. continue
  188. # 构建人设节点ID
  189. persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name)
  190. # 创建双向匹配边
  191. # 帖子标签 -> 人设标签
  192. edge_id_1 = build_edge_id(tag_id, "匹配", persona_id)
  193. match_edges[edge_id_1] = create_edge(
  194. source=tag_id,
  195. target=persona_id,
  196. edge_type="匹配",
  197. score=similarity,
  198. detail={}
  199. )
  200. # 人设标签 -> 帖子标签
  201. edge_id_2 = build_edge_id(persona_id, "匹配", tag_id)
  202. match_edges[edge_id_2] = create_edge(
  203. source=persona_id,
  204. target=tag_id,
  205. edge_type="匹配",
  206. score=similarity,
  207. detail={}
  208. )
  209. # 第二遍:构建支撑边和关联边
  210. for list_key, dimension in dimension_mapping.items():
  211. points = result.get(list_key, [])
  212. for point in points:
  213. tag_name = point.get("名称", "")
  214. point_id = point.get("ID", "")
  215. if not tag_name or not point_id:
  216. continue
  217. tag_id = id_to_node_id.get(point_id)
  218. if not tag_id:
  219. continue
  220. # 支撑边:当前点 -> 被支撑的点
  221. support_ids = point.get("支撑的ID", [])
  222. for target_point_id in support_ids:
  223. target_node_id = id_to_node_id.get(target_point_id)
  224. if target_node_id:
  225. edge_id = build_edge_id(tag_id, "支撑", target_node_id)
  226. support_edges[edge_id] = create_edge(
  227. source=tag_id,
  228. target=target_node_id,
  229. edge_type="支撑",
  230. score=1.0,
  231. detail={}
  232. )
  233. # 关联边:当前点 <-> 关联的点(双向)
  234. relation_ids = point.get("关联的ID", [])
  235. for target_point_id in relation_ids:
  236. target_node_id = id_to_node_id.get(target_point_id)
  237. if target_node_id:
  238. # 只创建一个方向的边(避免重复)
  239. edge_id = build_edge_id(tag_id, "关联", target_node_id)
  240. if edge_id not in relation_edges:
  241. relation_edges[edge_id] = create_edge(
  242. source=tag_id,
  243. target=target_node_id,
  244. edge_type="关联",
  245. score=1.0,
  246. detail={}
  247. )
  248. return tag_nodes, match_edges, support_edges, relation_edges
  249. # ==================== 构建边 ====================
  250. def build_belong_contain_edges(
  251. tag_nodes: Dict[str, Dict],
  252. dimension_node_ids: Dict[str, str]
  253. ) -> Dict[str, Dict]:
  254. """
  255. 构建属于/包含边(新结构:标签直接挂维度下)
  256. Returns:
  257. 边字典 { edgeId: edgeData }
  258. """
  259. edges = {}
  260. # 标签 -> 维度(属于/包含)
  261. for tag_id, tag_data in tag_nodes.items():
  262. dimension = tag_data["dimension"]
  263. dim_node_id = dimension_node_ids[dimension]
  264. # 属于边:标签 -> 维度
  265. edge_id = build_edge_id(tag_id, "属于", dim_node_id)
  266. edges[edge_id] = create_edge(
  267. source=tag_id,
  268. target=dim_node_id,
  269. edge_type="属于",
  270. score=1.0
  271. )
  272. # 包含边:维度 -> 标签
  273. edge_id_contain = build_edge_id(dim_node_id, "包含", tag_id)
  274. edges[edge_id_contain] = create_edge(
  275. source=dim_node_id,
  276. target=tag_id,
  277. edge_type="包含",
  278. score=1.0
  279. )
  280. return edges
  281. # ==================== 构建索引 ====================
  282. def build_index(edges: Dict[str, Dict]) -> Dict:
  283. """
  284. 构建游走索引
  285. Returns:
  286. {
  287. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  288. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  289. }
  290. """
  291. out_edges = {}
  292. in_edges = {}
  293. for edge_data in edges.values():
  294. source = edge_data["source"]
  295. target = edge_data["target"]
  296. edge_type = edge_data["type"]
  297. score = edge_data["score"]
  298. # outEdges
  299. if source not in out_edges:
  300. out_edges[source] = {}
  301. if edge_type not in out_edges[source]:
  302. out_edges[source][edge_type] = []
  303. out_edges[source][edge_type].append({
  304. "target": target,
  305. "score": score
  306. })
  307. # inEdges
  308. if target not in in_edges:
  309. in_edges[target] = {}
  310. if edge_type not in in_edges[target]:
  311. in_edges[target][edge_type] = []
  312. in_edges[target][edge_type].append({
  313. "source": source,
  314. "score": score
  315. })
  316. return {
  317. "outEdges": out_edges,
  318. "inEdges": in_edges
  319. }
  320. # ==================== 构建嵌套树 ====================
  321. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict:
  322. """
  323. 从根节点开始,沿"包含"边递归构建嵌套树结构
  324. Returns:
  325. 嵌套的树结构
  326. """
  327. # 从"包含"边构建 父节点 -> [子节点] 的映射
  328. parent_to_children = {}
  329. for edge_data in edges.values():
  330. if edge_data["type"] == "包含":
  331. parent_id = edge_data["source"]
  332. child_id = edge_data["target"]
  333. if parent_id not in parent_to_children:
  334. parent_to_children[parent_id] = []
  335. parent_to_children[parent_id].append(child_id)
  336. # 递归构建子树
  337. def build_subtree(node_id: str) -> Dict:
  338. node_data = nodes[node_id]
  339. subtree = {
  340. "id": node_id,
  341. "name": node_data["name"],
  342. "type": node_data["type"],
  343. "domain": node_data["domain"],
  344. "dimension": node_data["dimension"],
  345. "detail": node_data.get("detail", {}),
  346. "children": []
  347. }
  348. # 获取子节点
  349. child_ids = parent_to_children.get(node_id, [])
  350. for child_id in child_ids:
  351. if child_id in nodes:
  352. subtree["children"].append(build_subtree(child_id))
  353. return subtree
  354. return build_subtree(root_id)
  355. # ==================== 图游走工具 ====================
  356. def walk_graph(
  357. index: Dict,
  358. start_node: str,
  359. edge_types: List[str],
  360. direction: str = "out",
  361. min_score: float = None
  362. ) -> Set[str]:
  363. """
  364. 从起始节点出发,按指定边类型序列游走N步
  365. Args:
  366. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  367. start_node: 起始节点ID
  368. edge_types: 边类型序列,如 ["属于", "包含"]
  369. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  370. min_score: 最小分数过滤
  371. Returns:
  372. 到达的节点ID集合
  373. """
  374. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  375. target_key = "target" if direction == "out" else "source"
  376. current_nodes = {start_node}
  377. for edge_type in edge_types:
  378. next_nodes = set()
  379. for node in current_nodes:
  380. neighbors = edge_index.get(node, {}).get(edge_type, [])
  381. for neighbor in neighbors:
  382. if min_score is not None and neighbor.get("score", 0) < min_score:
  383. continue
  384. next_nodes.add(neighbor[target_key])
  385. current_nodes = next_nodes
  386. if not current_nodes:
  387. break
  388. return current_nodes
  389. def get_neighbors(
  390. index: Dict,
  391. node_id: str,
  392. edge_type: str = None,
  393. direction: str = "out",
  394. min_score: float = None
  395. ) -> List[Dict]:
  396. """
  397. 获取节点的邻居
  398. Args:
  399. index: 游走索引
  400. node_id: 节点ID
  401. edge_type: 边类型(可选,不指定则返回所有类型)
  402. direction: 方向 "out" / "in"
  403. min_score: 最小分数过滤
  404. Returns:
  405. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  406. """
  407. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  408. node_edges = edge_index.get(node_id, {})
  409. if edge_type:
  410. neighbors = node_edges.get(edge_type, [])
  411. else:
  412. neighbors = []
  413. for edges in node_edges.values():
  414. neighbors.extend(edges)
  415. if min_score is not None:
  416. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  417. return neighbors
  418. # ==================== 处理单个帖子 ====================
  419. def process_single_post(filtered_file: Path, output_dir: Path) -> Dict:
  420. """
  421. 处理单个帖子,生成帖子图谱
  422. Returns:
  423. 处理结果统计
  424. """
  425. # 读取数据
  426. with open(filtered_file, "r", encoding="utf-8") as f:
  427. filtered_data = json.load(f)
  428. post_id = filtered_data.get("帖子id", "")
  429. post_detail = filtered_data.get("帖子详情", {})
  430. post_title = post_detail.get("title", "")
  431. # 初始化节点和边
  432. all_nodes = {}
  433. all_edges = {}
  434. # 1. 提取标签节点和匹配边(新结构:没有点层)
  435. tag_nodes, match_edges, support_edges, relation_edges = extract_tags_and_matches(filtered_data)
  436. # 2. 添加根节点
  437. root_id = build_node_id("帖子", "帖子", "帖子", post_id)
  438. all_nodes[root_id] = create_node(
  439. domain="帖子",
  440. dimension="帖子",
  441. node_type="帖子",
  442. name=post_id,
  443. detail={
  444. "postTitle": post_title,
  445. "postDetail": post_detail
  446. }
  447. )
  448. # 3. 添加维度节点
  449. dimensions = ["灵感点", "目的点", "关键点"]
  450. dimension_node_ids = {}
  451. for dim in dimensions:
  452. dim_id = build_node_id("帖子", dim, dim, dim)
  453. dimension_node_ids[dim] = dim_id
  454. all_nodes[dim_id] = create_node(
  455. domain="帖子",
  456. dimension=dim,
  457. node_type=dim,
  458. name=dim,
  459. detail={}
  460. )
  461. # 维度 -> 根 的属于边
  462. edge_id = build_edge_id(dim_id, "属于", root_id)
  463. all_edges[edge_id] = create_edge(
  464. source=dim_id,
  465. target=root_id,
  466. edge_type="属于",
  467. score=1.0
  468. )
  469. # 根 -> 维度 的包含边
  470. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  471. all_edges[edge_id_contain] = create_edge(
  472. source=root_id,
  473. target=dim_id,
  474. edge_type="包含",
  475. score=1.0
  476. )
  477. # 4. 添加标签节点
  478. all_nodes.update(tag_nodes)
  479. # 5. 构建属于/包含边(标签直接挂维度下)
  480. belong_contain_edges = build_belong_contain_edges(tag_nodes, dimension_node_ids)
  481. all_edges.update(belong_contain_edges)
  482. # 6. 添加匹配边
  483. all_edges.update(match_edges)
  484. # 7. 添加支撑边和关联边
  485. all_edges.update(support_edges)
  486. all_edges.update(relation_edges)
  487. # 8. 构建索引
  488. index = build_index(all_edges)
  489. # 9. 构建嵌套树
  490. tree = build_nested_tree(all_nodes, all_edges, root_id)
  491. # 统计
  492. tag_count = len(tag_nodes)
  493. match_count = len(match_edges) // 2 # 双向边,除以2得到实际匹配数
  494. support_count = len(support_edges)
  495. relation_count = len(relation_edges)
  496. dimension_stats = {}
  497. for dim in dimensions:
  498. dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim)
  499. dimension_stats[dim] = {
  500. "tagCount": dim_tags
  501. }
  502. # 构建输出
  503. output_data = {
  504. "meta": {
  505. "postId": post_id,
  506. "postTitle": post_title,
  507. "postDetail": post_detail,
  508. "createdAt": datetime.now().isoformat(),
  509. "stats": {
  510. "nodeCount": len(all_nodes),
  511. "edgeCount": len(all_edges),
  512. "tagCount": tag_count,
  513. "matchCount": match_count,
  514. "supportCount": support_count,
  515. "relationCount": relation_count,
  516. "dimensions": dimension_stats
  517. }
  518. },
  519. "nodes": all_nodes,
  520. "edges": all_edges,
  521. "index": index,
  522. "tree": tree
  523. }
  524. # 保存
  525. output_file = output_dir / f"{post_id}_帖子图谱.json"
  526. with open(output_file, "w", encoding="utf-8") as f:
  527. json.dump(output_data, f, ensure_ascii=False, indent=2)
  528. return {
  529. "postId": post_id,
  530. "postTitle": post_title,
  531. "nodeCount": len(all_nodes),
  532. "edgeCount": len(all_edges),
  533. "tagCount": tag_count,
  534. "matchCount": match_count,
  535. "supportCount": support_count,
  536. "relationCount": relation_count,
  537. "outputFile": str(output_file)
  538. }
  539. # ==================== 主函数 ====================
  540. def main():
  541. config = PathConfig()
  542. config.ensure_dirs()
  543. print(f"账号: {config.account_name}")
  544. print(f"输出版本: {config.output_version}")
  545. print()
  546. # 输入目录
  547. filtered_results_dir = config.intermediate_dir / "filtered_results"
  548. # 输出目录
  549. output_dir = config.intermediate_dir / "post_graph"
  550. output_dir.mkdir(parents=True, exist_ok=True)
  551. print(f"输入目录: {filtered_results_dir}")
  552. print(f"输出目录: {output_dir}")
  553. print()
  554. # 获取所有帖子文件
  555. filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
  556. print(f"找到 {len(filtered_files)} 个帖子文件")
  557. print()
  558. # 处理每个帖子
  559. results = []
  560. for i, filtered_file in enumerate(filtered_files, 1):
  561. print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
  562. result = process_single_post(filtered_file, output_dir)
  563. results.append(result)
  564. print(f" 节点: {result['nodeCount']}, 边: {result['edgeCount']}")
  565. print(f" 标签: {result['tagCount']}, 匹配: {result['matchCount']}, 支撑: {result['supportCount']}, 关联: {result['relationCount']}")
  566. print(f" → {Path(result['outputFile']).name}")
  567. print()
  568. # 汇总统计
  569. print("=" * 60)
  570. print("处理完成!")
  571. print(f" 帖子数: {len(results)}")
  572. print(f" 总节点数: {sum(r['nodeCount'] for r in results)}")
  573. print(f" 总边数: {sum(r['edgeCount'] for r in results)}")
  574. print(f" 总标签数: {sum(r['tagCount'] for r in results)}")
  575. print(f" 总匹配数: {sum(r['matchCount'] for r in results)}")
  576. print(f" 总支撑边: {sum(r['supportCount'] for r in results)}")
  577. print(f" 总关联边: {sum(r['relationCount'] for r in results)}")
  578. print(f"\n输出目录: {output_dir}")
  579. if __name__ == "__main__":
  580. main()