build_post_graph.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建帖子图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. filtered_results/*_filtered.json - 帖子解构结果(过滤后的how解构)
  9. ================================================================================
  10. 输出文件: post_graph/{post_id}_帖子图谱.json(每个帖子一个文件)
  11. ================================================================================
  12. {
  13. "meta": { # 元信息
  14. "postId": "帖子ID",
  15. "postTitle": "帖子标题",
  16. "postDetail": {...},
  17. "createdAt": "时间戳",
  18. "stats": { ... }
  19. },
  20. "nodes": { # 节点字典 (nodeId -> nodeData)
  21. "{domain}:{dimension}:{type}:{name}": {
  22. "name": "显示名称",
  23. "type": "帖子|灵感点|目的点|关键点|点|标签",
  24. "domain": "帖子",
  25. "dimension": "帖子|灵感点|目的点|关键点",
  26. "detail": { ... }
  27. }
  28. },
  29. "edges": { # 边字典 (edgeId -> edgeData)
  30. "{source}|{type}|{target}": {
  31. "source": "源节点ID",
  32. "target": "目标节点ID",
  33. "type": "属于|包含",
  34. "score": 1.0,
  35. "detail": { ... }
  36. }
  37. },
  38. "index": { # 游走索引
  39. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  40. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  41. },
  42. "tree": { ... } # 嵌套树结构
  43. }
  44. ================================================================================
  45. 核心逻辑:
  46. ================================================================================
  47. 1. 从 filtered_results 读取帖子解构结果
  48. 2. 提取点节点和标签节点
  49. 3. 添加根节点(帖子)和维度节点(灵感点/目的点/关键点)
  50. 4. 构建属于/包含边
  51. 5. 构建索引和嵌套树
  52. ================================================================================
  53. 层级对应(人设 vs 帖子):
  54. ================================================================================
  55. | 人设 | 帖子 |
  56. |--------|--------|
  57. | 人设 | 帖子 |
  58. | 维度 | 维度 |
  59. | 分类 | 点 |
  60. | 标签 | 标签 |
  61. ================================================================================
  62. 节点ID格式: {domain}:{dimension}:{type}:{name}
  63. ================================================================================
  64. - 根节点: 帖子:帖子:帖子:{post_id}
  65. - 维度节点: 帖子:灵感点:灵感点:灵感点
  66. - 点节点: 帖子:灵感点:点:{point_name}
  67. - 标签节点: 帖子:灵感点:标签:{tag_name}
  68. ================================================================================
  69. 边类型:
  70. ================================================================================
  71. - 属于: 子节点 -> 父节点(层级关系)
  72. - 包含: 父节点 -> 子节点(层级关系)
  73. - 匹配: 帖子标签 <-> 人设标签(双向,score为相似度)
  74. ================================================================================
  75. 匹配边说明:
  76. ================================================================================
  77. 帖子图谱包含与人设图谱的匹配边,通过节点ID关联:
  78. - 帖子标签ID: 帖子:灵感点:标签:{tag_name}
  79. - 人设标签ID: 人设:灵感点:标签:{persona_tag_name}
  80. 使用方式:从帖子标签出发,沿"匹配"边游走到人设标签ID,
  81. 再从人设图谱.json中查找该ID的详细信息。
  82. ================================================================================
  83. """
  84. import json
  85. from pathlib import Path
  86. from typing import Dict, List, Set
  87. from datetime import datetime
  88. import sys
  89. # 添加项目根目录到路径
  90. project_root = Path(__file__).parent.parent.parent
  91. sys.path.insert(0, str(project_root))
  92. from script.data_processing.path_config import PathConfig
  93. # ==================== 节点和边构建工具 ====================
  94. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  95. """构建节点ID"""
  96. return f"{domain}:{dimension}:{node_type}:{name}"
  97. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  98. """构建边ID"""
  99. return f"{source}|{edge_type}|{target}"
  100. def create_node(
  101. domain: str,
  102. dimension: str,
  103. node_type: str,
  104. name: str,
  105. detail: Dict = None,
  106. category: str = None
  107. ) -> Dict:
  108. """创建节点"""
  109. node = {
  110. "name": name,
  111. "type": node_type,
  112. "dimension": dimension,
  113. "domain": domain,
  114. "detail": detail or {}
  115. }
  116. if category:
  117. node["category"] = category
  118. return node
  119. def create_edge(
  120. source: str,
  121. target: str,
  122. edge_type: str,
  123. score: float = None,
  124. detail: Dict = None
  125. ) -> Dict:
  126. """创建边"""
  127. return {
  128. "source": source,
  129. "target": target,
  130. "type": edge_type,
  131. "score": score,
  132. "detail": detail or {}
  133. }
  134. # ==================== 从帖子解构结果提取节点和匹配边 ====================
  135. def extract_tags_and_matches(filtered_data: Dict) -> tuple:
  136. """
  137. 从帖子解构结果中提取标签节点和匹配边(适配新结构)
  138. 新结构:解构结果 → 点列表 → 点 → 匹配人设结果
  139. 新结构的"点"对应旧结构的"标签"节点,直接挂在维度下
  140. Returns:
  141. (标签节点字典, 匹配边字典, 支撑边字典, 关联边字典)
  142. """
  143. tag_nodes = {} # nodeId -> nodeData
  144. match_edges = {} # edgeId -> edgeData
  145. support_edges = {} # 支撑边
  146. relation_edges = {} # 关联边
  147. # ID 到节点ID的映射(用于构建支撑边和关联边)
  148. id_to_node_id = {}
  149. # 新结构使用 "解构结果"
  150. result = filtered_data.get("解构结果", {})
  151. dimension_mapping = {
  152. "灵感点列表": "灵感点",
  153. "目的点列表": "目的点",
  154. "关键点列表": "关键点"
  155. }
  156. # 第一遍:创建节点并建立 ID 映射
  157. for list_key, dimension in dimension_mapping.items():
  158. points = result.get(list_key, [])
  159. for point in points:
  160. tag_name = point.get("名称", "")
  161. tag_desc = point.get("描述", "")
  162. point_id = point.get("ID", "")
  163. point_category = point.get("类型", "") # 根分类:意图/实质/形式
  164. if not tag_name:
  165. continue
  166. # 新结构的"点"直接创建为"标签"节点
  167. tag_id = build_node_id("帖子", dimension, "标签", tag_name)
  168. tag_nodes[tag_id] = create_node(
  169. domain="帖子",
  170. dimension=dimension,
  171. node_type="标签",
  172. name=tag_name,
  173. detail={
  174. "description": tag_desc,
  175. "pointId": point_id
  176. },
  177. category=point_category
  178. )
  179. # 建立 ID 映射
  180. if point_id:
  181. id_to_node_id[point_id] = tag_id
  182. # 直接从点的 匹配人设结果 提取匹配边
  183. matches = point.get("匹配人设结果", [])
  184. for match in matches:
  185. persona_name = match.get("人设特征名称", "")
  186. persona_dimension = match.get("人设特征层级", "")
  187. # 映射:源数据中 "点" → "标签"
  188. persona_type = match.get("特征类型", "标签")
  189. if persona_type == "点":
  190. persona_type = "标签"
  191. similarity = match.get("相似度", 0)
  192. if not persona_name or not persona_dimension:
  193. continue
  194. # 构建人设节点ID
  195. persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name)
  196. # 创建双向匹配边
  197. # 帖子标签 -> 人设标签
  198. edge_id_1 = build_edge_id(tag_id, "匹配", persona_id)
  199. match_edges[edge_id_1] = create_edge(
  200. source=tag_id,
  201. target=persona_id,
  202. edge_type="匹配",
  203. score=similarity,
  204. detail={}
  205. )
  206. # 人设标签 -> 帖子标签
  207. edge_id_2 = build_edge_id(persona_id, "匹配", tag_id)
  208. match_edges[edge_id_2] = create_edge(
  209. source=persona_id,
  210. target=tag_id,
  211. edge_type="匹配",
  212. score=similarity,
  213. detail={}
  214. )
  215. # 第二遍:构建支撑边和关联边
  216. for list_key, dimension in dimension_mapping.items():
  217. points = result.get(list_key, [])
  218. for point in points:
  219. tag_name = point.get("名称", "")
  220. point_id = point.get("ID", "")
  221. if not tag_name or not point_id:
  222. continue
  223. tag_id = id_to_node_id.get(point_id)
  224. if not tag_id:
  225. continue
  226. # 支撑边:当前点 -> 被支撑的点
  227. support_ids = point.get("支撑的ID", [])
  228. for target_point_id in support_ids:
  229. target_node_id = id_to_node_id.get(target_point_id)
  230. if target_node_id:
  231. edge_id = build_edge_id(tag_id, "支撑", target_node_id)
  232. support_edges[edge_id] = create_edge(
  233. source=tag_id,
  234. target=target_node_id,
  235. edge_type="支撑",
  236. score=1.0,
  237. detail={}
  238. )
  239. # 关联边:当前点 <-> 关联的点(双向)
  240. relation_ids = point.get("关联的ID", [])
  241. for target_point_id in relation_ids:
  242. target_node_id = id_to_node_id.get(target_point_id)
  243. if target_node_id:
  244. # 只创建一个方向的边(避免重复)
  245. edge_id = build_edge_id(tag_id, "关联", target_node_id)
  246. if edge_id not in relation_edges:
  247. relation_edges[edge_id] = create_edge(
  248. source=tag_id,
  249. target=target_node_id,
  250. edge_type="关联",
  251. score=1.0,
  252. detail={}
  253. )
  254. return tag_nodes, match_edges, support_edges, relation_edges
  255. # ==================== 构建边 ====================
  256. def build_belong_contain_edges(
  257. tag_nodes: Dict[str, Dict],
  258. dimension_node_ids: Dict[str, str]
  259. ) -> Dict[str, Dict]:
  260. """
  261. 构建属于/包含边(新结构:标签直接挂维度下)
  262. Returns:
  263. 边字典 { edgeId: edgeData }
  264. """
  265. edges = {}
  266. # 标签 -> 维度(属于/包含)
  267. for tag_id, tag_data in tag_nodes.items():
  268. dimension = tag_data["dimension"]
  269. dim_node_id = dimension_node_ids[dimension]
  270. # 属于边:标签 -> 维度
  271. edge_id = build_edge_id(tag_id, "属于", dim_node_id)
  272. edges[edge_id] = create_edge(
  273. source=tag_id,
  274. target=dim_node_id,
  275. edge_type="属于",
  276. score=1.0
  277. )
  278. # 包含边:维度 -> 标签
  279. edge_id_contain = build_edge_id(dim_node_id, "包含", tag_id)
  280. edges[edge_id_contain] = create_edge(
  281. source=dim_node_id,
  282. target=tag_id,
  283. edge_type="包含",
  284. score=1.0
  285. )
  286. return edges
  287. # ==================== 构建索引 ====================
  288. def build_index(edges: Dict[str, Dict]) -> Dict:
  289. """
  290. 构建游走索引
  291. Returns:
  292. {
  293. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  294. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  295. }
  296. """
  297. out_edges = {}
  298. in_edges = {}
  299. for edge_data in edges.values():
  300. source = edge_data["source"]
  301. target = edge_data["target"]
  302. edge_type = edge_data["type"]
  303. score = edge_data["score"]
  304. # outEdges
  305. if source not in out_edges:
  306. out_edges[source] = {}
  307. if edge_type not in out_edges[source]:
  308. out_edges[source][edge_type] = []
  309. out_edges[source][edge_type].append({
  310. "target": target,
  311. "score": score
  312. })
  313. # inEdges
  314. if target not in in_edges:
  315. in_edges[target] = {}
  316. if edge_type not in in_edges[target]:
  317. in_edges[target][edge_type] = []
  318. in_edges[target][edge_type].append({
  319. "source": source,
  320. "score": score
  321. })
  322. return {
  323. "outEdges": out_edges,
  324. "inEdges": in_edges
  325. }
  326. # ==================== 构建嵌套树 ====================
  327. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict:
  328. """
  329. 从根节点开始,沿"包含"边递归构建嵌套树结构
  330. Returns:
  331. 嵌套的树结构
  332. """
  333. # 从"包含"边构建 父节点 -> [子节点] 的映射
  334. parent_to_children = {}
  335. for edge_data in edges.values():
  336. if edge_data["type"] == "包含":
  337. parent_id = edge_data["source"]
  338. child_id = edge_data["target"]
  339. if parent_id not in parent_to_children:
  340. parent_to_children[parent_id] = []
  341. parent_to_children[parent_id].append(child_id)
  342. # 递归构建子树
  343. def build_subtree(node_id: str) -> Dict:
  344. node_data = nodes[node_id]
  345. subtree = {
  346. "id": node_id,
  347. "name": node_data["name"],
  348. "type": node_data["type"],
  349. "domain": node_data["domain"],
  350. "dimension": node_data["dimension"],
  351. "detail": node_data.get("detail", {}),
  352. "children": []
  353. }
  354. # 获取子节点
  355. child_ids = parent_to_children.get(node_id, [])
  356. for child_id in child_ids:
  357. if child_id in nodes:
  358. subtree["children"].append(build_subtree(child_id))
  359. return subtree
  360. return build_subtree(root_id)
  361. # ==================== 图游走工具 ====================
  362. def walk_graph(
  363. index: Dict,
  364. start_node: str,
  365. edge_types: List[str],
  366. direction: str = "out",
  367. min_score: float = None
  368. ) -> Set[str]:
  369. """
  370. 从起始节点出发,按指定边类型序列游走N步
  371. Args:
  372. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  373. start_node: 起始节点ID
  374. edge_types: 边类型序列,如 ["属于", "包含"]
  375. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  376. min_score: 最小分数过滤
  377. Returns:
  378. 到达的节点ID集合
  379. """
  380. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  381. target_key = "target" if direction == "out" else "source"
  382. current_nodes = {start_node}
  383. for edge_type in edge_types:
  384. next_nodes = set()
  385. for node in current_nodes:
  386. neighbors = edge_index.get(node, {}).get(edge_type, [])
  387. for neighbor in neighbors:
  388. if min_score is not None and neighbor.get("score", 0) < min_score:
  389. continue
  390. next_nodes.add(neighbor[target_key])
  391. current_nodes = next_nodes
  392. if not current_nodes:
  393. break
  394. return current_nodes
  395. def get_neighbors(
  396. index: Dict,
  397. node_id: str,
  398. edge_type: str = None,
  399. direction: str = "out",
  400. min_score: float = None
  401. ) -> List[Dict]:
  402. """
  403. 获取节点的邻居
  404. Args:
  405. index: 游走索引
  406. node_id: 节点ID
  407. edge_type: 边类型(可选,不指定则返回所有类型)
  408. direction: 方向 "out" / "in"
  409. min_score: 最小分数过滤
  410. Returns:
  411. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  412. """
  413. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  414. node_edges = edge_index.get(node_id, {})
  415. if edge_type:
  416. neighbors = node_edges.get(edge_type, [])
  417. else:
  418. neighbors = []
  419. for edges in node_edges.values():
  420. neighbors.extend(edges)
  421. if min_score is not None:
  422. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  423. return neighbors
  424. # ==================== 处理单个帖子 ====================
  425. def process_single_post(filtered_file: Path, output_dir: Path) -> Dict:
  426. """
  427. 处理单个帖子,生成帖子图谱
  428. Returns:
  429. 处理结果统计
  430. """
  431. # 读取数据
  432. with open(filtered_file, "r", encoding="utf-8") as f:
  433. filtered_data = json.load(f)
  434. post_id = filtered_data.get("帖子id", "")
  435. post_detail = filtered_data.get("帖子详情", {})
  436. post_title = post_detail.get("title", "")
  437. # 初始化节点和边
  438. all_nodes = {}
  439. all_edges = {}
  440. # 1. 提取标签节点和匹配边(新结构:没有点层)
  441. tag_nodes, match_edges, support_edges, relation_edges = extract_tags_and_matches(filtered_data)
  442. # 2. 添加根节点
  443. root_id = build_node_id("帖子", "帖子", "帖子", post_id)
  444. all_nodes[root_id] = create_node(
  445. domain="帖子",
  446. dimension="帖子",
  447. node_type="帖子",
  448. name=post_id,
  449. detail={
  450. "postTitle": post_title,
  451. "postDetail": post_detail
  452. }
  453. )
  454. # 3. 添加维度节点
  455. dimensions = ["灵感点", "目的点", "关键点"]
  456. dimension_node_ids = {}
  457. for dim in dimensions:
  458. dim_id = build_node_id("帖子", dim, dim, dim)
  459. dimension_node_ids[dim] = dim_id
  460. all_nodes[dim_id] = create_node(
  461. domain="帖子",
  462. dimension=dim,
  463. node_type=dim,
  464. name=dim,
  465. detail={}
  466. )
  467. # 维度 -> 根 的属于边
  468. edge_id = build_edge_id(dim_id, "属于", root_id)
  469. all_edges[edge_id] = create_edge(
  470. source=dim_id,
  471. target=root_id,
  472. edge_type="属于",
  473. score=1.0
  474. )
  475. # 根 -> 维度 的包含边
  476. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  477. all_edges[edge_id_contain] = create_edge(
  478. source=root_id,
  479. target=dim_id,
  480. edge_type="包含",
  481. score=1.0
  482. )
  483. # 4. 添加标签节点
  484. all_nodes.update(tag_nodes)
  485. # 5. 构建属于/包含边(标签直接挂维度下)
  486. belong_contain_edges = build_belong_contain_edges(tag_nodes, dimension_node_ids)
  487. all_edges.update(belong_contain_edges)
  488. # 6. 添加匹配边
  489. all_edges.update(match_edges)
  490. # 7. 添加支撑边和关联边
  491. all_edges.update(support_edges)
  492. all_edges.update(relation_edges)
  493. # 8. 构建索引
  494. index = build_index(all_edges)
  495. # 9. 构建嵌套树
  496. tree = build_nested_tree(all_nodes, all_edges, root_id)
  497. # 统计
  498. tag_count = len(tag_nodes)
  499. match_count = len(match_edges) // 2 # 双向边,除以2得到实际匹配数
  500. support_count = len(support_edges)
  501. relation_count = len(relation_edges)
  502. dimension_stats = {}
  503. for dim in dimensions:
  504. dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim)
  505. dimension_stats[dim] = {
  506. "tagCount": dim_tags
  507. }
  508. # 构建输出
  509. output_data = {
  510. "meta": {
  511. "postId": post_id,
  512. "postTitle": post_title,
  513. "postDetail": post_detail,
  514. "createdAt": datetime.now().isoformat(),
  515. "stats": {
  516. "nodeCount": len(all_nodes),
  517. "edgeCount": len(all_edges),
  518. "tagCount": tag_count,
  519. "matchCount": match_count,
  520. "supportCount": support_count,
  521. "relationCount": relation_count,
  522. "dimensions": dimension_stats
  523. }
  524. },
  525. "nodes": all_nodes,
  526. "edges": all_edges,
  527. "index": index,
  528. "tree": tree
  529. }
  530. # 保存
  531. output_file = output_dir / f"{post_id}_帖子图谱.json"
  532. with open(output_file, "w", encoding="utf-8") as f:
  533. json.dump(output_data, f, ensure_ascii=False, indent=2)
  534. return {
  535. "postId": post_id,
  536. "postTitle": post_title,
  537. "nodeCount": len(all_nodes),
  538. "edgeCount": len(all_edges),
  539. "tagCount": tag_count,
  540. "matchCount": match_count,
  541. "supportCount": support_count,
  542. "relationCount": relation_count,
  543. "outputFile": str(output_file)
  544. }
  545. # ==================== 主函数 ====================
  546. def main():
  547. config = PathConfig()
  548. config.ensure_dirs()
  549. print(f"账号: {config.account_name}")
  550. print(f"输出版本: {config.output_version}")
  551. print()
  552. # 输入目录
  553. filtered_results_dir = config.intermediate_dir / "filtered_results"
  554. # 输出目录
  555. output_dir = config.intermediate_dir / "post_graph"
  556. output_dir.mkdir(parents=True, exist_ok=True)
  557. print(f"输入目录: {filtered_results_dir}")
  558. print(f"输出目录: {output_dir}")
  559. print()
  560. # 获取所有帖子文件
  561. filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
  562. print(f"找到 {len(filtered_files)} 个帖子文件")
  563. print()
  564. # 处理每个帖子
  565. results = []
  566. for i, filtered_file in enumerate(filtered_files, 1):
  567. print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
  568. result = process_single_post(filtered_file, output_dir)
  569. results.append(result)
  570. print(f" 节点: {result['nodeCount']}, 边: {result['edgeCount']}")
  571. print(f" 标签: {result['tagCount']}, 匹配: {result['matchCount']}, 支撑: {result['supportCount']}, 关联: {result['relationCount']}")
  572. print(f" → {Path(result['outputFile']).name}")
  573. print()
  574. # 汇总统计
  575. print("=" * 60)
  576. print("处理完成!")
  577. print(f" 帖子数: {len(results)}")
  578. print(f" 总节点数: {sum(r['nodeCount'] for r in results)}")
  579. print(f" 总边数: {sum(r['edgeCount'] for r in results)}")
  580. print(f" 总标签数: {sum(r['tagCount'] for r in results)}")
  581. print(f" 总匹配数: {sum(r['matchCount'] for r in results)}")
  582. print(f" 总支撑边: {sum(r['supportCount'] for r in results)}")
  583. print(f" 总关联边: {sum(r['relationCount'] for r in results)}")
  584. print(f"\n输出目录: {output_dir}")
  585. if __name__ == "__main__":
  586. main()