build_post_graph.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 构建帖子图谱
  5. ================================================================================
  6. 输入文件:
  7. ================================================================================
  8. filtered_results/*_filtered.json - 帖子解构结果(过滤后的how解构)
  9. ================================================================================
  10. 输出文件: post_graph/{post_id}_帖子图谱.json(每个帖子一个文件)
  11. ================================================================================
  12. {
  13. "meta": { # 元信息
  14. "postId": "帖子ID",
  15. "postTitle": "帖子标题",
  16. "postDetail": {...},
  17. "createdAt": "时间戳",
  18. "stats": { ... }
  19. },
  20. "nodes": { # 节点字典 (nodeId -> nodeData)
  21. "{domain}:{dimension}:{type}:{name}": {
  22. "name": "显示名称",
  23. "type": "帖子|灵感点|目的点|关键点|点|标签",
  24. "domain": "帖子",
  25. "dimension": "帖子|灵感点|目的点|关键点",
  26. "detail": { ... }
  27. }
  28. },
  29. "edges": { # 边字典 (edgeId -> edgeData)
  30. "{source}|{type}|{target}": {
  31. "source": "源节点ID",
  32. "target": "目标节点ID",
  33. "type": "属于|包含",
  34. "score": 1.0,
  35. "detail": { ... }
  36. }
  37. },
  38. "index": { # 游走索引
  39. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  40. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  41. },
  42. "tree": { ... } # 嵌套树结构
  43. }
  44. ================================================================================
  45. 核心逻辑:
  46. ================================================================================
  47. 1. 从 filtered_results 读取帖子解构结果
  48. 2. 提取点节点和标签节点
  49. 3. 添加根节点(帖子)和维度节点(灵感点/目的点/关键点)
  50. 4. 构建属于/包含边
  51. 5. 构建索引和嵌套树
  52. ================================================================================
  53. 层级对应(人设 vs 帖子):
  54. ================================================================================
  55. | 人设 | 帖子 |
  56. |--------|--------|
  57. | 人设 | 帖子 |
  58. | 维度 | 维度 |
  59. | 分类 | 点 |
  60. | 标签 | 标签 |
  61. ================================================================================
  62. 节点ID格式: {domain}:{dimension}:{type}:{name}
  63. ================================================================================
  64. - 根节点: 帖子:帖子:帖子:{post_id}
  65. - 维度节点: 帖子:灵感点:灵感点:灵感点
  66. - 点节点: 帖子:灵感点:点:{point_name}
  67. - 标签节点: 帖子:灵感点:标签:{tag_name}
  68. ================================================================================
  69. 边类型:
  70. ================================================================================
  71. - 属于: 子节点 -> 父节点(层级关系)
  72. - 包含: 父节点 -> 子节点(层级关系)
  73. - 匹配: 帖子标签 <-> 人设标签(双向,score为相似度)
  74. ================================================================================
  75. 匹配边说明:
  76. ================================================================================
  77. 帖子图谱包含与人设图谱的匹配边,通过节点ID关联:
  78. - 帖子标签ID: 帖子:灵感点:标签:{tag_name}
  79. - 人设标签ID: 人设:灵感点:标签:{persona_tag_name}
  80. 使用方式:从帖子标签出发,沿"匹配"边游走到人设标签ID,
  81. 再从人设图谱.json中查找该ID的详细信息。
  82. ================================================================================
  83. """
  84. import json
  85. from pathlib import Path
  86. from typing import Dict, List, Set
  87. from datetime import datetime
  88. import sys
  89. # 添加项目根目录到路径
  90. project_root = Path(__file__).parent.parent.parent
  91. sys.path.insert(0, str(project_root))
  92. from script.data_processing.path_config import PathConfig
  93. # ==================== 节点和边构建工具 ====================
  94. def build_node_id(domain: str, dimension: str, node_type: str, name: str) -> str:
  95. """构建节点ID"""
  96. return f"{domain}:{dimension}:{node_type}:{name}"
  97. def build_edge_id(source: str, edge_type: str, target: str) -> str:
  98. """构建边ID"""
  99. return f"{source}|{edge_type}|{target}"
  100. def create_node(
  101. domain: str,
  102. dimension: str,
  103. node_type: str,
  104. name: str,
  105. detail: Dict = None
  106. ) -> Dict:
  107. """创建节点"""
  108. return {
  109. "name": name,
  110. "type": node_type,
  111. "dimension": dimension,
  112. "domain": domain,
  113. "detail": detail or {}
  114. }
  115. def create_edge(
  116. source: str,
  117. target: str,
  118. edge_type: str,
  119. score: float = None,
  120. detail: Dict = None
  121. ) -> Dict:
  122. """创建边"""
  123. return {
  124. "source": source,
  125. "target": target,
  126. "type": edge_type,
  127. "score": score,
  128. "detail": detail or {}
  129. }
  130. # ==================== 从帖子解构结果提取节点和匹配边 ====================
  131. def extract_points_tags_and_matches(filtered_data: Dict) -> tuple:
  132. """
  133. 从帖子解构结果中提取点节点、标签节点和匹配边
  134. Returns:
  135. (点节点字典, 标签节点字典, 标签到点的映射, 匹配边字典)
  136. """
  137. point_nodes = {} # nodeId -> nodeData
  138. tag_nodes = {} # nodeId -> nodeData
  139. tag_to_point = {} # tagId -> [pointId, ...]
  140. match_edges = {} # edgeId -> edgeData
  141. how_result = filtered_data.get("how解构结果", {})
  142. dimension_mapping = {
  143. "灵感点列表": "灵感点",
  144. "目的点列表": "目的点",
  145. "关键点列表": "关键点"
  146. }
  147. for list_key, dimension in dimension_mapping.items():
  148. points = how_result.get(list_key, [])
  149. for point in points:
  150. point_name = point.get("名称", "")
  151. point_desc = point.get("描述", "")
  152. if not point_name:
  153. continue
  154. # 创建点节点
  155. point_id = build_node_id("帖子", dimension, "点", point_name)
  156. point_nodes[point_id] = create_node(
  157. domain="帖子",
  158. dimension=dimension,
  159. node_type="点",
  160. name=point_name,
  161. detail={
  162. "description": point_desc
  163. }
  164. )
  165. # 遍历how步骤列表,提取标签和匹配
  166. how_steps = point.get("how步骤列表", [])
  167. for step in how_steps:
  168. step_name = step.get("步骤名称", "")
  169. features = step.get("特征列表", [])
  170. for feature in features:
  171. tag_name = feature.get("特征名称", "")
  172. weight = feature.get("权重", 1.0)
  173. if not tag_name:
  174. continue
  175. # 创建标签节点
  176. tag_id = build_node_id("帖子", dimension, "标签", tag_name)
  177. if tag_id not in tag_nodes:
  178. tag_nodes[tag_id] = create_node(
  179. domain="帖子",
  180. dimension=dimension,
  181. node_type="标签",
  182. name=tag_name,
  183. detail={
  184. "weight": weight,
  185. "stepName": step_name,
  186. "pointNames": [point_name]
  187. }
  188. )
  189. else:
  190. # 同一标签可能属于多个点
  191. if point_name not in tag_nodes[tag_id]["detail"]["pointNames"]:
  192. tag_nodes[tag_id]["detail"]["pointNames"].append(point_name)
  193. # 记录标签到点的映射
  194. if tag_id not in tag_to_point:
  195. tag_to_point[tag_id] = []
  196. if point_id not in tag_to_point[tag_id]:
  197. tag_to_point[tag_id].append(point_id)
  198. # 提取匹配边
  199. matches = feature.get("匹配结果", [])
  200. for match in matches:
  201. persona_name = match.get("人设特征名称", "")
  202. persona_dimension = match.get("人设特征层级", "")
  203. persona_type = match.get("特征类型", "标签")
  204. match_detail = match.get("匹配结果", {})
  205. similarity = match_detail.get("相似度", 0)
  206. if not persona_name or not persona_dimension:
  207. continue
  208. # 构建人设节点ID
  209. persona_id = build_node_id("人设", persona_dimension, persona_type, persona_name)
  210. # 创建双向匹配边
  211. # 帖子标签 -> 人设标签
  212. edge_id_1 = build_edge_id(tag_id, "匹配", persona_id)
  213. match_edges[edge_id_1] = create_edge(
  214. source=tag_id,
  215. target=persona_id,
  216. edge_type="匹配",
  217. score=similarity,
  218. detail={}
  219. )
  220. # 人设标签 -> 帖子标签
  221. edge_id_2 = build_edge_id(persona_id, "匹配", tag_id)
  222. match_edges[edge_id_2] = create_edge(
  223. source=persona_id,
  224. target=tag_id,
  225. edge_type="匹配",
  226. score=similarity,
  227. detail={}
  228. )
  229. return point_nodes, tag_nodes, tag_to_point, match_edges
  230. # ==================== 构建边 ====================
  231. def build_belong_contain_edges(
  232. point_nodes: Dict[str, Dict],
  233. tag_nodes: Dict[str, Dict],
  234. tag_to_point: Dict[str, List[str]],
  235. dimension_node_ids: Dict[str, str]
  236. ) -> Dict[str, Dict]:
  237. """
  238. 构建属于/包含边
  239. Returns:
  240. 边字典 { edgeId: edgeData }
  241. """
  242. edges = {}
  243. # 1. 点 -> 维度(属于/包含)
  244. for point_id, point_data in point_nodes.items():
  245. dimension = point_data["dimension"]
  246. dim_node_id = dimension_node_ids[dimension]
  247. # 属于边:点 -> 维度
  248. edge_id = build_edge_id(point_id, "属于", dim_node_id)
  249. edges[edge_id] = create_edge(
  250. source=point_id,
  251. target=dim_node_id,
  252. edge_type="属于",
  253. score=1.0
  254. )
  255. # 包含边:维度 -> 点
  256. edge_id_contain = build_edge_id(dim_node_id, "包含", point_id)
  257. edges[edge_id_contain] = create_edge(
  258. source=dim_node_id,
  259. target=point_id,
  260. edge_type="包含",
  261. score=1.0
  262. )
  263. # 2. 标签 -> 点(属于/包含)
  264. for tag_id, point_ids in tag_to_point.items():
  265. for point_id in point_ids:
  266. # 属于边:标签 -> 点
  267. edge_id = build_edge_id(tag_id, "属于", point_id)
  268. edges[edge_id] = create_edge(
  269. source=tag_id,
  270. target=point_id,
  271. edge_type="属于",
  272. score=1.0
  273. )
  274. # 包含边:点 -> 标签
  275. edge_id_contain = build_edge_id(point_id, "包含", tag_id)
  276. edges[edge_id_contain] = create_edge(
  277. source=point_id,
  278. target=tag_id,
  279. edge_type="包含",
  280. score=1.0
  281. )
  282. return edges
  283. # ==================== 构建索引 ====================
  284. def build_index(edges: Dict[str, Dict]) -> Dict:
  285. """
  286. 构建游走索引
  287. Returns:
  288. {
  289. "outEdges": { nodeId: { edgeType: [{ target, score }] } },
  290. "inEdges": { nodeId: { edgeType: [{ source, score }] } }
  291. }
  292. """
  293. out_edges = {}
  294. in_edges = {}
  295. for edge_data in edges.values():
  296. source = edge_data["source"]
  297. target = edge_data["target"]
  298. edge_type = edge_data["type"]
  299. score = edge_data["score"]
  300. # outEdges
  301. if source not in out_edges:
  302. out_edges[source] = {}
  303. if edge_type not in out_edges[source]:
  304. out_edges[source][edge_type] = []
  305. out_edges[source][edge_type].append({
  306. "target": target,
  307. "score": score
  308. })
  309. # inEdges
  310. if target not in in_edges:
  311. in_edges[target] = {}
  312. if edge_type not in in_edges[target]:
  313. in_edges[target][edge_type] = []
  314. in_edges[target][edge_type].append({
  315. "source": source,
  316. "score": score
  317. })
  318. return {
  319. "outEdges": out_edges,
  320. "inEdges": in_edges
  321. }
  322. # ==================== 构建嵌套树 ====================
  323. def build_nested_tree(nodes: Dict[str, Dict], edges: Dict[str, Dict], root_id: str) -> Dict:
  324. """
  325. 从根节点开始,沿"包含"边递归构建嵌套树结构
  326. Returns:
  327. 嵌套的树结构
  328. """
  329. # 从"包含"边构建 父节点 -> [子节点] 的映射
  330. parent_to_children = {}
  331. for edge_data in edges.values():
  332. if edge_data["type"] == "包含":
  333. parent_id = edge_data["source"]
  334. child_id = edge_data["target"]
  335. if parent_id not in parent_to_children:
  336. parent_to_children[parent_id] = []
  337. parent_to_children[parent_id].append(child_id)
  338. # 递归构建子树
  339. def build_subtree(node_id: str) -> Dict:
  340. node_data = nodes[node_id]
  341. subtree = {
  342. "id": node_id,
  343. "name": node_data["name"],
  344. "type": node_data["type"],
  345. "domain": node_data["domain"],
  346. "dimension": node_data["dimension"],
  347. "detail": node_data.get("detail", {}),
  348. "children": []
  349. }
  350. # 获取子节点
  351. child_ids = parent_to_children.get(node_id, [])
  352. for child_id in child_ids:
  353. if child_id in nodes:
  354. subtree["children"].append(build_subtree(child_id))
  355. return subtree
  356. return build_subtree(root_id)
  357. # ==================== 图游走工具 ====================
  358. def walk_graph(
  359. index: Dict,
  360. start_node: str,
  361. edge_types: List[str],
  362. direction: str = "out",
  363. min_score: float = None
  364. ) -> Set[str]:
  365. """
  366. 从起始节点出发,按指定边类型序列游走N步
  367. Args:
  368. index: 游走索引 {"outEdges": {...}, "inEdges": {...}}
  369. start_node: 起始节点ID
  370. edge_types: 边类型序列,如 ["属于", "包含"]
  371. direction: 游走方向 "out"(沿出边) / "in"(沿入边)
  372. min_score: 最小分数过滤
  373. Returns:
  374. 到达的节点ID集合
  375. """
  376. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  377. target_key = "target" if direction == "out" else "source"
  378. current_nodes = {start_node}
  379. for edge_type in edge_types:
  380. next_nodes = set()
  381. for node in current_nodes:
  382. neighbors = edge_index.get(node, {}).get(edge_type, [])
  383. for neighbor in neighbors:
  384. if min_score is not None and neighbor.get("score", 0) < min_score:
  385. continue
  386. next_nodes.add(neighbor[target_key])
  387. current_nodes = next_nodes
  388. if not current_nodes:
  389. break
  390. return current_nodes
  391. def get_neighbors(
  392. index: Dict,
  393. node_id: str,
  394. edge_type: str = None,
  395. direction: str = "out",
  396. min_score: float = None
  397. ) -> List[Dict]:
  398. """
  399. 获取节点的邻居
  400. Args:
  401. index: 游走索引
  402. node_id: 节点ID
  403. edge_type: 边类型(可选,不指定则返回所有类型)
  404. direction: 方向 "out" / "in"
  405. min_score: 最小分数过滤
  406. Returns:
  407. 邻居列表 [{"target": "...", "score": 0.5}, ...]
  408. """
  409. edge_index = index["outEdges"] if direction == "out" else index["inEdges"]
  410. node_edges = edge_index.get(node_id, {})
  411. if edge_type:
  412. neighbors = node_edges.get(edge_type, [])
  413. else:
  414. neighbors = []
  415. for edges in node_edges.values():
  416. neighbors.extend(edges)
  417. if min_score is not None:
  418. neighbors = [n for n in neighbors if n.get("score", 0) >= min_score]
  419. return neighbors
  420. # ==================== 处理单个帖子 ====================
  421. def process_single_post(filtered_file: Path, output_dir: Path) -> Dict:
  422. """
  423. 处理单个帖子,生成帖子图谱
  424. Returns:
  425. 处理结果统计
  426. """
  427. # 读取数据
  428. with open(filtered_file, "r", encoding="utf-8") as f:
  429. filtered_data = json.load(f)
  430. post_id = filtered_data.get("帖子id", "")
  431. post_detail = filtered_data.get("帖子详情", {})
  432. post_title = post_detail.get("title", "")
  433. # 初始化节点和边
  434. all_nodes = {}
  435. all_edges = {}
  436. # 1. 提取点节点、标签节点和匹配边
  437. point_nodes, tag_nodes, tag_to_point, match_edges = extract_points_tags_and_matches(filtered_data)
  438. # 2. 添加根节点
  439. root_id = build_node_id("帖子", "帖子", "帖子", post_id)
  440. all_nodes[root_id] = create_node(
  441. domain="帖子",
  442. dimension="帖子",
  443. node_type="帖子",
  444. name=post_id,
  445. detail={
  446. "postTitle": post_title,
  447. "postDetail": post_detail
  448. }
  449. )
  450. # 3. 添加维度节点
  451. dimensions = ["灵感点", "目的点", "关键点"]
  452. dimension_node_ids = {}
  453. for dim in dimensions:
  454. dim_id = build_node_id("帖子", dim, dim, dim)
  455. dimension_node_ids[dim] = dim_id
  456. all_nodes[dim_id] = create_node(
  457. domain="帖子",
  458. dimension=dim,
  459. node_type=dim,
  460. name=dim,
  461. detail={}
  462. )
  463. # 维度 -> 根 的属于边
  464. edge_id = build_edge_id(dim_id, "属于", root_id)
  465. all_edges[edge_id] = create_edge(
  466. source=dim_id,
  467. target=root_id,
  468. edge_type="属于",
  469. score=1.0
  470. )
  471. # 根 -> 维度 的包含边
  472. edge_id_contain = build_edge_id(root_id, "包含", dim_id)
  473. all_edges[edge_id_contain] = create_edge(
  474. source=root_id,
  475. target=dim_id,
  476. edge_type="包含",
  477. score=1.0
  478. )
  479. # 4. 添加点节点和标签节点
  480. all_nodes.update(point_nodes)
  481. all_nodes.update(tag_nodes)
  482. # 5. 构建属于/包含边
  483. belong_contain_edges = build_belong_contain_edges(
  484. point_nodes, tag_nodes, tag_to_point, dimension_node_ids
  485. )
  486. all_edges.update(belong_contain_edges)
  487. # 6. 添加匹配边
  488. all_edges.update(match_edges)
  489. # 7. 构建索引
  490. index = build_index(all_edges)
  491. # 8. 构建嵌套树
  492. tree = build_nested_tree(all_nodes, all_edges, root_id)
  493. # 统计
  494. point_count = len(point_nodes)
  495. tag_count = len(tag_nodes)
  496. match_count = len(match_edges) // 2 # 双向边,除以2得到实际匹配数
  497. dimension_stats = {}
  498. for dim in dimensions:
  499. dim_points = sum(1 for n in point_nodes.values() if n["dimension"] == dim)
  500. dim_tags = sum(1 for n in tag_nodes.values() if n["dimension"] == dim)
  501. dimension_stats[dim] = {
  502. "pointCount": dim_points,
  503. "tagCount": dim_tags
  504. }
  505. # 构建输出
  506. output_data = {
  507. "meta": {
  508. "postId": post_id,
  509. "postTitle": post_title,
  510. "postDetail": post_detail,
  511. "createdAt": datetime.now().isoformat(),
  512. "stats": {
  513. "nodeCount": len(all_nodes),
  514. "edgeCount": len(all_edges),
  515. "pointCount": point_count,
  516. "tagCount": tag_count,
  517. "matchCount": match_count,
  518. "dimensions": dimension_stats
  519. }
  520. },
  521. "nodes": all_nodes,
  522. "edges": all_edges,
  523. "index": index,
  524. "tree": tree
  525. }
  526. # 保存
  527. output_file = output_dir / f"{post_id}_帖子图谱.json"
  528. with open(output_file, "w", encoding="utf-8") as f:
  529. json.dump(output_data, f, ensure_ascii=False, indent=2)
  530. return {
  531. "postId": post_id,
  532. "postTitle": post_title,
  533. "nodeCount": len(all_nodes),
  534. "edgeCount": len(all_edges),
  535. "pointCount": point_count,
  536. "tagCount": tag_count,
  537. "matchCount": match_count,
  538. "outputFile": str(output_file)
  539. }
  540. # ==================== 主函数 ====================
  541. def main():
  542. config = PathConfig()
  543. config.ensure_dirs()
  544. print(f"账号: {config.account_name}")
  545. print(f"输出版本: {config.output_version}")
  546. print()
  547. # 输入目录
  548. filtered_results_dir = config.intermediate_dir / "filtered_results"
  549. # 输出目录
  550. output_dir = config.intermediate_dir / "post_graph"
  551. output_dir.mkdir(parents=True, exist_ok=True)
  552. print(f"输入目录: {filtered_results_dir}")
  553. print(f"输出目录: {output_dir}")
  554. print()
  555. # 获取所有帖子文件
  556. filtered_files = list(filtered_results_dir.glob("*_filtered.json"))
  557. print(f"找到 {len(filtered_files)} 个帖子文件")
  558. print()
  559. # 处理每个帖子
  560. results = []
  561. for i, filtered_file in enumerate(filtered_files, 1):
  562. print(f"[{i}/{len(filtered_files)}] 处理: {filtered_file.name}")
  563. result = process_single_post(filtered_file, output_dir)
  564. results.append(result)
  565. print(f" 节点: {result['nodeCount']}, 边: {result['edgeCount']}")
  566. print(f" 点: {result['pointCount']}, 标签: {result['tagCount']}, 匹配: {result['matchCount']}")
  567. print(f" → {Path(result['outputFile']).name}")
  568. print()
  569. # 汇总统计
  570. print("=" * 60)
  571. print("处理完成!")
  572. print(f" 帖子数: {len(results)}")
  573. print(f" 总节点数: {sum(r['nodeCount'] for r in results)}")
  574. print(f" 总边数: {sum(r['edgeCount'] for r in results)}")
  575. print(f" 总点数: {sum(r['pointCount'] for r in results)}")
  576. print(f" 总标签数: {sum(r['tagCount'] for r in results)}")
  577. print(f" 总匹配数: {sum(r['matchCount'] for r in results)}")
  578. print(f"\n输出目录: {output_dir}")
  579. if __name__ == "__main__":
  580. main()