extract_nodes_and_edges.py 43 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从源数据文件中提取节点列表和边关系
  5. 输入:
  6. 1. 过去帖子_pattern聚合结果.json - 分类节点、标签-分类边
  7. 2. 过去帖子_what解构结果目录 - 标签节点来源
  8. 3. dimension_associations_analysis.json - 分类-分类边(共现)
  9. 输出:
  10. 1. 节点列表.json
  11. 2. 边关系.json
  12. """
  13. import json
  14. from pathlib import Path
  15. from typing import Dict, List, Any, Set, Optional
  16. import sys
  17. import re
  18. # 添加项目根目录到路径
  19. project_root = Path(__file__).parent.parent.parent
  20. sys.path.insert(0, str(project_root))
  21. from script.data_processing.path_config import PathConfig
  22. from script.detail import get_xiaohongshu_detail
  23. def get_post_detail(post_id: str) -> Optional[Dict]:
  24. """获取帖子详情"""
  25. try:
  26. detail = get_xiaohongshu_detail(post_id)
  27. return detail
  28. except Exception as e:
  29. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  30. return None
  31. def get_last_segment(path: str) -> str:
  32. """获取路径的最后一段"""
  33. return path.split("/")[-1]
  34. def build_node_id(dimension: str, node_type: str, name: str) -> str:
  35. """
  36. 构建节点ID
  37. Args:
  38. dimension: 节点层级(灵感点、目的点、关键点)
  39. node_type: 节点类型(分类、标签)
  40. name: 节点名称
  41. Returns:
  42. 节点ID,格式: {层级}_{类型}_{名称}
  43. """
  44. return f"{dimension}_{node_type}_{name}"
  45. def extract_post_id_from_filename(filename: str) -> str:
  46. """从文件名中提取帖子ID
  47. 格式: 68a6b96f000000001d006058.json
  48. """
  49. return filename.replace('.json', '')
  50. def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
  51. """
  52. 获取当前帖子目录中的所有帖子ID
  53. Args:
  54. current_posts_dir: 当前帖子目录路径
  55. Returns:
  56. 当前帖子ID集合
  57. """
  58. if not current_posts_dir.exists():
  59. print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
  60. return set()
  61. json_files = list(current_posts_dir.glob("*.json"))
  62. if not json_files:
  63. print(f"警告: 当前帖子目录为空: {current_posts_dir}")
  64. return set()
  65. print(f"找到 {len(json_files)} 个当前帖子")
  66. post_ids = set()
  67. for file_path in json_files:
  68. post_id = extract_post_id_from_filename(file_path.name)
  69. if post_id:
  70. post_ids.add(post_id)
  71. print(f"提取到 {len(post_ids)} 个帖子ID")
  72. return post_ids
  73. def collect_all_post_ids_from_nodes(nodes: List[Dict]) -> Set[str]:
  74. """从节点列表中收集所有帖子ID"""
  75. post_ids = set()
  76. for node in nodes:
  77. for source in node.get("节点来源", []):
  78. post_id = source.get("帖子ID", "")
  79. if post_id:
  80. post_ids.add(post_id)
  81. return post_ids
  82. def collect_all_post_ids_from_edges(edges: List[Dict]) -> Set[str]:
  83. """从边列表中收集所有帖子ID"""
  84. post_ids = set()
  85. for edge in edges:
  86. edge_type = edge.get("边类型", "")
  87. edge_details = edge.get("边详情", {})
  88. if edge_type in ("分类共现(跨点)", "标签共现"):
  89. common_post_ids = edge_details.get("共同帖子ID", [])
  90. post_ids.update(common_post_ids)
  91. elif edge_type in ("支撑", "关联意图"):
  92. # 新边类型使用帖子ID列表
  93. post_id_list = edge_details.get("帖子ID列表", [])
  94. post_ids.update(post_id_list)
  95. # 点内共现边、属于边、包含边不包含帖子ID
  96. return post_ids
  97. def fetch_post_details(post_ids: Set[str]) -> Dict[str, Dict]:
  98. """
  99. 批量获取帖子详情
  100. Args:
  101. post_ids: 帖子ID集合
  102. Returns:
  103. 帖子ID -> 帖子详情 的映射
  104. """
  105. print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
  106. post_details = {}
  107. for i, post_id in enumerate(sorted(post_ids), 1):
  108. print(f" [{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
  109. detail = get_post_detail(post_id)
  110. if detail:
  111. post_details[post_id] = detail
  112. print(f"成功获取 {len(post_details)} 个帖子详情")
  113. return post_details
  114. def filter_nodes_by_post_ids(nodes: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  115. """
  116. 过滤节点,排除指定帖子ID的来源
  117. Args:
  118. nodes: 节点列表
  119. exclude_post_ids: 要排除的帖子ID集合
  120. Returns:
  121. 过滤后的节点列表
  122. """
  123. filtered_nodes = []
  124. for node in nodes:
  125. # 过滤节点来源
  126. filtered_sources = [
  127. source for source in node.get("节点来源", [])
  128. if source.get("帖子ID", "") not in exclude_post_ids
  129. ]
  130. # 只保留有来源的节点
  131. if filtered_sources:
  132. node_copy = node.copy()
  133. node_copy["节点来源"] = filtered_sources
  134. # 重新计算帖子数
  135. unique_post_ids = set(s.get("帖子ID", "") for s in filtered_sources if s.get("帖子ID"))
  136. node_copy["帖子数"] = len(unique_post_ids)
  137. filtered_nodes.append(node_copy)
  138. return filtered_nodes
  139. def filter_edges_by_post_ids(edges: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  140. """
  141. 过滤边,排除指定帖子ID的共现边
  142. Args:
  143. edges: 边列表
  144. exclude_post_ids: 要排除的帖子ID集合
  145. Returns:
  146. 过滤后的边列表
  147. """
  148. filtered_edges = []
  149. for edge in edges:
  150. edge_type = edge["边类型"]
  151. if edge_type in ("分类共现(跨点)", "标签共现"):
  152. # 过滤共同帖子ID
  153. edge_details = edge.get("边详情", {})
  154. common_post_ids = edge_details.get("共同帖子ID", [])
  155. filtered_post_ids = [pid for pid in common_post_ids if pid not in exclude_post_ids]
  156. if filtered_post_ids:
  157. edge_copy = edge.copy()
  158. edge_copy["边详情"] = edge_details.copy()
  159. edge_copy["边详情"]["共同帖子ID"] = filtered_post_ids
  160. edge_copy["边详情"]["共同帖子数"] = len(filtered_post_ids)
  161. filtered_edges.append(edge_copy)
  162. elif edge_type == "分类共现(点内)":
  163. # 点内共现边不涉及帖子ID,直接保留
  164. filtered_edges.append(edge)
  165. else:
  166. # 属于/包含边不需要过滤
  167. filtered_edges.append(edge)
  168. return filtered_edges
  169. # ========== 分类节点提取 ==========
  170. def extract_category_nodes_from_pattern(
  171. pattern_data: Dict,
  172. dimension_key: str,
  173. dimension_name: str
  174. ) -> List[Dict]:
  175. """
  176. 从pattern聚合结果中提取分类节点
  177. Args:
  178. pattern_data: pattern聚合数据
  179. dimension_key: 维度键名(灵感点列表、目的点、关键点列表)
  180. dimension_name: 维度名称(灵感点、目的点、关键点)
  181. Returns:
  182. 分类节点列表
  183. """
  184. nodes = []
  185. if dimension_key not in pattern_data:
  186. return nodes
  187. def collect_sources_recursively(node: Dict) -> List[Dict]:
  188. """递归收集节点及其所有子节点的特征来源"""
  189. sources = []
  190. # 收集当前节点的特征
  191. if "特征列表" in node:
  192. for feature in node["特征列表"]:
  193. source = {
  194. "点的名称": feature.get("所属点", ""),
  195. "点的描述": feature.get("点描述", ""),
  196. "帖子ID": feature.get("帖子id", "")
  197. }
  198. sources.append(source)
  199. # 递归收集子节点的特征
  200. for key, value in node.items():
  201. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  202. continue
  203. if isinstance(value, dict):
  204. sources.extend(collect_sources_recursively(value))
  205. return sources
  206. def traverse_node(node: Dict, parent_categories: List[str]):
  207. """递归遍历节点"""
  208. for key, value in node.items():
  209. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  210. continue
  211. if isinstance(value, dict):
  212. # 当前节点是一个分类
  213. current_path = parent_categories + [key]
  214. # 获取帖子列表
  215. post_ids = value.get("帖子列表", [])
  216. # 构建节点来源(从特征列表中获取,如果没有则递归收集子分类的)
  217. node_sources = []
  218. if "特征列表" in value:
  219. for feature in value["特征列表"]:
  220. source = {
  221. "点的名称": feature.get("所属点", ""),
  222. "点的描述": feature.get("点描述", ""),
  223. "帖子ID": feature.get("帖子id", "")
  224. }
  225. node_sources.append(source)
  226. else:
  227. # 没有直接特征,递归收集子分类的特征来源
  228. node_sources = collect_sources_recursively(value)
  229. node_info = {
  230. "节点ID": build_node_id(dimension_name, "分类", key),
  231. "节点名称": key,
  232. "节点类型": "分类",
  233. "节点层级": dimension_name,
  234. "所属分类": parent_categories.copy(),
  235. "帖子数": len(post_ids) if post_ids else len(set(s.get("帖子ID", "") for s in node_sources if s.get("帖子ID"))),
  236. "节点来源": node_sources
  237. }
  238. nodes.append(node_info)
  239. # 递归处理子节点
  240. traverse_node(value, current_path)
  241. traverse_node(pattern_data[dimension_key], [])
  242. return nodes
  243. # ========== 标签节点提取 ==========
  244. def extract_tag_nodes_from_pattern(
  245. pattern_data: Dict,
  246. dimension_key: str,
  247. dimension_name: str
  248. ) -> List[Dict]:
  249. """
  250. 从pattern聚合结果中提取标签节点
  251. Args:
  252. pattern_data: pattern聚合数据
  253. dimension_key: 维度键名
  254. dimension_name: 维度名称
  255. Returns:
  256. 标签节点列表
  257. """
  258. nodes = []
  259. tag_map = {} # 用于合并同名标签
  260. if dimension_key not in pattern_data:
  261. return nodes
  262. def traverse_node(node: Dict, parent_categories: List[str]):
  263. """递归遍历节点"""
  264. # 处理特征列表(标签)
  265. if "特征列表" in node:
  266. for feature in node["特征列表"]:
  267. tag_name = feature.get("特征名称", "")
  268. if not tag_name:
  269. continue
  270. source = {
  271. "点的名称": feature.get("所属点", ""),
  272. "点的描述": feature.get("点描述", ""),
  273. "帖子ID": feature.get("帖子id", "")
  274. }
  275. tag_id = build_node_id(dimension_name, "标签", tag_name)
  276. if tag_id not in tag_map:
  277. tag_map[tag_id] = {
  278. "节点ID": tag_id,
  279. "节点名称": tag_name,
  280. "节点类型": "标签",
  281. "节点层级": dimension_name,
  282. "所属分类": parent_categories.copy(),
  283. "帖子数": 0,
  284. "节点来源": [],
  285. "_post_ids": set()
  286. }
  287. tag_map[tag_id]["节点来源"].append(source)
  288. if source["帖子ID"]:
  289. tag_map[tag_id]["_post_ids"].add(source["帖子ID"])
  290. # 递归处理子节点
  291. for key, value in node.items():
  292. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  293. continue
  294. if isinstance(value, dict):
  295. current_path = parent_categories + [key]
  296. traverse_node(value, current_path)
  297. traverse_node(pattern_data[dimension_key], [])
  298. # 转换为列表,计算帖子数
  299. for tag_id, tag_info in tag_map.items():
  300. tag_info["帖子数"] = len(tag_info["_post_ids"])
  301. del tag_info["_post_ids"]
  302. nodes.append(tag_info)
  303. return nodes
  304. # ========== 标签-分类边提取 ==========
  305. def extract_tag_category_edges_from_pattern(
  306. pattern_data: Dict,
  307. dimension_key: str,
  308. dimension_name: str
  309. ) -> List[Dict]:
  310. """
  311. 从pattern聚合结果中提取标签-分类边(属于/包含)
  312. Args:
  313. pattern_data: pattern聚合数据
  314. dimension_key: 维度键名
  315. dimension_name: 维度名称
  316. Returns:
  317. 边列表
  318. """
  319. edges = []
  320. seen_edges = set() # 避免重复边
  321. if dimension_key not in pattern_data:
  322. return edges
  323. def traverse_node(node: Dict, parent_categories: List[str]):
  324. """递归遍历节点"""
  325. current_category = parent_categories[-1] if parent_categories else None
  326. # 处理特征列表(标签)
  327. if "特征列表" in node and current_category:
  328. for feature in node["特征列表"]:
  329. tag_name = feature.get("特征名称", "")
  330. if not tag_name:
  331. continue
  332. tag_id = build_node_id(dimension_name, "标签", tag_name)
  333. category_id = build_node_id(dimension_name, "分类", current_category)
  334. # 属于边:标签 -> 分类
  335. edge_key_belong = (tag_id, category_id, "属于")
  336. if edge_key_belong not in seen_edges:
  337. seen_edges.add(edge_key_belong)
  338. edges.append({
  339. "源节点ID": tag_id,
  340. "目标节点ID": category_id,
  341. "边类型": "属于",
  342. "边详情": {}
  343. })
  344. # 包含边:分类 -> 标签
  345. edge_key_contain = (category_id, tag_id, "包含")
  346. if edge_key_contain not in seen_edges:
  347. seen_edges.add(edge_key_contain)
  348. edges.append({
  349. "源节点ID": category_id,
  350. "目标节点ID": tag_id,
  351. "边类型": "包含",
  352. "边详情": {}
  353. })
  354. # 递归处理子节点
  355. for key, value in node.items():
  356. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  357. continue
  358. if isinstance(value, dict):
  359. current_path = parent_categories + [key]
  360. traverse_node(value, current_path)
  361. traverse_node(pattern_data[dimension_key], [])
  362. return edges
  363. # ========== 标签-标签共现边提取 ==========
  364. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  365. """
  366. 从单个帖子的解构结果中提取所有标签(特征名称)
  367. Args:
  368. post_data: 帖子解构数据
  369. Returns:
  370. 按维度分组的标签字典 {"灵感点": [...], "目的点": [...], "关键点": [...]}
  371. """
  372. tags_by_dimension = {
  373. "灵感点": [],
  374. "目的点": [],
  375. "关键点": []
  376. }
  377. # 提取灵感点
  378. if "inspiration_final_result" in post_data:
  379. inspiration_data = post_data["inspiration_final_result"]
  380. for item in inspiration_data.get("最终灵感点列表", []):
  381. tag_name = item.get("灵感点", "")
  382. if tag_name:
  383. tags_by_dimension["灵感点"].append(tag_name)
  384. # 提取目的点(意图+实质)
  385. if "purpose_final_result" in post_data:
  386. purpose_data = post_data["purpose_final_result"]
  387. for item in purpose_data.get("最终意图列表", []):
  388. tag_name = item.get("目的点", "")
  389. if tag_name:
  390. tags_by_dimension["目的点"].append(tag_name)
  391. for item in purpose_data.get("最终实质列表", []):
  392. tag_name = item.get("目的点", "")
  393. if tag_name:
  394. tags_by_dimension["目的点"].append(tag_name)
  395. # 提取关键点
  396. if "keypoint_final" in post_data:
  397. keypoint_data = post_data["keypoint_final"]
  398. for item in keypoint_data.get("最终关键点列表", []):
  399. tag_name = item.get("关键点", "")
  400. if tag_name:
  401. tags_by_dimension["关键点"].append(tag_name)
  402. return tags_by_dimension
  403. def extract_tag_cooccurrence_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> List[Dict]:
  404. """
  405. 从历史帖子解构结果中提取标签-标签共现边
  406. Args:
  407. historical_posts_dir: 历史帖子解构结果目录
  408. exclude_post_ids: 要排除的帖子ID集合
  409. Returns:
  410. 标签共现边列表
  411. """
  412. if exclude_post_ids is None:
  413. exclude_post_ids = set()
  414. # 存储每对标签的共现信息
  415. # key: (tag1_id, tag2_id), value: {"共同帖子ID": set()}
  416. cooccurrence_map = {}
  417. if not historical_posts_dir.exists():
  418. print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
  419. return []
  420. json_files = list(historical_posts_dir.glob("*.json"))
  421. print(f"找到 {len(json_files)} 个历史帖子文件")
  422. for file_path in json_files:
  423. # 提取帖子ID
  424. post_id = extract_post_id_from_filename(file_path.name)
  425. if not post_id:
  426. continue
  427. # 跳过排除的帖子
  428. if post_id in exclude_post_ids:
  429. continue
  430. try:
  431. with open(file_path, "r", encoding="utf-8") as f:
  432. post_data = json.load(f)
  433. # 提取该帖子的所有标签
  434. tags_by_dimension = extract_tags_from_post(post_data)
  435. # 对每个维度内的标签两两组合,构建共现关系
  436. for dimension, tags in tags_by_dimension.items():
  437. unique_tags = list(set(tags)) # 去重
  438. for i in range(len(unique_tags)):
  439. for j in range(i + 1, len(unique_tags)):
  440. tag1 = unique_tags[i]
  441. tag2 = unique_tags[j]
  442. # 构建节点ID
  443. tag1_id = build_node_id(dimension, "标签", tag1)
  444. tag2_id = build_node_id(dimension, "标签", tag2)
  445. # 确保顺序一致(按字典序)
  446. if tag1_id > tag2_id:
  447. tag1_id, tag2_id = tag2_id, tag1_id
  448. key = (tag1_id, tag2_id, dimension)
  449. if key not in cooccurrence_map:
  450. cooccurrence_map[key] = {"共同帖子ID": set()}
  451. cooccurrence_map[key]["共同帖子ID"].add(post_id)
  452. except Exception as e:
  453. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  454. # 转换为边列表
  455. edges = []
  456. for (tag1_id, tag2_id, dimension), info in cooccurrence_map.items():
  457. common_post_ids = list(info["共同帖子ID"])
  458. edge = {
  459. "源节点ID": tag1_id,
  460. "目标节点ID": tag2_id,
  461. "边类型": "标签共现",
  462. "边详情": {
  463. "共同帖子数": len(common_post_ids),
  464. "共同帖子ID": common_post_ids
  465. }
  466. }
  467. edges.append(edge)
  468. return edges
  469. # ========== 支撑边和关联意图边提取(新版数据结构)==========
  470. def extract_support_and_intent_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> tuple[List[Dict], List[Dict]]:
  471. """
  472. 从历史帖子解构结果中提取支撑边和关联意图边(仅新版数据结构)
  473. 支撑边:关键点 -> 灵感点/意图/实质
  474. 关联意图边:实质 -> 意图
  475. Args:
  476. historical_posts_dir: 历史帖子解构结果目录
  477. exclude_post_ids: 要排除的帖子ID集合
  478. Returns:
  479. (支撑边列表, 关联意图边列表)
  480. """
  481. if exclude_post_ids is None:
  482. exclude_post_ids = set()
  483. support_edges = [] # 支撑边
  484. intent_edges = [] # 关联意图边
  485. seen_support_edges = set()
  486. seen_intent_edges = set()
  487. if not historical_posts_dir.exists():
  488. print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
  489. return [], []
  490. json_files = list(historical_posts_dir.glob("*.json"))
  491. print(f"找到 {len(json_files)} 个历史帖子文件")
  492. for file_path in json_files:
  493. # 提取帖子ID
  494. post_id = extract_post_id_from_filename(file_path.name)
  495. if not post_id:
  496. post_id = file_path.stem
  497. # 跳过排除的帖子
  498. if post_id in exclude_post_ids:
  499. continue
  500. try:
  501. with open(file_path, "r", encoding="utf-8") as f:
  502. post_data = json.load(f)
  503. # 只处理新版数据结构
  504. if "keypoint_final" not in post_data and "purpose_final_result" not in post_data:
  505. continue
  506. # 构建帖子内的ID到名称映射
  507. id_to_name = {}
  508. id_to_type = {} # 记录ID对应的类型(灵感点/意图/实质)
  509. # 收集灵感点ID
  510. if "inspiration_final_result" in post_data:
  511. for item in post_data["inspiration_final_result"].get("最终灵感点列表", []):
  512. item_id = item.get("id", "")
  513. item_name = item.get("灵感点", "")
  514. if item_id and item_name:
  515. id_to_name[item_id] = item_name
  516. id_to_type[item_id] = "灵感点"
  517. # 收集意图和实质ID
  518. if "purpose_final_result" in post_data:
  519. purpose_data = post_data["purpose_final_result"]
  520. # 意图
  521. for item in purpose_data.get("最终意图列表", []):
  522. item_id = item.get("意图ID", "")
  523. item_name = item.get("目的点", "")
  524. if item_id and item_name:
  525. id_to_name[item_id] = item_name
  526. id_to_type[item_id] = "意图"
  527. # 实质
  528. for item in purpose_data.get("最终实质列表", []):
  529. item_id = item.get("实质ID", "")
  530. item_name = item.get("目的点", "")
  531. related_intent_id = item.get("关联意图ID", "")
  532. if item_id and item_name:
  533. id_to_name[item_id] = item_name
  534. id_to_type[item_id] = "实质"
  535. # 提取关联意图边:实质 -> 意图
  536. if item_id and related_intent_id and related_intent_id in id_to_name:
  537. substance_name = item_name
  538. intent_name = id_to_name[related_intent_id]
  539. # 构建节点ID(实质和意图都属于目的点维度)
  540. substance_node_id = build_node_id("目的点", "标签", substance_name)
  541. intent_node_id = build_node_id("目的点", "标签", intent_name)
  542. edge_key = (substance_node_id, intent_node_id)
  543. if edge_key not in seen_intent_edges:
  544. seen_intent_edges.add(edge_key)
  545. intent_edges.append({
  546. "源节点ID": substance_node_id,
  547. "目标节点ID": intent_node_id,
  548. "边类型": "关联意图",
  549. "边详情": {
  550. "源类型": "实质",
  551. "目标类型": "意图",
  552. "帖子ID列表": [post_id]
  553. }
  554. })
  555. else:
  556. # 已存在的边,添加帖子ID
  557. for edge in intent_edges:
  558. if edge["源节点ID"] == substance_node_id and edge["目标节点ID"] == intent_node_id:
  559. if post_id not in edge["边详情"]["帖子ID列表"]:
  560. edge["边详情"]["帖子ID列表"].append(post_id)
  561. break
  562. # 收集关键点ID并提取支撑边
  563. if "keypoint_final" in post_data:
  564. for item in post_data["keypoint_final"].get("最终关键点列表", []):
  565. kp_id = item.get("关键点ID", "")
  566. kp_name = item.get("关键点", "")
  567. support_ids = item.get("支撑的ID", [])
  568. if not kp_name or not support_ids:
  569. continue
  570. # 关键点节点ID
  571. kp_node_id = build_node_id("关键点", "标签", kp_name)
  572. # 遍历支撑的ID
  573. for support_id in support_ids:
  574. if support_id not in id_to_name:
  575. continue
  576. target_name = id_to_name[support_id]
  577. target_type = id_to_type[support_id]
  578. # 确定目标节点的维度
  579. if target_type == "灵感点":
  580. target_dimension = "灵感点"
  581. else: # 意图或实质
  582. target_dimension = "目的点"
  583. target_node_id = build_node_id(target_dimension, "标签", target_name)
  584. edge_key = (kp_node_id, target_node_id)
  585. if edge_key not in seen_support_edges:
  586. seen_support_edges.add(edge_key)
  587. support_edges.append({
  588. "源节点ID": kp_node_id,
  589. "目标节点ID": target_node_id,
  590. "边类型": "支撑",
  591. "边详情": {
  592. "源类型": "关键点",
  593. "目标类型": target_type,
  594. "帖子ID列表": [post_id]
  595. }
  596. })
  597. else:
  598. # 已存在的边,添加帖子ID
  599. for edge in support_edges:
  600. if edge["源节点ID"] == kp_node_id and edge["目标节点ID"] == target_node_id:
  601. if post_id not in edge["边详情"]["帖子ID列表"]:
  602. edge["边详情"]["帖子ID列表"].append(post_id)
  603. break
  604. except Exception as e:
  605. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  606. return support_edges, intent_edges
  607. # ========== 分类-分类边提取 ==========
  608. def extract_category_edges_from_associations(associations_data: Dict) -> List[Dict]:
  609. """
  610. 从dimension_associations_analysis.json中提取分类-分类边(共现)
  611. Args:
  612. associations_data: 关联分析数据
  613. Returns:
  614. 边列表
  615. """
  616. edges = []
  617. if "单维度关联分析" not in associations_data:
  618. return edges
  619. single_dim = associations_data["单维度关联分析"]
  620. # 维度映射
  621. dimension_map = {
  622. "灵感点维度": "灵感点",
  623. "目的点维度": "目的点",
  624. "关键点维度": "关键点"
  625. }
  626. for dim_key, dim_data in single_dim.items():
  627. if dim_key not in dimension_map:
  628. continue
  629. source_dimension = dimension_map[dim_key]
  630. # 遍历该维度下的所有关联方向
  631. for direction_key, direction_data in dim_data.items():
  632. if direction_key == "说明":
  633. continue
  634. if "→" not in direction_key:
  635. continue
  636. # 遍历每个源分类
  637. for source_path, source_info in direction_data.items():
  638. source_name = get_last_segment(source_path)
  639. source_node_id = build_node_id(source_dimension, "分类", source_name)
  640. # 确定目标维度
  641. for field_name, associations in source_info.items():
  642. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  643. continue
  644. target_dimension = field_name[1:-3]
  645. if not isinstance(associations, list):
  646. continue
  647. for assoc in associations:
  648. target_path = assoc.get("目标分类", "")
  649. if not target_path:
  650. continue
  651. target_name = get_last_segment(target_path)
  652. target_node_id = build_node_id(target_dimension, "分类", target_name)
  653. edge = {
  654. "源节点ID": source_node_id,
  655. "目标节点ID": target_node_id,
  656. "边类型": "分类共现(跨点)",
  657. "边详情": {
  658. "Jaccard相似度": assoc.get("Jaccard相似度", 0),
  659. "重叠系数": assoc.get("重叠系数", 0),
  660. "共同帖子数": assoc.get("共同帖子数", 0),
  661. "共同帖子ID": assoc.get("共同帖子ID", [])
  662. }
  663. }
  664. edges.append(edge)
  665. return edges
  666. # ========== 点内分类共现边提取 ==========
  667. def extract_intra_category_edges(intra_associations_data: Dict) -> List[Dict]:
  668. """
  669. 从intra_dimension_associations_analysis.json中提取点内分类共现边
  670. Args:
  671. intra_associations_data: 点内关联分析数据
  672. Returns:
  673. 边列表
  674. """
  675. edges = []
  676. seen_edges = set() # 避免重复边
  677. if "叶子分类组合聚类" not in intra_associations_data:
  678. return edges
  679. clusters_by_dim = intra_associations_data["叶子分类组合聚类"]
  680. for dimension, clusters in clusters_by_dim.items():
  681. if dimension not in ("灵感点", "目的点", "关键点"):
  682. continue
  683. for cluster_key, cluster_data in clusters.items():
  684. leaf_categories = cluster_data.get("叶子分类组合", [])
  685. point_count = cluster_data.get("点数", 0)
  686. point_details = cluster_data.get("点详情列表", [])
  687. # 提取点名称列表
  688. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  689. # 两两组合生成共现边
  690. for i in range(len(leaf_categories)):
  691. for j in range(i + 1, len(leaf_categories)):
  692. cat1 = leaf_categories[i]
  693. cat2 = leaf_categories[j]
  694. # 构建节点ID
  695. cat1_id = build_node_id(dimension, "分类", cat1)
  696. cat2_id = build_node_id(dimension, "分类", cat2)
  697. # 确保顺序一致(按字典序)
  698. if cat1_id > cat2_id:
  699. cat1_id, cat2_id = cat2_id, cat1_id
  700. edge_key = (cat1_id, cat2_id, dimension)
  701. if edge_key in seen_edges:
  702. # 已存在的边,累加点数和点名称
  703. for edge in edges:
  704. if (edge["源节点ID"] == cat1_id and
  705. edge["目标节点ID"] == cat2_id and
  706. edge["边类型"] == "分类共现(点内)"):
  707. edge["边详情"]["点数"] += point_count
  708. edge["边详情"]["关联点名称"].extend(point_names)
  709. break
  710. else:
  711. seen_edges.add(edge_key)
  712. edge = {
  713. "源节点ID": cat1_id,
  714. "目标节点ID": cat2_id,
  715. "边类型": "分类共现(点内)",
  716. "边详情": {
  717. "点数": point_count,
  718. "关联点名称": point_names.copy()
  719. }
  720. }
  721. edges.append(edge)
  722. return edges
  723. # ========== 主函数 ==========
  724. def main():
  725. # 使用路径配置
  726. config = PathConfig()
  727. config.ensure_dirs()
  728. print(f"账号: {config.account_name}")
  729. print(f"输出版本: {config.output_version}")
  730. print(f"过滤模式: {config.filter_mode}")
  731. print()
  732. # 输入文件路径
  733. pattern_file = config.pattern_cluster_file
  734. # 尝试新路径,如果不存在则使用旧路径
  735. associations_file_new = config.account_dir / "pattern相关文件/detail/dimension_associations_analysis.json"
  736. associations_file_old = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  737. associations_file = associations_file_new if associations_file_new.exists() else associations_file_old
  738. intra_associations_file_new = config.account_dir / "pattern相关文件/detail/intra_dimension_associations_analysis.json"
  739. intra_associations_file_old = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  740. intra_associations_file = intra_associations_file_new if intra_associations_file_new.exists() else intra_associations_file_old
  741. current_posts_dir = config.current_posts_dir
  742. # 输出文件路径
  743. nodes_output_file = config.intermediate_dir / "节点列表.json"
  744. edges_output_file = config.intermediate_dir / "边关系.json"
  745. print(f"输入文件:")
  746. print(f" pattern聚合文件: {pattern_file}")
  747. print(f" 跨点关联分析文件: {associations_file}")
  748. print(f" 点内关联分析文件: {intra_associations_file}")
  749. print(f" 当前帖子目录: {current_posts_dir}")
  750. print(f"\n输出文件:")
  751. print(f" 节点列表: {nodes_output_file}")
  752. print(f" 边关系: {edges_output_file}")
  753. print()
  754. # 读取pattern聚合结果
  755. print("正在读取pattern聚合结果...")
  756. with open(pattern_file, "r", encoding="utf-8") as f:
  757. pattern_data = json.load(f)
  758. # 读取跨点关联分析结果
  759. print("正在读取跨点关联分析结果...")
  760. with open(associations_file, "r", encoding="utf-8") as f:
  761. associations_data = json.load(f)
  762. # 读取点内关联分析结果
  763. print("正在读取点内关联分析结果...")
  764. with open(intra_associations_file, "r", encoding="utf-8") as f:
  765. intra_associations_data = json.load(f)
  766. # ===== 提取节点 =====
  767. print("\n" + "="*60)
  768. print("正在提取节点...")
  769. all_nodes = []
  770. # 维度映射
  771. dimension_mapping = {
  772. "灵感点列表": "灵感点",
  773. "目的点": "目的点",
  774. "关键点列表": "关键点"
  775. }
  776. # 提取分类节点
  777. print("\n提取分类节点:")
  778. for dim_key, dim_name in dimension_mapping.items():
  779. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  780. all_nodes.extend(category_nodes)
  781. print(f" {dim_name}: {len(category_nodes)} 个分类节点")
  782. # 提取标签节点
  783. print("\n提取标签节点:")
  784. for dim_key, dim_name in dimension_mapping.items():
  785. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  786. all_nodes.extend(tag_nodes)
  787. print(f" {dim_name}: {len(tag_nodes)} 个标签节点")
  788. print(f"\n总计: {len(all_nodes)} 个节点")
  789. # 统计节点类型
  790. category_count = sum(1 for n in all_nodes if n["节点类型"] == "分类")
  791. tag_count = sum(1 for n in all_nodes if n["节点类型"] == "标签")
  792. print(f" 分类节点: {category_count}")
  793. print(f" 标签节点: {tag_count}")
  794. # ===== 提取边 =====
  795. print("\n" + "="*60)
  796. print("正在提取边...")
  797. all_edges = []
  798. # 提取分类-分类边(跨点共现)
  799. print("\n提取分类-分类边(跨点共现):")
  800. category_edges = extract_category_edges_from_associations(associations_data)
  801. all_edges.extend(category_edges)
  802. print(f" 分类共现(跨点)边: {len(category_edges)} 条")
  803. # 提取分类-分类边(点内共现)
  804. print("\n提取分类-分类边(点内共现):")
  805. intra_category_edges = extract_intra_category_edges(intra_associations_data)
  806. all_edges.extend(intra_category_edges)
  807. print(f" 分类共现(点内)边: {len(intra_category_edges)} 条")
  808. # 提取标签-分类边(属于/包含)
  809. print("\n提取标签-分类边(属于/包含):")
  810. belong_count = 0
  811. contain_count = 0
  812. for dim_key, dim_name in dimension_mapping.items():
  813. tag_category_edges = extract_tag_category_edges_from_pattern(pattern_data, dim_key, dim_name)
  814. all_edges.extend(tag_category_edges)
  815. dim_belong = sum(1 for e in tag_category_edges if e["边类型"] == "属于")
  816. dim_contain = sum(1 for e in tag_category_edges if e["边类型"] == "包含")
  817. belong_count += dim_belong
  818. contain_count += dim_contain
  819. print(f" {dim_name}: {dim_belong} 条属于边, {dim_contain} 条包含边")
  820. # 提取标签-标签边(共现)- 需要在过滤之前先记录排除的帖子ID
  821. # 这里先占位,过滤后再处理
  822. tag_cooccurrence_edges_placeholder = True
  823. print(f"\n边统计(标签共现待提取):")
  824. print(f" 分类共现(跨点)边: {len(category_edges)}")
  825. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  826. print(f" 属于边: {belong_count}")
  827. print(f" 包含边: {contain_count}")
  828. # ===== 应用过滤 =====
  829. exclude_post_ids = set()
  830. filter_mode = config.filter_mode
  831. if filter_mode == "exclude_current_posts":
  832. print("\n" + "="*60)
  833. print("应用过滤规则: 排除当前帖子ID")
  834. exclude_post_ids = get_current_post_ids(current_posts_dir)
  835. if exclude_post_ids:
  836. # 过滤节点
  837. nodes_before = len(all_nodes)
  838. all_nodes = filter_nodes_by_post_ids(all_nodes, exclude_post_ids)
  839. nodes_after = len(all_nodes)
  840. print(f"\n节点过滤: {nodes_before} -> {nodes_after} (移除 {nodes_before - nodes_after} 个)")
  841. # 过滤边
  842. edges_before = len(all_edges)
  843. all_edges = filter_edges_by_post_ids(all_edges, exclude_post_ids)
  844. edges_after = len(all_edges)
  845. print(f"边过滤: {edges_before} -> {edges_after} (移除 {edges_before - edges_after} 条)")
  846. elif filter_mode == "none":
  847. print("\n过滤模式: none,不应用任何过滤")
  848. else:
  849. print(f"\n警告: 未知的过滤模式 '{filter_mode}',不应用过滤")
  850. # ===== 提取标签-标签共现边 =====
  851. print("\n" + "="*60)
  852. print("提取标签-标签共现边...")
  853. historical_posts_dir = config.historical_posts_dir
  854. print(f"历史帖子目录: {historical_posts_dir}")
  855. tag_cooccurrence_edges = extract_tag_cooccurrence_edges(historical_posts_dir, exclude_post_ids)
  856. all_edges.extend(tag_cooccurrence_edges)
  857. print(f" 标签-标签共现边: {len(tag_cooccurrence_edges)} 条")
  858. # ===== 提取支撑边和关联意图边(新版数据结构)=====
  859. print("\n" + "="*60)
  860. print("提取支撑边和关联意图边(新版数据结构)...")
  861. support_edges, intent_edges = extract_support_and_intent_edges(historical_posts_dir, exclude_post_ids)
  862. all_edges.extend(support_edges)
  863. all_edges.extend(intent_edges)
  864. print(f" 支撑边: {len(support_edges)} 条")
  865. print(f" 关联意图边: {len(intent_edges)} 条")
  866. # 更新总计
  867. print(f"\n总计: {len(all_edges)} 条边")
  868. print(f" 分类共现(跨点)边: {len(category_edges)}")
  869. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  870. print(f" 标签共现边: {len(tag_cooccurrence_edges)}")
  871. print(f" 支撑边: {len(support_edges)}")
  872. print(f" 关联意图边: {len(intent_edges)}")
  873. print(f" 属于边: {belong_count}")
  874. print(f" 包含边: {contain_count}")
  875. # ===== 获取帖子详情 =====
  876. print("\n" + "="*60)
  877. print("获取帖子详情...")
  878. # 收集所有需要获取详情的帖子ID(从节点和边)
  879. post_ids_from_nodes = collect_all_post_ids_from_nodes(all_nodes)
  880. post_ids_from_edges = collect_all_post_ids_from_edges(all_edges)
  881. all_post_ids = post_ids_from_nodes | post_ids_from_edges
  882. print(f"节点中的帖子: {len(post_ids_from_nodes)} 个")
  883. print(f"边中的帖子: {len(post_ids_from_edges)} 个")
  884. print(f"合计(去重): {len(all_post_ids)} 个")
  885. # 批量获取帖子详情
  886. post_details = fetch_post_details(all_post_ids)
  887. # ===== 保存结果 =====
  888. print("\n" + "="*60)
  889. # 输出文件路径
  890. post_details_output_file = config.intermediate_dir / "帖子详情映射.json"
  891. # 保存节点列表
  892. nodes_output = {
  893. "说明": {
  894. "描述": "分类和标签节点列表",
  895. "数据来源": ["过去帖子_pattern聚合结果.json"],
  896. "过滤模式": filter_mode,
  897. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  898. },
  899. "节点列表": all_nodes
  900. }
  901. print(f"正在保存节点列表到: {nodes_output_file}")
  902. with open(nodes_output_file, "w", encoding="utf-8") as f:
  903. json.dump(nodes_output, f, ensure_ascii=False, indent=2)
  904. # 构建节点ID索引的边关系: 节点 -> 边类型 -> {目标节点: 完整边信息}
  905. edges_by_node = {} # key: 节点ID, value: {边类型: {目标节点ID: 完整边信息}}
  906. for edge in all_edges:
  907. source_id = edge["源节点ID"]
  908. target_id = edge["目标节点ID"]
  909. edge_type = edge["边类型"]
  910. # 源节点 -> 目标节点
  911. if source_id not in edges_by_node:
  912. edges_by_node[source_id] = {}
  913. if edge_type not in edges_by_node[source_id]:
  914. edges_by_node[source_id][edge_type] = {}
  915. edges_by_node[source_id][edge_type][target_id] = edge
  916. # 保存边关系
  917. edges_output = {
  918. "说明": {
  919. "描述": "分类和标签之间的边关系",
  920. "数据来源": ["过去帖子_pattern聚合结果.json", "dimension_associations_analysis.json", "过去帖子_what解构结果目录"],
  921. "过滤模式": filter_mode,
  922. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  923. },
  924. "边列表": all_edges,
  925. "节点边索引": edges_by_node
  926. }
  927. print(f"正在保存边关系到: {edges_output_file}")
  928. with open(edges_output_file, "w", encoding="utf-8") as f:
  929. json.dump(edges_output, f, ensure_ascii=False, indent=2)
  930. # 保存帖子详情映射
  931. post_details_output = {
  932. "说明": {
  933. "描述": "帖子ID到帖子详情的映射",
  934. "帖子数": len(post_details)
  935. },
  936. "帖子详情": post_details
  937. }
  938. print(f"正在保存帖子详情映射到: {post_details_output_file}")
  939. with open(post_details_output_file, "w", encoding="utf-8") as f:
  940. json.dump(post_details_output, f, ensure_ascii=False, indent=2)
  941. print("\n完成!")
  942. print(f"\n输出文件:")
  943. print(f" 节点列表: {len(all_nodes)} 个节点")
  944. print(f" 边关系: {len(all_edges)} 条边")
  945. print(f" 帖子详情映射: {len(post_details)} 个帖子")
  946. if __name__ == "__main__":
  947. main()