extract_nodes_and_edges.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从源数据文件中提取节点列表和边关系
  5. 输入:
  6. 1. 过去帖子_pattern聚合结果.json - 分类节点、标签-分类边
  7. 2. 过去帖子_what解构结果目录 - 标签节点来源
  8. 3. dimension_associations_analysis.json - 分类-分类边(共现)
  9. 输出:
  10. 1. 节点列表.json
  11. 2. 边关系.json
  12. """
  13. import json
  14. from pathlib import Path
  15. from typing import Dict, List, Any, Set, Optional
  16. import sys
  17. import re
  18. # 添加项目根目录到路径
  19. project_root = Path(__file__).parent.parent.parent
  20. sys.path.insert(0, str(project_root))
  21. from script.data_processing.path_config import PathConfig
  22. from script.detail import get_xiaohongshu_detail
  23. def get_post_detail(post_id: str) -> Optional[Dict]:
  24. """获取帖子详情"""
  25. try:
  26. detail = get_xiaohongshu_detail(post_id)
  27. return detail
  28. except Exception as e:
  29. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  30. return None
  31. def get_last_segment(path: str) -> str:
  32. """获取路径的最后一段"""
  33. return path.split("/")[-1]
  34. def build_node_id(dimension: str, node_type: str, name: str) -> str:
  35. """
  36. 构建节点ID
  37. Args:
  38. dimension: 节点层级(灵感点、目的点、关键点)
  39. node_type: 节点类型(分类、标签)
  40. name: 节点名称
  41. Returns:
  42. 节点ID,格式: {层级}_{类型}_{名称}
  43. """
  44. return f"{dimension}_{node_type}_{name}"
  45. def extract_post_id_from_filename(filename: str) -> str:
  46. """从文件名中提取帖子ID"""
  47. match = re.match(r'^([^_]+)_', filename)
  48. if match:
  49. return match.group(1)
  50. return ""
  51. def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
  52. """
  53. 获取当前帖子目录中的所有帖子ID
  54. Args:
  55. current_posts_dir: 当前帖子目录路径
  56. Returns:
  57. 当前帖子ID集合
  58. """
  59. if not current_posts_dir.exists():
  60. print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
  61. return set()
  62. json_files = list(current_posts_dir.glob("*.json"))
  63. if not json_files:
  64. print(f"警告: 当前帖子目录为空: {current_posts_dir}")
  65. return set()
  66. print(f"找到 {len(json_files)} 个当前帖子")
  67. post_ids = set()
  68. for file_path in json_files:
  69. post_id = extract_post_id_from_filename(file_path.name)
  70. if post_id:
  71. post_ids.add(post_id)
  72. print(f"提取到 {len(post_ids)} 个帖子ID")
  73. return post_ids
  74. def collect_all_post_ids_from_nodes(nodes: List[Dict]) -> Set[str]:
  75. """从节点列表中收集所有帖子ID"""
  76. post_ids = set()
  77. for node in nodes:
  78. for source in node.get("节点来源", []):
  79. post_id = source.get("帖子ID", "")
  80. if post_id:
  81. post_ids.add(post_id)
  82. return post_ids
  83. def collect_all_post_ids_from_edges(edges: List[Dict]) -> Set[str]:
  84. """从边列表中收集所有帖子ID"""
  85. post_ids = set()
  86. for edge in edges:
  87. if edge.get("边类型") in ("分类共现(跨点)", "标签共现"):
  88. edge_details = edge.get("边详情", {})
  89. common_post_ids = edge_details.get("共同帖子ID", [])
  90. post_ids.update(common_post_ids)
  91. # 点内共现边不包含帖子ID
  92. return post_ids
  93. def fetch_post_details(post_ids: Set[str]) -> Dict[str, Dict]:
  94. """
  95. 批量获取帖子详情
  96. Args:
  97. post_ids: 帖子ID集合
  98. Returns:
  99. 帖子ID -> 帖子详情 的映射
  100. """
  101. print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
  102. post_details = {}
  103. for i, post_id in enumerate(sorted(post_ids), 1):
  104. print(f" [{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
  105. detail = get_post_detail(post_id)
  106. if detail:
  107. post_details[post_id] = detail
  108. print(f"成功获取 {len(post_details)} 个帖子详情")
  109. return post_details
  110. def filter_nodes_by_post_ids(nodes: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  111. """
  112. 过滤节点,排除指定帖子ID的来源
  113. Args:
  114. nodes: 节点列表
  115. exclude_post_ids: 要排除的帖子ID集合
  116. Returns:
  117. 过滤后的节点列表
  118. """
  119. filtered_nodes = []
  120. for node in nodes:
  121. # 过滤节点来源
  122. filtered_sources = [
  123. source for source in node.get("节点来源", [])
  124. if source.get("帖子ID", "") not in exclude_post_ids
  125. ]
  126. # 只保留有来源的节点
  127. if filtered_sources:
  128. node_copy = node.copy()
  129. node_copy["节点来源"] = filtered_sources
  130. # 重新计算帖子数
  131. unique_post_ids = set(s.get("帖子ID", "") for s in filtered_sources if s.get("帖子ID"))
  132. node_copy["帖子数"] = len(unique_post_ids)
  133. filtered_nodes.append(node_copy)
  134. return filtered_nodes
  135. def filter_edges_by_post_ids(edges: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  136. """
  137. 过滤边,排除指定帖子ID的共现边
  138. Args:
  139. edges: 边列表
  140. exclude_post_ids: 要排除的帖子ID集合
  141. Returns:
  142. 过滤后的边列表
  143. """
  144. filtered_edges = []
  145. for edge in edges:
  146. edge_type = edge["边类型"]
  147. if edge_type in ("分类共现(跨点)", "标签共现"):
  148. # 过滤共同帖子ID
  149. edge_details = edge.get("边详情", {})
  150. common_post_ids = edge_details.get("共同帖子ID", [])
  151. filtered_post_ids = [pid for pid in common_post_ids if pid not in exclude_post_ids]
  152. if filtered_post_ids:
  153. edge_copy = edge.copy()
  154. edge_copy["边详情"] = edge_details.copy()
  155. edge_copy["边详情"]["共同帖子ID"] = filtered_post_ids
  156. edge_copy["边详情"]["共同帖子数"] = len(filtered_post_ids)
  157. filtered_edges.append(edge_copy)
  158. elif edge_type == "分类共现(点内)":
  159. # 点内共现边不涉及帖子ID,直接保留
  160. filtered_edges.append(edge)
  161. else:
  162. # 属于/包含边不需要过滤
  163. filtered_edges.append(edge)
  164. return filtered_edges
  165. # ========== 分类节点提取 ==========
  166. def extract_category_nodes_from_pattern(
  167. pattern_data: Dict,
  168. dimension_key: str,
  169. dimension_name: str
  170. ) -> List[Dict]:
  171. """
  172. 从pattern聚合结果中提取分类节点
  173. Args:
  174. pattern_data: pattern聚合数据
  175. dimension_key: 维度键名(灵感点列表、目的点、关键点列表)
  176. dimension_name: 维度名称(灵感点、目的点、关键点)
  177. Returns:
  178. 分类节点列表
  179. """
  180. nodes = []
  181. if dimension_key not in pattern_data:
  182. return nodes
  183. def collect_sources_recursively(node: Dict) -> List[Dict]:
  184. """递归收集节点及其所有子节点的特征来源"""
  185. sources = []
  186. # 收集当前节点的特征
  187. if "特征列表" in node:
  188. for feature in node["特征列表"]:
  189. source = {
  190. "点的名称": feature.get("所属点", ""),
  191. "点的描述": feature.get("点描述", ""),
  192. "帖子ID": feature.get("帖子id", "")
  193. }
  194. sources.append(source)
  195. # 递归收集子节点的特征
  196. for key, value in node.items():
  197. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  198. continue
  199. if isinstance(value, dict):
  200. sources.extend(collect_sources_recursively(value))
  201. return sources
  202. def traverse_node(node: Dict, parent_categories: List[str]):
  203. """递归遍历节点"""
  204. for key, value in node.items():
  205. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  206. continue
  207. if isinstance(value, dict):
  208. # 当前节点是一个分类
  209. current_path = parent_categories + [key]
  210. # 获取帖子列表
  211. post_ids = value.get("帖子列表", [])
  212. # 构建节点来源(从特征列表中获取,如果没有则递归收集子分类的)
  213. node_sources = []
  214. if "特征列表" in value:
  215. for feature in value["特征列表"]:
  216. source = {
  217. "点的名称": feature.get("所属点", ""),
  218. "点的描述": feature.get("点描述", ""),
  219. "帖子ID": feature.get("帖子id", "")
  220. }
  221. node_sources.append(source)
  222. else:
  223. # 没有直接特征,递归收集子分类的特征来源
  224. node_sources = collect_sources_recursively(value)
  225. node_info = {
  226. "节点ID": build_node_id(dimension_name, "分类", key),
  227. "节点名称": key,
  228. "节点类型": "分类",
  229. "节点层级": dimension_name,
  230. "所属分类": parent_categories.copy(),
  231. "帖子数": len(post_ids) if post_ids else len(set(s.get("帖子ID", "") for s in node_sources if s.get("帖子ID"))),
  232. "节点来源": node_sources
  233. }
  234. nodes.append(node_info)
  235. # 递归处理子节点
  236. traverse_node(value, current_path)
  237. traverse_node(pattern_data[dimension_key], [])
  238. return nodes
  239. # ========== 标签节点提取 ==========
  240. def extract_tag_nodes_from_pattern(
  241. pattern_data: Dict,
  242. dimension_key: str,
  243. dimension_name: str
  244. ) -> List[Dict]:
  245. """
  246. 从pattern聚合结果中提取标签节点
  247. Args:
  248. pattern_data: pattern聚合数据
  249. dimension_key: 维度键名
  250. dimension_name: 维度名称
  251. Returns:
  252. 标签节点列表
  253. """
  254. nodes = []
  255. tag_map = {} # 用于合并同名标签
  256. if dimension_key not in pattern_data:
  257. return nodes
  258. def traverse_node(node: Dict, parent_categories: List[str]):
  259. """递归遍历节点"""
  260. # 处理特征列表(标签)
  261. if "特征列表" in node:
  262. for feature in node["特征列表"]:
  263. tag_name = feature.get("特征名称", "")
  264. if not tag_name:
  265. continue
  266. source = {
  267. "点的名称": feature.get("所属点", ""),
  268. "点的描述": feature.get("点描述", ""),
  269. "帖子ID": feature.get("帖子id", "")
  270. }
  271. tag_id = build_node_id(dimension_name, "标签", tag_name)
  272. if tag_id not in tag_map:
  273. tag_map[tag_id] = {
  274. "节点ID": tag_id,
  275. "节点名称": tag_name,
  276. "节点类型": "标签",
  277. "节点层级": dimension_name,
  278. "所属分类": parent_categories.copy(),
  279. "帖子数": 0,
  280. "节点来源": [],
  281. "_post_ids": set()
  282. }
  283. tag_map[tag_id]["节点来源"].append(source)
  284. if source["帖子ID"]:
  285. tag_map[tag_id]["_post_ids"].add(source["帖子ID"])
  286. # 递归处理子节点
  287. for key, value in node.items():
  288. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  289. continue
  290. if isinstance(value, dict):
  291. current_path = parent_categories + [key]
  292. traverse_node(value, current_path)
  293. traverse_node(pattern_data[dimension_key], [])
  294. # 转换为列表,计算帖子数
  295. for tag_id, tag_info in tag_map.items():
  296. tag_info["帖子数"] = len(tag_info["_post_ids"])
  297. del tag_info["_post_ids"]
  298. nodes.append(tag_info)
  299. return nodes
  300. # ========== 标签-分类边提取 ==========
  301. def extract_tag_category_edges_from_pattern(
  302. pattern_data: Dict,
  303. dimension_key: str,
  304. dimension_name: str
  305. ) -> List[Dict]:
  306. """
  307. 从pattern聚合结果中提取标签-分类边(属于/包含)
  308. Args:
  309. pattern_data: pattern聚合数据
  310. dimension_key: 维度键名
  311. dimension_name: 维度名称
  312. Returns:
  313. 边列表
  314. """
  315. edges = []
  316. seen_edges = set() # 避免重复边
  317. if dimension_key not in pattern_data:
  318. return edges
  319. def traverse_node(node: Dict, parent_categories: List[str]):
  320. """递归遍历节点"""
  321. current_category = parent_categories[-1] if parent_categories else None
  322. # 处理特征列表(标签)
  323. if "特征列表" in node and current_category:
  324. for feature in node["特征列表"]:
  325. tag_name = feature.get("特征名称", "")
  326. if not tag_name:
  327. continue
  328. tag_id = build_node_id(dimension_name, "标签", tag_name)
  329. category_id = build_node_id(dimension_name, "分类", current_category)
  330. # 属于边:标签 -> 分类
  331. edge_key_belong = (tag_id, category_id, "属于")
  332. if edge_key_belong not in seen_edges:
  333. seen_edges.add(edge_key_belong)
  334. edges.append({
  335. "源节点ID": tag_id,
  336. "目标节点ID": category_id,
  337. "边类型": "属于",
  338. "边详情": {}
  339. })
  340. # 包含边:分类 -> 标签
  341. edge_key_contain = (category_id, tag_id, "包含")
  342. if edge_key_contain not in seen_edges:
  343. seen_edges.add(edge_key_contain)
  344. edges.append({
  345. "源节点ID": category_id,
  346. "目标节点ID": tag_id,
  347. "边类型": "包含",
  348. "边详情": {}
  349. })
  350. # 递归处理子节点
  351. for key, value in node.items():
  352. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  353. continue
  354. if isinstance(value, dict):
  355. current_path = parent_categories + [key]
  356. traverse_node(value, current_path)
  357. traverse_node(pattern_data[dimension_key], [])
  358. return edges
  359. # ========== 标签-标签共现边提取 ==========
  360. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  361. """
  362. 从单个帖子的解构结果中提取所有标签(特征名称)
  363. Args:
  364. post_data: 帖子解构数据
  365. Returns:
  366. 按维度分组的标签字典 {"灵感点": [...], "目的点": [...], "关键点": [...]}
  367. """
  368. tags_by_dimension = {
  369. "灵感点": [],
  370. "目的点": [],
  371. "关键点": []
  372. }
  373. if "三点解构" not in post_data:
  374. return tags_by_dimension
  375. three_points = post_data["三点解构"]
  376. # 提取灵感点的特征
  377. if "灵感点" in three_points:
  378. inspiration = three_points["灵感点"]
  379. for section in ["全新内容", "共性差异", "共性内容"]:
  380. if section in inspiration and isinstance(inspiration[section], list):
  381. for item in inspiration[section]:
  382. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  383. for feature in item["提取的特征"]:
  384. tag_name = feature.get("特征名称", "")
  385. if tag_name:
  386. tags_by_dimension["灵感点"].append(tag_name)
  387. # 提取目的点的特征
  388. if "目的点" in three_points:
  389. purpose = three_points["目的点"]
  390. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  391. for item in purpose["purposes"]:
  392. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  393. for feature in item["提取的特征"]:
  394. tag_name = feature.get("特征名称", "")
  395. if tag_name:
  396. tags_by_dimension["目的点"].append(tag_name)
  397. # 提取关键点的特征
  398. if "关键点" in three_points:
  399. key_points = three_points["关键点"]
  400. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  401. for item in key_points["key_points"]:
  402. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  403. for feature in item["提取的特征"]:
  404. tag_name = feature.get("特征名称", "")
  405. if tag_name:
  406. tags_by_dimension["关键点"].append(tag_name)
  407. return tags_by_dimension
  408. def extract_tag_cooccurrence_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> List[Dict]:
  409. """
  410. 从历史帖子解构结果中提取标签-标签共现边
  411. Args:
  412. historical_posts_dir: 历史帖子解构结果目录
  413. exclude_post_ids: 要排除的帖子ID集合
  414. Returns:
  415. 标签共现边列表
  416. """
  417. if exclude_post_ids is None:
  418. exclude_post_ids = set()
  419. # 存储每对标签的共现信息
  420. # key: (tag1_id, tag2_id), value: {"共同帖子ID": set()}
  421. cooccurrence_map = {}
  422. if not historical_posts_dir.exists():
  423. print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
  424. return []
  425. json_files = list(historical_posts_dir.glob("*.json"))
  426. print(f"找到 {len(json_files)} 个历史帖子文件")
  427. for file_path in json_files:
  428. # 提取帖子ID
  429. post_id = extract_post_id_from_filename(file_path.name)
  430. if not post_id:
  431. continue
  432. # 跳过排除的帖子
  433. if post_id in exclude_post_ids:
  434. continue
  435. try:
  436. with open(file_path, "r", encoding="utf-8") as f:
  437. post_data = json.load(f)
  438. # 提取该帖子的所有标签
  439. tags_by_dimension = extract_tags_from_post(post_data)
  440. # 对每个维度内的标签两两组合,构建共现关系
  441. for dimension, tags in tags_by_dimension.items():
  442. unique_tags = list(set(tags)) # 去重
  443. for i in range(len(unique_tags)):
  444. for j in range(i + 1, len(unique_tags)):
  445. tag1 = unique_tags[i]
  446. tag2 = unique_tags[j]
  447. # 构建节点ID
  448. tag1_id = build_node_id(dimension, "标签", tag1)
  449. tag2_id = build_node_id(dimension, "标签", tag2)
  450. # 确保顺序一致(按字典序)
  451. if tag1_id > tag2_id:
  452. tag1_id, tag2_id = tag2_id, tag1_id
  453. key = (tag1_id, tag2_id, dimension)
  454. if key not in cooccurrence_map:
  455. cooccurrence_map[key] = {"共同帖子ID": set()}
  456. cooccurrence_map[key]["共同帖子ID"].add(post_id)
  457. except Exception as e:
  458. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  459. # 转换为边列表
  460. edges = []
  461. for (tag1_id, tag2_id, dimension), info in cooccurrence_map.items():
  462. common_post_ids = list(info["共同帖子ID"])
  463. edge = {
  464. "源节点ID": tag1_id,
  465. "目标节点ID": tag2_id,
  466. "边类型": "标签共现",
  467. "边详情": {
  468. "共同帖子数": len(common_post_ids),
  469. "共同帖子ID": common_post_ids
  470. }
  471. }
  472. edges.append(edge)
  473. return edges
  474. # ========== 分类-分类边提取 ==========
  475. def extract_category_edges_from_associations(associations_data: Dict) -> List[Dict]:
  476. """
  477. 从dimension_associations_analysis.json中提取分类-分类边(共现)
  478. Args:
  479. associations_data: 关联分析数据
  480. Returns:
  481. 边列表
  482. """
  483. edges = []
  484. if "单维度关联分析" not in associations_data:
  485. return edges
  486. single_dim = associations_data["单维度关联分析"]
  487. # 维度映射
  488. dimension_map = {
  489. "灵感点维度": "灵感点",
  490. "目的点维度": "目的点",
  491. "关键点维度": "关键点"
  492. }
  493. for dim_key, dim_data in single_dim.items():
  494. if dim_key not in dimension_map:
  495. continue
  496. source_dimension = dimension_map[dim_key]
  497. # 遍历该维度下的所有关联方向
  498. for direction_key, direction_data in dim_data.items():
  499. if direction_key == "说明":
  500. continue
  501. if "→" not in direction_key:
  502. continue
  503. # 遍历每个源分类
  504. for source_path, source_info in direction_data.items():
  505. source_name = get_last_segment(source_path)
  506. source_node_id = build_node_id(source_dimension, "分类", source_name)
  507. # 确定目标维度
  508. for field_name, associations in source_info.items():
  509. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  510. continue
  511. target_dimension = field_name[1:-3]
  512. if not isinstance(associations, list):
  513. continue
  514. for assoc in associations:
  515. target_path = assoc.get("目标分类", "")
  516. if not target_path:
  517. continue
  518. target_name = get_last_segment(target_path)
  519. target_node_id = build_node_id(target_dimension, "分类", target_name)
  520. edge = {
  521. "源节点ID": source_node_id,
  522. "目标节点ID": target_node_id,
  523. "边类型": "分类共现(跨点)",
  524. "边详情": {
  525. "Jaccard相似度": assoc.get("Jaccard相似度", 0),
  526. "重叠系数": assoc.get("重叠系数", 0),
  527. "共同帖子数": assoc.get("共同帖子数", 0),
  528. "共同帖子ID": assoc.get("共同帖子ID", [])
  529. }
  530. }
  531. edges.append(edge)
  532. return edges
  533. # ========== 点内分类共现边提取 ==========
  534. def extract_intra_category_edges(intra_associations_data: Dict) -> List[Dict]:
  535. """
  536. 从intra_dimension_associations_analysis.json中提取点内分类共现边
  537. Args:
  538. intra_associations_data: 点内关联分析数据
  539. Returns:
  540. 边列表
  541. """
  542. edges = []
  543. seen_edges = set() # 避免重复边
  544. if "叶子分类组合聚类" not in intra_associations_data:
  545. return edges
  546. clusters_by_dim = intra_associations_data["叶子分类组合聚类"]
  547. for dimension, clusters in clusters_by_dim.items():
  548. if dimension not in ("灵感点", "目的点", "关键点"):
  549. continue
  550. for cluster_key, cluster_data in clusters.items():
  551. leaf_categories = cluster_data.get("叶子分类组合", [])
  552. point_count = cluster_data.get("点数", 0)
  553. point_details = cluster_data.get("点详情列表", [])
  554. # 提取点名称列表
  555. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  556. # 两两组合生成共现边
  557. for i in range(len(leaf_categories)):
  558. for j in range(i + 1, len(leaf_categories)):
  559. cat1 = leaf_categories[i]
  560. cat2 = leaf_categories[j]
  561. # 构建节点ID
  562. cat1_id = build_node_id(dimension, "分类", cat1)
  563. cat2_id = build_node_id(dimension, "分类", cat2)
  564. # 确保顺序一致(按字典序)
  565. if cat1_id > cat2_id:
  566. cat1_id, cat2_id = cat2_id, cat1_id
  567. edge_key = (cat1_id, cat2_id, dimension)
  568. if edge_key in seen_edges:
  569. # 已存在的边,累加点数和点名称
  570. for edge in edges:
  571. if (edge["源节点ID"] == cat1_id and
  572. edge["目标节点ID"] == cat2_id and
  573. edge["边类型"] == "分类共现(点内)"):
  574. edge["边详情"]["点数"] += point_count
  575. edge["边详情"]["关联点名称"].extend(point_names)
  576. break
  577. else:
  578. seen_edges.add(edge_key)
  579. edge = {
  580. "源节点ID": cat1_id,
  581. "目标节点ID": cat2_id,
  582. "边类型": "分类共现(点内)",
  583. "边详情": {
  584. "点数": point_count,
  585. "关联点名称": point_names.copy()
  586. }
  587. }
  588. edges.append(edge)
  589. return edges
  590. # ========== 主函数 ==========
  591. def main():
  592. # 使用路径配置
  593. config = PathConfig()
  594. config.ensure_dirs()
  595. print(f"账号: {config.account_name}")
  596. print(f"输出版本: {config.output_version}")
  597. print(f"过滤模式: {config.filter_mode}")
  598. print()
  599. # 输入文件路径
  600. pattern_file = config.pattern_cluster_file
  601. associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  602. intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  603. current_posts_dir = config.current_posts_dir
  604. # 输出文件路径
  605. nodes_output_file = config.intermediate_dir / "节点列表.json"
  606. edges_output_file = config.intermediate_dir / "边关系.json"
  607. print(f"输入文件:")
  608. print(f" pattern聚合文件: {pattern_file}")
  609. print(f" 跨点关联分析文件: {associations_file}")
  610. print(f" 点内关联分析文件: {intra_associations_file}")
  611. print(f" 当前帖子目录: {current_posts_dir}")
  612. print(f"\n输出文件:")
  613. print(f" 节点列表: {nodes_output_file}")
  614. print(f" 边关系: {edges_output_file}")
  615. print()
  616. # 读取pattern聚合结果
  617. print("正在读取pattern聚合结果...")
  618. with open(pattern_file, "r", encoding="utf-8") as f:
  619. pattern_data = json.load(f)
  620. # 读取跨点关联分析结果
  621. print("正在读取跨点关联分析结果...")
  622. with open(associations_file, "r", encoding="utf-8") as f:
  623. associations_data = json.load(f)
  624. # 读取点内关联分析结果
  625. print("正在读取点内关联分析结果...")
  626. with open(intra_associations_file, "r", encoding="utf-8") as f:
  627. intra_associations_data = json.load(f)
  628. # ===== 提取节点 =====
  629. print("\n" + "="*60)
  630. print("正在提取节点...")
  631. all_nodes = []
  632. # 维度映射
  633. dimension_mapping = {
  634. "灵感点列表": "灵感点",
  635. "目的点": "目的点",
  636. "关键点列表": "关键点"
  637. }
  638. # 提取分类节点
  639. print("\n提取分类节点:")
  640. for dim_key, dim_name in dimension_mapping.items():
  641. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  642. all_nodes.extend(category_nodes)
  643. print(f" {dim_name}: {len(category_nodes)} 个分类节点")
  644. # 提取标签节点
  645. print("\n提取标签节点:")
  646. for dim_key, dim_name in dimension_mapping.items():
  647. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  648. all_nodes.extend(tag_nodes)
  649. print(f" {dim_name}: {len(tag_nodes)} 个标签节点")
  650. print(f"\n总计: {len(all_nodes)} 个节点")
  651. # 统计节点类型
  652. category_count = sum(1 for n in all_nodes if n["节点类型"] == "分类")
  653. tag_count = sum(1 for n in all_nodes if n["节点类型"] == "标签")
  654. print(f" 分类节点: {category_count}")
  655. print(f" 标签节点: {tag_count}")
  656. # ===== 提取边 =====
  657. print("\n" + "="*60)
  658. print("正在提取边...")
  659. all_edges = []
  660. # 提取分类-分类边(跨点共现)
  661. print("\n提取分类-分类边(跨点共现):")
  662. category_edges = extract_category_edges_from_associations(associations_data)
  663. all_edges.extend(category_edges)
  664. print(f" 分类共现(跨点)边: {len(category_edges)} 条")
  665. # 提取分类-分类边(点内共现)
  666. print("\n提取分类-分类边(点内共现):")
  667. intra_category_edges = extract_intra_category_edges(intra_associations_data)
  668. all_edges.extend(intra_category_edges)
  669. print(f" 分类共现(点内)边: {len(intra_category_edges)} 条")
  670. # 提取标签-分类边(属于/包含)
  671. print("\n提取标签-分类边(属于/包含):")
  672. belong_count = 0
  673. contain_count = 0
  674. for dim_key, dim_name in dimension_mapping.items():
  675. tag_category_edges = extract_tag_category_edges_from_pattern(pattern_data, dim_key, dim_name)
  676. all_edges.extend(tag_category_edges)
  677. dim_belong = sum(1 for e in tag_category_edges if e["边类型"] == "属于")
  678. dim_contain = sum(1 for e in tag_category_edges if e["边类型"] == "包含")
  679. belong_count += dim_belong
  680. contain_count += dim_contain
  681. print(f" {dim_name}: {dim_belong} 条属于边, {dim_contain} 条包含边")
  682. # 提取标签-标签边(共现)- 需要在过滤之前先记录排除的帖子ID
  683. # 这里先占位,过滤后再处理
  684. tag_cooccurrence_edges_placeholder = True
  685. print(f"\n边统计(标签共现待提取):")
  686. print(f" 分类共现(跨点)边: {len(category_edges)}")
  687. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  688. print(f" 属于边: {belong_count}")
  689. print(f" 包含边: {contain_count}")
  690. # ===== 应用过滤 =====
  691. exclude_post_ids = set()
  692. filter_mode = config.filter_mode
  693. if filter_mode == "exclude_current_posts":
  694. print("\n" + "="*60)
  695. print("应用过滤规则: 排除当前帖子ID")
  696. exclude_post_ids = get_current_post_ids(current_posts_dir)
  697. if exclude_post_ids:
  698. # 过滤节点
  699. nodes_before = len(all_nodes)
  700. all_nodes = filter_nodes_by_post_ids(all_nodes, exclude_post_ids)
  701. nodes_after = len(all_nodes)
  702. print(f"\n节点过滤: {nodes_before} -> {nodes_after} (移除 {nodes_before - nodes_after} 个)")
  703. # 过滤边
  704. edges_before = len(all_edges)
  705. all_edges = filter_edges_by_post_ids(all_edges, exclude_post_ids)
  706. edges_after = len(all_edges)
  707. print(f"边过滤: {edges_before} -> {edges_after} (移除 {edges_before - edges_after} 条)")
  708. elif filter_mode == "none":
  709. print("\n过滤模式: none,不应用任何过滤")
  710. else:
  711. print(f"\n警告: 未知的过滤模式 '{filter_mode}',不应用过滤")
  712. # ===== 提取标签-标签共现边 =====
  713. print("\n" + "="*60)
  714. print("提取标签-标签共现边...")
  715. historical_posts_dir = config.historical_posts_dir
  716. print(f"历史帖子目录: {historical_posts_dir}")
  717. tag_cooccurrence_edges = extract_tag_cooccurrence_edges(historical_posts_dir, exclude_post_ids)
  718. all_edges.extend(tag_cooccurrence_edges)
  719. print(f" 标签-标签共现边: {len(tag_cooccurrence_edges)} 条")
  720. # 更新总计
  721. print(f"\n总计: {len(all_edges)} 条边")
  722. print(f" 分类共现(跨点)边: {len(category_edges)}")
  723. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  724. print(f" 标签共现边: {len(tag_cooccurrence_edges)}")
  725. print(f" 属于边: {belong_count}")
  726. print(f" 包含边: {contain_count}")
  727. # ===== 获取帖子详情 =====
  728. print("\n" + "="*60)
  729. print("获取帖子详情...")
  730. # 收集所有需要获取详情的帖子ID(从节点和边)
  731. post_ids_from_nodes = collect_all_post_ids_from_nodes(all_nodes)
  732. post_ids_from_edges = collect_all_post_ids_from_edges(all_edges)
  733. all_post_ids = post_ids_from_nodes | post_ids_from_edges
  734. print(f"节点中的帖子: {len(post_ids_from_nodes)} 个")
  735. print(f"边中的帖子: {len(post_ids_from_edges)} 个")
  736. print(f"合计(去重): {len(all_post_ids)} 个")
  737. # 批量获取帖子详情
  738. post_details = fetch_post_details(all_post_ids)
  739. # ===== 保存结果 =====
  740. print("\n" + "="*60)
  741. # 输出文件路径
  742. post_details_output_file = config.intermediate_dir / "帖子详情映射.json"
  743. # 保存节点列表
  744. nodes_output = {
  745. "说明": {
  746. "描述": "分类和标签节点列表",
  747. "数据来源": ["过去帖子_pattern聚合结果.json"],
  748. "过滤模式": filter_mode,
  749. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  750. },
  751. "节点列表": all_nodes
  752. }
  753. print(f"正在保存节点列表到: {nodes_output_file}")
  754. with open(nodes_output_file, "w", encoding="utf-8") as f:
  755. json.dump(nodes_output, f, ensure_ascii=False, indent=2)
  756. # 构建节点ID索引的边关系: 节点 -> 边类型 -> {目标节点: 完整边信息}
  757. edges_by_node = {} # key: 节点ID, value: {边类型: {目标节点ID: 完整边信息}}
  758. for edge in all_edges:
  759. source_id = edge["源节点ID"]
  760. target_id = edge["目标节点ID"]
  761. edge_type = edge["边类型"]
  762. # 源节点 -> 目标节点
  763. if source_id not in edges_by_node:
  764. edges_by_node[source_id] = {}
  765. if edge_type not in edges_by_node[source_id]:
  766. edges_by_node[source_id][edge_type] = {}
  767. edges_by_node[source_id][edge_type][target_id] = edge
  768. # 保存边关系
  769. edges_output = {
  770. "说明": {
  771. "描述": "分类和标签之间的边关系",
  772. "数据来源": ["过去帖子_pattern聚合结果.json", "dimension_associations_analysis.json", "过去帖子_what解构结果目录"],
  773. "过滤模式": filter_mode,
  774. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  775. },
  776. "边列表": all_edges,
  777. "节点边索引": edges_by_node
  778. }
  779. print(f"正在保存边关系到: {edges_output_file}")
  780. with open(edges_output_file, "w", encoding="utf-8") as f:
  781. json.dump(edges_output, f, ensure_ascii=False, indent=2)
  782. # 保存帖子详情映射
  783. post_details_output = {
  784. "说明": {
  785. "描述": "帖子ID到帖子详情的映射",
  786. "帖子数": len(post_details)
  787. },
  788. "帖子详情": post_details
  789. }
  790. print(f"正在保存帖子详情映射到: {post_details_output_file}")
  791. with open(post_details_output_file, "w", encoding="utf-8") as f:
  792. json.dump(post_details_output, f, ensure_ascii=False, indent=2)
  793. print("\n完成!")
  794. print(f"\n输出文件:")
  795. print(f" 节点列表: {len(all_nodes)} 个节点")
  796. print(f" 边关系: {len(all_edges)} 条边")
  797. print(f" 帖子详情映射: {len(post_details)} 个帖子")
  798. if __name__ == "__main__":
  799. main()