extract_nodes_and_edges.py 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 从源数据文件中提取节点列表和边关系
  5. 输入:
  6. 1. 过去帖子_pattern聚合结果.json - 分类节点、标签-分类边
  7. 2. 过去帖子_what解构结果目录 - 标签节点来源
  8. 3. dimension_associations_analysis.json - 分类-分类边(共现)
  9. 输出:
  10. 1. 节点列表.json
  11. 2. 边关系.json
  12. """
  13. import json
  14. from pathlib import Path
  15. from typing import Dict, List, Any, Set, Optional
  16. import sys
  17. import re
  18. # 添加项目根目录到路径
  19. project_root = Path(__file__).parent.parent.parent
  20. sys.path.insert(0, str(project_root))
  21. from script.data_processing.path_config import PathConfig
  22. from script.detail import get_xiaohongshu_detail
  23. def get_post_detail(post_id: str) -> Optional[Dict]:
  24. """获取帖子详情"""
  25. try:
  26. detail = get_xiaohongshu_detail(post_id)
  27. return detail
  28. except Exception as e:
  29. print(f" 警告: 获取帖子 {post_id} 详情失败: {e}")
  30. return None
  31. def get_last_segment(path: str) -> str:
  32. """获取路径的最后一段"""
  33. return path.split("/")[-1]
  34. def build_node_id(dimension: str, node_type: str, name: str) -> str:
  35. """
  36. 构建节点ID
  37. Args:
  38. dimension: 节点层级(灵感点、目的点、关键点)
  39. node_type: 节点类型(分类、标签)
  40. name: 节点名称
  41. Returns:
  42. 节点ID,格式: {层级}_{类型}_{名称}
  43. """
  44. return f"{dimension}_{node_type}_{name}"
  45. def extract_post_id_from_filename(filename: str) -> str:
  46. """从文件名中提取帖子ID"""
  47. match = re.match(r'^([^_]+)_', filename)
  48. if match:
  49. return match.group(1)
  50. return ""
  51. def get_current_post_ids(current_posts_dir: Path) -> Set[str]:
  52. """
  53. 获取当前帖子目录中的所有帖子ID
  54. Args:
  55. current_posts_dir: 当前帖子目录路径
  56. Returns:
  57. 当前帖子ID集合
  58. """
  59. if not current_posts_dir.exists():
  60. print(f"警告: 当前帖子目录不存在: {current_posts_dir}")
  61. return set()
  62. json_files = list(current_posts_dir.glob("*.json"))
  63. if not json_files:
  64. print(f"警告: 当前帖子目录为空: {current_posts_dir}")
  65. return set()
  66. print(f"找到 {len(json_files)} 个当前帖子")
  67. post_ids = set()
  68. for file_path in json_files:
  69. post_id = extract_post_id_from_filename(file_path.name)
  70. if post_id:
  71. post_ids.add(post_id)
  72. print(f"提取到 {len(post_ids)} 个帖子ID")
  73. return post_ids
  74. def collect_all_post_ids_from_nodes(nodes: List[Dict]) -> Set[str]:
  75. """从节点列表中收集所有帖子ID"""
  76. post_ids = set()
  77. for node in nodes:
  78. for source in node.get("节点来源", []):
  79. post_id = source.get("帖子ID", "")
  80. if post_id:
  81. post_ids.add(post_id)
  82. return post_ids
  83. def collect_all_post_ids_from_edges(edges: List[Dict]) -> Set[str]:
  84. """从边列表中收集所有帖子ID"""
  85. post_ids = set()
  86. for edge in edges:
  87. if edge.get("边类型") in ("分类共现(跨点)", "标签共现"):
  88. edge_details = edge.get("边详情", {})
  89. common_post_ids = edge_details.get("共同帖子ID", [])
  90. post_ids.update(common_post_ids)
  91. # 点内共现边不包含帖子ID
  92. return post_ids
  93. def fetch_post_details(post_ids: Set[str]) -> Dict[str, Dict]:
  94. """
  95. 批量获取帖子详情
  96. Args:
  97. post_ids: 帖子ID集合
  98. Returns:
  99. 帖子ID -> 帖子详情 的映射
  100. """
  101. print(f"\n正在获取 {len(post_ids)} 个帖子的详情...")
  102. post_details = {}
  103. for i, post_id in enumerate(sorted(post_ids), 1):
  104. print(f" [{i}/{len(post_ids)}] 获取帖子 {post_id} 的详情...")
  105. detail = get_post_detail(post_id)
  106. if detail:
  107. post_details[post_id] = detail
  108. print(f"成功获取 {len(post_details)} 个帖子详情")
  109. return post_details
  110. def filter_nodes_by_post_ids(nodes: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  111. """
  112. 过滤节点,排除指定帖子ID的来源
  113. Args:
  114. nodes: 节点列表
  115. exclude_post_ids: 要排除的帖子ID集合
  116. Returns:
  117. 过滤后的节点列表
  118. """
  119. filtered_nodes = []
  120. for node in nodes:
  121. # 过滤节点来源
  122. filtered_sources = [
  123. source for source in node.get("节点来源", [])
  124. if source.get("帖子ID", "") not in exclude_post_ids
  125. ]
  126. # 只保留有来源的节点
  127. if filtered_sources:
  128. node_copy = node.copy()
  129. node_copy["节点来源"] = filtered_sources
  130. # 重新计算帖子数
  131. unique_post_ids = set(s.get("帖子ID", "") for s in filtered_sources if s.get("帖子ID"))
  132. node_copy["帖子数"] = len(unique_post_ids)
  133. filtered_nodes.append(node_copy)
  134. return filtered_nodes
  135. def filter_edges_by_post_ids(edges: List[Dict], exclude_post_ids: Set[str]) -> List[Dict]:
  136. """
  137. 过滤边,排除指定帖子ID的共现边
  138. Args:
  139. edges: 边列表
  140. exclude_post_ids: 要排除的帖子ID集合
  141. Returns:
  142. 过滤后的边列表
  143. """
  144. filtered_edges = []
  145. for edge in edges:
  146. edge_type = edge["边类型"]
  147. if edge_type in ("分类共现(跨点)", "标签共现"):
  148. # 过滤共同帖子ID
  149. edge_details = edge.get("边详情", {})
  150. common_post_ids = edge_details.get("共同帖子ID", [])
  151. filtered_post_ids = [pid for pid in common_post_ids if pid not in exclude_post_ids]
  152. if filtered_post_ids:
  153. edge_copy = edge.copy()
  154. edge_copy["边详情"] = edge_details.copy()
  155. edge_copy["边详情"]["共同帖子ID"] = filtered_post_ids
  156. edge_copy["边详情"]["共同帖子数"] = len(filtered_post_ids)
  157. filtered_edges.append(edge_copy)
  158. elif edge_type == "分类共现(点内)":
  159. # 点内共现边不涉及帖子ID,直接保留
  160. filtered_edges.append(edge)
  161. else:
  162. # 属于/包含边不需要过滤
  163. filtered_edges.append(edge)
  164. return filtered_edges
  165. # ========== 分类节点提取 ==========
  166. def extract_category_nodes_from_pattern(
  167. pattern_data: Dict,
  168. dimension_key: str,
  169. dimension_name: str
  170. ) -> List[Dict]:
  171. """
  172. 从pattern聚合结果中提取分类节点
  173. Args:
  174. pattern_data: pattern聚合数据
  175. dimension_key: 维度键名(灵感点列表、目的点、关键点列表)
  176. dimension_name: 维度名称(灵感点、目的点、关键点)
  177. Returns:
  178. 分类节点列表
  179. """
  180. nodes = []
  181. if dimension_key not in pattern_data:
  182. return nodes
  183. def traverse_node(node: Dict, parent_categories: List[str]):
  184. """递归遍历节点"""
  185. for key, value in node.items():
  186. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  187. continue
  188. if isinstance(value, dict):
  189. # 当前节点是一个分类
  190. current_path = parent_categories + [key]
  191. # 获取帖子列表
  192. post_ids = value.get("帖子列表", [])
  193. # 构建节点来源(从特征列表中获取)
  194. node_sources = []
  195. if "特征列表" in value:
  196. for feature in value["特征列表"]:
  197. source = {
  198. "点的名称": feature.get("所属点", ""),
  199. "点的描述": feature.get("点描述", ""),
  200. "帖子ID": feature.get("帖子id", "")
  201. }
  202. node_sources.append(source)
  203. node_info = {
  204. "节点ID": build_node_id(dimension_name, "分类", key),
  205. "节点名称": key,
  206. "节点类型": "分类",
  207. "节点层级": dimension_name,
  208. "所属分类": parent_categories.copy(),
  209. "帖子数": len(post_ids),
  210. "节点来源": node_sources
  211. }
  212. nodes.append(node_info)
  213. # 递归处理子节点
  214. traverse_node(value, current_path)
  215. traverse_node(pattern_data[dimension_key], [])
  216. return nodes
  217. # ========== 标签节点提取 ==========
  218. def extract_tag_nodes_from_pattern(
  219. pattern_data: Dict,
  220. dimension_key: str,
  221. dimension_name: str
  222. ) -> List[Dict]:
  223. """
  224. 从pattern聚合结果中提取标签节点
  225. Args:
  226. pattern_data: pattern聚合数据
  227. dimension_key: 维度键名
  228. dimension_name: 维度名称
  229. Returns:
  230. 标签节点列表
  231. """
  232. nodes = []
  233. tag_map = {} # 用于合并同名标签
  234. if dimension_key not in pattern_data:
  235. return nodes
  236. def traverse_node(node: Dict, parent_categories: List[str]):
  237. """递归遍历节点"""
  238. # 处理特征列表(标签)
  239. if "特征列表" in node:
  240. for feature in node["特征列表"]:
  241. tag_name = feature.get("特征名称", "")
  242. if not tag_name:
  243. continue
  244. source = {
  245. "点的名称": feature.get("所属点", ""),
  246. "点的描述": feature.get("点描述", ""),
  247. "帖子ID": feature.get("帖子id", "")
  248. }
  249. tag_id = build_node_id(dimension_name, "标签", tag_name)
  250. if tag_id not in tag_map:
  251. tag_map[tag_id] = {
  252. "节点ID": tag_id,
  253. "节点名称": tag_name,
  254. "节点类型": "标签",
  255. "节点层级": dimension_name,
  256. "所属分类": parent_categories.copy(),
  257. "帖子数": 0,
  258. "节点来源": [],
  259. "_post_ids": set()
  260. }
  261. tag_map[tag_id]["节点来源"].append(source)
  262. if source["帖子ID"]:
  263. tag_map[tag_id]["_post_ids"].add(source["帖子ID"])
  264. # 递归处理子节点
  265. for key, value in node.items():
  266. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  267. continue
  268. if isinstance(value, dict):
  269. current_path = parent_categories + [key]
  270. traverse_node(value, current_path)
  271. traverse_node(pattern_data[dimension_key], [])
  272. # 转换为列表,计算帖子数
  273. for tag_id, tag_info in tag_map.items():
  274. tag_info["帖子数"] = len(tag_info["_post_ids"])
  275. del tag_info["_post_ids"]
  276. nodes.append(tag_info)
  277. return nodes
  278. # ========== 标签-分类边提取 ==========
  279. def extract_tag_category_edges_from_pattern(
  280. pattern_data: Dict,
  281. dimension_key: str,
  282. dimension_name: str
  283. ) -> List[Dict]:
  284. """
  285. 从pattern聚合结果中提取标签-分类边(属于/包含)
  286. Args:
  287. pattern_data: pattern聚合数据
  288. dimension_key: 维度键名
  289. dimension_name: 维度名称
  290. Returns:
  291. 边列表
  292. """
  293. edges = []
  294. seen_edges = set() # 避免重复边
  295. if dimension_key not in pattern_data:
  296. return edges
  297. def traverse_node(node: Dict, parent_categories: List[str]):
  298. """递归遍历节点"""
  299. current_category = parent_categories[-1] if parent_categories else None
  300. # 处理特征列表(标签)
  301. if "特征列表" in node and current_category:
  302. for feature in node["特征列表"]:
  303. tag_name = feature.get("特征名称", "")
  304. if not tag_name:
  305. continue
  306. tag_id = build_node_id(dimension_name, "标签", tag_name)
  307. category_id = build_node_id(dimension_name, "分类", current_category)
  308. # 属于边:标签 -> 分类
  309. edge_key_belong = (tag_id, category_id, "属于")
  310. if edge_key_belong not in seen_edges:
  311. seen_edges.add(edge_key_belong)
  312. edges.append({
  313. "源节点ID": tag_id,
  314. "目标节点ID": category_id,
  315. "边类型": "属于",
  316. "边详情": {}
  317. })
  318. # 包含边:分类 -> 标签
  319. edge_key_contain = (category_id, tag_id, "包含")
  320. if edge_key_contain not in seen_edges:
  321. seen_edges.add(edge_key_contain)
  322. edges.append({
  323. "源节点ID": category_id,
  324. "目标节点ID": tag_id,
  325. "边类型": "包含",
  326. "边详情": {}
  327. })
  328. # 递归处理子节点
  329. for key, value in node.items():
  330. if key in ["特征列表", "_meta", "帖子数", "特征数", "帖子列表"]:
  331. continue
  332. if isinstance(value, dict):
  333. current_path = parent_categories + [key]
  334. traverse_node(value, current_path)
  335. traverse_node(pattern_data[dimension_key], [])
  336. return edges
  337. # ========== 标签-标签共现边提取 ==========
  338. def extract_tags_from_post(post_data: Dict) -> Dict[str, List[str]]:
  339. """
  340. 从单个帖子的解构结果中提取所有标签(特征名称)
  341. Args:
  342. post_data: 帖子解构数据
  343. Returns:
  344. 按维度分组的标签字典 {"灵感点": [...], "目的点": [...], "关键点": [...]}
  345. """
  346. tags_by_dimension = {
  347. "灵感点": [],
  348. "目的点": [],
  349. "关键点": []
  350. }
  351. if "三点解构" not in post_data:
  352. return tags_by_dimension
  353. three_points = post_data["三点解构"]
  354. # 提取灵感点的特征
  355. if "灵感点" in three_points:
  356. inspiration = three_points["灵感点"]
  357. for section in ["全新内容", "共性差异", "共性内容"]:
  358. if section in inspiration and isinstance(inspiration[section], list):
  359. for item in inspiration[section]:
  360. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  361. for feature in item["提取的特征"]:
  362. tag_name = feature.get("特征名称", "")
  363. if tag_name:
  364. tags_by_dimension["灵感点"].append(tag_name)
  365. # 提取目的点的特征
  366. if "目的点" in three_points:
  367. purpose = three_points["目的点"]
  368. if "purposes" in purpose and isinstance(purpose["purposes"], list):
  369. for item in purpose["purposes"]:
  370. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  371. for feature in item["提取的特征"]:
  372. tag_name = feature.get("特征名称", "")
  373. if tag_name:
  374. tags_by_dimension["目的点"].append(tag_name)
  375. # 提取关键点的特征
  376. if "关键点" in three_points:
  377. key_points = three_points["关键点"]
  378. if "key_points" in key_points and isinstance(key_points["key_points"], list):
  379. for item in key_points["key_points"]:
  380. if "提取的特征" in item and isinstance(item["提取的特征"], list):
  381. for feature in item["提取的特征"]:
  382. tag_name = feature.get("特征名称", "")
  383. if tag_name:
  384. tags_by_dimension["关键点"].append(tag_name)
  385. return tags_by_dimension
  386. def extract_tag_cooccurrence_edges(historical_posts_dir: Path, exclude_post_ids: Set[str] = None) -> List[Dict]:
  387. """
  388. 从历史帖子解构结果中提取标签-标签共现边
  389. Args:
  390. historical_posts_dir: 历史帖子解构结果目录
  391. exclude_post_ids: 要排除的帖子ID集合
  392. Returns:
  393. 标签共现边列表
  394. """
  395. if exclude_post_ids is None:
  396. exclude_post_ids = set()
  397. # 存储每对标签的共现信息
  398. # key: (tag1_id, tag2_id), value: {"共同帖子ID": set()}
  399. cooccurrence_map = {}
  400. if not historical_posts_dir.exists():
  401. print(f"警告: 历史帖子目录不存在: {historical_posts_dir}")
  402. return []
  403. json_files = list(historical_posts_dir.glob("*.json"))
  404. print(f"找到 {len(json_files)} 个历史帖子文件")
  405. for file_path in json_files:
  406. # 提取帖子ID
  407. post_id = extract_post_id_from_filename(file_path.name)
  408. if not post_id:
  409. continue
  410. # 跳过排除的帖子
  411. if post_id in exclude_post_ids:
  412. continue
  413. try:
  414. with open(file_path, "r", encoding="utf-8") as f:
  415. post_data = json.load(f)
  416. # 提取该帖子的所有标签
  417. tags_by_dimension = extract_tags_from_post(post_data)
  418. # 对每个维度内的标签两两组合,构建共现关系
  419. for dimension, tags in tags_by_dimension.items():
  420. unique_tags = list(set(tags)) # 去重
  421. for i in range(len(unique_tags)):
  422. for j in range(i + 1, len(unique_tags)):
  423. tag1 = unique_tags[i]
  424. tag2 = unique_tags[j]
  425. # 构建节点ID
  426. tag1_id = build_node_id(dimension, "标签", tag1)
  427. tag2_id = build_node_id(dimension, "标签", tag2)
  428. # 确保顺序一致(按字典序)
  429. if tag1_id > tag2_id:
  430. tag1_id, tag2_id = tag2_id, tag1_id
  431. key = (tag1_id, tag2_id, dimension)
  432. if key not in cooccurrence_map:
  433. cooccurrence_map[key] = {"共同帖子ID": set()}
  434. cooccurrence_map[key]["共同帖子ID"].add(post_id)
  435. except Exception as e:
  436. print(f" 警告: 处理文件 {file_path.name} 时出错: {e}")
  437. # 转换为边列表
  438. edges = []
  439. for (tag1_id, tag2_id, dimension), info in cooccurrence_map.items():
  440. common_post_ids = list(info["共同帖子ID"])
  441. edge = {
  442. "源节点ID": tag1_id,
  443. "目标节点ID": tag2_id,
  444. "边类型": "标签共现",
  445. "边详情": {
  446. "共同帖子数": len(common_post_ids),
  447. "共同帖子ID": common_post_ids
  448. }
  449. }
  450. edges.append(edge)
  451. return edges
  452. # ========== 分类-分类边提取 ==========
  453. def extract_category_edges_from_associations(associations_data: Dict) -> List[Dict]:
  454. """
  455. 从dimension_associations_analysis.json中提取分类-分类边(共现)
  456. Args:
  457. associations_data: 关联分析数据
  458. Returns:
  459. 边列表
  460. """
  461. edges = []
  462. if "单维度关联分析" not in associations_data:
  463. return edges
  464. single_dim = associations_data["单维度关联分析"]
  465. # 维度映射
  466. dimension_map = {
  467. "灵感点维度": "灵感点",
  468. "目的点维度": "目的点",
  469. "关键点维度": "关键点"
  470. }
  471. for dim_key, dim_data in single_dim.items():
  472. if dim_key not in dimension_map:
  473. continue
  474. source_dimension = dimension_map[dim_key]
  475. # 遍历该维度下的所有关联方向
  476. for direction_key, direction_data in dim_data.items():
  477. if direction_key == "说明":
  478. continue
  479. if "→" not in direction_key:
  480. continue
  481. # 遍历每个源分类
  482. for source_path, source_info in direction_data.items():
  483. source_name = get_last_segment(source_path)
  484. source_node_id = build_node_id(source_dimension, "分类", source_name)
  485. # 确定目标维度
  486. for field_name, associations in source_info.items():
  487. if not field_name.startswith("与") or not field_name.endswith("的关联"):
  488. continue
  489. target_dimension = field_name[1:-3]
  490. if not isinstance(associations, list):
  491. continue
  492. for assoc in associations:
  493. target_path = assoc.get("目标分类", "")
  494. if not target_path:
  495. continue
  496. target_name = get_last_segment(target_path)
  497. target_node_id = build_node_id(target_dimension, "分类", target_name)
  498. edge = {
  499. "源节点ID": source_node_id,
  500. "目标节点ID": target_node_id,
  501. "边类型": "分类共现(跨点)",
  502. "边详情": {
  503. "Jaccard相似度": assoc.get("Jaccard相似度", 0),
  504. "重叠系数": assoc.get("重叠系数", 0),
  505. "共同帖子数": assoc.get("共同帖子数", 0),
  506. "共同帖子ID": assoc.get("共同帖子ID", [])
  507. }
  508. }
  509. edges.append(edge)
  510. return edges
  511. # ========== 点内分类共现边提取 ==========
  512. def extract_intra_category_edges(intra_associations_data: Dict) -> List[Dict]:
  513. """
  514. 从intra_dimension_associations_analysis.json中提取点内分类共现边
  515. Args:
  516. intra_associations_data: 点内关联分析数据
  517. Returns:
  518. 边列表
  519. """
  520. edges = []
  521. seen_edges = set() # 避免重复边
  522. if "叶子分类组合聚类" not in intra_associations_data:
  523. return edges
  524. clusters_by_dim = intra_associations_data["叶子分类组合聚类"]
  525. for dimension, clusters in clusters_by_dim.items():
  526. if dimension not in ("灵感点", "目的点", "关键点"):
  527. continue
  528. for cluster_key, cluster_data in clusters.items():
  529. leaf_categories = cluster_data.get("叶子分类组合", [])
  530. point_count = cluster_data.get("点数", 0)
  531. point_details = cluster_data.get("点详情列表", [])
  532. # 提取点名称列表
  533. point_names = [p.get("点名称", "") for p in point_details if p.get("点名称")]
  534. # 两两组合生成共现边
  535. for i in range(len(leaf_categories)):
  536. for j in range(i + 1, len(leaf_categories)):
  537. cat1 = leaf_categories[i]
  538. cat2 = leaf_categories[j]
  539. # 构建节点ID
  540. cat1_id = build_node_id(dimension, "分类", cat1)
  541. cat2_id = build_node_id(dimension, "分类", cat2)
  542. # 确保顺序一致(按字典序)
  543. if cat1_id > cat2_id:
  544. cat1_id, cat2_id = cat2_id, cat1_id
  545. edge_key = (cat1_id, cat2_id, dimension)
  546. if edge_key in seen_edges:
  547. # 已存在的边,累加点数和点名称
  548. for edge in edges:
  549. if (edge["源节点ID"] == cat1_id and
  550. edge["目标节点ID"] == cat2_id and
  551. edge["边类型"] == "分类共现(点内)"):
  552. edge["边详情"]["点数"] += point_count
  553. edge["边详情"]["关联点名称"].extend(point_names)
  554. break
  555. else:
  556. seen_edges.add(edge_key)
  557. edge = {
  558. "源节点ID": cat1_id,
  559. "目标节点ID": cat2_id,
  560. "边类型": "分类共现(点内)",
  561. "边详情": {
  562. "点数": point_count,
  563. "关联点名称": point_names.copy()
  564. }
  565. }
  566. edges.append(edge)
  567. return edges
  568. # ========== 主函数 ==========
  569. def main():
  570. # 使用路径配置
  571. config = PathConfig()
  572. config.ensure_dirs()
  573. print(f"账号: {config.account_name}")
  574. print(f"输出版本: {config.output_version}")
  575. print(f"过滤模式: {config.filter_mode}")
  576. print()
  577. # 输入文件路径
  578. pattern_file = config.pattern_cluster_file
  579. associations_file = config.account_dir / "pattern相关文件/optimization/dimension_associations_analysis.json"
  580. intra_associations_file = config.account_dir / "pattern相关文件/optimization/intra_dimension_associations_analysis.json"
  581. current_posts_dir = config.current_posts_dir
  582. # 输出文件路径
  583. nodes_output_file = config.intermediate_dir / "节点列表.json"
  584. edges_output_file = config.intermediate_dir / "边关系.json"
  585. print(f"输入文件:")
  586. print(f" pattern聚合文件: {pattern_file}")
  587. print(f" 跨点关联分析文件: {associations_file}")
  588. print(f" 点内关联分析文件: {intra_associations_file}")
  589. print(f" 当前帖子目录: {current_posts_dir}")
  590. print(f"\n输出文件:")
  591. print(f" 节点列表: {nodes_output_file}")
  592. print(f" 边关系: {edges_output_file}")
  593. print()
  594. # 读取pattern聚合结果
  595. print("正在读取pattern聚合结果...")
  596. with open(pattern_file, "r", encoding="utf-8") as f:
  597. pattern_data = json.load(f)
  598. # 读取跨点关联分析结果
  599. print("正在读取跨点关联分析结果...")
  600. with open(associations_file, "r", encoding="utf-8") as f:
  601. associations_data = json.load(f)
  602. # 读取点内关联分析结果
  603. print("正在读取点内关联分析结果...")
  604. with open(intra_associations_file, "r", encoding="utf-8") as f:
  605. intra_associations_data = json.load(f)
  606. # ===== 提取节点 =====
  607. print("\n" + "="*60)
  608. print("正在提取节点...")
  609. all_nodes = []
  610. # 维度映射
  611. dimension_mapping = {
  612. "灵感点列表": "灵感点",
  613. "目的点": "目的点",
  614. "关键点列表": "关键点"
  615. }
  616. # 提取分类节点
  617. print("\n提取分类节点:")
  618. for dim_key, dim_name in dimension_mapping.items():
  619. category_nodes = extract_category_nodes_from_pattern(pattern_data, dim_key, dim_name)
  620. all_nodes.extend(category_nodes)
  621. print(f" {dim_name}: {len(category_nodes)} 个分类节点")
  622. # 提取标签节点
  623. print("\n提取标签节点:")
  624. for dim_key, dim_name in dimension_mapping.items():
  625. tag_nodes = extract_tag_nodes_from_pattern(pattern_data, dim_key, dim_name)
  626. all_nodes.extend(tag_nodes)
  627. print(f" {dim_name}: {len(tag_nodes)} 个标签节点")
  628. print(f"\n总计: {len(all_nodes)} 个节点")
  629. # 统计节点类型
  630. category_count = sum(1 for n in all_nodes if n["节点类型"] == "分类")
  631. tag_count = sum(1 for n in all_nodes if n["节点类型"] == "标签")
  632. print(f" 分类节点: {category_count}")
  633. print(f" 标签节点: {tag_count}")
  634. # ===== 提取边 =====
  635. print("\n" + "="*60)
  636. print("正在提取边...")
  637. all_edges = []
  638. # 提取分类-分类边(跨点共现)
  639. print("\n提取分类-分类边(跨点共现):")
  640. category_edges = extract_category_edges_from_associations(associations_data)
  641. all_edges.extend(category_edges)
  642. print(f" 分类共现(跨点)边: {len(category_edges)} 条")
  643. # 提取分类-分类边(点内共现)
  644. print("\n提取分类-分类边(点内共现):")
  645. intra_category_edges = extract_intra_category_edges(intra_associations_data)
  646. all_edges.extend(intra_category_edges)
  647. print(f" 分类共现(点内)边: {len(intra_category_edges)} 条")
  648. # 提取标签-分类边(属于/包含)
  649. print("\n提取标签-分类边(属于/包含):")
  650. belong_count = 0
  651. contain_count = 0
  652. for dim_key, dim_name in dimension_mapping.items():
  653. tag_category_edges = extract_tag_category_edges_from_pattern(pattern_data, dim_key, dim_name)
  654. all_edges.extend(tag_category_edges)
  655. dim_belong = sum(1 for e in tag_category_edges if e["边类型"] == "属于")
  656. dim_contain = sum(1 for e in tag_category_edges if e["边类型"] == "包含")
  657. belong_count += dim_belong
  658. contain_count += dim_contain
  659. print(f" {dim_name}: {dim_belong} 条属于边, {dim_contain} 条包含边")
  660. # 提取标签-标签边(共现)- 需要在过滤之前先记录排除的帖子ID
  661. # 这里先占位,过滤后再处理
  662. tag_cooccurrence_edges_placeholder = True
  663. print(f"\n边统计(标签共现待提取):")
  664. print(f" 分类共现(跨点)边: {len(category_edges)}")
  665. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  666. print(f" 属于边: {belong_count}")
  667. print(f" 包含边: {contain_count}")
  668. # ===== 应用过滤 =====
  669. exclude_post_ids = set()
  670. filter_mode = config.filter_mode
  671. if filter_mode == "exclude_current_posts":
  672. print("\n" + "="*60)
  673. print("应用过滤规则: 排除当前帖子ID")
  674. exclude_post_ids = get_current_post_ids(current_posts_dir)
  675. if exclude_post_ids:
  676. # 过滤节点
  677. nodes_before = len(all_nodes)
  678. all_nodes = filter_nodes_by_post_ids(all_nodes, exclude_post_ids)
  679. nodes_after = len(all_nodes)
  680. print(f"\n节点过滤: {nodes_before} -> {nodes_after} (移除 {nodes_before - nodes_after} 个)")
  681. # 过滤边
  682. edges_before = len(all_edges)
  683. all_edges = filter_edges_by_post_ids(all_edges, exclude_post_ids)
  684. edges_after = len(all_edges)
  685. print(f"边过滤: {edges_before} -> {edges_after} (移除 {edges_before - edges_after} 条)")
  686. elif filter_mode == "none":
  687. print("\n过滤模式: none,不应用任何过滤")
  688. else:
  689. print(f"\n警告: 未知的过滤模式 '{filter_mode}',不应用过滤")
  690. # ===== 提取标签-标签共现边 =====
  691. print("\n" + "="*60)
  692. print("提取标签-标签共现边...")
  693. historical_posts_dir = config.historical_posts_dir
  694. print(f"历史帖子目录: {historical_posts_dir}")
  695. tag_cooccurrence_edges = extract_tag_cooccurrence_edges(historical_posts_dir, exclude_post_ids)
  696. all_edges.extend(tag_cooccurrence_edges)
  697. print(f" 标签-标签共现边: {len(tag_cooccurrence_edges)} 条")
  698. # 更新总计
  699. print(f"\n总计: {len(all_edges)} 条边")
  700. print(f" 分类共现(跨点)边: {len(category_edges)}")
  701. print(f" 分类共现(点内)边: {len(intra_category_edges)}")
  702. print(f" 标签共现边: {len(tag_cooccurrence_edges)}")
  703. print(f" 属于边: {belong_count}")
  704. print(f" 包含边: {contain_count}")
  705. # ===== 获取帖子详情 =====
  706. print("\n" + "="*60)
  707. print("获取帖子详情...")
  708. # 收集所有需要获取详情的帖子ID(从节点和边)
  709. post_ids_from_nodes = collect_all_post_ids_from_nodes(all_nodes)
  710. post_ids_from_edges = collect_all_post_ids_from_edges(all_edges)
  711. all_post_ids = post_ids_from_nodes | post_ids_from_edges
  712. print(f"节点中的帖子: {len(post_ids_from_nodes)} 个")
  713. print(f"边中的帖子: {len(post_ids_from_edges)} 个")
  714. print(f"合计(去重): {len(all_post_ids)} 个")
  715. # 批量获取帖子详情
  716. post_details = fetch_post_details(all_post_ids)
  717. # ===== 保存结果 =====
  718. print("\n" + "="*60)
  719. # 输出文件路径
  720. post_details_output_file = config.intermediate_dir / "帖子详情映射.json"
  721. # 保存节点列表
  722. nodes_output = {
  723. "说明": {
  724. "描述": "分类和标签节点列表",
  725. "数据来源": ["过去帖子_pattern聚合结果.json"],
  726. "过滤模式": filter_mode,
  727. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  728. },
  729. "节点列表": all_nodes
  730. }
  731. print(f"正在保存节点列表到: {nodes_output_file}")
  732. with open(nodes_output_file, "w", encoding="utf-8") as f:
  733. json.dump(nodes_output, f, ensure_ascii=False, indent=2)
  734. # 构建节点ID索引的边关系: 节点 -> 边类型 -> {目标节点: 完整边信息}
  735. edges_by_node = {} # key: 节点ID, value: {边类型: {目标节点ID: 完整边信息}}
  736. for edge in all_edges:
  737. source_id = edge["源节点ID"]
  738. target_id = edge["目标节点ID"]
  739. edge_type = edge["边类型"]
  740. # 源节点 -> 目标节点
  741. if source_id not in edges_by_node:
  742. edges_by_node[source_id] = {}
  743. if edge_type not in edges_by_node[source_id]:
  744. edges_by_node[source_id][edge_type] = {}
  745. edges_by_node[source_id][edge_type][target_id] = edge
  746. # 保存边关系
  747. edges_output = {
  748. "说明": {
  749. "描述": "分类和标签之间的边关系",
  750. "数据来源": ["过去帖子_pattern聚合结果.json", "dimension_associations_analysis.json", "过去帖子_what解构结果目录"],
  751. "过滤模式": filter_mode,
  752. "过滤帖子数": len(exclude_post_ids) if exclude_post_ids else 0
  753. },
  754. "边列表": all_edges,
  755. "节点边索引": edges_by_node
  756. }
  757. print(f"正在保存边关系到: {edges_output_file}")
  758. with open(edges_output_file, "w", encoding="utf-8") as f:
  759. json.dump(edges_output, f, ensure_ascii=False, indent=2)
  760. # 保存帖子详情映射
  761. post_details_output = {
  762. "说明": {
  763. "描述": "帖子ID到帖子详情的映射",
  764. "帖子数": len(post_details)
  765. },
  766. "帖子详情": post_details
  767. }
  768. print(f"正在保存帖子详情映射到: {post_details_output_file}")
  769. with open(post_details_output_file, "w", encoding="utf-8") as f:
  770. json.dump(post_details_output, f, ensure_ascii=False, indent=2)
  771. print("\n完成!")
  772. print(f"\n输出文件:")
  773. print(f" 节点列表: {len(all_nodes)} 个节点")
  774. print(f" 边关系: {len(all_edges)} 条边")
  775. print(f" 帖子详情映射: {len(post_details)} 个帖子")
  776. if __name__ == "__main__":
  777. main()