build_item_graph.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. """
  2. 构建 Item 共现关系图
  3. 以每个 item 为节点,基于原始帖子中的共现关系建图:
  4. - co_in_post: 共同出现在同一帖子中(两个分类出现在同一帖子即为一次共现)
  5. - hierarchy: 分类路径上的层级关系(祖先/后代)
  6. """
  7. import json
  8. import os
  9. from collections import defaultdict
  10. from itertools import combinations
  11. from typing import List, Dict, Optional
  12. from .apriori_analysis_post_level import (
  13. build_transactions_at_depth,
  14. )
  15. # ──────────────────────────────────────────────
  16. # 1. 解析 item meta 信息
  17. # ──────────────────────────────────────────────
  18. def parse_item_meta(item: str, dimension_mode: str) -> Dict:
  19. """解析 item 字符串,提取 meta 信息"""
  20. if dimension_mode == 'full':
  21. # 格式: 点类型_维度_路径 或 点类型_维度_路径||名称
  22. parts = item.split('_', 2)
  23. if len(parts) < 3:
  24. return {'raw': item}
  25. point_type, dimension, path_with_name = parts
  26. elif dimension_mode == 'substance_form_only':
  27. # 格式: 维度_路径
  28. parts = item.split('_', 1)
  29. if len(parts) < 2:
  30. return {'raw': item}
  31. point_type, dimension, path_with_name = None, parts[0], parts[1]
  32. elif dimension_mode == 'point_type_only':
  33. # 格式: 点类型_路径
  34. parts = item.split('_', 1)
  35. if len(parts) < 2:
  36. return {'raw': item}
  37. point_type, dimension, path_with_name = parts[0], None, parts[1]
  38. else:
  39. return {'raw': item}
  40. if '||' in path_with_name:
  41. path, name = path_with_name.split('||', 1)
  42. layer = 'name'
  43. else:
  44. path, name = path_with_name, None
  45. layer = 'path'
  46. return {
  47. 'point_type': point_type,
  48. 'dimension': dimension,
  49. 'path': path,
  50. 'name': name,
  51. 'layer': layer,
  52. }
  53. # ──────────────────────────────────────────────
  54. # 2. 检测层级关系
  55. # ──────────────────────────────────────────────
  56. def detect_hierarchy(item_a: str, meta_a: Dict, item_b: str, meta_b: Dict) -> Optional[Dict]:
  57. """检测两个 item 之间是否存在层级关系
  58. Returns:
  59. 从 item_a 视角看的层级关系 dict,或 None
  60. """
  61. # 必须同 point_type 和 dimension
  62. if meta_a.get('point_type') != meta_b.get('point_type'):
  63. return None
  64. if meta_a.get('dimension') != meta_b.get('dimension'):
  65. return None
  66. # 构建 full_path(含 name)
  67. def full_path(meta):
  68. p = meta.get('path', '')
  69. n = meta.get('name')
  70. return f"{p}>{n}" if n else p
  71. fp_a = full_path(meta_a)
  72. fp_b = full_path(meta_b)
  73. if not fp_a or not fp_b or fp_a == fp_b:
  74. return None
  75. if fp_b.startswith(fp_a + '>'):
  76. depth_diff = fp_b.count('>') - fp_a.count('>')
  77. return {'relation': 'ancestor', 'depth_diff': depth_diff}
  78. if fp_a.startswith(fp_b + '>'):
  79. depth_diff = fp_a.count('>') - fp_b.count('>')
  80. return {'relation': 'descendant', 'depth_diff': depth_diff}
  81. return None
  82. # ──────────────────────────────────────────────
  83. # 3. 核心:构建 item graph
  84. # ──────────────────────────────────────────────
  85. def collect_elements_for_items(original_data: Dict, dimension_mode: str) -> Dict[str, Dict[str, Dict]]:
  86. """从原始数据中收集每个分类路径下的具体元素名称(计数 + 来源帖子ID)
  87. Args:
  88. original_data: point_classification_results.json 的完整数据
  89. dimension_mode: 维度模式
  90. Returns:
  91. {item_key: {element_name: {count: int, post_ids: [str, ...]}, ...}, ...}
  92. """
  93. elements_map = defaultdict(lambda: defaultdict(lambda: {'count': 0, 'post_ids': set()}))
  94. for post_id, post_data in original_data.items():
  95. for point_type in ['灵感点', '目的点', '关键点']:
  96. points = post_data.get(point_type, [])
  97. for point in points:
  98. # 按维度处理
  99. dim_configs = [
  100. ('实质', point.get('实质', [])),
  101. ('形式', point.get('形式', [])),
  102. ('意图', point.get('意图', [])),
  103. ]
  104. for dim_name, items in dim_configs:
  105. for item in items:
  106. path = item.get('分类路径', [])
  107. name = item.get('名称', '')
  108. if not path or not name:
  109. continue
  110. path_label = '>'.join(path)
  111. # 构建 item_key(与 build_transactions_at_depth 中 depth='max' 一致)
  112. if dimension_mode == 'full':
  113. item_key = f"{point_type}_{dim_name}_{path_label}"
  114. elif dimension_mode == 'point_type_only':
  115. item_key = f"{point_type}_{path_label}"
  116. elif dimension_mode == 'substance_form_only':
  117. item_key = f"{dim_name}_{path_label}"
  118. else:
  119. continue
  120. elements_map[item_key][name]['count'] += 1
  121. elements_map[item_key][name]['post_ids'].add(post_id)
  122. # 将 set 转为 sorted list
  123. return {
  124. k: {
  125. name: {'count': v['count'], 'post_ids': sorted(v['post_ids'])}
  126. for name, v in elements.items()
  127. }
  128. for k, elements in elements_map.items()
  129. }
  130. def build_item_graph(
  131. transactions: List[List[str]],
  132. post_ids: List[str],
  133. dimension_mode: str,
  134. post_account_map: Dict[str, str] = None,
  135. elements_map: Dict[str, Dict[str, int]] = None,
  136. ) -> Dict:
  137. """基于原始帖子的共现关系构建 item 关系图
  138. Args:
  139. transactions: transaction 列表(每个帖子的 item 列表)
  140. post_ids: 帖子 ID 列表
  141. dimension_mode: 维度模式
  142. post_account_map: post_id → account_name 映射(可选)
  143. elements_map: item_key → {element_name: count} 映射(可选)
  144. Returns:
  145. graph dict: {item_str: {meta: {...}, edges: {other_item: {edge_type: {...}}}}}
  146. """
  147. # ── 3.1 收集所有 item,构建 meta ──
  148. all_items = set()
  149. for txn in transactions:
  150. all_items.update(txn)
  151. item_metas = {}
  152. for item in all_items:
  153. item_metas[item] = parse_item_meta(item, dimension_mode)
  154. # ── 3.2 统计每个 item 出现在多少个帖子中,收集 item → accounts ──
  155. freq_in_posts = defaultdict(int)
  156. item_accounts = defaultdict(set)
  157. for i, txn in enumerate(transactions):
  158. pid = post_ids[i]
  159. account = post_account_map.get(pid) if post_account_map else None
  160. for item in set(txn):
  161. freq_in_posts[item] += 1
  162. if account:
  163. item_accounts[item].add(account)
  164. # ── 3.3 构建 co_in_post 边 ──
  165. co_post_edges = defaultdict(lambda: {
  166. 'co_post_count': 0,
  167. 'post_ids': [],
  168. })
  169. for i, txn in enumerate(transactions):
  170. items = sorted(set(txn))
  171. for a, b in combinations(items, 2):
  172. co_post_edges[(a, b)]['co_post_count'] += 1
  173. co_post_edges[(a, b)]['post_ids'].append(post_ids[i])
  174. # ── 3.4 组装 graph ──
  175. graph = {}
  176. for item in all_items:
  177. meta = item_metas.get(item, {'raw': item})
  178. meta['frequency_in_posts'] = freq_in_posts[item]
  179. if item_accounts[item]:
  180. meta['accounts'] = sorted(item_accounts[item])
  181. # 附加元素信息(深拷贝,避免 build_post_ids_index 就地修改影响原始 elements_map)
  182. if elements_map and item in elements_map:
  183. # 按出现次数降序排列
  184. sorted_elements = {
  185. name: {'count': v['count'], 'post_ids': list(v['post_ids'])}
  186. for name, v in sorted(elements_map[item].items(), key=lambda x: -x[1]['count'])
  187. }
  188. meta['elements'] = sorted_elements
  189. graph[item] = {
  190. 'meta': meta,
  191. 'edges': {},
  192. }
  193. # 填充 co_in_post 边(双向)
  194. for (a, b), data in co_post_edges.items():
  195. edge_payload = {
  196. 'co_post_count': data['co_post_count'],
  197. 'post_ids': data['post_ids'],
  198. }
  199. # A → B
  200. _ensure_edge(graph, a, b)
  201. graph[a]['edges'][b]['co_in_post'] = {
  202. **edge_payload,
  203. 'confidence': round(data['co_post_count'] / freq_in_posts[a], 4),
  204. }
  205. # B → A
  206. _ensure_edge(graph, b, a)
  207. graph[b]['edges'][a]['co_in_post'] = {
  208. **edge_payload,
  209. 'confidence': round(data['co_post_count'] / freq_in_posts[b], 4),
  210. }
  211. # 填充 hierarchy 边(双向)
  212. items_list = sorted(all_items)
  213. for i, a in enumerate(items_list):
  214. for b in items_list[i + 1:]:
  215. h = detect_hierarchy(a, item_metas[a], b, item_metas[b])
  216. if h:
  217. _ensure_edge(graph, a, b)
  218. graph[a]['edges'][b]['hierarchy'] = h
  219. reverse_relation = 'descendant' if h['relation'] == 'ancestor' else 'ancestor'
  220. _ensure_edge(graph, b, a)
  221. graph[b]['edges'][a]['hierarchy'] = {
  222. 'relation': reverse_relation,
  223. 'depth_diff': h['depth_diff'],
  224. }
  225. return graph
  226. def _ensure_edge(graph: Dict, source: str, target: str):
  227. """确保 graph[source]['edges'][target] 存在"""
  228. if source not in graph:
  229. graph[source] = {'meta': {}, 'edges': {}}
  230. if target not in graph[source]['edges']:
  231. graph[source]['edges'][target] = {}
  232. # ──────────────────────────────────────────────
  233. # 4. post_ids 全局索引(压缩 JSON 体积)
  234. # ──────────────────────────────────────────────
  235. def build_post_ids_index(graph: Dict) -> List[str]:
  236. """收集所有 post_ids,构建全局索引,将 graph 中的 post_ids 就地替换为索引数组
  237. Returns:
  238. post_ids 索引列表
  239. """
  240. all_pids = set()
  241. # 从 edges 收集
  242. for item_data in graph.values():
  243. for target, edge_types in item_data.get('edges', {}).items():
  244. cp = edge_types.get('co_in_post')
  245. if cp and cp.get('post_ids'):
  246. all_pids.update(cp['post_ids'])
  247. # 从 elements 收集
  248. for item_data in graph.values():
  249. elements = item_data.get('meta', {}).get('elements', {})
  250. for el_data in elements.values():
  251. if isinstance(el_data, dict) and el_data.get('post_ids'):
  252. all_pids.update(el_data['post_ids'])
  253. index = sorted(all_pids)
  254. pid_to_idx = {pid: i for i, pid in enumerate(index)}
  255. # 替换 edges 中的 post_ids
  256. for item_data in graph.values():
  257. for target, edge_types in item_data.get('edges', {}).items():
  258. cp = edge_types.get('co_in_post')
  259. if cp and cp.get('post_ids'):
  260. cp['post_ids'] = [pid_to_idx[pid] for pid in cp['post_ids']]
  261. # 替换 elements 中的 post_ids
  262. for item_data in graph.values():
  263. elements = item_data.get('meta', {}).get('elements', {})
  264. for el_data in elements.values():
  265. if isinstance(el_data, dict) and el_data.get('post_ids'):
  266. el_data['post_ids'] = [pid_to_idx[pid] for pid in el_data['post_ids']]
  267. return index
  268. # ──────────────────────────────────────────────
  269. # 5. 主函数
  270. # ──────────────────────────────────────────────
  271. # 支持的 depth 模式及说明
  272. DEPTH_MODES = {
  273. 'max': '叶子节点完整路径',
  274. 'all_levels': '展开所有层级',
  275. }
  276. # 支持的 dimension 模式
  277. DIMENSION_MODES = ['full', 'point_type_only', 'substance_form_only']
  278. def main(account_name: str,
  279. depth_modes: List[str] = None,
  280. dimension_modes: List[str] = None):
  281. """构建 item 共现关系图
  282. Args:
  283. account_name: 账号名称
  284. depth_modes: 要构建的 depth 模式列表,默认 ['max', 'all_levels']
  285. dimension_modes: 要构建的 dimension 模式列表,默认 ['full']
  286. """
  287. if depth_modes is None:
  288. depth_modes = ['max', 'all_levels']
  289. if dimension_modes is None:
  290. dimension_modes = ['full']
  291. base_dir = f"result/{account_name}"
  292. results_file = os.path.join(base_dir, "topic_point_data/point_classification_results.json")
  293. post_account_map_file = os.path.join(base_dir, "topic_point_data/post_account_map.json")
  294. output_dir = os.path.join(base_dir, "item_graph")
  295. os.makedirs(output_dir, exist_ok=True)
  296. # 加载原始数据(只加载一次)
  297. with open(results_file, 'r', encoding='utf-8') as f:
  298. original_data = json.load(f)
  299. print(f"已加载原始数据: {len(original_data)} 个帖子")
  300. # 加载 post → account 映射
  301. post_account_map = None
  302. if os.path.exists(post_account_map_file):
  303. with open(post_account_map_file, 'r', encoding='utf-8') as f:
  304. post_account_map = json.load(f)
  305. print(f"已加载 post_account_map: {len(post_account_map)} 条")
  306. print(f"{'=' * 60}")
  307. print(f"构建 Item 共现关系图: {account_name}")
  308. print(f"{'=' * 60}")
  309. for dimension_mode in dimension_modes:
  310. # 每种 dimension_mode 只计算一次 elements_map(与 depth 无关)
  311. elements_map = collect_elements_for_items(original_data, dimension_mode)
  312. print(f"\n维度模式: {dimension_mode} | 含元素的分类数: {len(elements_map)}")
  313. for depth in depth_modes:
  314. print(f"\n{'─' * 60}")
  315. print(f"维度模式: {dimension_mode} | 深度: {depth} ({DEPTH_MODES.get(depth, depth)})")
  316. # 复用已加载的 original_data
  317. transactions, post_ids, _ = build_transactions_at_depth(
  318. results_file, depth, dimension_mode=dimension_mode, data=original_data
  319. )
  320. print(f"帖子数: {len(post_ids)}")
  321. # 统计唯一 items
  322. unique_items = set()
  323. for txn in transactions:
  324. unique_items.update(txn)
  325. print(f"唯一 item 数: {len(unique_items)}")
  326. # 构建基础图(co_in_post + hierarchy)
  327. graph = build_item_graph(transactions, post_ids, dimension_mode, post_account_map, elements_map)
  328. # 统计
  329. total_edges = sum(len(node['edges']) for node in graph.values())
  330. edge_type_counts = defaultdict(int)
  331. for node in graph.values():
  332. for target_node, edge_types in node['edges'].items():
  333. for et in edge_types:
  334. edge_type_counts[et] += 1
  335. print(f"\n 图节点数: {len(graph)}")
  336. print(f" 图边数(有向): {total_edges}")
  337. for et, cnt in sorted(edge_type_counts.items()):
  338. print(f" {et}: {cnt}")
  339. # 构建 post_ids 全局索引并压缩
  340. post_ids_index = build_post_ids_index(graph)
  341. # 保存 item graph(带索引)
  342. output = {'_post_ids_index': post_ids_index, **graph}
  343. output_file = os.path.join(output_dir, f"item_graph_{dimension_mode}_{depth}.json")
  344. with open(output_file, 'w', encoding='utf-8') as f:
  345. json.dump(output, f, ensure_ascii=False, indent=2)
  346. print(f" 已保存图: {output_file} (post_ids索引: {len(post_ids_index)})")
  347. print(f"\n{'=' * 60}")
  348. print("完成")
  349. if __name__ == "__main__":
  350. account_name = "小红书"
  351. main(account_name, depth_modes=['max', 'all_levels'],
  352. dimension_modes=['full', 'point_type_only', 'substance_form_only'])
  353. import visualization_item_graph.generate as generate
  354. generate.generate_html(account_name, f"result/{account_name}")