apriori_analysis_post_level.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. import json
  2. import os
  3. from typing import List, Dict, Tuple
  4. from fim import fpgrowth as pyfim_fpgrowth
  5. import pandas as pd
  6. def build_classification_path_label(classification_path: List[str]) -> str:
  7. """构建完整分类路径标签
  8. Args:
  9. classification_path: 分类路径数组,如 ['食品', '水果', '猕猴桃']
  10. Returns:
  11. 完整路径标签,如 '食品>水果>猕猴桃'
  12. """
  13. if classification_path and len(classification_path) > 0:
  14. return '>'.join(classification_path)
  15. return ""
  16. def get_path_at_depth(classification_path: List[str], target_depth: int) -> str:
  17. """获取指定深度的分类路径
  18. Args:
  19. classification_path: 分类路径数组,如 ['食品', '水果', '猕猴桃']
  20. target_depth: 目标深度(1-based),如果路径不够深,返回最大深度
  21. Returns:
  22. 指定深度的路径标签
  23. """
  24. if not classification_path or len(classification_path) == 0:
  25. return ""
  26. # 如果请求深度超过实际深度,使用实际深度(取全部)
  27. actual_depth = min(target_depth, len(classification_path))
  28. return '>'.join(classification_path[:actual_depth])
  29. def build_transactions_at_depth(results_file: str, target_depth, dimension_mode: str = 'full',
  30. data: Dict = None) -> Tuple[
  31. List[List[str]], List[str], Dict]:
  32. """构建指定深度的transactions
  33. Args:
  34. results_file: 数据文件路径
  35. target_depth: 目标深度
  36. - 1, 2, 3...: 具体深度层级
  37. - 'max': 最具体(叶子节点)
  38. - 'max_with_name': 最具体+名称
  39. - 'mixed': 混合深度(同时包含 max 路径层 + max_with_name 名称层)
  40. - 'all_levels': 展开所有层级(A, A>B, A>B>C 同时放入transaction)
  41. - 'max-N': 从叶子端缩减N层(如 'max-1' 去掉最后1层,'max-2' 去掉最后2层)
  42. dimension_mode: 维度模式
  43. - 'full': 点类型_维度_路径(默认,维度包括:实质/形式/意图)
  44. - 'point_type_only': 点类型_路径
  45. - 'substance_form_only': 维度_路径(维度包括:实质/形式/意图)
  46. Returns:
  47. (transactions, post_ids, original_data)
  48. Item 格式(根据 dimension_mode):
  49. - full 模式:
  50. - 路径层:点类型_维度_路径(例:灵感点_实质_路径,灵感点_形式_路径,灵感点_意图_路径)
  51. - 名称层:点类型_维度_路径||名称
  52. - point_type_only 模式:
  53. - 路径层:点类型_路径
  54. - 名称层:点类型_路径||名称
  55. - substance_form_only 模式:
  56. - 路径层:维度_路径(例:实质_路径,形式_路径,意图_路径)
  57. - 名称层:维度_路径||名称
  58. """
  59. if data is None:
  60. with open(results_file, 'r', encoding='utf-8') as f:
  61. data = json.load(f)
  62. # 解析 max-N 模式
  63. trim_from_leaf = 0
  64. if isinstance(target_depth, str) and target_depth.startswith('max-'):
  65. trim_from_leaf = int(target_depth.split('-')[1])
  66. transactions = []
  67. post_ids = []
  68. for post_id, post_data in data.items():
  69. post_transaction_set = set()
  70. for point_type in ['灵感点', '目的点', '关键点']:
  71. points = post_data.get(point_type, [])
  72. for point in points:
  73. # 根据 dimension_mode 构建前缀
  74. if dimension_mode == 'full':
  75. prefix_substance = f"{point_type}_实质_"
  76. prefix_form = f"{point_type}_形式_"
  77. prefix_intent = f"{point_type}_意图_"
  78. elif dimension_mode == 'point_type_only':
  79. prefix_substance = f"{point_type}_"
  80. prefix_form = f"{point_type}_"
  81. prefix_intent = f"{point_type}_"
  82. elif dimension_mode == 'substance_form_only':
  83. prefix_substance = "实质_"
  84. prefix_form = "形式_"
  85. prefix_intent = "意图_"
  86. else:
  87. raise ValueError(f"Unknown dimension_mode: {dimension_mode}")
  88. # 辅助函数:展开路径的所有层级前缀
  89. def expand_all_levels(path_list, prefix, start_depth=2):
  90. """将路径展开为多层级前缀,start_depth=1从根节点开始,2跳过根节点"""
  91. items = set()
  92. for depth in range(start_depth, len(path_list) + 1):
  93. label = '>'.join(path_list[:depth])
  94. items.add(f"{prefix}{label}")
  95. return items
  96. # 处理实质
  97. for item in point.get('实质', []):
  98. path = item.get('分类路径', [])
  99. name = item.get('名称', '')
  100. if path:
  101. if target_depth == 'all_levels':
  102. # 展开所有层级前缀
  103. post_transaction_set.update(expand_all_levels(path, prefix_substance))
  104. elif target_depth == 'mixed':
  105. # 混合深度:同时添加路径层和名称层
  106. # 路径层代表分类,名称层代表具体的点
  107. # 它们是不同的语义层级,应该同时存在以发现跨层级关系
  108. path_label = '>'.join(path)
  109. # 添加路径层(分类)
  110. post_transaction_set.add(f"{prefix_substance}{path_label}")
  111. # 添加名称层(具体点)
  112. if name:
  113. post_transaction_set.add(f"{prefix_substance}{path_label}||{name}")
  114. elif target_depth == 'max':
  115. # 使用完整路径(叶子节点)
  116. path_label = '>'.join(path)
  117. post_transaction_set.add(f"{prefix_substance}{path_label}")
  118. elif target_depth == 'max_with_name':
  119. # 使用完整路径||名称
  120. path_label = '>'.join(path)
  121. if name:
  122. post_transaction_set.add(f"{prefix_substance}{path_label}||{name}")
  123. else:
  124. # 如果没有名称,退化为路径
  125. post_transaction_set.add(f"{prefix_substance}{path_label}")
  126. elif trim_from_leaf > 0:
  127. # max-N:从叶子端缩减N层
  128. if len(path) > trim_from_leaf:
  129. trimmed = path[:-trim_from_leaf]
  130. else:
  131. trimmed = path[:1]
  132. path_label = '>'.join(trimmed)
  133. post_transaction_set.add(f"{prefix_substance}{path_label}")
  134. else:
  135. # 使用指定深度
  136. path_label = get_path_at_depth(path, target_depth)
  137. if path_label:
  138. post_transaction_set.add(f"{prefix_substance}{path_label}")
  139. # 处理形式
  140. for item in point.get('形式', []):
  141. path = item.get('分类路径', [])
  142. name = item.get('名称', '')
  143. if path:
  144. if target_depth == 'all_levels':
  145. post_transaction_set.update(expand_all_levels(path, prefix_form))
  146. elif target_depth == 'mixed':
  147. # 混合深度:同时添加路径层和名称层
  148. # 路径层代表分类,名称层代表具体的点
  149. # 它们是不同的语义层级,应该同时存在以发现跨层级关系
  150. path_label = '>'.join(path)
  151. # 添加路径层(分类)
  152. post_transaction_set.add(f"{prefix_form}{path_label}")
  153. # 添加名称层(具体点)
  154. if name:
  155. post_transaction_set.add(f"{prefix_form}{path_label}||{name}")
  156. elif target_depth == 'max':
  157. path_label = '>'.join(path)
  158. post_transaction_set.add(f"{prefix_form}{path_label}")
  159. elif target_depth == 'max_with_name':
  160. path_label = '>'.join(path)
  161. if name:
  162. post_transaction_set.add(f"{prefix_form}{path_label}||{name}")
  163. else:
  164. post_transaction_set.add(f"{prefix_form}{path_label}")
  165. elif trim_from_leaf > 0:
  166. if len(path) > trim_from_leaf:
  167. trimmed = path[:-trim_from_leaf]
  168. else:
  169. trimmed = path[:1]
  170. path_label = '>'.join(trimmed)
  171. post_transaction_set.add(f"{prefix_form}{path_label}")
  172. else:
  173. path_label = get_path_at_depth(path, target_depth)
  174. if path_label:
  175. post_transaction_set.add(f"{prefix_form}{path_label}")
  176. # 处理意图
  177. for item in point.get('意图', []):
  178. path = item.get('分类路径', [])
  179. name = item.get('名称', '')
  180. if path:
  181. if target_depth == 'all_levels':
  182. post_transaction_set.update(expand_all_levels(path, prefix_intent, start_depth=1))
  183. elif target_depth == 'mixed':
  184. # 混合深度:同时添加路径层和名称层
  185. # 路径层代表分类,名称层代表具体的点
  186. # 它们是不同的语义层级,应该同时存在以发现跨层级关系
  187. path_label = '>'.join(path)
  188. # 添加路径层(分类)
  189. post_transaction_set.add(f"{prefix_intent}{path_label}")
  190. # 添加名称层(具体点)
  191. if name:
  192. post_transaction_set.add(f"{prefix_intent}{path_label}||{name}")
  193. elif target_depth == 'max':
  194. path_label = '>'.join(path)
  195. post_transaction_set.add(f"{prefix_intent}{path_label}")
  196. elif target_depth == 'max_with_name':
  197. path_label = '>'.join(path)
  198. if name:
  199. post_transaction_set.add(f"{prefix_intent}{path_label}||{name}")
  200. else:
  201. post_transaction_set.add(f"{prefix_intent}{path_label}")
  202. elif trim_from_leaf > 0:
  203. if len(path) > trim_from_leaf:
  204. trimmed = path[:-trim_from_leaf]
  205. else:
  206. trimmed = path[:1]
  207. path_label = '>'.join(trimmed)
  208. post_transaction_set.add(f"{prefix_intent}{path_label}")
  209. else:
  210. path_label = get_path_at_depth(path, target_depth)
  211. if path_label:
  212. post_transaction_set.add(f"{prefix_intent}{path_label}")
  213. post_transaction = list(post_transaction_set)
  214. if post_transaction:
  215. transactions.append(post_transaction)
  216. post_ids.append(post_id)
  217. return transactions, post_ids, data
  218. def run_fpgrowth_with_absolute_support(transactions: List[List[str]],
  219. min_absolute_support: int = 2,
  220. max_len: int = None) -> pd.DataFrame:
  221. """使用 pyfim FP-Growth 算法挖掘闭频繁项集
  222. Args:
  223. transactions: transaction列表
  224. min_absolute_support: 最小绝对支持度(至少出现在N个transactions中)
  225. max_len: 频繁项集的最大长度(None表示不限制)
  226. Returns:
  227. 包含闭频繁项集的DataFrame
  228. """
  229. if not transactions or len(transactions) == 0:
  230. return pd.DataFrame()
  231. total_transactions = len(transactions)
  232. print(f" 使用算法: pyfim FP-Growth (闭频繁项集)")
  233. print(f" 总 transactions 数: {total_transactions}")
  234. print(f" 最小绝对支持度: {min_absolute_support}")
  235. import time
  236. start_time = time.time()
  237. # pyfim fpgrowth: supp 为负数表示绝对支持度, target='c' 为闭频繁项集, report='a' 返回绝对支持度计数
  238. pyfim_kwargs = {
  239. 'supp': -min_absolute_support,
  240. 'target': 'c',
  241. 'report': 'a',
  242. }
  243. if max_len is not None:
  244. pyfim_kwargs['zmax'] = max_len
  245. print(f" 最大项集长度限制: {max_len}")
  246. result = pyfim_fpgrowth(transactions, **pyfim_kwargs)
  247. elapsed_time = time.time() - start_time
  248. print(f" 算法运行时间: {elapsed_time:.2f} 秒")
  249. if not result:
  250. return pd.DataFrame()
  251. # 转换为 DataFrame
  252. rows = []
  253. for itemset_tuple, abs_support in result:
  254. itemset = frozenset(itemset_tuple)
  255. support = abs_support / total_transactions
  256. rows.append({
  257. 'itemsets': itemset,
  258. 'support': support,
  259. 'absolute_support': int(abs_support),
  260. 'length': len(itemset),
  261. })
  262. frequent_itemsets = pd.DataFrame(rows)
  263. print(f" 找到 {len(frequent_itemsets)} 个闭频繁项集")
  264. # 排序
  265. if not frequent_itemsets.empty:
  266. frequent_itemsets = frequent_itemsets.sort_values(['length', 'support'], ascending=[False, False])
  267. return frequent_itemsets
  268. def find_post_ids_for_itemset(itemset: frozenset, transactions: List[List[str]],
  269. post_ids: List[str]) -> List[str]:
  270. """找出包含指定频繁项集的所有帖子ID
  271. Args:
  272. itemset: 频繁项集
  273. transactions: transaction列表
  274. post_ids: 每个transaction对应的帖子ID列表
  275. Returns:
  276. 包含该频繁项集的帖子ID列表
  277. """
  278. itemset_set = set(itemset)
  279. matched_post_ids = []
  280. for idx, transaction in enumerate(transactions):
  281. transaction_set = set(transaction)
  282. # 如果 transaction 包含 itemset 的所有元素
  283. if itemset_set.issubset(transaction_set):
  284. matched_post_ids.append(post_ids[idx])
  285. return matched_post_ids
  286. def batch_find_post_ids(all_itemsets: List[frozenset],
  287. transactions: List[List[str]],
  288. post_ids: List[str]) -> Dict[frozenset, List[str]]:
  289. """批量查找所有频繁项集匹配的帖子ID(一次遍历 transactions)
  290. 相比对每个 itemset 单独调用 find_post_ids_for_itemset,
  291. 这里只遍历 transactions 一次,同时匹配所有 itemsets。
  292. Args:
  293. all_itemsets: 所有待匹配的频繁项集列表
  294. transactions: transaction列表
  295. post_ids: 每个transaction对应的帖子ID列表
  296. Returns:
  297. {itemset: [matched_post_id, ...], ...}
  298. """
  299. result = {itemset: [] for itemset in all_itemsets}
  300. for idx, transaction in enumerate(transactions):
  301. txn_set = set(transaction)
  302. pid = post_ids[idx]
  303. for itemset in all_itemsets:
  304. if itemset.issubset(txn_set):
  305. result[itemset].append(pid)
  306. return result
  307. def classify_itemset_by_point_type(itemset: frozenset, dimension_mode: str = 'full') -> Dict:
  308. """分类频繁项集涉及的点类型和维度
  309. Args:
  310. itemset: 频繁项集
  311. dimension_mode: 维度模式
  312. - 'full': 点类型_维度_路径(默认,维度包括:实质/形式/意图)
  313. - 'point_type_only': 点类型_路径
  314. - 'substance_form_only': 维度_路径(维度包括:实质/形式/意图)
  315. Returns:
  316. {
  317. 'point_types': 涉及的点类型集合,
  318. 'dimensions': 涉及的维度集合,
  319. 'is_single_point_type': 是否只涉及单一点类型,
  320. 'is_cross_point': 是否跨点类型,
  321. 'combination_type': 组合类型描述
  322. }
  323. """
  324. point_types = set()
  325. dimensions = set()
  326. for item in itemset:
  327. if dimension_mode == 'full':
  328. # 格式: 点类型_维度_路径 或 点类型_维度_路径||名称
  329. parts = item.split('_', 2) # 分割为最多3部分
  330. if len(parts) >= 3:
  331. point_type = parts[0]
  332. dimension = parts[1]
  333. point_types.add(point_type)
  334. dimensions.add(dimension)
  335. elif dimension_mode == 'point_type_only':
  336. # 格式: 点类型_路径 或 点类型_路径||名称
  337. # 只有点类型,没有维度信息
  338. parts = item.split('_', 1)
  339. if len(parts) >= 1:
  340. point_type = parts[0]
  341. point_types.add(point_type)
  342. elif dimension_mode == 'substance_form_only':
  343. # 格式: 维度_路径 或 维度_路径||名称
  344. # 只有维度,没有点类型信息
  345. parts = item.split('_', 1)
  346. if len(parts) >= 2:
  347. dimension = parts[0]
  348. dimensions.add(dimension)
  349. is_single_point_type = len(point_types) <= 1
  350. is_cross_point = len(point_types) > 1
  351. # 生成组合类型描述
  352. if dimension_mode == 'full':
  353. # full 模式:同时考虑点类型和维度
  354. if is_single_point_type:
  355. point_type = list(point_types)[0] if point_types else '未知'
  356. if len(dimensions) == 1:
  357. dimension = list(dimensions)[0]
  358. combination_type = f"{point_type}_{dimension}"
  359. else:
  360. # 多个维度:使用实际的维度组合
  361. dimensions_sorted = sorted(list(dimensions))
  362. combination_type = f"{point_type}_{'+'.join(dimensions_sorted)}"
  363. else:
  364. # 跨点组合
  365. point_types_list = sorted(list(point_types))
  366. if len(dimensions) == 1:
  367. dimension = list(dimensions)[0]
  368. combination_type = f"{'×'.join(point_types_list)}_{dimension}"
  369. else:
  370. combination_type = f"{'×'.join(point_types_list)}_混合"
  371. elif dimension_mode == 'point_type_only':
  372. # point_type_only 模式:只考虑点类型
  373. if is_single_point_type:
  374. point_type = list(point_types)[0] if point_types else '未知'
  375. combination_type = f"{point_type}"
  376. else:
  377. # 跨点组合
  378. point_types_list = sorted(list(point_types))
  379. combination_type = f"{'×'.join(point_types_list)}"
  380. elif dimension_mode == 'substance_form_only':
  381. # substance_form_only 模式:只考虑维度
  382. if len(dimensions) == 1:
  383. dimension = list(dimensions)[0]
  384. combination_type = f"{dimension}"
  385. else:
  386. # 多个维度
  387. dimensions_sorted = sorted(list(dimensions))
  388. combination_type = f"{'+'.join(dimensions_sorted)}"
  389. return {
  390. 'point_types': list(point_types),
  391. 'dimensions': list(dimensions),
  392. 'is_single_point_type': is_single_point_type,
  393. 'is_cross_point': is_cross_point,
  394. 'combination_type': combination_type
  395. }
  396. def categorize_frequent_itemsets(frequent_itemsets: pd.DataFrame, dimension_mode: str = 'full') -> Dict[
  397. str, pd.DataFrame]:
  398. """将频繁项集按组合类型分类
  399. Args:
  400. frequent_itemsets: Apriori结果DataFrame
  401. dimension_mode: 维度模式
  402. Returns:
  403. 按组合类型分类的字典
  404. """
  405. if frequent_itemsets.empty:
  406. return {}
  407. categorized = {}
  408. for _, row in frequent_itemsets.iterrows():
  409. itemset = row['itemsets']
  410. classification = classify_itemset_by_point_type(itemset, dimension_mode)
  411. combo_type = classification['combination_type']
  412. if combo_type not in categorized:
  413. categorized[combo_type] = []
  414. categorized[combo_type].append(row)
  415. # 转换为DataFrame
  416. result = {}
  417. for combo_type, rows in categorized.items():
  418. result[combo_type] = pd.DataFrame(rows)
  419. return result
  420. def parse_mixed_item(item: str) -> Dict:
  421. """解析 item(统一格式)
  422. Args:
  423. item: 格式为 "点类型_维度_路径" 或 "点类型_维度_路径||名称"
  424. Returns:
  425. {
  426. 'layer': 'path' 或 'name', # 通过是否包含||判断
  427. 'point_type': 点类型,
  428. 'dimension': 维度,
  429. 'path': 路径,
  430. 'name': 名称(如果有),
  431. 'full_path': 完整路径(包含名称)
  432. }
  433. """
  434. # 分离点类型、维度和路径
  435. parts = item.split('_', 2)
  436. if len(parts) < 3:
  437. raise ValueError(f"Invalid item format: {item}")
  438. point_type = parts[0]
  439. dimension = parts[1]
  440. path_with_name = parts[2]
  441. # 分离路径和名称
  442. if '||' in path_with_name:
  443. path, name = path_with_name.split('||', 1)
  444. layer = 'name'
  445. full_path = f"{path}>{name}"
  446. else:
  447. path = path_with_name
  448. name = None
  449. layer = 'path'
  450. full_path = path
  451. return {
  452. 'layer': layer,
  453. 'point_type': point_type,
  454. 'dimension': dimension,
  455. 'path': path,
  456. 'name': name,
  457. 'full_path': full_path
  458. }
  459. def is_ancestor_descendant_pair(item1: str, item2: str) -> bool:
  460. """检查两个 item 是否为父子/祖孙关系
  461. Args:
  462. item1, item2: 混合深度格式的 item
  463. Returns:
  464. True 如果存在祖先-后代关系
  465. """
  466. try:
  467. parsed1 = parse_mixed_item(item1)
  468. parsed2 = parse_mixed_item(item2)
  469. except ValueError:
  470. return False
  471. # 必须是相同的点类型和维度
  472. if (parsed1['point_type'] != parsed2['point_type'] or
  473. parsed1['dimension'] != parsed2['dimension']):
  474. return False
  475. # 检查 full_path 的包含关系
  476. path1 = parsed1['full_path']
  477. path2 = parsed2['full_path']
  478. # item1 是 item2 的祖先
  479. if path2.startswith(path1 + '>'):
  480. return True
  481. # item2 是 item1 的祖先
  482. if path1.startswith(path2 + '>'):
  483. return True
  484. return False
  485. def is_cross_level_combination(itemset) -> bool:
  486. """检查是否为跨层级组合(有意义的组合)
  487. 有意义的组合:
  488. - 包含至少一个 PATH 和一个 NAME
  489. - 或者全是 NAME 但来自不同的路径分支
  490. 无意义的组合(需要过滤):
  491. - 全是 PATH
  492. - 全是 NAME 且来自同一路径分支(兄弟节点)
  493. - 存在父子/祖孙关系
  494. Args:
  495. itemset: frozenset 或 list of items
  496. Returns:
  497. True 如果是有意义的跨层级组合
  498. """
  499. items = list(itemset)
  500. # 规则1:检查是否存在父子/祖孙关系
  501. for i, item1 in enumerate(items):
  502. for j, item2 in enumerate(items):
  503. if i < j and is_ancestor_descendant_pair(item1, item2):
  504. return False # 存在父子关系,过滤
  505. # 解析所有 items
  506. parsed_items = []
  507. for item in items:
  508. try:
  509. parsed_items.append(parse_mixed_item(item))
  510. except ValueError:
  511. return False
  512. # 统计层级分布
  513. layers = [p['layer'] for p in parsed_items]
  514. layer_counts = {'path': layers.count('path'), 'name': layers.count('name')}
  515. # 规则2:过滤全是 path 的组合
  516. if layer_counts['name'] == 0:
  517. return False
  518. # 规则3:过滤全是 name 的组合(不是跨层级)
  519. if layer_counts['path'] == 0:
  520. return False
  521. # 只有同时包含 path 和 name 才是有意义的跨层级组合
  522. return True
  523. def filter_mixed_depth_itemsets(frequent_itemsets: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
  524. """过滤混合深度的频繁项集,只保留有意义的跨层级组合
  525. Args:
  526. frequent_itemsets: Apriori 结果
  527. Returns:
  528. (filtered_itemsets, filter_stats)
  529. """
  530. if frequent_itemsets.empty:
  531. return frequent_itemsets, {}
  532. filtered_rows = []
  533. filter_stats = {
  534. 'total': len(frequent_itemsets),
  535. 'filtered_all_path': 0,
  536. 'filtered_same_path_names': 0,
  537. 'filtered_ancestor_descendant': 0,
  538. 'kept': 0
  539. }
  540. for _, row in frequent_itemsets.iterrows():
  541. itemset = row['itemsets']
  542. # 最小长度检查
  543. if len(itemset) < 2:
  544. continue
  545. # 跨层级检查
  546. if not is_cross_level_combination(itemset):
  547. # 统计过滤原因(简化版)
  548. layers = []
  549. for item in itemset:
  550. try:
  551. parsed = parse_mixed_item(item)
  552. layers.append(parsed['layer'])
  553. except ValueError:
  554. pass
  555. if layers.count('name') == 0:
  556. filter_stats['filtered_all_path'] += 1
  557. else:
  558. filter_stats['filtered_same_path_names'] += 1
  559. continue
  560. filtered_rows.append(row)
  561. filter_stats['kept'] += 1
  562. if not filtered_rows:
  563. return pd.DataFrame(), filter_stats
  564. result = pd.DataFrame(filtered_rows)
  565. return result.sort_values(['length', 'support'], ascending=[False, False]), filter_stats
  566. def format_results_for_json(frequent_itemsets: pd.DataFrame,
  567. total_transactions: int,
  568. transactions: List[List[str]],
  569. post_ids: List[str],
  570. depth=None,
  571. dimension_mode: str = 'full') -> Dict:
  572. """格式化结果为JSON格式
  573. Args:
  574. frequent_itemsets: Apriori结果DataFrame
  575. total_transactions: transaction总数
  576. transactions: transaction列表
  577. post_ids: 每个transaction对应的帖子ID
  578. depth: 深度标记(可选)
  579. dimension_mode: 维度模式
  580. Returns:
  581. 格式化的结果字典
  582. """
  583. if frequent_itemsets.empty:
  584. return {
  585. 'depth': depth,
  586. 'total_transactions': total_transactions,
  587. 'frequent_itemsets_by_type': {}
  588. }
  589. # 按组合类型分类(传递 dimension_mode)
  590. categorized = categorize_frequent_itemsets(frequent_itemsets, dimension_mode)
  591. # 批量查找所有 itemset 匹配的帖子(一次遍历 transactions)
  592. all_itemsets = list(frequent_itemsets['itemsets'])
  593. import time
  594. t0 = time.time()
  595. itemset_post_map = batch_find_post_ids(all_itemsets, transactions, post_ids)
  596. print(f" 批量匹配帖子耗时: {time.time() - t0:.2f} 秒 ({len(all_itemsets)} 个项集 × {len(transactions)} 个帖子)")
  597. results_by_type = {}
  598. for combo_type, itemsets_df in categorized.items():
  599. itemsets_list = []
  600. for _, row in itemsets_df.iterrows():
  601. itemset = row['itemsets']
  602. matched_posts = itemset_post_map[itemset]
  603. # 分类信息(传递 dimension_mode)
  604. classification = classify_itemset_by_point_type(itemset, dimension_mode)
  605. itemset_dict = {
  606. 'itemset': list(itemset),
  607. 'support': float(row['support']),
  608. 'absolute_support': int(row['absolute_support']),
  609. 'length': int(row['length']),
  610. 'post_count': len(matched_posts),
  611. 'matched_posts': matched_posts,
  612. 'point_types': classification['point_types'],
  613. 'dimensions': classification['dimensions'],
  614. 'is_cross_point': classification['is_cross_point']
  615. }
  616. itemsets_list.append(itemset_dict)
  617. results_by_type[combo_type] = {
  618. 'count': len(itemsets_list),
  619. 'itemsets': itemsets_list
  620. }
  621. return {
  622. 'depth': depth,
  623. 'total_transactions': total_transactions,
  624. 'min_support_used': float(frequent_itemsets['support'].min()) if not frequent_itemsets.empty else 0,
  625. 'frequent_itemsets_by_type': results_by_type
  626. }