pattern_dimension_analyze.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757
  1. """
  2. Pattern 维度分析 Tool
  3. 功能概述:
  4. 1. 读取某次整体推导日志目录下各轮评估结果,累计 matched_post_point / derivation_output_point 等字段。
  5. 2. 每轮通过 derivation_output_point 在人设树中找到 cluster_level 层祖先节点(已推导维度节点集合)。
  6. 3. 从 deduped_patterns 中筛选包含已推导维度节点的 pattern,并对各元素标记是否已推导。
  7. 输入参数:
  8. - account_name: 账号名称
  9. - post_id: 帖子 ID
  10. - log_id: 推导日志目录名(形如 20260313210921)
  11. - cluster_level: 在人设树中查找祖先节点的目标深度(root 为 0 层)
  12. """
  13. import json
  14. import sys
  15. from pathlib import Path
  16. from typing import Any, Dict, List, Optional, Tuple, Set
  17. # 保证直接运行或作为包加载时都能解析 utils / tools(IDE 可跳转)
  18. _root = Path(__file__).resolve().parent.parent
  19. if str(_root) not in sys.path:
  20. sys.path.insert(0, str(_root))
  21. from tools.find_tree_node import _load_trees # 加载三棵人设树
  22. _BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
  23. _BASE_OUTPUT = Path(__file__).resolve().parent.parent / "output"
  24. # pattern 库 key 定义(与 find_pattern 中保持一致)
  25. TOP_KEYS = [
  26. "depth_4",
  27. ]
  28. SUB_KEYS = ["two_x", "one_x", "zero_x"]
  29. # ---------------------------------------------------------------------------
  30. # 1. 读取推导日志:按轮次累计 matched_post_point
  31. # ---------------------------------------------------------------------------
  32. def _round_eval_dir(account_name: str, post_id: str, log_id: str) -> Path:
  33. """
  34. 推导日志目录:
  35. ../output/{account_name}/推导日志/{post_id}/{log_id}/
  36. """
  37. return _BASE_OUTPUT / account_name / "推导日志" / post_id / log_id
  38. def _load_round_matched_points(
  39. account_name: str,
  40. post_id: str,
  41. log_id: str,
  42. ) -> List[Dict[str, Any]]:
  43. """
  44. 读取指定日志目录下所有 {轮次}.评估.json,按轮次排序,生成:
  45. [
  46. {
  47. "round": 1,
  48. "round_points": [
  49. {
  50. "matched_post_point": "叙事结构",
  51. "derivation_output_point": "叙事编排",
  52. "matched_score": 0.9151,
  53. "is_fully_derived": true,
  54. },
  55. ...
  56. ],
  57. "cumulative_points": [
  58. ... 累计到本轮的去重列表(以 derivation_output_point 为去重 key) ...
  59. ],
  60. },
  61. ...
  62. ]
  63. """
  64. base_dir = _round_eval_dir(account_name, post_id, log_id)
  65. if not base_dir.is_dir():
  66. return []
  67. eval_files: List[Tuple[int, Path]] = []
  68. for p in base_dir.glob("*.json"):
  69. name = p.name
  70. # 只处理 *_评估.json
  71. if not name.endswith("评估.json"):
  72. continue
  73. try:
  74. round_str = name.split("_", 1)[0]
  75. r = int(round_str)
  76. except Exception:
  77. continue
  78. eval_files.append((r, p))
  79. eval_files.sort(key=lambda x: x[0])
  80. results: List[Dict[str, Any]] = []
  81. cumulative: List[Dict[str, Any]] = []
  82. cumulative_set: Set[str] = set() # 以 derivation_output_point 去重
  83. for r, path in eval_files:
  84. try:
  85. with open(path, "r", encoding="utf-8") as f:
  86. data = json.load(f)
  87. except Exception:
  88. continue
  89. eval_results = data.get("eval_results") or []
  90. round_points: List[Dict[str, Any]] = []
  91. seen_in_round: Set[str] = set()
  92. for item in eval_results:
  93. if not isinstance(item, dict):
  94. continue
  95. if not item.get("is_matched"):
  96. continue
  97. dop = item.get("derivation_output_point")
  98. if dop is None:
  99. continue
  100. dop = str(dop).strip()
  101. if not dop:
  102. continue
  103. # 本轮内按 derivation_output_point 去重
  104. if dop in seen_in_round:
  105. continue
  106. seen_in_round.add(dop)
  107. mpp = item.get("matched_post_point")
  108. entry: Dict[str, Any] = {
  109. "matched_post_point": str(mpp).strip() if mpp is not None else None,
  110. "derivation_output_point": dop,
  111. "matched_score": item.get("matched_score"),
  112. "is_fully_derived": item.get("is_fully_derived"),
  113. }
  114. round_points.append(entry)
  115. # 累加到累计列表(按 derivation_output_point 去重)
  116. for entry in round_points:
  117. dop = entry["derivation_output_point"]
  118. if dop not in cumulative_set:
  119. cumulative_set.add(dop)
  120. cumulative.append(entry)
  121. results.append(
  122. {
  123. "round": r,
  124. "round_points": round_points,
  125. "cumulative_points": list(cumulative),
  126. }
  127. )
  128. return results
  129. # ---------------------------------------------------------------------------
  130. # 2. 读取 pattern 库并按 matched_post_point 打分
  131. # ---------------------------------------------------------------------------
  132. def _pattern_file(account_name: str) -> Path:
  133. """pattern 库文件:../input/{account_name}/原始数据/pattern/processed_edge_data.json"""
  134. return _BASE_INPUT / account_name / "原始数据" / "pattern" / "processed_edge_data.json"
  135. def _load_raw_patterns(account_name: str) -> List[Dict[str, Any]]:
  136. """
  137. 读取 pattern 库中所有原始 pattern(保留 items 结构,不做合并)。
  138. 返回列表中每个元素形如原始 JSON 中的 pattern(此处不关心 item 的 point / dimension 字段)。
  139. """
  140. path = _pattern_file(account_name)
  141. if not path.is_file():
  142. return []
  143. with open(path, "r", encoding="utf-8") as f:
  144. data = json.load(f)
  145. patterns: List[Dict[str, Any]] = []
  146. for top in TOP_KEYS:
  147. block = data.get(top)
  148. if not isinstance(block, dict):
  149. continue
  150. for sub in SUB_KEYS:
  151. items = block.get(sub) or []
  152. if isinstance(items, list):
  153. for p in items:
  154. if isinstance(p, dict):
  155. patterns.append(p)
  156. return patterns
  157. def _slim_pattern_for_dedupe(p: Dict[str, Any]) -> Tuple[float, List[str]]:
  158. """
  159. 提取 pattern 的 support 与去重后的 item name 列表(按名称合并,不关心顺序),
  160. 用于与 find_pattern.py 中的去重逻辑对齐。
  161. """
  162. items = p.get("items") or []
  163. names = [str(it.get("name") or "").strip() for it in items if isinstance(it, dict)]
  164. seen: Set[str] = set()
  165. unique: List[str] = []
  166. for n in names:
  167. if n and n not in seen:
  168. seen.add(n)
  169. unique.append(n)
  170. try:
  171. support = float(p.get("support", 0.0))
  172. except (TypeError, ValueError):
  173. support = 0.0
  174. return support, unique
  175. def _dedupe_patterns(raw_patterns: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
  176. """
  177. 按 pattern 的 item name 集合去重(不区分顺序),与 find_pattern.py 的思路一致:
  178. - key 为 sorted(unique item names)
  179. - 同一个 key 仅保留 support 最大的 pattern(保留其原始 items 结构,方便后续打分)
  180. """
  181. key_to_best: Dict[Tuple[str, ...], Dict[str, Any]] = {}
  182. key_to_support: Dict[Tuple[str, ...], float] = {}
  183. for p in raw_patterns:
  184. support, unique = _slim_pattern_for_dedupe(p)
  185. if not unique:
  186. continue
  187. key = tuple(sorted(unique))
  188. best_support = key_to_support.get(key)
  189. if best_support is None or support > best_support:
  190. key_to_support[key] = support
  191. key_to_best[key] = p
  192. return list(key_to_best.values())
  193. # ---------------------------------------------------------------------------
  194. # 3. 人设树节点信息 & 聚类节点搜索
  195. # ---------------------------------------------------------------------------
  196. class TreeIndex:
  197. """
  198. 人设树索引:
  199. - node_info: 节点 -> { "parent": 父节点名称, "children": [子节点名称...], "depth": 深度, "dimension": 维度名 }
  200. - roots: 维度名 -> 根节点名称(即维度名本身)
  201. - merged_tree: 将实质/形式/意图三棵树合并后的单个 JSON(顶层 key 为实质/形式/意图)
  202. """
  203. def __init__(self, account_name: str) -> None:
  204. self.account_name = account_name
  205. self.node_info: Dict[str, Dict[str, Any]] = {}
  206. self.roots: Dict[str, str] = {}
  207. # 三棵树合并后的 JSON:{"实质": {...}, "形式": {...}, "意图": {...}}
  208. self.merged_tree: Dict[str, Dict[str, Any]] = {}
  209. self._build()
  210. def _build(self) -> None:
  211. trees = _load_trees(self.account_name)
  212. # 1)先将三棵树合并成一个 JSON:{"实质": {...}, "形式": {...}, "意图": {...}}
  213. merged: Dict[str, Dict[str, Any]] = {}
  214. for dim_name, root in trees:
  215. if isinstance(root, dict):
  216. merged[dim_name] = root
  217. self.merged_tree = merged
  218. # 2)基于合并后的 JSON 构建 parent/children 结构
  219. for dim_name, root in merged.items():
  220. root_name = dim_name
  221. self.roots[dim_name] = root_name
  222. if root_name not in self.node_info:
  223. self.node_info[root_name] = {
  224. "parent": None,
  225. "children": [],
  226. "dimension": dim_name,
  227. "depth": 0,
  228. }
  229. def walk(parent_name: str, node_dict: Dict[str, Any]):
  230. children = node_dict.get("children") or {}
  231. for name, child in children.items():
  232. if not isinstance(child, dict):
  233. continue
  234. if name not in self.node_info:
  235. self.node_info[name] = {
  236. "parent": parent_name,
  237. "children": [],
  238. "dimension": dim_name,
  239. "depth": None, # 稍后统一计算
  240. }
  241. else:
  242. # 仅当不会形成自引用时才更新 parent(树中可能存在同名的父子节点)
  243. if name != parent_name:
  244. self.node_info[name]["parent"] = parent_name
  245. self.node_info[name]["dimension"] = dim_name
  246. # 维护父节点的 children
  247. if parent_name not in self.node_info:
  248. self.node_info[parent_name] = {
  249. "parent": None,
  250. "children": [],
  251. "dimension": dim_name,
  252. "depth": 0,
  253. }
  254. if name not in self.node_info[parent_name]["children"]:
  255. self.node_info[parent_name]["children"].append(name)
  256. walk(name, child)
  257. walk(root_name, root)
  258. # 统一计算各节点深度(从根开始 BFS)
  259. from collections import deque
  260. q = deque()
  261. for dim_name, root_name in self.roots.items():
  262. if root_name not in self.node_info:
  263. continue
  264. self.node_info[root_name]["depth"] = 0
  265. q.append(root_name)
  266. while q:
  267. cur = q.popleft()
  268. cur_depth = self.node_info[cur].get("depth", 0) or 0
  269. for child in self.node_info[cur].get("children", []):
  270. self.node_info.setdefault(child, {})
  271. if self.node_info[child].get("depth") is None:
  272. self.node_info[child]["depth"] = cur_depth + 1
  273. q.append(child)
  274. def find_ancestor_at_level(self, node_name: str, level: int) -> Optional[str]:
  275. """
  276. 在人设树中找到 node_name 的 depth == level 的祖先节点。
  277. - 若 node_name 自身 depth == level,直接返回自身。
  278. - 若 node_name depth < level(比目标层浅),返回自身。
  279. - 否则沿 parent 链向上查找,返回第一个 depth == level 的祖先节点。
  280. """
  281. info = self.node_info.get(node_name)
  282. if not info:
  283. return None
  284. depth = info.get("depth")
  285. if depth is None:
  286. return None
  287. if depth <= level:
  288. return node_name
  289. cur = node_name
  290. visited: Set[str] = set()
  291. while cur and cur not in visited:
  292. visited.add(cur)
  293. cur_info = self.node_info.get(cur) or {}
  294. cur_depth = cur_info.get("depth") or 0
  295. if cur_depth == level:
  296. return cur
  297. if cur_depth < level:
  298. return cur
  299. parent = cur_info.get("parent")
  300. if parent is None:
  301. return cur
  302. cur = parent
  303. return None
  304. # 聚类搜索(不再区分维度)
  305. def find_clusters(
  306. self,
  307. elements: List[str],
  308. cluster_level: int,
  309. ) -> List[Dict[str, Any]]:
  310. """
  311. 在所有人设树中,为给定元素列表寻找聚类节点(不再要求 dimension 一致)。
  312. 规则(固定聚类层级 cluster_level):
  313. - 仅在 depth == cluster_level 的节点上做聚类判断:
  314. * 若某节点子树中包含的元素数量 >= 2,
  315. 且在该路径上尚未存在更高层(深度更小)的聚类节点,则将其视为一个聚类节点。
  316. - 对无法向上形成聚类的元素,为其寻找 depth == cluster_level 的祖先节点,
  317. 若存在则作为该元素的「单元素聚类」节点。
  318. - 返回:
  319. [
  320. {
  321. "cluster_node": "节点名",
  322. "from_elements": ["元素A", "元素B", ...]
  323. },
  324. ...
  325. ]
  326. """
  327. # 过滤出真实存在于人设树中的元素
  328. elem_set: Set[str] = set()
  329. for e in elements:
  330. e = str(e).strip()
  331. if not e:
  332. continue
  333. info = self.node_info.get(e)
  334. if not info:
  335. continue
  336. elem_set.add(e)
  337. if not elem_set:
  338. return []
  339. # 先计算每个节点子树中包含的元素数量(跨所有维度的根)
  340. # 注意:人设树数据中可能存在意外的环或重复引用,这里通过 visited 集合避免递归死循环。
  341. subtree_count: Dict[str, int] = {}
  342. def dfs_count(node: str, visited: Set[str]) -> int:
  343. if node in visited:
  344. # 检测到环,直接返回 0,避免无限递归
  345. return 0
  346. visited.add(node)
  347. cnt = 1 if node in elem_set else 0
  348. for ch in self.node_info.get(node, {}).get("children", []):
  349. cnt += dfs_count(ch, visited)
  350. subtree_count[node] = cnt
  351. return cnt
  352. for root_name in self.roots.values():
  353. dfs_count(root_name, set())
  354. # 再自上而下优先选择「更上层」聚类节点(但仅在 cluster_level 层):
  355. # - 若当前节点已作为聚类节点,则其子孙不再作为聚类节点(保证尽量向上聚类);
  356. # 同样需要防止意外的环导致递归过深,这里使用 visited 集合。
  357. clusters: Set[str] = set()
  358. def dfs_select(node: str, ancestor_selected: bool, visited: Set[str]) -> None:
  359. if node in visited:
  360. return
  361. visited.add(node)
  362. info = self.node_info.get(node) or {}
  363. depth = info.get("depth", 0) or 0
  364. cnt = subtree_count.get(node, 0)
  365. selected_here = False
  366. # 仅当祖先尚未被选中、当前节点位于 cluster_level 层且满足条件时,选当前节点为聚类节点
  367. if (not ancestor_selected) and depth == cluster_level and cnt >= 2:
  368. clusters.add(node)
  369. selected_here = True
  370. # 祖先已经被选中或当前节点被选中,则子孙不再作为聚类节点
  371. for ch in info.get("children", []):
  372. dfs_select(ch, ancestor_selected or selected_here, visited)
  373. for root_name in self.roots.values():
  374. dfs_select(root_name, False, set())
  375. if not clusters:
  376. return []
  377. # 统计每个聚类节点下真实覆盖的元素列表
  378. cluster_to_elements: Dict[str, Set[str]] = {c: set() for c in clusters}
  379. for e in elem_set:
  380. cur = e
  381. visited: Set[str] = set()
  382. while cur and cur not in visited:
  383. visited.add(cur)
  384. if cur in clusters:
  385. cluster_to_elements[cur].add(e)
  386. parent = self.node_info.get(cur, {}).get("parent")
  387. if parent is None:
  388. break
  389. cur = parent
  390. out: List[Dict[str, Any]] = []
  391. # 1)多元素聚类:仅统计真正输出的聚类节点所覆盖的元素,
  392. # 避免把「元素数不足 2 的节点」也算作已覆盖,从而导致元素丢失。
  393. covered_elems: Set[str] = set()
  394. for node in clusters:
  395. elems = sorted(cluster_to_elements.get(node) or [])
  396. if len(elems) < 2:
  397. # 主聚类逻辑只考虑覆盖至少 2 个元素的节点
  398. continue
  399. out.append(
  400. {
  401. "cluster_node": node,
  402. "from_elements": elems,
  403. }
  404. )
  405. for e in elems:
  406. covered_elems.add(e)
  407. # 2)对无法向上形成聚类的元素,给一个「单元素聚类」
  408. uncovered = elem_set - covered_elems
  409. # 将未覆盖元素按「cluster_level 层级的祖先节点」分组,确保同一个祖先节点下的
  410. # 多个元素合并为一个聚类,而不是多个单元素聚类。
  411. single_clusters: Dict[str, Set[str]] = {}
  412. for e in uncovered:
  413. # 单元素聚类时,cluster_node 应为「祖先节点」,不直接使用元素自身。
  414. # 这里固定选择 depth == cluster_level 的祖先节点。
  415. info_e = self.node_info.get(e) or {}
  416. parent = info_e.get("parent")
  417. cur = parent
  418. best_ancestor: Optional[str] = None
  419. visited_chain: Set[str] = set()
  420. while cur and cur not in visited_chain:
  421. visited_chain.add(cur)
  422. info = self.node_info.get(cur) or {}
  423. depth = info.get("depth", 0) or 0
  424. if depth == cluster_level:
  425. best_ancestor = cur
  426. break
  427. parent = info.get("parent")
  428. if parent is None:
  429. break
  430. cur = parent
  431. if best_ancestor:
  432. single_clusters.setdefault(best_ancestor, set()).add(e)
  433. for anc, elems in single_clusters.items():
  434. out.append(
  435. {
  436. "cluster_node": anc,
  437. "from_elements": sorted(elems),
  438. }
  439. )
  440. # 为了输出更稳定,按 from_elements 的元素数量从大到小排序,数量相同再按节点名排序
  441. out.sort(key=lambda x: (-len(x["from_elements"]), x["cluster_node"]))
  442. return out
  443. # ---------------------------------------------------------------------------
  444. # 4. 对单轮数据执行 pattern & 聚类分析
  445. # ---------------------------------------------------------------------------
  446. def _analyze_single_round(
  447. patterns: List[Dict[str, Any]],
  448. tree_index: TreeIndex,
  449. cumulative_points: List[Dict[str, Any]],
  450. cluster_level: int,
  451. ) -> Dict[str, Any]:
  452. """
  453. 对某一轮(给定累计 point 列表)执行维度分析:
  454. 1. 从 cumulative_points 中提取 derivation_output_point,
  455. 在人设树中找到每个节点的 cluster_level 层祖先 → derived_ancestor_set(已推导维度节点集合)。
  456. 2. 从 deduped_patterns 中筛选出包含 derived_ancestor_set 中节点的 pattern。
  457. 3. 对筛选出 pattern 的每个元素标记是否已推导:
  458. - 元素在 derived_ancestor_set 中 → is_derived=True(已推导维度)
  459. - 其他 → is_derived=False(未推导维度)
  460. 4. 汇总 derived_dims / underived_dims 列表。
  461. 返回结构:
  462. {
  463. "cumulative_points": [...], # 原始累计 point 对象列表
  464. "derived_ancestor_nodes": [...], # 所有 derivation_output_point 对应的 cluster_level 层祖先节点(已推导维度节点集合)
  465. "patterns": [...], # 筛选后带 is_derived 标记的 pattern 列表
  466. "derived_dims": [...], # 已推导维度节点(去重,出现于筛选 pattern 中)
  467. "underived_dims": [...], # 未推导维度节点(去重,排除已推导节点)
  468. "patterns_count": int,
  469. "derived_dim_count": int,
  470. "underived_dim_count": int,
  471. }
  472. """
  473. # 1. 收集 derived_ancestor_set,同时记录每个祖先节点对应的 matched_post_point 来源
  474. derived_ancestor_set: Set[str] = set()
  475. ancestor_to_mpps: Dict[str, List[str]] = {} # 祖先节点 -> [matched_post_point, ...]
  476. for entry in cumulative_points:
  477. dop = entry.get("derivation_output_point")
  478. if not dop:
  479. continue
  480. ancestor = tree_index.find_ancestor_at_level(str(dop).strip(), cluster_level)
  481. if not ancestor:
  482. continue
  483. derived_ancestor_set.add(ancestor)
  484. mpp = entry.get("matched_post_point") or ""
  485. if mpp and mpp not in ancestor_to_mpps.get(ancestor, []):
  486. ancestor_to_mpps.setdefault(ancestor, []).append(mpp)
  487. # 2. 筛选 pattern:已推导维度节点占所有元素的比例 >= 50%
  488. filtered_patterns: List[Dict[str, Any]] = []
  489. for p in patterns:
  490. items = p.get("items") or []
  491. item_names = [
  492. str(it.get("name") or "").strip()
  493. for it in items
  494. if isinstance(it, dict)
  495. ]
  496. if not item_names:
  497. continue
  498. if len(item_names) < 5:
  499. continue
  500. derived_count = sum(1 for name in item_names if name in derived_ancestor_set)
  501. if derived_count / len(item_names) >= 0.5:
  502. filtered_patterns.append(p)
  503. print(
  504. f"filtered_patterns: {len(filtered_patterns)}, "
  505. f"derived_ancestor_set: {len(derived_ancestor_set)}"
  506. )
  507. def _node_label(name: str, is_derived: bool) -> str:
  508. """
  509. 返回格式化标签:
  510. - 已推导节点:'node_name->dimension(mpp1,mpp2,...)'
  511. - 未推导节点:'node_name->dimension'
  512. """
  513. dim = (tree_index.node_info.get(name) or {}).get("dimension") or ""
  514. base = f"{name}->{dim}" if dim else name
  515. if is_derived:
  516. mpps = ancestor_to_mpps.get(name) or []
  517. if mpps:
  518. return f"{base}({','.join(mpps)})"
  519. return base
  520. # 3. 对筛选 pattern 元素分类并汇总维度列表
  521. derived_dims: List[str] = []
  522. underived_dims: List[str] = []
  523. derived_dims_seen: Set[str] = set()
  524. underived_dims_seen: Set[str] = set()
  525. scored_patterns: List[Dict[str, Any]] = []
  526. for p in filtered_patterns:
  527. items = p.get("items") or []
  528. tagged_items: List[Dict[str, Any]] = []
  529. for it in items:
  530. if not isinstance(it, dict):
  531. continue
  532. name = str(it.get("name") or "").strip()
  533. is_derived = name in derived_ancestor_set
  534. tagged_items.append(
  535. {
  536. "name": name,
  537. "is_derived": is_derived,
  538. }
  539. )
  540. if is_derived:
  541. if name and name not in derived_dims_seen:
  542. derived_dims_seen.add(name)
  543. derived_dims.append(_node_label(name, is_derived=True))
  544. else:
  545. if name and name not in underived_dims_seen:
  546. underived_dims_seen.add(name)
  547. underived_dims.append(_node_label(name, is_derived=False))
  548. scored_patterns.append(
  549. {
  550. "id": p.get("id"),
  551. "support": p.get("support"),
  552. "items": tagged_items,
  553. }
  554. )
  555. # 从 underived_dims 中排除与 derived_dims 重叠的节点
  556. underived_dims = [d for d in underived_dims if d.split("->")[0] not in derived_dims_seen]
  557. # 按 is_derived=True 的元素数量从高到低排序,数量相同再按元素总数从高到低
  558. scored_patterns.sort(
  559. key=lambda x: (
  560. sum(1 for it in x.get("items", []) if it.get("is_derived")),
  561. len(x.get("items", [])),
  562. ),
  563. reverse=True,
  564. )
  565. return {
  566. "cumulative_points": list(cumulative_points),
  567. "derived_ancestor_nodes": sorted(derived_ancestor_set),
  568. "patterns": scored_patterns,
  569. "derived_dims": derived_dims,
  570. "underived_dims": underived_dims,
  571. "patterns_count": len(scored_patterns),
  572. "derived_dim_count": len(derived_dims),
  573. "underived_dim_count": len(underived_dims),
  574. }
  575. def pattern_dimension_analyze(
  576. account_name: str,
  577. post_id: str,
  578. log_id: str,
  579. cluster_level: int = 2,
  580. ) -> Dict[str, Any]:
  581. """
  582. Pattern 维度分析主入口。
  583. 参数
  584. -------
  585. account_name : 账号名(用于定位 input / output 下的数据目录)
  586. post_id : 帖子 ID(用于定位推导日志)
  587. log_id : 推导日志目录名(../output/{account_name}/推导日志/{post_id}/{log_id}/)
  588. cluster_level : 在人设树中查找祖先节点的目标深度(root 为 0 层),默认 2
  589. 逻辑概述
  590. --------
  591. 每一轮:
  592. 1. 从 derivation_output_point 在人设树中找到 cluster_level 层祖先节点 → 已推导维度节点集合。
  593. 2. 筛选包含已推导维度节点的 pattern。
  594. 3. 标记每个 pattern 元素是否已推导,汇总 derived_dims / underived_dims。
  595. """
  596. eval_dir = _round_eval_dir(account_name, post_id, log_id)
  597. if not eval_dir.is_dir():
  598. raise FileNotFoundError(f"推导日志目录不存在: {eval_dir}")
  599. round_infos = _load_round_matched_points(account_name, post_id, log_id)
  600. if not round_infos:
  601. return {
  602. "account_name": account_name,
  603. "post_id": post_id,
  604. "log_id": log_id,
  605. "cluster_level": cluster_level,
  606. "rounds": [],
  607. "message": "未在指定日志目录下找到任何评估结果文件(*_评估.json)",
  608. }
  609. tree_index = TreeIndex(account_name)
  610. # pattern 库只在整体分析时读取 & 去重一次,避免每一轮重复 IO 与解析
  611. raw_patterns = _load_raw_patterns(account_name)
  612. deduped_patterns = _dedupe_patterns(raw_patterns)
  613. print(f"deduped_patterns len: {len(deduped_patterns)}")
  614. rounds_output: List[Dict[str, Any]] = []
  615. for info in round_infos:
  616. r = info["round"]
  617. cumulative_points = info["cumulative_points"]
  618. analyzed = _analyze_single_round(
  619. patterns=deduped_patterns,
  620. tree_index=tree_index,
  621. cumulative_points=cumulative_points,
  622. cluster_level=cluster_level,
  623. )
  624. analyzed["round"] = r
  625. rounds_output.append(analyzed)
  626. return {
  627. "account_name": account_name,
  628. "post_id": post_id,
  629. "log_id": log_id,
  630. "cluster_level": cluster_level,
  631. "rounds": rounds_output,
  632. }
  633. def main(account_name, post_id, log_id) -> None:
  634. """本地简单测试:以家有大志账号的一次推导日志做分析,并将结果写入输出目录。"""
  635. result = pattern_dimension_analyze(
  636. account_name=account_name,
  637. post_id=post_id,
  638. log_id=log_id,
  639. cluster_level=3,
  640. )
  641. # 控制台打印前 4000 字符,便于快速查看
  642. # print(json.dumps(result, ensure_ascii=False, indent=2)[:4000] + "...")
  643. # 写入输出文件:../output/{account_name}/推导日志/{post_id}/{log_id}/pattern_dimension_analyze.json
  644. out_dir = _round_eval_dir(account_name, post_id, log_id)
  645. out_dir.mkdir(parents=True, exist_ok=True)
  646. output_file_name = f"{post_id}_pattern_dimension_analyze.json"
  647. out_path = out_dir / output_file_name
  648. with open(out_path, "w", encoding="utf-8") as f:
  649. json.dump(result, f, ensure_ascii=False, indent=2)
  650. print(f"\n分析结果已写入: {out_path}")
  651. if __name__ == "__main__":
  652. account_name = "家有大志"
  653. items = [
  654. {"post_id": "68fb6a5c000000000302e5de", "log_id": "20260317214307"},
  655. {"post_id": "69185d49000000000d00f94e", "log_id": "20260317214841"},
  656. {"post_id": "6921937a000000001b0278d1", "log_id": "20260317215616"}
  657. ]
  658. for item in items:
  659. post_id = item["post_id"]
  660. log_id = item["log_id"]
  661. main(account_name, post_id, log_id)