1 mēnesi atpakaļ · f274c437fc
--- a/examples_how/overall_derivation/tools/find_tree_node.py
+++ b/examples_how/overall_derivation/tools/find_tree_node.py
@@ -15,7 +15,10 @@ from typing import Any, Optional
 
				 _root = Path(__file__).resolve().parent.parent
			
 
				 if str(_root) not in sys.path:
			
 
				     sys.path.insert(0, str(_root))
			
 
				-from utils.conditional_ratio_calc import calc_node_conditional_ratio  # noqa: E402
			
 
				+from utils.conditional_ratio_calc import (  # noqa: E402
			
 
				+    build_node_post_index,
			
 
				+    calc_node_conditional_ratio,
			
 
				+)
			
 
				 from tools.point_match import match_derivation_to_post_points  # noqa: E402
			
 
				 
			
 
				 try:
			
@@ -216,11 +219,17 @@ def get_nodes_by_conditional_ratio(
 
				             if ratio >= threshold:
			
 
				                 scored.append((node_name, ratio, parent_name, dim_for(node_name)))
			
 
				     else:
			
 
				+        node_post_index = build_node_post_index(account_name, base_dir)
			
 
				         for node_name, parent_name in node_to_parent.items():
			
 
				             if allowed_node_names is not None and node_name not in allowed_node_names:
			
 
				                 continue
			
 
				             ratio = calc_node_conditional_ratio(
			
 
				-                account_name, derived_list, node_name, base_dir=base_dir
			
 
				+                account_name,
			
 
				+                derived_list,
			
 
				+                node_name,
			
 
				+                base_dir=base_dir,
			
 
				+                node_post_index=node_post_index,
			
 
				+                target_ratio=threshold,
			
 
				             )
			
 
				             if ratio >= threshold:
			
 
				                 scored.append((node_name, ratio, parent_name, dim_for(node_name)))
			
--- a/examples_how/overall_derivation/utils/conditional_ratio_calc.py
+++ b/examples_how/overall_derivation/utils/conditional_ratio_calc.py
@@ -11,6 +11,9 @@ import json
 
				 from pathlib import Path
			
 
				 from typing import Any
			
 
				 
			
 
				+# 节点名 -> (该节点 post_ids, 父节点 post_ids)，用 frozenset 便于批量计算时复用、避免重复转换
			
 
				+NodePostIndex = dict[str, tuple[frozenset[str], frozenset[str]]]
			
 
				+
			
 
				 # 已推导列表：每项为 (已推导的选题点, 推导来源人设树节点)，如 ("分享","分享")、("柴犬","动物角色")
			
 
				 # 推导来源人设树节点的 post_ids 在计算条件概率时从人设树中读取
			
 
				 DerivedItem = tuple[str, str]
			
@@ -86,11 +89,36 @@ def _derived_post_ids_from_sources(
 
				     return common if common is not None else set()
			
 
				 
			
 
				 
			
 
				+def _derived_post_ids_from_frozen_index(
			
 
				+    derived_list: list[DerivedItem],
			
 
				+    index: NodePostIndex,
			
 
				+) -> frozenset[str]:
			
 
				+    """与 _derived_post_ids_from_sources 相同语义，索引为 frozenset 版（批量场景复用）。"""
			
 
				+    common: frozenset[str] | None = None
			
 
				+    for _topic_point, source_node in derived_list:
			
 
				+        if source_node not in index:
			
 
				+            continue
			
 
				+        pids = index[source_node][0]
			
 
				+        common = pids if common is None else common & pids
			
 
				+    return common if common is not None else frozenset()
			
 
				+
			
 
				+
			
 
				+def build_node_post_index(account_name: str, base_dir: Path | None = None) -> NodePostIndex:
			
 
				+    """
			
 
				+    构建账号人设树的节点索引（每个节点只建一次，供批量 calc_node_conditional_ratio 复用）。
			
 
				+    值为 (节点 post_ids, 父节点 post_ids) 的 frozenset，减少重复 list->set 与拷贝。
			
 
				+    """
			
 
				+    raw = _build_node_index(account_name, base_dir)
			
 
				+    return {k: (frozenset(a), frozenset(b)) for k, (a, b) in raw.items()}
			
 
				+
			
 
				+
			
 
				 def calc_node_conditional_ratio(
			
 
				     account_name: str,
			
 
				     derived_list: list[DerivedItem],
			
 
				     tree_node_name: str,
			
 
				     base_dir: Path | None = None,
			
 
				+    node_post_index: NodePostIndex | None = None,
			
 
				+    target_ratio: float | None = None,
			
 
				 ) -> float:
			
 
				     """
			
 
				     计算人设树节点 N 在父节点 P 下的条件概率。
			
@@ -100,6 +128,8 @@ def calc_node_conditional_ratio(
 
				         derived_list: 已推导列表，每项 (已推导的选题点, 推导来源人设树节点)
			
 
				         tree_node_name: 人设树节点 N 的名称（字符串匹配）
			
 
				         base_dir: 可选，input 根目录；不传则使用相对本文件的 ../input
			
 
				+        node_post_index: 可选，由 build_node_post_index 预构建；批量对多节点计算时传入可避免重复读盘与遍历整棵树
			
 
				+        target_ratio: 可选，目标条件概率。若某个组合的条件概率已达到该值，则直接返回（用于缩小组合搜索）
			
 
				 
			
 
				     计算规则:
			
 
				         已推导的帖子集合：从 derived_list 中先取「最多选题点」的交集，再逐步减少到 1 个选题点，
			
@@ -108,24 +138,47 @@ def calc_node_conditional_ratio(
 
				         分子 = |已推导的帖子集合 ∩ N 的 post_ids|，分母 = |已推导的帖子集合 ∩ P 的 post_ids|；
			
 
				         条件概率 = 分子/分母，且 ≤1；分母为 0 时该情况跳过。
			
 
				     """
			
 
				-    index = _build_node_index(account_name, base_dir)
			
 
				+    index = node_post_index if node_post_index is not None else build_node_post_index(account_name, base_dir)
			
 
				     if tree_node_name not in index:
			
 
				         return 0.0
			
 
				-    n_pids, p_pids = index[tree_node_name]
			
 
				-    set_n = set(n_pids)
			
 
				-    set_p = set(p_pids)
			
 
				+    set_n, set_p = index[tree_node_name]
			
 
				+
			
 
				+    # 关键优化（不改变搜索空间/结果）：
			
 
				+    # - derived_list 里重复的 source_node 对“交集”没有任何影响，但会把 L 变大导致 2^L 爆炸
			
 
				+    # - 不在 index 里的 source_node 原本也会被跳过，提前过滤可减少组合规模
			
 
				+    # - 组合内交集直接对 frozenset 逐步 &，避免 list(combo)/函数调用开销
			
 
				+    seen_sources: set[str] = set()
			
 
				+    source_sets: list[frozenset[str]] = []
			
 
				+    for _topic, source_node in derived_list:
			
 
				+        if source_node in seen_sources:
			
 
				+            continue
			
 
				+        seen_sources.add(source_node)
			
 
				+        tup = index.get(source_node)
			
 
				+        if tup is None:
			
 
				+            continue
			
 
				+        source_sets.append(tup[0])
			
 
				+
			
 
				+    if not source_sets:
			
 
				+        return 0.0
			
 
				+
			
 
				+    # 将更小的集合放前面：交集会更快“变小”，每次 & 的成本更低（仍然枚举全部子集）
			
 
				+    source_sets.sort(key=len)
			
 
				 
			
 
				     max_ratio = 0.0
			
 
				-    # 从「最多选题点」到 1 个选题点：对每种子集大小，取所有组合，分别算条件概率后取最大
			
 
				-    for k in range(len(derived_list), 0, -1):
			
 
				-        for combo in itertools.combinations(derived_list, k):
			
 
				-            derived_post_ids = _derived_post_ids_from_sources(list(combo), index)
			
 
				+    # 从 1 个选题点到「最多选题点」：对每种子集大小，取所有组合，分别算条件概率后取最大
			
 
				+    for k in range(1, len(source_sets) + 1):
			
 
				+        for combo_sets in itertools.combinations(source_sets, k):
			
 
				+            derived_post_ids = combo_sets[0]
			
 
				+            for s in combo_sets[1:]:
			
 
				+                derived_post_ids = derived_post_ids & s
			
 
				             den = len(derived_post_ids & set_p)
			
 
				             if den == 0:
			
 
				                 continue
			
 
				             num = len(derived_post_ids & set_n)
			
 
				             ratio = min(1.0, num / den)
			
 
				             max_ratio = max(max_ratio, ratio)
			
 
				+            if target_ratio is not None and max_ratio >= target_ratio:
			
 
				+                return round(max_ratio, 4)
			
 
				     return round(max_ratio, 4)