2 месяцев назад · 93788b146e
--- a/examples_how/overall_derivation/tools/find_pattern.py
+++ b/examples_how/overall_derivation/tools/find_pattern.py
@@ -0,0 +1,256 @@
 
				+"""
			
 
				+查找 Pattern Tool - 从 pattern 库中获取符合条件概率阈值的 pattern
			
 
				+
			
 
				+功能：读取账号的 pattern 库，合并去重后按条件概率筛选，返回 topN 条 pattern（含 pattern 名称、条件概率）。
			
 
				+"""
			
 
				+
			
 
				+import importlib.util
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Optional
			
 
				+
			
 
				+try:
			
 
				+    from agent.tools import tool, ToolResult, ToolContext
			
 
				+except ImportError:
			
 
				+    def tool(*args, **kwargs):
			
 
				+        return lambda f: f
			
 
				+    ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
			
 
				+    ToolContext = None
			
 
				+
			
 
				+# 与 pattern_data_process 一致的 key 定义
			
 
				+TOP_KEYS = [
			
 
				+    "depth_max_with_name",
			
 
				+    "depth_mixed",
			
 
				+    "depth_max_concrete",
			
 
				+    "depth2_medium",
			
 
				+    "depth1_abstract",
			
 
				+]
			
 
				+SUB_KEYS = ["two_x", "one_x", "zero_x"]
			
 
				+
			
 
				+# 加载 conditional_ratio_calc（与 find_tree_node 一致）
			
 
				+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
			
 
				+_cond_spec = importlib.util.spec_from_file_location(
			
 
				+    "conditional_ratio_calc",
			
 
				+    _utils_dir / "conditional_ratio_calc.py",
			
 
				+)
			
 
				+_cond_mod = importlib.util.module_from_spec(_cond_spec)
			
 
				+_cond_spec.loader.exec_module(_cond_mod)
			
 
				+calc_pattern_conditional_ratio = _cond_mod.calc_pattern_conditional_ratio
			
 
				+
			
 
				+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
			
 
				+
			
 
				+
			
 
				+def _pattern_file(account_name: str) -> Path:
			
 
				+    """pattern 库文件：../input/{account_name}/原始数据/pattern/processed_edge_data.json"""
			
 
				+    return _BASE_INPUT / account_name / "原始数据" / "pattern" / "processed_edge_data.json"
			
 
				+
			
 
				+
			
 
				+def _slim_pattern(p: dict) -> tuple[float, int, list[str], int]:
			
 
				+    """提取 name 列表（去重保序）、support、length、post_count。"""
			
 
				+    names = [item["name"] for item in (p.get("items") or [])]
			
 
				+    seen = set()
			
 
				+    unique = []
			
 
				+    for n in names:
			
 
				+        if n not in seen:
			
 
				+            seen.add(n)
			
 
				+            unique.append(n)
			
 
				+    support = round(float(p.get("support", 0)), 4)
			
 
				+    length = int(p.get("length", 0))
			
 
				+    post_count = int(p.get("post_count", 0))
			
 
				+    return support, length, unique, post_count
			
 
				+
			
 
				+
			
 
				+def _merge_and_dedupe(patterns: list[dict]) -> list[dict]:
			
 
				+    """
			
 
				+    按 items 的 name 集合去重（不区分顺序），留 support 最大；
			
 
				+    输出格式保留 s、l、i（nameA+nameB+nameC）及 post_count，供条件概率计算使用。
			
 
				+    """
			
 
				+    key_to_best: dict[tuple, tuple[float, int, int]] = {}
			
 
				+    for p in patterns:
			
 
				+        support, length, unique, post_count = _slim_pattern(p)
			
 
				+        if not unique:
			
 
				+            continue
			
 
				+        key = tuple(sorted(unique))
			
 
				+        if key not in key_to_best or support > key_to_best[key][0]:
			
 
				+            key_to_best[key] = (support, length, post_count)
			
 
				+    out = []
			
 
				+    for k, (s, l, post_count) in key_to_best.items():
			
 
				+        if s < 0.1:
			
 
				+            continue
			
 
				+        out.append({
			
 
				+            "s": s,
			
 
				+            "l": l,
			
 
				+            "i": "+".join(k),
			
 
				+            "post_count": post_count,
			
 
				+        })
			
 
				+    out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def _load_and_merge_patterns(account_name: str) -> list[dict]:
			
 
				+    """读取 pattern 库 JSON，按 TOP_KEYS/SUB_KEYS 合并为列表并做合并、去重。"""
			
 
				+    path = _pattern_file(account_name)
			
 
				+    if not path.is_file():
			
 
				+        return []
			
 
				+    with open(path, "r", encoding="utf-8") as f:
			
 
				+        data = json.load(f)
			
 
				+    all_patterns = []
			
 
				+    for top in TOP_KEYS:
			
 
				+        if top not in data:
			
 
				+            continue
			
 
				+        block = data[top]
			
 
				+        for sub in SUB_KEYS:
			
 
				+            all_patterns.extend(block.get(sub) or [])
			
 
				+    return _merge_and_dedupe(all_patterns)
			
 
				+
			
 
				+
			
 
				+def _parse_derived_list(derived_items: list[dict[str, str]]) -> list[tuple[str, str]]:
			
 
				+    """将 agent 传入的 [{"topic": "x", "source_node": "y"}, ...] 转为 DerivedItem 列表。"""
			
 
				+    out = []
			
 
				+    for item in derived_items:
			
 
				+        if isinstance(item, dict):
			
 
				+            topic = item.get("topic") or item.get("已推导的选题点")
			
 
				+            source = item.get("source_node") or item.get("推导来源人设树节点")
			
 
				+            if topic is not None and source is not None:
			
 
				+                out.append((str(topic).strip(), str(source).strip()))
			
 
				+        elif isinstance(item, (list, tuple)) and len(item) >= 2:
			
 
				+            out.append((str(item[0]).strip(), str(item[1]).strip()))
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+def get_patterns_by_conditional_ratio(
			
 
				+    account_name: str,
			
 
				+    derived_list: list[tuple[str, str]],
			
 
				+    conditional_ratio_threshold: float,
			
 
				+    top_n: int,
			
 
				+) -> list[dict[str, Any]]:
			
 
				+    """
			
 
				+    从 pattern 库中获取条件概率 >= 阈值的 pattern，按条件概率降序（同分按 length 降序），返回 top_n 条。
			
 
				+    返回每项：pattern名称（nameA+nameB+nameC）、条件概率。
			
 
				+    """
			
 
				+    merged = _load_and_merge_patterns(account_name)
			
 
				+    if not merged:
			
 
				+        return []
			
 
				+    base_dir = _BASE_INPUT
			
 
				+    scored: list[tuple[dict, float]] = []
			
 
				+    for p in merged:
			
 
				+        # calc_pattern_conditional_ratio 需要 pattern 含 "i" 与 "post_count"
			
 
				+        ratio = calc_pattern_conditional_ratio(
			
 
				+            account_name, derived_list, p, base_dir=base_dir
			
 
				+        )
			
 
				+        if ratio >= conditional_ratio_threshold:
			
 
				+            scored.append((p, ratio))
			
 
				+    # 条件概率从高到低；相等按 length 降序
			
 
				+    scored.sort(key=lambda x: (-x[1], -x[0]["l"]))
			
 
				+    result = []
			
 
				+    for p, ratio in scored[:top_n]:
			
 
				+        result.append({
			
 
				+            "pattern名称": p["i"],
			
 
				+            "条件概率": round(ratio, 6),
			
 
				+        })
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+@tool(
			
 
				+    description="从 pattern 库中获取符合条件概率阈值的 pattern。"
			
 
				+    "输入：账号名、已推导选题点列表（DerivedItem）、条件概率阈值、topN。"
			
 
				+    "返回：pattern 名称（nameA+nameB+nameC）及条件概率，按条件概率从高到低最多 topN 条。"
			
 
				+)
			
 
				+async def find_pattern(
			
 
				+    account_name: str,
			
 
				+    derived_items: list[dict[str, str]],
			
 
				+    conditional_ratio_threshold: float,
			
 
				+    top_n: int = 20,
			
 
				+    context: Optional[ToolContext] = None,
			
 
				+) -> ToolResult:
			
 
				+    """
			
 
				+    从 pattern 库中获取符合条件概率阈值的 pattern。
			
 
				+
			
 
				+    已推导选题点 derived_items：每项为 {"topic": "已推导选题点", "source_node": "推导来源人设树节点"}。
			
 
				+    流程：读取 pattern 库 → 合并去重 → 计算条件概率 → 筛选 ≥ 阈值 → 按条件概率降序（同分按 length 降序）→ 返回 top_n 条。
			
 
				+    返回每条：pattern名称（nameA+nameB+nameC）、条件概率。
			
 
				+    """
			
 
				+    pattern_path = _pattern_file(account_name)
			
 
				+    if not pattern_path.is_file():
			
 
				+        return ToolResult(
			
 
				+            title="Pattern 库不存在",
			
 
				+            output=f"pattern 文件不存在: {pattern_path}",
			
 
				+            error="Pattern file not found",
			
 
				+        )
			
 
				+    try:
			
 
				+        derived_list = _parse_derived_list(derived_items)
			
 
				+        if not derived_list:
			
 
				+            return ToolResult(
			
 
				+                title="参数无效",
			
 
				+                output="derived_items 不能为空，且每项需包含 topic 与 source_node（或 已推导的选题点 与 推导来源人设树节点）",
			
 
				+                error="Invalid derived_items",
			
 
				+            )
			
 
				+        items = get_patterns_by_conditional_ratio(
			
 
				+            account_name, derived_list, conditional_ratio_threshold, top_n
			
 
				+        )
			
 
				+        if not items:
			
 
				+            output = f"未找到条件概率 >= {conditional_ratio_threshold} 的 pattern"
			
 
				+        else:
			
 
				+            lines = [
			
 
				+                f"- {x['pattern名称']}\t条件概率={x['条件概率']}"
			
 
				+                for x in items
			
 
				+            ]
			
 
				+            output = "\n".join(lines)
			
 
				+        return ToolResult(
			
 
				+            title=f"符合条件概率的 Pattern ({account_name}, 阈值={conditional_ratio_threshold})",
			
 
				+            output=output,
			
 
				+            metadata={
			
 
				+                "account_name": account_name,
			
 
				+                "conditional_ratio_threshold": conditional_ratio_threshold,
			
 
				+                "top_n": top_n,
			
 
				+                "count": len(items),
			
 
				+                "items": items,
			
 
				+            },
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        return ToolResult(
			
 
				+            title="查找 Pattern 失败",
			
 
				+            output=str(e),
			
 
				+            error=str(e),
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    """本地测试：用家有大志账号、已推导选题点，查询符合条件概率阈值的 pattern。"""
			
 
				+    import asyncio
			
 
				+
			
 
				+    account_name = "家有大志"
			
 
				+    # 已推导选题点，每项：已推导的选题点 + 推导来源人设树节点
			
 
				+    derived_items = [
			
 
				+        {"topic": "分享", "source_node": "分享"},
			
 
				+        {"topic": "柴犬", "source_node": "动物角色"},
			
 
				+    ]
			
 
				+    conditional_ratio_threshold = 0.01
			
 
				+    top_n = 10
			
 
				+
			
 
				+    # 1）直接调用核心函数
			
 
				+    derived_list = _parse_derived_list(derived_items)
			
 
				+    items = get_patterns_by_conditional_ratio(
			
 
				+        account_name, derived_list, conditional_ratio_threshold, top_n
			
 
				+    )
			
 
				+    print(f"账号: {account_name}, 阈值: {conditional_ratio_threshold}, top_n: {top_n}")
			
 
				+    print(f"共 {len(items)} 条 pattern:\n")
			
 
				+    for x in items:
			
 
				+        print(f"  - {x['pattern名称']}\t条件概率={x['条件概率']}")
			
 
				+
			
 
				+    # 2）有 agent 时通过 tool 接口再跑一遍
			
 
				+    if ToolResult is not None:
			
 
				+        async def run_tool():
			
 
				+            result = await find_pattern(
			
 
				+                account_name=account_name,
			
 
				+                derived_items=derived_items,
			
 
				+                conditional_ratio_threshold=conditional_ratio_threshold,
			
 
				+                top_n=top_n,
			
 
				+            )
			
 
				+            print("\n--- Tool 返回 ---")
			
 
				+            print(result.output)
			
 
				+        asyncio.run(run_tool())
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/examples_how/overall_derivation/tools/find_tree_node.py
+++ b/examples_how/overall_derivation/tools/find_tree_node.py
@@ -0,0 +1,303 @@
 
				+"""
			
 
				+查找树节点 Tool - 人设树节点查询
			
 
				+
			
 
				+功能：
			
 
				+1. 获取人设树的常量节点（全局常量、局部常量）
			
 
				+2. 获取符合条件概率阈值的节点（按条件概率排序返回 topN）
			
 
				+"""
			
 
				+
			
 
				+import importlib.util
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Optional
			
 
				+
			
 
				+try:
			
 
				+    from agent.tools import tool, ToolResult, ToolContext
			
 
				+except ImportError:
			
 
				+    def tool(*args, **kwargs):
			
 
				+        return lambda f: f
			
 
				+    ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
			
 
				+    ToolContext = None
			
 
				+
			
 
				+# 加载同目录层级的 conditional_ratio_calc（不依赖包结构）
			
 
				+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
			
 
				+_cond_spec = importlib.util.spec_from_file_location(
			
 
				+    "conditional_ratio_calc",
			
 
				+    _utils_dir / "conditional_ratio_calc.py",
			
 
				+)
			
 
				+_cond_mod = importlib.util.module_from_spec(_cond_spec)
			
 
				+_cond_spec.loader.exec_module(_cond_mod)
			
 
				+calc_node_conditional_ratio = _cond_mod.calc_node_conditional_ratio
			
 
				+
			
 
				+# 相对本文件：tools -> overall_derivation，input 在 overall_derivation 下
			
 
				+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
			
 
				+
			
 
				+
			
 
				+def _tree_dir(account_name: str) -> Path:
			
 
				+    """人设树目录：../input/{account_name}/原始数据/tree/"""
			
 
				+    return _BASE_INPUT / account_name / "原始数据" / "tree"
			
 
				+
			
 
				+
			
 
				+def _load_trees(account_name: str) -> list[tuple[str, dict]]:
			
 
				+    """加载该账号下所有维度的人设树。返回 [(维度名, 根节点 dict), ...]。"""
			
 
				+    td = _tree_dir(account_name)
			
 
				+    if not td.is_dir():
			
 
				+        return []
			
 
				+    result = []
			
 
				+    for p in td.glob("*.json"):
			
 
				+        try:
			
 
				+            with open(p, "r", encoding="utf-8") as f:
			
 
				+                data = json.load(f)
			
 
				+            for dim_name, root in data.items():
			
 
				+                if isinstance(root, dict):
			
 
				+                    result.append((dim_name, root))
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            continue
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _iter_all_nodes(account_name: str):
			
 
				+    """遍历该账号下所有人设树节点，产出 (节点名称, 父节点名称, 节点 dict)。"""
			
 
				+    for dim_name, root in _load_trees(account_name):
			
 
				+        def walk(parent_name: str, node_dict: dict):
			
 
				+            for name, child in (node_dict.get("children") or {}).items():
			
 
				+                if not isinstance(child, dict):
			
 
				+                    continue
			
 
				+                yield (name, parent_name, child)
			
 
				+                yield from walk(name, child)
			
 
				+
			
 
				+        yield from walk(dim_name, root)
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# 1. 获取人设树常量节点
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def get_constant_nodes(account_name: str) -> list[dict[str, Any]]:
			
 
				+    """
			
 
				+    获取人设树的常量节点。
			
 
				+    - 全局常量：_is_constant=True
			
 
				+    - 局部常量：_is_local_constant=True 且 _is_constant=False
			
 
				+    返回列表项：节点名称、概率(_ratio)、常量类型。
			
 
				+    """
			
 
				+    result = []
			
 
				+    for node_name, _parent, node in _iter_all_nodes(account_name):
			
 
				+        is_const = node.get("_is_constant") is True
			
 
				+        is_local = node.get("_is_local_constant") is True
			
 
				+        if is_const:
			
 
				+            const_type = "全局常量"
			
 
				+        elif is_local and not is_const:
			
 
				+            const_type = "局部常量"
			
 
				+        else:
			
 
				+            continue
			
 
				+        ratio = node.get("_ratio")
			
 
				+        result.append({
			
 
				+            "节点名称": node_name,
			
 
				+            "概率": ratio,
			
 
				+            "常量类型": const_type,
			
 
				+        })
			
 
				+    result.sort(key=lambda x: (x["概率"] is None, -(x["概率"] or 0)))
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# 2. 获取符合条件概率阈值的节点
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def get_nodes_by_conditional_ratio(
			
 
				+    account_name: str,
			
 
				+    derived_list: list[tuple[str, str]],
			
 
				+    threshold: float,
			
 
				+    top_n: int,
			
 
				+) -> list[dict[str, Any]]:
			
 
				+    """
			
 
				+    获取人设树中条件概率 >= threshold 的节点，按条件概率降序，返回前 top_n 个。
			
 
				+    derived_list: 已推导列表，每项 (已推导的选题点, 推导来源人设树节点)。
			
 
				+    返回列表项：节点名称、条件概率、父节点名称。
			
 
				+    """
			
 
				+    base_dir = _BASE_INPUT
			
 
				+    node_to_parent: dict[str, str] = {}
			
 
				+    for node_name, parent_name, _ in _iter_all_nodes(account_name):
			
 
				+        node_to_parent[node_name] = parent_name
			
 
				+
			
 
				+    scored: list[tuple[str, float, str]] = []
			
 
				+    for node_name, parent_name in node_to_parent.items():
			
 
				+        ratio = calc_node_conditional_ratio(
			
 
				+            account_name, derived_list, node_name, base_dir=base_dir
			
 
				+        )
			
 
				+        if ratio >= threshold:
			
 
				+            scored.append((node_name, ratio, parent_name))
			
 
				+
			
 
				+    scored.sort(key=lambda x: x[1], reverse=True)
			
 
				+    top = scored[:top_n]
			
 
				+    return [
			
 
				+        {"节点名称": name, "条件概率": ratio, "父节点名称": parent}
			
 
				+        for name, ratio, parent in top
			
 
				+    ]
			
 
				+
			
 
				+
			
 
				+def _parse_derived_list(derived_items: list[dict[str, str]]) -> list[tuple[str, str]]:
			
 
				+    """将 agent 传入的 [{"topic": "x", "source_node": "y"}, ...] 转为 DerivedItem 列表。"""
			
 
				+    out = []
			
 
				+    for item in derived_items:
			
 
				+        if isinstance(item, dict):
			
 
				+            topic = item.get("topic") or item.get("已推导的选题点")
			
 
				+            source = item.get("source_node") or item.get("推导来源人设树节点")
			
 
				+            if topic is not None and source is not None:
			
 
				+                out.append((str(topic).strip(), str(source).strip()))
			
 
				+        elif isinstance(item, (list, tuple)) and len(item) >= 2:
			
 
				+            out.append((str(item[0]).strip(), str(item[1]).strip()))
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# Agent Tools（参考 glob_tool 封装）
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+@tool(description="获取人设树的常量节点（全局常量、局部常量）。输入账号名，返回节点名称、概率、常量类型。")
			
 
				+async def find_tree_constant_nodes(
			
 
				+    account_name: str,
			
 
				+    context: Optional[ToolContext] = None,
			
 
				+) -> ToolResult:
			
 
				+    """
			
 
				+    获取人设树的常量节点。
			
 
				+    读取该账号 input/{account_name}/原始数据/tree/ 下的人设树 JSON，
			
 
				+    筛选 _is_constant=true（全局常量）或 _is_local_constant=true 且 _is_constant=false（局部常量）的节点，
			
 
				+    返回：节点名称、概率(_ratio)、常量类型。
			
 
				+    """
			
 
				+    tree_dir = _tree_dir(account_name)
			
 
				+    if not tree_dir.is_dir():
			
 
				+        return ToolResult(
			
 
				+            title="人设树目录不存在",
			
 
				+            output=f"目录不存在: {tree_dir}",
			
 
				+            error="Directory not found",
			
 
				+        )
			
 
				+    try:
			
 
				+        items = get_constant_nodes(account_name)
			
 
				+        if not items:
			
 
				+            output = "未找到常量节点"
			
 
				+        else:
			
 
				+            lines = [f"- {x['节点名称']}\t概率={x['概率']}\t{x['常量类型']}" for x in items]
			
 
				+            output = "\n".join(lines)
			
 
				+        return ToolResult(
			
 
				+            title=f"常量节点 ({account_name})",
			
 
				+            output=output,
			
 
				+            metadata={"account_name": account_name, "count": len(items), "items": items},
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        return ToolResult(
			
 
				+            title="获取常量节点失败",
			
 
				+            output=str(e),
			
 
				+            error=str(e),
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+@tool(
			
 
				+    description="获取人设树中条件概率不低于阈值的节点，按条件概率从高到低返回 topN。"
			
 
				+    "输入：账号名、已推导选题点列表、条件概率阈值、topN。"
			
 
				+)
			
 
				+async def find_tree_nodes_by_conditional_ratio(
			
 
				+    account_name: str,
			
 
				+    derived_items: list[dict[str, str]],
			
 
				+    conditional_ratio_threshold: float,
			
 
				+    top_n: int = 20,
			
 
				+    context: Optional[ToolContext] = None,
			
 
				+) -> ToolResult:
			
 
				+    """
			
 
				+    获取人设树中符合条件概率阈值的节点。
			
 
				+    已推导选题点 derived_items：每项为 {\"topic\": \"已推导选题点\", \"source_node\": \"推导来源人设树节点\"}。
			
 
				+    返回：节点名称、条件概率、父节点名称，按条件概率降序最多 top_n 条。
			
 
				+    """
			
 
				+    tree_dir = _tree_dir(account_name)
			
 
				+    if not tree_dir.is_dir():
			
 
				+        return ToolResult(
			
 
				+            title="人设树目录不存在",
			
 
				+            output=f"目录不存在: {tree_dir}",
			
 
				+            error="Directory not found",
			
 
				+        )
			
 
				+    try:
			
 
				+        derived_list = _parse_derived_list(derived_items)
			
 
				+        if not derived_list:
			
 
				+            return ToolResult(
			
 
				+                title="参数无效",
			
 
				+                output="derived_items 不能为空，且每项需包含 topic 与 source_node（或 已推导的选题点 与 推导来源人设树节点）",
			
 
				+                error="Invalid derived_items",
			
 
				+            )
			
 
				+        items = get_nodes_by_conditional_ratio(
			
 
				+            account_name, derived_list, conditional_ratio_threshold, top_n
			
 
				+        )
			
 
				+        if not items:
			
 
				+            output = f"未找到条件概率 >= {conditional_ratio_threshold} 的节点"
			
 
				+        else:
			
 
				+            lines = [
			
 
				+                f"- {x['节点名称']}\t条件概率={x['条件概率']}\t父节点={x['父节点名称']}"
			
 
				+                for x in items
			
 
				+            ]
			
 
				+            output = "\n".join(lines)
			
 
				+        return ToolResult(
			
 
				+            title=f"条件概率节点 ({account_name}, 阈值={conditional_ratio_threshold})",
			
 
				+            output=output,
			
 
				+            metadata={
			
 
				+                "account_name": account_name,
			
 
				+                "threshold": conditional_ratio_threshold,
			
 
				+                "top_n": top_n,
			
 
				+                "count": len(items),
			
 
				+                "items": items,
			
 
				+            },
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        return ToolResult(
			
 
				+            title="按条件概率查询节点失败",
			
 
				+            output=str(e),
			
 
				+            error=str(e),
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    """本地测试：用家有大志账号测常量节点与条件概率节点，有 agent 时再跑一遍 tool 接口。"""
			
 
				+    import asyncio
			
 
				+
			
 
				+    account_name = "家有大志"
			
 
				+    derived_items = [
			
 
				+        {"topic": "分享", "source_node": "分享"},
			
 
				+    ]
			
 
				+    conditional_ratio_threshold = 0.1
			
 
				+    top_n = 10
			
 
				+
			
 
				+    # 1）常量节点
			
 
				+    constant_nodes = get_constant_nodes(account_name)
			
 
				+    print(f"账号: {account_name} — 常量节点共 {len(constant_nodes)} 个（前 50 个）:")
			
 
				+    for x in constant_nodes[:50]:
			
 
				+        print(f"  - {x['节点名称']}\t概率={x['概率']}\t{x['常量类型']}")
			
 
				+    print()
			
 
				+
			
 
				+    # 2）条件概率节点（核心函数）
			
 
				+    derived_list = _parse_derived_list(derived_items)
			
 
				+    ratio_nodes = get_nodes_by_conditional_ratio(
			
 
				+        account_name, derived_list, conditional_ratio_threshold, top_n
			
 
				+    )
			
 
				+    print(f"条件概率节点 阈值={conditional_ratio_threshold}, top_n={top_n}, 共 {len(ratio_nodes)} 个:")
			
 
				+    for x in ratio_nodes:
			
 
				+        print(f"  - {x['节点名称']}\t条件概率={x['条件概率']}\t父节点={x['父节点名称']}")
			
 
				+    print()
			
 
				+
			
 
				+    # 3）有 agent 时通过 tool 接口再跑一遍
			
 
				+    if ToolResult is not None:
			
 
				+        async def run_tools():
			
 
				+            r1 = await find_tree_constant_nodes(account_name)
			
 
				+            print("--- find_tree_constant_nodes ---")
			
 
				+            print(r1.output[:200] + "..." if len(r1.output) > 200 else r1.output)
			
 
				+            r2 = await find_tree_nodes_by_conditional_ratio(
			
 
				+                account_name,
			
 
				+                derived_items=derived_items,
			
 
				+                conditional_ratio_threshold=conditional_ratio_threshold,
			
 
				+                top_n=top_n,
			
 
				+            )
			
 
				+            print("\n--- find_tree_nodes_by_conditional_ratio ---")
			
 
				+            print(r2.output)
			
 
				+        asyncio.run(run_tools())
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/examples_how/overall_derivation/tools/point_match.py
+++ b/examples_how/overall_derivation/tools/point_match.py
@@ -0,0 +1,193 @@
 
				+"""
			
 
				+选题点匹配 Tool - 判断推导选题点是否与帖子中的选题点匹配
			
 
				+
			
 
				+功能：读取帖子选题点列表，与推导选题点做相似度计算，返回 combined_score >= 阈值的匹配对。
			
 
				+"""
			
 
				+
			
 
				+import importlib.util
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any, List, Optional
			
 
				+
			
 
				+try:
			
 
				+    from agent.tools import tool, ToolResult, ToolContext
			
 
				+except ImportError:
			
 
				+    def tool(*args, **kwargs):
			
 
				+        return lambda f: f
			
 
				+    ToolResult = None
			
 
				+    ToolContext = None
			
 
				+
			
 
				+# 加载 similarity_calc
			
 
				+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
			
 
				+_sim_spec = importlib.util.spec_from_file_location(
			
 
				+    "similarity_calc",
			
 
				+    _utils_dir / "similarity_calc.py",
			
 
				+)
			
 
				+_sim_mod = importlib.util.module_from_spec(_sim_spec)
			
 
				+_sim_spec.loader.exec_module(_sim_mod)
			
 
				+similarity_matrix = _sim_mod.similarity_matrix
			
 
				+
			
 
				+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
			
 
				+
			
 
				+# 默认匹配阈值
			
 
				+DEFAULT_MATCH_THRESHOLD = 0.8
			
 
				+
			
 
				+
			
 
				+def _post_topic_file(account_name: str, post_id: str) -> Path:
			
 
				+    """帖子选题点文件：../input/{account_name}/post_topic/{post_id}.json"""
			
 
				+    return _BASE_INPUT / account_name / "post_topic" / f"{post_id}.json"
			
 
				+
			
 
				+
			
 
				+def _load_post_topic_points(account_name: str, post_id: str) -> List[str]:
			
 
				+    """从 post_topic JSON 读取帖子选题点列表。文件内容为字符串数组。"""
			
 
				+    path = _post_topic_file(account_name, post_id)
			
 
				+    if not path.is_file():
			
 
				+        return []
			
 
				+    with open(path, "r", encoding="utf-8") as f:
			
 
				+        data = json.load(f)
			
 
				+    if not isinstance(data, list):
			
 
				+        return []
			
 
				+    return [str(x).strip() for x in data if x and str(x).strip()]
			
 
				+
			
 
				+
			
 
				+def _normalize_derivation_points(derivation_output_points: List[Any]) -> List[str]:
			
 
				+    """将 agent 传入的推导选题点转为字符串列表（支持字符串或含 topic/选题点 的 dict）。"""
			
 
				+    out: List[str] = []
			
 
				+    for item in derivation_output_points:
			
 
				+        if isinstance(item, str) and item.strip():
			
 
				+            out.append(item.strip())
			
 
				+        elif isinstance(item, dict):
			
 
				+            topic = item.get("topic") or item.get("选题点") or item.get("已推导的选题点")
			
 
				+            if topic is not None and str(topic).strip():
			
 
				+                out.append(str(topic).strip())
			
 
				+    return out
			
 
				+
			
 
				+
			
 
				+async def match_derivation_to_post_points(
			
 
				+    derivation_output_points: List[Any],
			
 
				+    account_name: str,
			
 
				+    post_id: str,
			
 
				+    match_threshold: float = DEFAULT_MATCH_THRESHOLD,
			
 
				+) -> List[dict[str, Any]]:
			
 
				+    """
			
 
				+    判断推导选题点是否与帖子选题点匹配，返回 combined_score >= match_threshold 的列表。
			
 
				+
			
 
				+    Returns:
			
 
				+        每项: {"推导选题点": str, "帖子选题点": str, "匹配分数": float}
			
 
				+    """
			
 
				+    post_points = _load_post_topic_points(account_name, post_id)
			
 
				+    derivation_points = _normalize_derivation_points(derivation_output_points)
			
 
				+    if not derivation_points:
			
 
				+        return []
			
 
				+    if not post_points:
			
 
				+        return []
			
 
				+
			
 
				+    items = await similarity_matrix(derivation_points, post_points)
			
 
				+    matched = []
			
 
				+    for row in items:
			
 
				+        if row["combined_score"] >= match_threshold:
			
 
				+            matched.append({
			
 
				+                "推导选题点": row["phrase_a"],
			
 
				+                "帖子选题点": row["phrase_b"],
			
 
				+                "匹配分数": round(row["combined_score"], 6),
			
 
				+            })
			
 
				+    return matched
			
 
				+
			
 
				+
			
 
				+@tool(
			
 
				+    description="判断推导选题点是否与帖子中的选题点匹配。"
			
 
				+    "输入：推导选题点列表、账号名、帖子ID。"
			
 
				+    "从 input/{account_name}/post_topic/{post_id}.json 读取帖子选题点，用相似度计算匹配，combined_score>=0.8 视为匹配成功。"
			
 
				+    "返回：匹配成功的列表，每项含 推导选题点、帖子选题点、匹配分数。"
			
 
				+)
			
 
				+async def point_match(
			
 
				+    derivation_output_points: List[Any],
			
 
				+    account_name: str,
			
 
				+    post_id: str,
			
 
				+    match_threshold: float = DEFAULT_MATCH_THRESHOLD,
			
 
				+    context: Optional[ToolContext] = None,
			
 
				+) -> ToolResult:
			
 
				+    """
			
 
				+    判断推导选题点是否和帖子中的选题点匹配。
			
 
				+
			
 
				+    流程：
			
 
				+    1. 从 ../input/{account_name}/post_topic/{post_id}.json 读取帖子选题点列表
			
 
				+    2. 调用 similarity_matrix 计算两列表相似度，combined_score >= match_threshold 视为匹配
			
 
				+    3. 返回匹配成功列表：推导选题点、帖子选题点、匹配分数
			
 
				+    """
			
 
				+    topic_path = _post_topic_file(account_name, post_id)
			
 
				+    if not topic_path.is_file():
			
 
				+        return ToolResult(
			
 
				+            title="帖子选题点文件不存在",
			
 
				+            output=f"帖子选题点文件不存在: {topic_path}",
			
 
				+            error="Post topic file not found",
			
 
				+        )
			
 
				+    try:
			
 
				+        derivation_points = _normalize_derivation_points(derivation_output_points)
			
 
				+        if not derivation_points:
			
 
				+            return ToolResult(
			
 
				+                title="参数无效",
			
 
				+                output="derivation_output_points 不能为空，且需为字符串列表或含 topic/选题点 的字典列表",
			
 
				+                error="Invalid derivation_output_points",
			
 
				+            )
			
 
				+        matched = await match_derivation_to_post_points(
			
 
				+            derivation_output_points, account_name, post_id, match_threshold
			
 
				+        )
			
 
				+        if not matched:
			
 
				+            output = f"未找到 combined_score >= {match_threshold} 的匹配"
			
 
				+        else:
			
 
				+            lines = [
			
 
				+                f"- 推导: {x['推导选题点']}\t帖子: {x['帖子选题点']}\t分数={x['匹配分数']}"
			
 
				+                for x in matched
			
 
				+            ]
			
 
				+            output = "\n".join(lines)
			
 
				+        return ToolResult(
			
 
				+            title=f"选题点匹配结果 ({account_name}, post_id={post_id})",
			
 
				+            output=output,
			
 
				+            metadata={
			
 
				+                "account_name": account_name,
			
 
				+                "post_id": post_id,
			
 
				+                "match_threshold": match_threshold,
			
 
				+                "count": len(matched),
			
 
				+                "items": matched,
			
 
				+            },
			
 
				+        )
			
 
				+    except Exception as e:
			
 
				+        return ToolResult(
			
 
				+            title="选题点匹配失败",
			
 
				+            output=str(e),
			
 
				+            error=str(e),
			
 
				+        )
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    """本地测试：用家有大志账号、某帖子ID、推导选题点列表测试匹配。"""
			
 
				+    import asyncio
			
 
				+
			
 
				+    account_name = "家有大志"
			
 
				+    post_id = "68fb6a5c000000000302e5de"
			
 
				+    derivation_output_points = ["分享", "创意改造", "柴犬", "不存在的点"]
			
 
				+
			
 
				+    async def run():
			
 
				+        matched = await match_derivation_to_post_points(
			
 
				+            derivation_output_points, account_name, post_id
			
 
				+        )
			
 
				+        print(f"账号: {account_name}, post_id: {post_id}")
			
 
				+        print(f"推导选题点: {derivation_output_points}")
			
 
				+        print(f"匹配成功 {len(matched)} 条:\n")
			
 
				+        for x in matched:
			
 
				+            print(f"  - 推导: {x['推导选题点']}\t帖子: {x['帖子选题点']}\t分数={x['匹配分数']}")
			
 
				+        if ToolResult is not None:
			
 
				+            result = await point_match(
			
 
				+                derivation_output_points=derivation_output_points,
			
 
				+                account_name=account_name,
			
 
				+                post_id=post_id,
			
 
				+            )
			
 
				+            print("\n--- Tool 返回 ---")
			
 
				+            print(result.output)
			
 
				+
			
 
				+    asyncio.run(run())
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/examples_how/overall_derivation/utils/conditional_ratio_calc.py
+++ b/examples_how/overall_derivation/utils/conditional_ratio_calc.py
@@ -0,0 +1,210 @@
 
				+"""
			
 
				+条件概率计算工具：
			
 
				+1）计算某个人设树节点在父节点下的条件概率；
			
 
				+2）计算某个 pattern 的条件概率。
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from typing import Any
			
 
				+
			
 
				+# 已推导列表：每项为 (已推导的选题点, 推导来源人设树节点)，如 ("分享","分享")、("柴犬","动物角色")
			
 
				+# 推导来源人设树节点的 post_ids 在计算条件概率时从人设树中读取
			
 
				+DerivedItem = tuple[str, str]
			
 
				+
			
 
				+
			
 
				+def _tree_dir(account_name: str, base_dir: Path | None = None) -> Path:
			
 
				+    """人设树目录：../input/{account_name}/原始数据/tree/（相对本文件所在目录）。"""
			
 
				+    if base_dir is not None:
			
 
				+        return base_dir / account_name / "原始数据" / "tree"
			
 
				+    return Path(__file__).resolve().parent.parent / "input" / account_name / "原始数据" / "tree"
			
 
				+
			
 
				+
			
 
				+def _load_trees(account_name: str, base_dir: Path | None = None) -> list[tuple[str, dict]]:
			
 
				+    """加载该账号下所有维度的人设树。返回 [(维度名, 根节点 dict), ...]。"""
			
 
				+    td = _tree_dir(account_name, base_dir)
			
 
				+    if not td.is_dir():
			
 
				+        return []
			
 
				+    result = []
			
 
				+    for p in td.glob("*.json"):
			
 
				+        try:
			
 
				+            with open(p, "r", encoding="utf-8") as f:
			
 
				+                data = json.load(f)
			
 
				+            # 文件格式为 { "实质": { ... } } 或 { "形式": { ... } }
			
 
				+            for dim_name, root in data.items():
			
 
				+                if isinstance(root, dict):
			
 
				+                    result.append((dim_name, root))
			
 
				+                break
			
 
				+        except Exception:
			
 
				+            continue
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def _post_ids_of(node: dict) -> list[str]:
			
 
				+    """从树节点中取出 _post_ids，无则返回空列表。"""
			
 
				+    return list(node.get("_post_ids") or [])
			
 
				+
			
 
				+
			
 
				+def _build_node_index(account_name: str, base_dir: Path | None = None) -> dict[str, tuple[list[str], list[str]]]:
			
 
				+    """
			
 
				+    遍历所有维度的人设树，建立 节点名 -> (该节点 post_ids, 父节点 post_ids)。
			
 
				+    同一节点名在多个分支出现时，保留第一次遇到的（保证父子一致）。
			
 
				+    """
			
 
				+    index: dict[str, tuple[list[str], list[str]]] = {}
			
 
				+    for _dim, root in _load_trees(account_name, base_dir):
			
 
				+        parent_pids = _post_ids_of(root)
			
 
				+
			
 
				+        def walk(parent_ids: list[str], node_dict: dict) -> None:
			
 
				+            for name, child in (node_dict.get("children") or {}).items():
			
 
				+                if not isinstance(child, dict):
			
 
				+                    continue
			
 
				+                if name not in index:
			
 
				+                    index[name] = (_post_ids_of(child), list(parent_ids))
			
 
				+                walk(_post_ids_of(child), child)
			
 
				+
			
 
				+        walk(parent_pids, root)
			
 
				+    return index
			
 
				+
			
 
				+
			
 
				+def _derived_post_ids_from_sources(
			
 
				+    derived_list: list[DerivedItem],
			
 
				+    index: dict[str, tuple[list[str], list[str]]],
			
 
				+) -> set[str]:
			
 
				+    """根据 derived_list 中的「推导来源人设树节点」在人设树中的 post_ids 取交集，得到已推导的帖子集合。"""
			
 
				+    common: set[str] | None = None
			
 
				+    for _topic_point, source_node in derived_list:
			
 
				+        if source_node not in index:
			
 
				+            continue
			
 
				+        pids = set(index[source_node][0])
			
 
				+        if common is None:
			
 
				+            common = pids
			
 
				+        else:
			
 
				+            common &= pids
			
 
				+    return common if common is not None else set()
			
 
				+
			
 
				+
			
 
				+def calc_node_conditional_ratio(
			
 
				+    account_name: str,
			
 
				+    derived_list: list[DerivedItem],
			
 
				+    tree_node_name: str,
			
 
				+    base_dir: Path | None = None,
			
 
				+) -> float:
			
 
				+    """
			
 
				+    计算人设树节点 N 在父节点 P 下的条件概率。
			
 
				+
			
 
				+    参数:
			
 
				+        account_name: 账号名称
			
 
				+        derived_list: 已推导列表，每项 (已推导的选题点, 推导来源人设树节点)
			
 
				+        tree_node_name: 人设树节点 N 的名称（字符串匹配）
			
 
				+        base_dir: 可选，input 根目录；不传则使用相对本文件的 ../input
			
 
				+
			
 
				+    计算规则:
			
 
				+        已推导的帖子集合 = 各「推导来源人设树节点」在人设树中的 post_ids 的交集（方法内从树读取）
			
 
				+        分子 = |已推导的帖子集合 ∩ N 的 post_ids|
			
 
				+        分母 = |已推导的帖子集合 ∩ P 的 post_ids|
			
 
				+        条件概率 = 分子/分母，且 ≤1；分母为 0 时返回 1。
			
 
				+    """
			
 
				+    index = _build_node_index(account_name, base_dir)
			
 
				+    derived_post_ids = _derived_post_ids_from_sources(derived_list, index)
			
 
				+    if tree_node_name not in index:
			
 
				+        return 1.0
			
 
				+    n_pids, p_pids = index[tree_node_name]
			
 
				+    set_n = set(n_pids)
			
 
				+    set_p = set(p_pids)
			
 
				+    den = len(derived_post_ids & set_p)
			
 
				+    if den == 0:
			
 
				+        return 1.0
			
 
				+    num = len(derived_post_ids & set_n)
			
 
				+    return min(1.0, num / den)
			
 
				+
			
 
				+
			
 
				+def _pattern_nodes_and_post_count(pattern: dict[str, Any]) -> tuple[list[str], int]:
			
 
				+    """从 pattern 中解析出节点列表和 post_count。支持 nodes + post_count 或 i + post_count。"""
			
 
				+    nodes = pattern.get("nodes")
			
 
				+    if nodes is not None and isinstance(nodes, list):
			
 
				+        nodes = [str(x).strip() for x in nodes if x]
			
 
				+    else:
			
 
				+        raw = pattern.get("i") or pattern.get("pattern_str") or ""
			
 
				+        nodes = [x.strip() for x in str(raw).replace("+", " ").split() if x.strip()]
			
 
				+    post_count = int(pattern.get("post_count", 0))
			
 
				+    return nodes, post_count
			
 
				+
			
 
				+
			
 
				+def calc_pattern_conditional_ratio(
			
 
				+    account_name: str,
			
 
				+    derived_list: list[DerivedItem],
			
 
				+    pattern: dict[str, Any],
			
 
				+    base_dir: Path | None = None,
			
 
				+) -> float:
			
 
				+    """
			
 
				+    计算某个 pattern 的条件概率。
			
 
				+
			
 
				+    参数:
			
 
				+        account_name: 账号名称
			
 
				+        derived_list: 已推导列表，每项 (已推导的选题点, 推导来源人设树节点)
			
 
				+        pattern: 至少包含节点列表与 post_count。
			
 
				+                 - 节点列表: key 为 "nodes"（list）或 "i"（字符串，用 + 连接）
			
 
				+                 - post_count: 该 pattern 的帖子数量，作为分子
			
 
				+        base_dir: 可选，input 根目录
			
 
				+
			
 
				+    计算规则:
			
 
				+        取 pattern 中「已被推导」的节点（其名称出现在 derived 的推导来源中），
			
 
				+        在人设树中取这些节点的 post_ids 的交集作为分母；
			
 
				+        分子 = pattern.post_count。
			
 
				+        条件概率 = 分子/分母，且 ≤1；分母为 0 时返回 1。
			
 
				+    """
			
 
				+    pattern_nodes, post_count = _pattern_nodes_and_post_count(pattern)
			
 
				+    if not pattern_nodes or post_count <= 0:
			
 
				+        return 1.0
			
 
				+
			
 
				+    derived_sources = set(source for _post, source in derived_list)
			
 
				+    # pattern 中已被推导的节点
			
 
				+    derived_pattern_nodes = [n for n in pattern_nodes if n in derived_sources]
			
 
				+    if not derived_pattern_nodes:
			
 
				+        return 1.0
			
 
				+
			
 
				+    index = _build_node_index(account_name, base_dir)
			
 
				+    # 仅使用在人设树中存在的「已被推导」节点，取它们在树中的 post_ids 的交集
			
 
				+    derived_in_tree = [n for n in derived_pattern_nodes if n in index]
			
 
				+    if not derived_in_tree:
			
 
				+        return 1.0
			
 
				+    common: set[str] | None = None
			
 
				+    for name in derived_in_tree:
			
 
				+        pids = set(index[name][0])
			
 
				+        if common is None:
			
 
				+            common = pids
			
 
				+        else:
			
 
				+            common &= pids
			
 
				+    if common is None or len(common) == 0:
			
 
				+        return 1.0
			
 
				+    den = len(common)
			
 
				+    return min(1.0, post_count / den)
			
 
				+
			
 
				+
			
 
				+def _test_with_user_example() -> None:
			
 
				+    """
			
 
				+    使用你提供的测试数据：已推导 (分享|分享)、(柴犬|动物角色)；
			
 
				+    人设树节点：恶作剧；pattern：分享+动物角色+创意表达 post_count=2。
			
 
				+    推导来源的 post_ids 在方法内部从人设树读取。
			
 
				+    """
			
 
				+    account_name = "家有大志"
			
 
				+    # 已推导列表：(已推导的选题点, 推导来源人设树节点)
			
 
				+    derived_list: list[DerivedItem] = [
			
 
				+        ("分享", "分享"),
			
 
				+        # ("柴犬", "动物角色"),
			
 
				+    ]
			
 
				+
			
 
				+    # 1）人设树节点「恶作剧」的条件概率
			
 
				+    r_node = calc_node_conditional_ratio(account_name, derived_list, "恶作剧")
			
 
				+    print(f"1) 人设树节点「恶作剧」条件概率: {r_node}")
			
 
				+
			
 
				+    # 2）pattern 分享+动物角色+创意表达 post_count=2 的条件概率
			
 
				+    pattern = {"i": "分享+动物角色+创意表达", "post_count": 2}
			
 
				+    r_pattern = calc_pattern_conditional_ratio(account_name, derived_list, pattern)
			
 
				+    print(f"2) pattern 分享+动物角色+创意表达 (post_count=2) 条件概率: {r_pattern}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    _test_with_user_example()
			
--- a/examples_how/overall_derivation/utils/similarity_calc.py
+++ b/examples_how/overall_derivation/utils/similarity_calc.py
@@ -0,0 +1,358 @@
 
				+"""
			
 
				+相似度计算工具：计算两组短语的 M×N 相似度矩阵。
			
 
				+使用综合相似度：embedding 50% + LLM 50%。
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import asyncio
			
 
				+import hashlib
			
 
				+import json
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import time
			
 
				+from typing import List, Tuple, TypedDict
			
 
				+
			
 
				+import httpx
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+# 缓存目录：相对本文件所在目录的 ../.cache/similarity，按 (phrase_a, phrase_b) 原子化存储
			
 
				+_CACHE_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".cache", "similarity"))
			
 
				+
			
 
				+
			
 
				+def _atomic_pair_key(phrase_a: str, phrase_b: str) -> str:
			
 
				+    """单对短语的缓存键（原子粒度）。"""
			
 
				+    raw = json.dumps([phrase_a, phrase_b], ensure_ascii=False, sort_keys=False)
			
 
				+    return hashlib.sha256(raw.encode("utf-8")).hexdigest()
			
 
				+
			
 
				+
			
 
				+def _ensure_cache_dir() -> None:
			
 
				+    os.makedirs(_CACHE_DIR, exist_ok=True)
			
 
				+
			
 
				+
			
 
				+def _read_atomic_score(cache_type: str, phrase_a: str, phrase_b: str) -> float | None:
			
 
				+    """读取单对短语的分数缓存，不存在或失败返回 None。"""
			
 
				+    key = _atomic_pair_key(phrase_a, phrase_b)
			
 
				+    path = os.path.join(_CACHE_DIR, f"{cache_type}_{key}.json")
			
 
				+    if not os.path.isfile(path):
			
 
				+        return None
			
 
				+    try:
			
 
				+        with open(path, "r", encoding="utf-8") as f:
			
 
				+            data = json.load(f)
			
 
				+        # 校验 phrase_a / phrase_b 一致，避免碰撞误用
			
 
				+        if data.get("phrase_a") != phrase_a or data.get("phrase_b") != phrase_b:
			
 
				+            return None
			
 
				+        return float(data["score"])
			
 
				+    except Exception as e:
			
 
				+        logger.debug("[similarity_cache] 读取 %s 失败: %s", path, e)
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _write_atomic_score(cache_type: str, phrase_a: str, phrase_b: str, score: float) -> None:
			
 
				+    """写入单对短语的分数缓存（原子结果：phrase_a, phrase_b, score）。"""
			
 
				+    _ensure_cache_dir()
			
 
				+    key = _atomic_pair_key(phrase_a, phrase_b)
			
 
				+    path = os.path.join(_CACHE_DIR, f"{cache_type}_{key}.json")
			
 
				+    try:
			
 
				+        with open(path, "w", encoding="utf-8") as f:
			
 
				+            json.dump({"phrase_a": phrase_a, "phrase_b": phrase_b, "score": score}, f, ensure_ascii=False)
			
 
				+    except Exception as e:
			
 
				+        logger.warning("[similarity_cache] 写入 %s 失败: %s", path, e)
			
 
				+
			
 
				+
			
 
				+class SimilarityItem(TypedDict):
			
 
				+    """单条相似度结果。"""
			
 
				+    phrase_a: str
			
 
				+    phrase_b: str
			
 
				+    embedding_score: float
			
 
				+    llm_score: float
			
 
				+    combined_score: float
			
 
				+
			
 
				+# 批量提示词模板（LLM 打分用）。占位符：{count}、{pairs_list}；JSON 内大括号已转义
			
 
				+DEFAULT_BATCH_PROMPT_TEMPLATE = """
			
 
				+请从语意角度判断以下{count}对短语的相似度，每对从0-1打分，输出格式如下（必须是一个JSON数组）：
			
 
				+```json
			
 
				+[
			
 
				+    {{
			
 
				+        "text_1": "",
			
 
				+        "text_2": "",
			
 
				+        "score": 0.0,
			
 
				+        "reason": "简明扼要说明理由"
			
 
				+    }},
			
 
				+    {{
			
 
				+        "text_1": "",
			
 
				+        "text_2": "",
			
 
				+        "score": 0.0,
			
 
				+        "reason": "简明扼要说明理由"
			
 
				+    }}
			
 
				+]
			
 
				+```
			
 
				+
			
 
				+短语对列表：
			
 
				+{pairs_list}
			
 
				+""".strip()
			
 
				+
			
 
				+# Embedding 相似度 API
			
 
				+EMBEDDING_SIMILARITY_URL = "http://61.48.133.26:8187/cartesian_similarity"
			
 
				+# LLM 模型
			
 
				+LLM_MODEL = "openai/gpt-4.1-mini"
			
 
				+
			
 
				+
			
 
				+def _phrase_pairs(phrases_a: List[str], phrases_b: List[str]) -> List[Tuple[str, str]]:
			
 
				+    """将 M×N 展开为短语对列表，顺序为 (a0,b0),(a0,b1),...,(a0,b_{N-1}),(a1,b0),..."""
			
 
				+    return [(a, b) for a in phrases_a for b in phrases_b]
			
 
				+
			
 
				+
			
 
				+async def _embedding_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
			
 
				+    """
			
 
				+    调用 cartesian_similarity API，返回 M×N 矩阵。先查原子缓存 (phrase_a, phrase_b) -> score，仅对未命中的短语对调用 API。
			
 
				+    """
			
 
				+    if not phrases_a or not phrases_b:
			
 
				+        return []
			
 
				+
			
 
				+    M, N = len(phrases_a), len(phrases_b)
			
 
				+    matrix = [[0.0] * N for _ in range(M)]
			
 
				+    missing_indices: List[Tuple[int, int]] = []
			
 
				+    for i in range(M):
			
 
				+        for j in range(N):
			
 
				+            score = _read_atomic_score("embedding", phrases_a[i], phrases_b[j])
			
 
				+            if score is not None:
			
 
				+                matrix[i][j] = score
			
 
				+            else:
			
 
				+                missing_indices.append((i, j))
			
 
				+
			
 
				+    total = M * N
			
 
				+    hit_count = total - len(missing_indices)
			
 
				+    if hit_count > 0:
			
 
				+        logger.info("[similarity_matrix] embedding 原子缓存命中 %d/%d", hit_count, total)
			
 
				+    if not missing_indices:
			
 
				+        return matrix
			
 
				+
			
 
				+    # 仅对未命中的短语对调用 API：构造缺失的 phrases_a / phrases_b（去重且保持顺序）
			
 
				+    a_set: List[str] = list(dict.fromkeys(phrases_a[i] for i, _ in missing_indices))
			
 
				+    b_set: List[str] = list(dict.fromkeys(phrases_b[j] for _, j in missing_indices))
			
 
				+    async with httpx.AsyncClient(timeout=60.0) as client:
			
 
				+        resp = await client.post(
			
 
				+            EMBEDDING_SIMILARITY_URL,
			
 
				+            json={"texts1": a_set, "texts2": b_set},
			
 
				+            headers={"Content-Type": "application/json"},
			
 
				+        )
			
 
				+        resp.raise_for_status()
			
 
				+        data = resp.json()
			
 
				+
			
 
				+    results = data.get("results", [])
			
 
				+    len_b = len(b_set)
			
 
				+    for i, j in missing_indices:
			
 
				+        a, b = phrases_a[i], phrases_b[j]
			
 
				+        i_m, j_m = a_set.index(a), b_set.index(b)
			
 
				+        idx_flat = i_m * len_b + j_m
			
 
				+        if idx_flat < len(results):
			
 
				+            score = float(results[idx_flat]["score"])
			
 
				+            matrix[i][j] = score
			
 
				+            _write_atomic_score("embedding", a, b, score)
			
 
				+    return matrix
			
 
				+
			
 
				+
			
 
				+def _extract_json_array(content: str) -> List[dict]:
			
 
				+    """从 LLM 回复中解析 JSON 数组（允许被 ```json ... ``` 包裹）。"""
			
 
				+    content = content.strip()
			
 
				+    # 尝试匹配 ```json ... ``` 中的内容
			
 
				+    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
			
 
				+    if m:
			
 
				+        content = m.group(1).strip()
			
 
				+    return json.loads(content)
			
 
				+
			
 
				+
			
 
				+async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
			
 
				+    """
			
 
				+    用 LLM 对短语对打分，返回 M×N 矩阵。先查原子缓存，仅对未命中的短语对调用 API。
			
 
				+    """
			
 
				+    if not phrases_a or not phrases_b:
			
 
				+        return []
			
 
				+
			
 
				+    M, N = len(phrases_a), len(phrases_b)
			
 
				+    matrix = [[0.0] * N for _ in range(M)]
			
 
				+    missing_indices: List[Tuple[int, int]] = []
			
 
				+    for i in range(M):
			
 
				+        for j in range(N):
			
 
				+            score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
			
 
				+            if score is not None:
			
 
				+                matrix[i][j] = score
			
 
				+            else:
			
 
				+                missing_indices.append((i, j))
			
 
				+
			
 
				+    total = M * N
			
 
				+    hit_count = total - len(missing_indices)
			
 
				+    if hit_count > 0:
			
 
				+        logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
			
 
				+    if not missing_indices:
			
 
				+        return matrix
			
 
				+
			
 
				+    # 仅对未命中的短语对调用 LLM：按缺失顺序构造 pairs_list，LLM 按同序返回
			
 
				+    missing_pairs = [(phrases_a[i], phrases_b[j]) for (i, j) in missing_indices]
			
 
				+    pairs_list = "\n".join(
			
 
				+        f'{idx + 1}. "{a}" 和 "{b}"'
			
 
				+        for idx, (a, b) in enumerate(missing_pairs)
			
 
				+    )
			
 
				+    prompt = DEFAULT_BATCH_PROMPT_TEMPLATE.format(count=len(missing_pairs), pairs_list=pairs_list)
			
 
				+
			
 
				+    from agent.llm.openrouter import openrouter_llm_call
			
 
				+
			
 
				+    messages = [{"role": "user", "content": prompt}]
			
 
				+    result = await openrouter_llm_call(messages, model=LLM_MODEL)
			
 
				+    content = result.get("content", "")
			
 
				+    if not content:
			
 
				+        raise ValueError("LLM 未返回内容")
			
 
				+
			
 
				+    items = _extract_json_array(content)
			
 
				+    for idx, (i, j) in enumerate(missing_indices):
			
 
				+        if idx >= len(items):
			
 
				+            break
			
 
				+        score = float(items[idx].get("score", 0.0))
			
 
				+        score = max(0.0, min(1.0, score))
			
 
				+        matrix[i][j] = score
			
 
				+        a, b = phrases_a[i], phrases_b[j]
			
 
				+        _write_atomic_score("llm", a, b, score)
			
 
				+    return matrix
			
 
				+
			
 
				+
			
 
				+async def similarity_matrix(
			
 
				+    phrases_a: List[str],
			
 
				+    phrases_b: List[str],
			
 
				+    *,
			
 
				+    embedding_weight: float = 0.5,
			
 
				+    llm_weight: float = 0.5,
			
 
				+) -> List[SimilarityItem]:
			
 
				+    """
			
 
				+    计算两组短语的相似度，返回对象列表（每条含 phrase_a, phrase_b, embedding_score, llm_score, combined_score）。
			
 
				+
			
 
				+    综合相似度 = embedding_weight * embedding_score + llm_weight * llm_score。
			
 
				+    默认各 50%。
			
 
				+
			
 
				+    Args:
			
 
				+        phrases_a: 第一组短语列表（M 个）
			
 
				+        phrases_b: 第二组短语列表（N 个）
			
 
				+        embedding_weight: embedding 权重，默认 0.5
			
 
				+        llm_weight: LLM 权重，默认 0.5
			
 
				+
			
 
				+    Returns:
			
 
				+        对象列表，长度 M×N，顺序与短语对 (a0,b0),(a0,b1),...,(aM-1,bN-1) 一致。
			
 
				+    """
			
 
				+    if not phrases_a or not phrases_b:
			
 
				+        return []
			
 
				+
			
 
				+    M, N = len(phrases_a), len(phrases_b)
			
 
				+    total_pairs = M * N
			
 
				+    logger.info("[similarity_matrix] 开始计算: phrases_a=%d, phrases_b=%d, 短语对=%d", M, N, total_pairs)
			
 
				+    t_total = time.perf_counter()
			
 
				+
			
 
				+    async def _run_embedding() -> List[List[float]]:
			
 
				+        t0 = time.perf_counter()
			
 
				+        out = await _embedding_similarity(phrases_a, phrases_b)
			
 
				+        logger.info("[similarity_matrix] embedding 耗时: %.3fs", time.perf_counter() - t0)
			
 
				+        return out
			
 
				+
			
 
				+    async def _run_llm() -> List[List[float]]:
			
 
				+        t0 = time.perf_counter()
			
 
				+        out = await _llm_similarity(phrases_a, phrases_b)
			
 
				+        logger.info("[similarity_matrix] LLM 耗时: %.3fs", time.perf_counter() - t0)
			
 
				+        return out
			
 
				+
			
 
				+    emb_matrix, llm_matrix = await asyncio.gather(_run_embedding(), _run_llm())
			
 
				+    elapsed = time.perf_counter() - t_total
			
 
				+    logger.info("[similarity_matrix] 总耗时: %.3fs", elapsed)
			
 
				+
			
 
				+    N = len(phrases_b)
			
 
				+    pairs = _phrase_pairs(phrases_a, phrases_b)
			
 
				+    result: List[SimilarityItem] = []
			
 
				+    for idx, (a, b) in enumerate(pairs):
			
 
				+        i, j = idx // N, idx % N
			
 
				+        emb_s = emb_matrix[i][j]
			
 
				+        llm_s = llm_matrix[i][j]
			
 
				+        combined_s = embedding_weight * emb_s + llm_weight * llm_s
			
 
				+        result.append({
			
 
				+            "phrase_a": a,
			
 
				+            "phrase_b": b,
			
 
				+            "embedding_score": emb_s,
			
 
				+            "llm_score": llm_s,
			
 
				+            "combined_score": combined_s,
			
 
				+        })
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def similarity_matrix_sync(
			
 
				+    phrases_a: List[str],
			
 
				+    phrases_b: List[str],
			
 
				+    **kwargs,
			
 
				+) -> List[SimilarityItem]:
			
 
				+    """同步封装：在同步代码中调用时使用 asyncio.run 执行。返回与 similarity_matrix 相同结构的对象列表。"""
			
 
				+    return asyncio.run(similarity_matrix(phrases_a, phrases_b, **kwargs))
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# 测试
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def test_phrase_pairs() -> None:
			
 
				+    """测试 M×N 展开为短语对列表的顺序。"""
			
 
				+    a = ["犬", "猫咪"]
			
 
				+    b = ["狗", "手机"]
			
 
				+    pairs = _phrase_pairs(a, b)
			
 
				+    assert len(pairs) == 4
			
 
				+    assert pairs[0] == ("犬", "狗")
			
 
				+    assert pairs[1] == ("犬", "手机")
			
 
				+    assert pairs[2] == ("猫咪", "狗")
			
 
				+    assert pairs[3] == ("猫咪", "手机")
			
 
				+    print("test_phrase_pairs: ok")
			
 
				+
			
 
				+
			
 
				+def test_extract_json_array() -> None:
			
 
				+    """测试从 LLM 回复中解析 JSON 数组。"""
			
 
				+    # 带 ```json 包裹
			
 
				+    content = '''一些说明
			
 
				+```json
			
 
				+[
			
 
				+    {"text_1": "犬", "text_2": "狗", "score": 0.85, "reason": "同义"}
			
 
				+]
			
 
				+```
			
 
				+'''
			
 
				+    arr = _extract_json_array(content)
			
 
				+    assert len(arr) == 1
			
 
				+    assert arr[0]["score"] == 0.85
			
 
				+    # 纯 JSON 数组
			
 
				+    arr2 = _extract_json_array('[{"score": 0.5}]')
			
 
				+    assert len(arr2) == 1 and arr2[0]["score"] == 0.5
			
 
				+    print("test_extract_json_array: ok")
			
 
				+
			
 
				+
			
 
				+async def test_similarity_matrix() -> None:
			
 
				+    """集成测试：调用 embedding + LLM 得到相似度对象列表。"""
			
 
				+    phrases_a = ["犬", "猫咪"]
			
 
				+    phrases_b = ["狗", "手机"]
			
 
				+    items = await similarity_matrix(phrases_a, phrases_b)
			
 
				+    assert len(items) == 4
			
 
				+    for row in items:
			
 
				+        assert "phrase_a" in row and "phrase_b" in row
			
 
				+        assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
			
 
				+        assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1]，得到 {row['combined_score']}"
			
 
				+    # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
			
 
				+    dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
			
 
				+    dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
			
 
				+    assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
			
 
				+    print("test_similarity_matrix: ok")
			
 
				+    for r in items:
			
 
				+        print(f"  {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    test_phrase_pairs()
			
 
				+    test_extract_json_array()
			
 
				+    print("运行集成测试（需 embedding API、OPEN_ROUTER_API_KEY 及 agent 依赖）...")
			
 
				+    try:
			
 
				+        asyncio.run(test_similarity_matrix())
			
 
				+        print("全部通过。")
			
 
				+    except Exception as e:
			
 
				+        print(f"跳过集成测试: {e}")
			
 
				+        print("仅单元测试已通过。集成测试请确保：1) embedding 服务可访问 2) 设置 OPEN_ROUTER_API_KEY 3) 在项目根目录执行: python -m examples_how.overall_derivation.utils.similarity_calc")