Просмотр исходного кода

how agent 一些封装的通用方法、tools

liuzhiheng 6 дней назад
Родитель
Сommit
93788b146e

+ 256 - 0
examples_how/overall_derivation/tools/find_pattern.py

@@ -0,0 +1,256 @@
+"""
+查找 Pattern Tool - 从 pattern 库中获取符合条件概率阈值的 pattern
+
+功能:读取账号的 pattern 库,合并去重后按条件概率筛选,返回 topN 条 pattern(含 pattern 名称、条件概率)。
+"""
+
+import importlib.util
+import json
+from pathlib import Path
+from typing import Any, Optional
+
+try:
+    from agent.tools import tool, ToolResult, ToolContext
+except ImportError:
+    def tool(*args, **kwargs):
+        return lambda f: f
+    ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
+    ToolContext = None
+
+# 与 pattern_data_process 一致的 key 定义
+TOP_KEYS = [
+    "depth_max_with_name",
+    "depth_mixed",
+    "depth_max_concrete",
+    "depth2_medium",
+    "depth1_abstract",
+]
+SUB_KEYS = ["two_x", "one_x", "zero_x"]
+
+# 加载 conditional_ratio_calc(与 find_tree_node 一致)
+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
+_cond_spec = importlib.util.spec_from_file_location(
+    "conditional_ratio_calc",
+    _utils_dir / "conditional_ratio_calc.py",
+)
+_cond_mod = importlib.util.module_from_spec(_cond_spec)
+_cond_spec.loader.exec_module(_cond_mod)
+calc_pattern_conditional_ratio = _cond_mod.calc_pattern_conditional_ratio
+
+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
+
+
+def _pattern_file(account_name: str) -> Path:
+    """pattern 库文件:../input/{account_name}/原始数据/pattern/processed_edge_data.json"""
+    return _BASE_INPUT / account_name / "原始数据" / "pattern" / "processed_edge_data.json"
+
+
+def _slim_pattern(p: dict) -> tuple[float, int, list[str], int]:
+    """提取 name 列表(去重保序)、support、length、post_count。"""
+    names = [item["name"] for item in (p.get("items") or [])]
+    seen = set()
+    unique = []
+    for n in names:
+        if n not in seen:
+            seen.add(n)
+            unique.append(n)
+    support = round(float(p.get("support", 0)), 4)
+    length = int(p.get("length", 0))
+    post_count = int(p.get("post_count", 0))
+    return support, length, unique, post_count
+
+
+def _merge_and_dedupe(patterns: list[dict]) -> list[dict]:
+    """
+    按 items 的 name 集合去重(不区分顺序),留 support 最大;
+    输出格式保留 s、l、i(nameA+nameB+nameC)及 post_count,供条件概率计算使用。
+    """
+    key_to_best: dict[tuple, tuple[float, int, int]] = {}
+    for p in patterns:
+        support, length, unique, post_count = _slim_pattern(p)
+        if not unique:
+            continue
+        key = tuple(sorted(unique))
+        if key not in key_to_best or support > key_to_best[key][0]:
+            key_to_best[key] = (support, length, post_count)
+    out = []
+    for k, (s, l, post_count) in key_to_best.items():
+        if s < 0.1:
+            continue
+        out.append({
+            "s": s,
+            "l": l,
+            "i": "+".join(k),
+            "post_count": post_count,
+        })
+    out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
+    return out
+
+
+def _load_and_merge_patterns(account_name: str) -> list[dict]:
+    """读取 pattern 库 JSON,按 TOP_KEYS/SUB_KEYS 合并为列表并做合并、去重。"""
+    path = _pattern_file(account_name)
+    if not path.is_file():
+        return []
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    all_patterns = []
+    for top in TOP_KEYS:
+        if top not in data:
+            continue
+        block = data[top]
+        for sub in SUB_KEYS:
+            all_patterns.extend(block.get(sub) or [])
+    return _merge_and_dedupe(all_patterns)
+
+
+def _parse_derived_list(derived_items: list[dict[str, str]]) -> list[tuple[str, str]]:
+    """将 agent 传入的 [{"topic": "x", "source_node": "y"}, ...] 转为 DerivedItem 列表。"""
+    out = []
+    for item in derived_items:
+        if isinstance(item, dict):
+            topic = item.get("topic") or item.get("已推导的选题点")
+            source = item.get("source_node") or item.get("推导来源人设树节点")
+            if topic is not None and source is not None:
+                out.append((str(topic).strip(), str(source).strip()))
+        elif isinstance(item, (list, tuple)) and len(item) >= 2:
+            out.append((str(item[0]).strip(), str(item[1]).strip()))
+    return out
+
+
+def get_patterns_by_conditional_ratio(
+    account_name: str,
+    derived_list: list[tuple[str, str]],
+    conditional_ratio_threshold: float,
+    top_n: int,
+) -> list[dict[str, Any]]:
+    """
+    从 pattern 库中获取条件概率 >= 阈值的 pattern,按条件概率降序(同分按 length 降序),返回 top_n 条。
+    返回每项:pattern名称(nameA+nameB+nameC)、条件概率。
+    """
+    merged = _load_and_merge_patterns(account_name)
+    if not merged:
+        return []
+    base_dir = _BASE_INPUT
+    scored: list[tuple[dict, float]] = []
+    for p in merged:
+        # calc_pattern_conditional_ratio 需要 pattern 含 "i" 与 "post_count"
+        ratio = calc_pattern_conditional_ratio(
+            account_name, derived_list, p, base_dir=base_dir
+        )
+        if ratio >= conditional_ratio_threshold:
+            scored.append((p, ratio))
+    # 条件概率从高到低;相等按 length 降序
+    scored.sort(key=lambda x: (-x[1], -x[0]["l"]))
+    result = []
+    for p, ratio in scored[:top_n]:
+        result.append({
+            "pattern名称": p["i"],
+            "条件概率": round(ratio, 6),
+        })
+    return result
+
+
+@tool(
+    description="从 pattern 库中获取符合条件概率阈值的 pattern。"
+    "输入:账号名、已推导选题点列表(DerivedItem)、条件概率阈值、topN。"
+    "返回:pattern 名称(nameA+nameB+nameC)及条件概率,按条件概率从高到低最多 topN 条。"
+)
+async def find_pattern(
+    account_name: str,
+    derived_items: list[dict[str, str]],
+    conditional_ratio_threshold: float,
+    top_n: int = 20,
+    context: Optional[ToolContext] = None,
+) -> ToolResult:
+    """
+    从 pattern 库中获取符合条件概率阈值的 pattern。
+
+    已推导选题点 derived_items:每项为 {"topic": "已推导选题点", "source_node": "推导来源人设树节点"}。
+    流程:读取 pattern 库 → 合并去重 → 计算条件概率 → 筛选 ≥ 阈值 → 按条件概率降序(同分按 length 降序)→ 返回 top_n 条。
+    返回每条:pattern名称(nameA+nameB+nameC)、条件概率。
+    """
+    pattern_path = _pattern_file(account_name)
+    if not pattern_path.is_file():
+        return ToolResult(
+            title="Pattern 库不存在",
+            output=f"pattern 文件不存在: {pattern_path}",
+            error="Pattern file not found",
+        )
+    try:
+        derived_list = _parse_derived_list(derived_items)
+        if not derived_list:
+            return ToolResult(
+                title="参数无效",
+                output="derived_items 不能为空,且每项需包含 topic 与 source_node(或 已推导的选题点 与 推导来源人设树节点)",
+                error="Invalid derived_items",
+            )
+        items = get_patterns_by_conditional_ratio(
+            account_name, derived_list, conditional_ratio_threshold, top_n
+        )
+        if not items:
+            output = f"未找到条件概率 >= {conditional_ratio_threshold} 的 pattern"
+        else:
+            lines = [
+                f"- {x['pattern名称']}\t条件概率={x['条件概率']}"
+                for x in items
+            ]
+            output = "\n".join(lines)
+        return ToolResult(
+            title=f"符合条件概率的 Pattern ({account_name}, 阈值={conditional_ratio_threshold})",
+            output=output,
+            metadata={
+                "account_name": account_name,
+                "conditional_ratio_threshold": conditional_ratio_threshold,
+                "top_n": top_n,
+                "count": len(items),
+                "items": items,
+            },
+        )
+    except Exception as e:
+        return ToolResult(
+            title="查找 Pattern 失败",
+            output=str(e),
+            error=str(e),
+        )
+
+
+def main() -> None:
+    """本地测试:用家有大志账号、已推导选题点,查询符合条件概率阈值的 pattern。"""
+    import asyncio
+
+    account_name = "家有大志"
+    # 已推导选题点,每项:已推导的选题点 + 推导来源人设树节点
+    derived_items = [
+        {"topic": "分享", "source_node": "分享"},
+        {"topic": "柴犬", "source_node": "动物角色"},
+    ]
+    conditional_ratio_threshold = 0.01
+    top_n = 10
+
+    # 1)直接调用核心函数
+    derived_list = _parse_derived_list(derived_items)
+    items = get_patterns_by_conditional_ratio(
+        account_name, derived_list, conditional_ratio_threshold, top_n
+    )
+    print(f"账号: {account_name}, 阈值: {conditional_ratio_threshold}, top_n: {top_n}")
+    print(f"共 {len(items)} 条 pattern:\n")
+    for x in items:
+        print(f"  - {x['pattern名称']}\t条件概率={x['条件概率']}")
+
+    # 2)有 agent 时通过 tool 接口再跑一遍
+    if ToolResult is not None:
+        async def run_tool():
+            result = await find_pattern(
+                account_name=account_name,
+                derived_items=derived_items,
+                conditional_ratio_threshold=conditional_ratio_threshold,
+                top_n=top_n,
+            )
+            print("\n--- Tool 返回 ---")
+            print(result.output)
+        asyncio.run(run_tool())
+
+
+if __name__ == "__main__":
+    main()

+ 303 - 0
examples_how/overall_derivation/tools/find_tree_node.py

@@ -0,0 +1,303 @@
+"""
+查找树节点 Tool - 人设树节点查询
+
+功能:
+1. 获取人设树的常量节点(全局常量、局部常量)
+2. 获取符合条件概率阈值的节点(按条件概率排序返回 topN)
+"""
+
+import importlib.util
+import json
+from pathlib import Path
+from typing import Any, Optional
+
+try:
+    from agent.tools import tool, ToolResult, ToolContext
+except ImportError:
+    def tool(*args, **kwargs):
+        return lambda f: f
+    ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
+    ToolContext = None
+
+# 加载同目录层级的 conditional_ratio_calc(不依赖包结构)
+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
+_cond_spec = importlib.util.spec_from_file_location(
+    "conditional_ratio_calc",
+    _utils_dir / "conditional_ratio_calc.py",
+)
+_cond_mod = importlib.util.module_from_spec(_cond_spec)
+_cond_spec.loader.exec_module(_cond_mod)
+calc_node_conditional_ratio = _cond_mod.calc_node_conditional_ratio
+
+# 相对本文件:tools -> overall_derivation,input 在 overall_derivation 下
+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
+
+
+def _tree_dir(account_name: str) -> Path:
+    """人设树目录:../input/{account_name}/原始数据/tree/"""
+    return _BASE_INPUT / account_name / "原始数据" / "tree"
+
+
+def _load_trees(account_name: str) -> list[tuple[str, dict]]:
+    """加载该账号下所有维度的人设树。返回 [(维度名, 根节点 dict), ...]。"""
+    td = _tree_dir(account_name)
+    if not td.is_dir():
+        return []
+    result = []
+    for p in td.glob("*.json"):
+        try:
+            with open(p, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            for dim_name, root in data.items():
+                if isinstance(root, dict):
+                    result.append((dim_name, root))
+                break
+        except Exception:
+            continue
+    return result
+
+
+def _iter_all_nodes(account_name: str):
+    """遍历该账号下所有人设树节点,产出 (节点名称, 父节点名称, 节点 dict)。"""
+    for dim_name, root in _load_trees(account_name):
+        def walk(parent_name: str, node_dict: dict):
+            for name, child in (node_dict.get("children") or {}).items():
+                if not isinstance(child, dict):
+                    continue
+                yield (name, parent_name, child)
+                yield from walk(name, child)
+
+        yield from walk(dim_name, root)
+
+
+# ---------------------------------------------------------------------------
+# 1. 获取人设树常量节点
+# ---------------------------------------------------------------------------
+
+def get_constant_nodes(account_name: str) -> list[dict[str, Any]]:
+    """
+    获取人设树的常量节点。
+    - 全局常量:_is_constant=True
+    - 局部常量:_is_local_constant=True 且 _is_constant=False
+    返回列表项:节点名称、概率(_ratio)、常量类型。
+    """
+    result = []
+    for node_name, _parent, node in _iter_all_nodes(account_name):
+        is_const = node.get("_is_constant") is True
+        is_local = node.get("_is_local_constant") is True
+        if is_const:
+            const_type = "全局常量"
+        elif is_local and not is_const:
+            const_type = "局部常量"
+        else:
+            continue
+        ratio = node.get("_ratio")
+        result.append({
+            "节点名称": node_name,
+            "概率": ratio,
+            "常量类型": const_type,
+        })
+    result.sort(key=lambda x: (x["概率"] is None, -(x["概率"] or 0)))
+    return result
+
+
+# ---------------------------------------------------------------------------
+# 2. 获取符合条件概率阈值的节点
+# ---------------------------------------------------------------------------
+
+def get_nodes_by_conditional_ratio(
+    account_name: str,
+    derived_list: list[tuple[str, str]],
+    threshold: float,
+    top_n: int,
+) -> list[dict[str, Any]]:
+    """
+    获取人设树中条件概率 >= threshold 的节点,按条件概率降序,返回前 top_n 个。
+    derived_list: 已推导列表,每项 (已推导的选题点, 推导来源人设树节点)。
+    返回列表项:节点名称、条件概率、父节点名称。
+    """
+    base_dir = _BASE_INPUT
+    node_to_parent: dict[str, str] = {}
+    for node_name, parent_name, _ in _iter_all_nodes(account_name):
+        node_to_parent[node_name] = parent_name
+
+    scored: list[tuple[str, float, str]] = []
+    for node_name, parent_name in node_to_parent.items():
+        ratio = calc_node_conditional_ratio(
+            account_name, derived_list, node_name, base_dir=base_dir
+        )
+        if ratio >= threshold:
+            scored.append((node_name, ratio, parent_name))
+
+    scored.sort(key=lambda x: x[1], reverse=True)
+    top = scored[:top_n]
+    return [
+        {"节点名称": name, "条件概率": ratio, "父节点名称": parent}
+        for name, ratio, parent in top
+    ]
+
+
+def _parse_derived_list(derived_items: list[dict[str, str]]) -> list[tuple[str, str]]:
+    """将 agent 传入的 [{"topic": "x", "source_node": "y"}, ...] 转为 DerivedItem 列表。"""
+    out = []
+    for item in derived_items:
+        if isinstance(item, dict):
+            topic = item.get("topic") or item.get("已推导的选题点")
+            source = item.get("source_node") or item.get("推导来源人设树节点")
+            if topic is not None and source is not None:
+                out.append((str(topic).strip(), str(source).strip()))
+        elif isinstance(item, (list, tuple)) and len(item) >= 2:
+            out.append((str(item[0]).strip(), str(item[1]).strip()))
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Agent Tools(参考 glob_tool 封装)
+# ---------------------------------------------------------------------------
+
+@tool(description="获取人设树的常量节点(全局常量、局部常量)。输入账号名,返回节点名称、概率、常量类型。")
+async def find_tree_constant_nodes(
+    account_name: str,
+    context: Optional[ToolContext] = None,
+) -> ToolResult:
+    """
+    获取人设树的常量节点。
+    读取该账号 input/{account_name}/原始数据/tree/ 下的人设树 JSON,
+    筛选 _is_constant=true(全局常量)或 _is_local_constant=true 且 _is_constant=false(局部常量)的节点,
+    返回:节点名称、概率(_ratio)、常量类型。
+    """
+    tree_dir = _tree_dir(account_name)
+    if not tree_dir.is_dir():
+        return ToolResult(
+            title="人设树目录不存在",
+            output=f"目录不存在: {tree_dir}",
+            error="Directory not found",
+        )
+    try:
+        items = get_constant_nodes(account_name)
+        if not items:
+            output = "未找到常量节点"
+        else:
+            lines = [f"- {x['节点名称']}\t概率={x['概率']}\t{x['常量类型']}" for x in items]
+            output = "\n".join(lines)
+        return ToolResult(
+            title=f"常量节点 ({account_name})",
+            output=output,
+            metadata={"account_name": account_name, "count": len(items), "items": items},
+        )
+    except Exception as e:
+        return ToolResult(
+            title="获取常量节点失败",
+            output=str(e),
+            error=str(e),
+        )
+
+
+@tool(
+    description="获取人设树中条件概率不低于阈值的节点,按条件概率从高到低返回 topN。"
+    "输入:账号名、已推导选题点列表、条件概率阈值、topN。"
+)
+async def find_tree_nodes_by_conditional_ratio(
+    account_name: str,
+    derived_items: list[dict[str, str]],
+    conditional_ratio_threshold: float,
+    top_n: int = 20,
+    context: Optional[ToolContext] = None,
+) -> ToolResult:
+    """
+    获取人设树中符合条件概率阈值的节点。
+    已推导选题点 derived_items:每项为 {\"topic\": \"已推导选题点\", \"source_node\": \"推导来源人设树节点\"}。
+    返回:节点名称、条件概率、父节点名称,按条件概率降序最多 top_n 条。
+    """
+    tree_dir = _tree_dir(account_name)
+    if not tree_dir.is_dir():
+        return ToolResult(
+            title="人设树目录不存在",
+            output=f"目录不存在: {tree_dir}",
+            error="Directory not found",
+        )
+    try:
+        derived_list = _parse_derived_list(derived_items)
+        if not derived_list:
+            return ToolResult(
+                title="参数无效",
+                output="derived_items 不能为空,且每项需包含 topic 与 source_node(或 已推导的选题点 与 推导来源人设树节点)",
+                error="Invalid derived_items",
+            )
+        items = get_nodes_by_conditional_ratio(
+            account_name, derived_list, conditional_ratio_threshold, top_n
+        )
+        if not items:
+            output = f"未找到条件概率 >= {conditional_ratio_threshold} 的节点"
+        else:
+            lines = [
+                f"- {x['节点名称']}\t条件概率={x['条件概率']}\t父节点={x['父节点名称']}"
+                for x in items
+            ]
+            output = "\n".join(lines)
+        return ToolResult(
+            title=f"条件概率节点 ({account_name}, 阈值={conditional_ratio_threshold})",
+            output=output,
+            metadata={
+                "account_name": account_name,
+                "threshold": conditional_ratio_threshold,
+                "top_n": top_n,
+                "count": len(items),
+                "items": items,
+            },
+        )
+    except Exception as e:
+        return ToolResult(
+            title="按条件概率查询节点失败",
+            output=str(e),
+            error=str(e),
+        )
+
+
+def main() -> None:
+    """本地测试:用家有大志账号测常量节点与条件概率节点,有 agent 时再跑一遍 tool 接口。"""
+    import asyncio
+
+    account_name = "家有大志"
+    derived_items = [
+        {"topic": "分享", "source_node": "分享"},
+    ]
+    conditional_ratio_threshold = 0.1
+    top_n = 10
+
+    # 1)常量节点
+    constant_nodes = get_constant_nodes(account_name)
+    print(f"账号: {account_name} — 常量节点共 {len(constant_nodes)} 个(前 50 个):")
+    for x in constant_nodes[:50]:
+        print(f"  - {x['节点名称']}\t概率={x['概率']}\t{x['常量类型']}")
+    print()
+
+    # 2)条件概率节点(核心函数)
+    derived_list = _parse_derived_list(derived_items)
+    ratio_nodes = get_nodes_by_conditional_ratio(
+        account_name, derived_list, conditional_ratio_threshold, top_n
+    )
+    print(f"条件概率节点 阈值={conditional_ratio_threshold}, top_n={top_n}, 共 {len(ratio_nodes)} 个:")
+    for x in ratio_nodes:
+        print(f"  - {x['节点名称']}\t条件概率={x['条件概率']}\t父节点={x['父节点名称']}")
+    print()
+
+    # 3)有 agent 时通过 tool 接口再跑一遍
+    if ToolResult is not None:
+        async def run_tools():
+            r1 = await find_tree_constant_nodes(account_name)
+            print("--- find_tree_constant_nodes ---")
+            print(r1.output[:200] + "..." if len(r1.output) > 200 else r1.output)
+            r2 = await find_tree_nodes_by_conditional_ratio(
+                account_name,
+                derived_items=derived_items,
+                conditional_ratio_threshold=conditional_ratio_threshold,
+                top_n=top_n,
+            )
+            print("\n--- find_tree_nodes_by_conditional_ratio ---")
+            print(r2.output)
+        asyncio.run(run_tools())
+
+
+if __name__ == "__main__":
+    main()

+ 193 - 0
examples_how/overall_derivation/tools/point_match.py

@@ -0,0 +1,193 @@
+"""
+选题点匹配 Tool - 判断推导选题点是否与帖子中的选题点匹配
+
+功能:读取帖子选题点列表,与推导选题点做相似度计算,返回 combined_score >= 阈值的匹配对。
+"""
+
+import importlib.util
+import json
+from pathlib import Path
+from typing import Any, List, Optional
+
+try:
+    from agent.tools import tool, ToolResult, ToolContext
+except ImportError:
+    def tool(*args, **kwargs):
+        return lambda f: f
+    ToolResult = None
+    ToolContext = None
+
+# 加载 similarity_calc
+_utils_dir = Path(__file__).resolve().parent.parent / "utils"
+_sim_spec = importlib.util.spec_from_file_location(
+    "similarity_calc",
+    _utils_dir / "similarity_calc.py",
+)
+_sim_mod = importlib.util.module_from_spec(_sim_spec)
+_sim_spec.loader.exec_module(_sim_mod)
+similarity_matrix = _sim_mod.similarity_matrix
+
+_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"
+
+# 默认匹配阈值
+DEFAULT_MATCH_THRESHOLD = 0.8
+
+
+def _post_topic_file(account_name: str, post_id: str) -> Path:
+    """帖子选题点文件:../input/{account_name}/post_topic/{post_id}.json"""
+    return _BASE_INPUT / account_name / "post_topic" / f"{post_id}.json"
+
+
+def _load_post_topic_points(account_name: str, post_id: str) -> List[str]:
+    """从 post_topic JSON 读取帖子选题点列表。文件内容为字符串数组。"""
+    path = _post_topic_file(account_name, post_id)
+    if not path.is_file():
+        return []
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        return []
+    return [str(x).strip() for x in data if x and str(x).strip()]
+
+
+def _normalize_derivation_points(derivation_output_points: List[Any]) -> List[str]:
+    """将 agent 传入的推导选题点转为字符串列表(支持字符串或含 topic/选题点 的 dict)。"""
+    out: List[str] = []
+    for item in derivation_output_points:
+        if isinstance(item, str) and item.strip():
+            out.append(item.strip())
+        elif isinstance(item, dict):
+            topic = item.get("topic") or item.get("选题点") or item.get("已推导的选题点")
+            if topic is not None and str(topic).strip():
+                out.append(str(topic).strip())
+    return out
+
+
+async def match_derivation_to_post_points(
+    derivation_output_points: List[Any],
+    account_name: str,
+    post_id: str,
+    match_threshold: float = DEFAULT_MATCH_THRESHOLD,
+) -> List[dict[str, Any]]:
+    """
+    判断推导选题点是否与帖子选题点匹配,返回 combined_score >= match_threshold 的列表。
+
+    Returns:
+        每项: {"推导选题点": str, "帖子选题点": str, "匹配分数": float}
+    """
+    post_points = _load_post_topic_points(account_name, post_id)
+    derivation_points = _normalize_derivation_points(derivation_output_points)
+    if not derivation_points:
+        return []
+    if not post_points:
+        return []
+
+    items = await similarity_matrix(derivation_points, post_points)
+    matched = []
+    for row in items:
+        if row["combined_score"] >= match_threshold:
+            matched.append({
+                "推导选题点": row["phrase_a"],
+                "帖子选题点": row["phrase_b"],
+                "匹配分数": round(row["combined_score"], 6),
+            })
+    return matched
+
+
+@tool(
+    description="判断推导选题点是否与帖子中的选题点匹配。"
+    "输入:推导选题点列表、账号名、帖子ID。"
+    "从 input/{account_name}/post_topic/{post_id}.json 读取帖子选题点,用相似度计算匹配,combined_score>=0.8 视为匹配成功。"
+    "返回:匹配成功的列表,每项含 推导选题点、帖子选题点、匹配分数。"
+)
+async def point_match(
+    derivation_output_points: List[Any],
+    account_name: str,
+    post_id: str,
+    match_threshold: float = DEFAULT_MATCH_THRESHOLD,
+    context: Optional[ToolContext] = None,
+) -> ToolResult:
+    """
+    判断推导选题点是否和帖子中的选题点匹配。
+
+    流程:
+    1. 从 ../input/{account_name}/post_topic/{post_id}.json 读取帖子选题点列表
+    2. 调用 similarity_matrix 计算两列表相似度,combined_score >= match_threshold 视为匹配
+    3. 返回匹配成功列表:推导选题点、帖子选题点、匹配分数
+    """
+    topic_path = _post_topic_file(account_name, post_id)
+    if not topic_path.is_file():
+        return ToolResult(
+            title="帖子选题点文件不存在",
+            output=f"帖子选题点文件不存在: {topic_path}",
+            error="Post topic file not found",
+        )
+    try:
+        derivation_points = _normalize_derivation_points(derivation_output_points)
+        if not derivation_points:
+            return ToolResult(
+                title="参数无效",
+                output="derivation_output_points 不能为空,且需为字符串列表或含 topic/选题点 的字典列表",
+                error="Invalid derivation_output_points",
+            )
+        matched = await match_derivation_to_post_points(
+            derivation_output_points, account_name, post_id, match_threshold
+        )
+        if not matched:
+            output = f"未找到 combined_score >= {match_threshold} 的匹配"
+        else:
+            lines = [
+                f"- 推导: {x['推导选题点']}\t帖子: {x['帖子选题点']}\t分数={x['匹配分数']}"
+                for x in matched
+            ]
+            output = "\n".join(lines)
+        return ToolResult(
+            title=f"选题点匹配结果 ({account_name}, post_id={post_id})",
+            output=output,
+            metadata={
+                "account_name": account_name,
+                "post_id": post_id,
+                "match_threshold": match_threshold,
+                "count": len(matched),
+                "items": matched,
+            },
+        )
+    except Exception as e:
+        return ToolResult(
+            title="选题点匹配失败",
+            output=str(e),
+            error=str(e),
+        )
+
+
+def main() -> None:
+    """本地测试:用家有大志账号、某帖子ID、推导选题点列表测试匹配。"""
+    import asyncio
+
+    account_name = "家有大志"
+    post_id = "68fb6a5c000000000302e5de"
+    derivation_output_points = ["分享", "创意改造", "柴犬", "不存在的点"]
+
+    async def run():
+        matched = await match_derivation_to_post_points(
+            derivation_output_points, account_name, post_id
+        )
+        print(f"账号: {account_name}, post_id: {post_id}")
+        print(f"推导选题点: {derivation_output_points}")
+        print(f"匹配成功 {len(matched)} 条:\n")
+        for x in matched:
+            print(f"  - 推导: {x['推导选题点']}\t帖子: {x['帖子选题点']}\t分数={x['匹配分数']}")
+        if ToolResult is not None:
+            result = await point_match(
+                derivation_output_points=derivation_output_points,
+                account_name=account_name,
+                post_id=post_id,
+            )
+            print("\n--- Tool 返回 ---")
+            print(result.output)
+
+    asyncio.run(run())
+
+
+if __name__ == "__main__":
+    main()

+ 210 - 0
examples_how/overall_derivation/utils/conditional_ratio_calc.py

@@ -0,0 +1,210 @@
+"""
+条件概率计算工具:
+1)计算某个人设树节点在父节点下的条件概率;
+2)计算某个 pattern 的条件概率。
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+# 已推导列表:每项为 (已推导的选题点, 推导来源人设树节点),如 ("分享","分享")、("柴犬","动物角色")
+# 推导来源人设树节点的 post_ids 在计算条件概率时从人设树中读取
+DerivedItem = tuple[str, str]
+
+
+def _tree_dir(account_name: str, base_dir: Path | None = None) -> Path:
+    """人设树目录:../input/{account_name}/原始数据/tree/(相对本文件所在目录)。"""
+    if base_dir is not None:
+        return base_dir / account_name / "原始数据" / "tree"
+    return Path(__file__).resolve().parent.parent / "input" / account_name / "原始数据" / "tree"
+
+
+def _load_trees(account_name: str, base_dir: Path | None = None) -> list[tuple[str, dict]]:
+    """加载该账号下所有维度的人设树。返回 [(维度名, 根节点 dict), ...]。"""
+    td = _tree_dir(account_name, base_dir)
+    if not td.is_dir():
+        return []
+    result = []
+    for p in td.glob("*.json"):
+        try:
+            with open(p, "r", encoding="utf-8") as f:
+                data = json.load(f)
+            # 文件格式为 { "实质": { ... } } 或 { "形式": { ... } }
+            for dim_name, root in data.items():
+                if isinstance(root, dict):
+                    result.append((dim_name, root))
+                break
+        except Exception:
+            continue
+    return result
+
+
+def _post_ids_of(node: dict) -> list[str]:
+    """从树节点中取出 _post_ids,无则返回空列表。"""
+    return list(node.get("_post_ids") or [])
+
+
+def _build_node_index(account_name: str, base_dir: Path | None = None) -> dict[str, tuple[list[str], list[str]]]:
+    """
+    遍历所有维度的人设树,建立 节点名 -> (该节点 post_ids, 父节点 post_ids)。
+    同一节点名在多个分支出现时,保留第一次遇到的(保证父子一致)。
+    """
+    index: dict[str, tuple[list[str], list[str]]] = {}
+    for _dim, root in _load_trees(account_name, base_dir):
+        parent_pids = _post_ids_of(root)
+
+        def walk(parent_ids: list[str], node_dict: dict) -> None:
+            for name, child in (node_dict.get("children") or {}).items():
+                if not isinstance(child, dict):
+                    continue
+                if name not in index:
+                    index[name] = (_post_ids_of(child), list(parent_ids))
+                walk(_post_ids_of(child), child)
+
+        walk(parent_pids, root)
+    return index
+
+
+def _derived_post_ids_from_sources(
+    derived_list: list[DerivedItem],
+    index: dict[str, tuple[list[str], list[str]]],
+) -> set[str]:
+    """根据 derived_list 中的「推导来源人设树节点」在人设树中的 post_ids 取交集,得到已推导的帖子集合。"""
+    common: set[str] | None = None
+    for _topic_point, source_node in derived_list:
+        if source_node not in index:
+            continue
+        pids = set(index[source_node][0])
+        if common is None:
+            common = pids
+        else:
+            common &= pids
+    return common if common is not None else set()
+
+
+def calc_node_conditional_ratio(
+    account_name: str,
+    derived_list: list[DerivedItem],
+    tree_node_name: str,
+    base_dir: Path | None = None,
+) -> float:
+    """
+    计算人设树节点 N 在父节点 P 下的条件概率。
+
+    参数:
+        account_name: 账号名称
+        derived_list: 已推导列表,每项 (已推导的选题点, 推导来源人设树节点)
+        tree_node_name: 人设树节点 N 的名称(字符串匹配)
+        base_dir: 可选,input 根目录;不传则使用相对本文件的 ../input
+
+    计算规则:
+        已推导的帖子集合 = 各「推导来源人设树节点」在人设树中的 post_ids 的交集(方法内从树读取)
+        分子 = |已推导的帖子集合 ∩ N 的 post_ids|
+        分母 = |已推导的帖子集合 ∩ P 的 post_ids|
+        条件概率 = 分子/分母,且 ≤1;分母为 0 时返回 1。
+    """
+    index = _build_node_index(account_name, base_dir)
+    derived_post_ids = _derived_post_ids_from_sources(derived_list, index)
+    if tree_node_name not in index:
+        return 1.0
+    n_pids, p_pids = index[tree_node_name]
+    set_n = set(n_pids)
+    set_p = set(p_pids)
+    den = len(derived_post_ids & set_p)
+    if den == 0:
+        return 1.0
+    num = len(derived_post_ids & set_n)
+    return min(1.0, num / den)
+
+
+def _pattern_nodes_and_post_count(pattern: dict[str, Any]) -> tuple[list[str], int]:
+    """从 pattern 中解析出节点列表和 post_count。支持 nodes + post_count 或 i + post_count。"""
+    nodes = pattern.get("nodes")
+    if nodes is not None and isinstance(nodes, list):
+        nodes = [str(x).strip() for x in nodes if x]
+    else:
+        raw = pattern.get("i") or pattern.get("pattern_str") or ""
+        nodes = [x.strip() for x in str(raw).replace("+", " ").split() if x.strip()]
+    post_count = int(pattern.get("post_count", 0))
+    return nodes, post_count
+
+
+def calc_pattern_conditional_ratio(
+    account_name: str,
+    derived_list: list[DerivedItem],
+    pattern: dict[str, Any],
+    base_dir: Path | None = None,
+) -> float:
+    """
+    计算某个 pattern 的条件概率。
+
+    参数:
+        account_name: 账号名称
+        derived_list: 已推导列表,每项 (已推导的选题点, 推导来源人设树节点)
+        pattern: 至少包含节点列表与 post_count。
+                 - 节点列表: key 为 "nodes"(list)或 "i"(字符串,用 + 连接)
+                 - post_count: 该 pattern 的帖子数量,作为分子
+        base_dir: 可选,input 根目录
+
+    计算规则:
+        取 pattern 中「已被推导」的节点(其名称出现在 derived 的推导来源中),
+        在人设树中取这些节点的 post_ids 的交集作为分母;
+        分子 = pattern.post_count。
+        条件概率 = 分子/分母,且 ≤1;分母为 0 时返回 1。
+    """
+    pattern_nodes, post_count = _pattern_nodes_and_post_count(pattern)
+    if not pattern_nodes or post_count <= 0:
+        return 1.0
+
+    derived_sources = set(source for _post, source in derived_list)
+    # pattern 中已被推导的节点
+    derived_pattern_nodes = [n for n in pattern_nodes if n in derived_sources]
+    if not derived_pattern_nodes:
+        return 1.0
+
+    index = _build_node_index(account_name, base_dir)
+    # 仅使用在人设树中存在的「已被推导」节点,取它们在树中的 post_ids 的交集
+    derived_in_tree = [n for n in derived_pattern_nodes if n in index]
+    if not derived_in_tree:
+        return 1.0
+    common: set[str] | None = None
+    for name in derived_in_tree:
+        pids = set(index[name][0])
+        if common is None:
+            common = pids
+        else:
+            common &= pids
+    if common is None or len(common) == 0:
+        return 1.0
+    den = len(common)
+    return min(1.0, post_count / den)
+
+
+def _test_with_user_example() -> None:
+    """
+    使用你提供的测试数据:已推导 (分享|分享)、(柴犬|动物角色);
+    人设树节点:恶作剧;pattern:分享+动物角色+创意表达 post_count=2。
+    推导来源的 post_ids 在方法内部从人设树读取。
+    """
+    account_name = "家有大志"
+    # 已推导列表:(已推导的选题点, 推导来源人设树节点)
+    derived_list: list[DerivedItem] = [
+        ("分享", "分享"),
+        # ("柴犬", "动物角色"),
+    ]
+
+    # 1)人设树节点「恶作剧」的条件概率
+    r_node = calc_node_conditional_ratio(account_name, derived_list, "恶作剧")
+    print(f"1) 人设树节点「恶作剧」条件概率: {r_node}")
+
+    # 2)pattern 分享+动物角色+创意表达 post_count=2 的条件概率
+    pattern = {"i": "分享+动物角色+创意表达", "post_count": 2}
+    r_pattern = calc_pattern_conditional_ratio(account_name, derived_list, pattern)
+    print(f"2) pattern 分享+动物角色+创意表达 (post_count=2) 条件概率: {r_pattern}")
+
+
+if __name__ == "__main__":
+    _test_with_user_example()

+ 358 - 0
examples_how/overall_derivation/utils/similarity_calc.py

@@ -0,0 +1,358 @@
+"""
+相似度计算工具:计算两组短语的 M×N 相似度矩阵。
+使用综合相似度:embedding 50% + LLM 50%。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import json
+import logging
+import os
+import re
+import time
+from typing import List, Tuple, TypedDict
+
+import httpx
+
+logger = logging.getLogger(__name__)
+
+# 缓存目录:相对本文件所在目录的 ../.cache/similarity,按 (phrase_a, phrase_b) 原子化存储
+_CACHE_DIR = os.path.normpath(os.path.join(os.path.dirname(__file__), "..", ".cache", "similarity"))
+
+
+def _atomic_pair_key(phrase_a: str, phrase_b: str) -> str:
+    """单对短语的缓存键(原子粒度)。"""
+    raw = json.dumps([phrase_a, phrase_b], ensure_ascii=False, sort_keys=False)
+    return hashlib.sha256(raw.encode("utf-8")).hexdigest()
+
+
+def _ensure_cache_dir() -> None:
+    os.makedirs(_CACHE_DIR, exist_ok=True)
+
+
+def _read_atomic_score(cache_type: str, phrase_a: str, phrase_b: str) -> float | None:
+    """读取单对短语的分数缓存,不存在或失败返回 None。"""
+    key = _atomic_pair_key(phrase_a, phrase_b)
+    path = os.path.join(_CACHE_DIR, f"{cache_type}_{key}.json")
+    if not os.path.isfile(path):
+        return None
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        # 校验 phrase_a / phrase_b 一致,避免碰撞误用
+        if data.get("phrase_a") != phrase_a or data.get("phrase_b") != phrase_b:
+            return None
+        return float(data["score"])
+    except Exception as e:
+        logger.debug("[similarity_cache] 读取 %s 失败: %s", path, e)
+        return None
+
+
+def _write_atomic_score(cache_type: str, phrase_a: str, phrase_b: str, score: float) -> None:
+    """写入单对短语的分数缓存(原子结果:phrase_a, phrase_b, score)。"""
+    _ensure_cache_dir()
+    key = _atomic_pair_key(phrase_a, phrase_b)
+    path = os.path.join(_CACHE_DIR, f"{cache_type}_{key}.json")
+    try:
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump({"phrase_a": phrase_a, "phrase_b": phrase_b, "score": score}, f, ensure_ascii=False)
+    except Exception as e:
+        logger.warning("[similarity_cache] 写入 %s 失败: %s", path, e)
+
+
+class SimilarityItem(TypedDict):
+    """单条相似度结果。"""
+    phrase_a: str
+    phrase_b: str
+    embedding_score: float
+    llm_score: float
+    combined_score: float
+
+# 批量提示词模板(LLM 打分用)。占位符:{count}、{pairs_list};JSON 内大括号已转义
+DEFAULT_BATCH_PROMPT_TEMPLATE = """
+请从语意角度判断以下{count}对短语的相似度,每对从0-1打分,输出格式如下(必须是一个JSON数组):
+```json
+[
+    {{
+        "text_1": "",
+        "text_2": "",
+        "score": 0.0,
+        "reason": "简明扼要说明理由"
+    }},
+    {{
+        "text_1": "",
+        "text_2": "",
+        "score": 0.0,
+        "reason": "简明扼要说明理由"
+    }}
+]
+```
+
+短语对列表:
+{pairs_list}
+""".strip()
+
+# Embedding 相似度 API
+EMBEDDING_SIMILARITY_URL = "http://61.48.133.26:8187/cartesian_similarity"
+# LLM 模型
+LLM_MODEL = "openai/gpt-4.1-mini"
+
+
+def _phrase_pairs(phrases_a: List[str], phrases_b: List[str]) -> List[Tuple[str, str]]:
+    """将 M×N 展开为短语对列表,顺序为 (a0,b0),(a0,b1),...,(a0,b_{N-1}),(a1,b0),..."""
+    return [(a, b) for a in phrases_a for b in phrases_b]
+
+
+async def _embedding_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
+    """
+    调用 cartesian_similarity API,返回 M×N 矩阵。先查原子缓存 (phrase_a, phrase_b) -> score,仅对未命中的短语对调用 API。
+    """
+    if not phrases_a or not phrases_b:
+        return []
+
+    M, N = len(phrases_a), len(phrases_b)
+    matrix = [[0.0] * N for _ in range(M)]
+    missing_indices: List[Tuple[int, int]] = []
+    for i in range(M):
+        for j in range(N):
+            score = _read_atomic_score("embedding", phrases_a[i], phrases_b[j])
+            if score is not None:
+                matrix[i][j] = score
+            else:
+                missing_indices.append((i, j))
+
+    total = M * N
+    hit_count = total - len(missing_indices)
+    if hit_count > 0:
+        logger.info("[similarity_matrix] embedding 原子缓存命中 %d/%d", hit_count, total)
+    if not missing_indices:
+        return matrix
+
+    # 仅对未命中的短语对调用 API:构造缺失的 phrases_a / phrases_b(去重且保持顺序)
+    a_set: List[str] = list(dict.fromkeys(phrases_a[i] for i, _ in missing_indices))
+    b_set: List[str] = list(dict.fromkeys(phrases_b[j] for _, j in missing_indices))
+    async with httpx.AsyncClient(timeout=60.0) as client:
+        resp = await client.post(
+            EMBEDDING_SIMILARITY_URL,
+            json={"texts1": a_set, "texts2": b_set},
+            headers={"Content-Type": "application/json"},
+        )
+        resp.raise_for_status()
+        data = resp.json()
+
+    results = data.get("results", [])
+    len_b = len(b_set)
+    for i, j in missing_indices:
+        a, b = phrases_a[i], phrases_b[j]
+        i_m, j_m = a_set.index(a), b_set.index(b)
+        idx_flat = i_m * len_b + j_m
+        if idx_flat < len(results):
+            score = float(results[idx_flat]["score"])
+            matrix[i][j] = score
+            _write_atomic_score("embedding", a, b, score)
+    return matrix
+
+
+def _extract_json_array(content: str) -> List[dict]:
+    """从 LLM 回复中解析 JSON 数组(允许被 ```json ... ``` 包裹)。"""
+    content = content.strip()
+    # 尝试匹配 ```json ... ``` 中的内容
+    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
+    if m:
+        content = m.group(1).strip()
+    return json.loads(content)
+
+
+async def _llm_similarity(phrases_a: List[str], phrases_b: List[str]) -> List[List[float]]:
+    """
+    用 LLM 对短语对打分,返回 M×N 矩阵。先查原子缓存,仅对未命中的短语对调用 API。
+    """
+    if not phrases_a or not phrases_b:
+        return []
+
+    M, N = len(phrases_a), len(phrases_b)
+    matrix = [[0.0] * N for _ in range(M)]
+    missing_indices: List[Tuple[int, int]] = []
+    for i in range(M):
+        for j in range(N):
+            score = _read_atomic_score("llm", phrases_a[i], phrases_b[j])
+            if score is not None:
+                matrix[i][j] = score
+            else:
+                missing_indices.append((i, j))
+
+    total = M * N
+    hit_count = total - len(missing_indices)
+    if hit_count > 0:
+        logger.info("[similarity_matrix] LLM 原子缓存命中 %d/%d", hit_count, total)
+    if not missing_indices:
+        return matrix
+
+    # 仅对未命中的短语对调用 LLM:按缺失顺序构造 pairs_list,LLM 按同序返回
+    missing_pairs = [(phrases_a[i], phrases_b[j]) for (i, j) in missing_indices]
+    pairs_list = "\n".join(
+        f'{idx + 1}. "{a}" 和 "{b}"'
+        for idx, (a, b) in enumerate(missing_pairs)
+    )
+    prompt = DEFAULT_BATCH_PROMPT_TEMPLATE.format(count=len(missing_pairs), pairs_list=pairs_list)
+
+    from agent.llm.openrouter import openrouter_llm_call
+
+    messages = [{"role": "user", "content": prompt}]
+    result = await openrouter_llm_call(messages, model=LLM_MODEL)
+    content = result.get("content", "")
+    if not content:
+        raise ValueError("LLM 未返回内容")
+
+    items = _extract_json_array(content)
+    for idx, (i, j) in enumerate(missing_indices):
+        if idx >= len(items):
+            break
+        score = float(items[idx].get("score", 0.0))
+        score = max(0.0, min(1.0, score))
+        matrix[i][j] = score
+        a, b = phrases_a[i], phrases_b[j]
+        _write_atomic_score("llm", a, b, score)
+    return matrix
+
+
+async def similarity_matrix(
+    phrases_a: List[str],
+    phrases_b: List[str],
+    *,
+    embedding_weight: float = 0.5,
+    llm_weight: float = 0.5,
+) -> List[SimilarityItem]:
+    """
+    计算两组短语的相似度,返回对象列表(每条含 phrase_a, phrase_b, embedding_score, llm_score, combined_score)。
+
+    综合相似度 = embedding_weight * embedding_score + llm_weight * llm_score。
+    默认各 50%。
+
+    Args:
+        phrases_a: 第一组短语列表(M 个)
+        phrases_b: 第二组短语列表(N 个)
+        embedding_weight: embedding 权重,默认 0.5
+        llm_weight: LLM 权重,默认 0.5
+
+    Returns:
+        对象列表,长度 M×N,顺序与短语对 (a0,b0),(a0,b1),...,(aM-1,bN-1) 一致。
+    """
+    if not phrases_a or not phrases_b:
+        return []
+
+    M, N = len(phrases_a), len(phrases_b)
+    total_pairs = M * N
+    logger.info("[similarity_matrix] 开始计算: phrases_a=%d, phrases_b=%d, 短语对=%d", M, N, total_pairs)
+    t_total = time.perf_counter()
+
+    async def _run_embedding() -> List[List[float]]:
+        t0 = time.perf_counter()
+        out = await _embedding_similarity(phrases_a, phrases_b)
+        logger.info("[similarity_matrix] embedding 耗时: %.3fs", time.perf_counter() - t0)
+        return out
+
+    async def _run_llm() -> List[List[float]]:
+        t0 = time.perf_counter()
+        out = await _llm_similarity(phrases_a, phrases_b)
+        logger.info("[similarity_matrix] LLM 耗时: %.3fs", time.perf_counter() - t0)
+        return out
+
+    emb_matrix, llm_matrix = await asyncio.gather(_run_embedding(), _run_llm())
+    elapsed = time.perf_counter() - t_total
+    logger.info("[similarity_matrix] 总耗时: %.3fs", elapsed)
+
+    N = len(phrases_b)
+    pairs = _phrase_pairs(phrases_a, phrases_b)
+    result: List[SimilarityItem] = []
+    for idx, (a, b) in enumerate(pairs):
+        i, j = idx // N, idx % N
+        emb_s = emb_matrix[i][j]
+        llm_s = llm_matrix[i][j]
+        combined_s = embedding_weight * emb_s + llm_weight * llm_s
+        result.append({
+            "phrase_a": a,
+            "phrase_b": b,
+            "embedding_score": emb_s,
+            "llm_score": llm_s,
+            "combined_score": combined_s,
+        })
+    return result
+
+
+def similarity_matrix_sync(
+    phrases_a: List[str],
+    phrases_b: List[str],
+    **kwargs,
+) -> List[SimilarityItem]:
+    """同步封装:在同步代码中调用时使用 asyncio.run 执行。返回与 similarity_matrix 相同结构的对象列表。"""
+    return asyncio.run(similarity_matrix(phrases_a, phrases_b, **kwargs))
+
+
+# ---------------------------------------------------------------------------
+# 测试
+# ---------------------------------------------------------------------------
+
+def test_phrase_pairs() -> None:
+    """测试 M×N 展开为短语对列表的顺序。"""
+    a = ["犬", "猫咪"]
+    b = ["狗", "手机"]
+    pairs = _phrase_pairs(a, b)
+    assert len(pairs) == 4
+    assert pairs[0] == ("犬", "狗")
+    assert pairs[1] == ("犬", "手机")
+    assert pairs[2] == ("猫咪", "狗")
+    assert pairs[3] == ("猫咪", "手机")
+    print("test_phrase_pairs: ok")
+
+
+def test_extract_json_array() -> None:
+    """测试从 LLM 回复中解析 JSON 数组。"""
+    # 带 ```json 包裹
+    content = '''一些说明
+```json
+[
+    {"text_1": "犬", "text_2": "狗", "score": 0.85, "reason": "同义"}
+]
+```
+'''
+    arr = _extract_json_array(content)
+    assert len(arr) == 1
+    assert arr[0]["score"] == 0.85
+    # 纯 JSON 数组
+    arr2 = _extract_json_array('[{"score": 0.5}]')
+    assert len(arr2) == 1 and arr2[0]["score"] == 0.5
+    print("test_extract_json_array: ok")
+
+
+async def test_similarity_matrix() -> None:
+    """集成测试:调用 embedding + LLM 得到相似度对象列表。"""
+    phrases_a = ["犬", "猫咪"]
+    phrases_b = ["狗", "手机"]
+    items = await similarity_matrix(phrases_a, phrases_b)
+    assert len(items) == 4
+    for row in items:
+        assert "phrase_a" in row and "phrase_b" in row
+        assert "embedding_score" in row and "llm_score" in row and "combined_score" in row
+        assert 0 <= row["combined_score"] <= 1, f"combined_score 应在 [0,1],得到 {row['combined_score']}"
+    # 语义上 "犬"-"狗" 应高于 "犬"-"手机"
+    dog_dog = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "狗")
+    dog_phone = next(r for r in items if r["phrase_a"] == "犬" and r["phrase_b"] == "手机")
+    assert dog_dog["combined_score"] > dog_phone["combined_score"], "犬-狗 应高于 犬-手机"
+    print("test_similarity_matrix: ok")
+    for r in items:
+        print(f"  {r['phrase_a']}-{r['phrase_b']}: emb={r['embedding_score']:.4f} llm={r['llm_score']:.4f} combined={r['combined_score']:.4f}")
+
+
+if __name__ == "__main__":
+    test_phrase_pairs()
+    test_extract_json_array()
+    print("运行集成测试(需 embedding API、OPEN_ROUTER_API_KEY 及 agent 依赖)...")
+    try:
+        asyncio.run(test_similarity_matrix())
+        print("全部通过。")
+    except Exception as e:
+        print(f"跳过集成测试: {e}")
+        print("仅单元测试已通过。集成测试请确保:1) embedding 服务可访问 2) 设置 OPEN_ROUTER_API_KEY 3) 在项目根目录执行: python -m examples_how.overall_derivation.utils.similarity_calc")