howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
							"""
查找 Pattern Tool - 从 pattern 库中获取符合条件概率阈值的 pattern

功能：
- 账号：读取 input/{账号}/处理后数据/pattern/pattern.json，条件概率基于账号人设树。
- 平台库：读取 input/xiaohongshu/pattern/processed_edge_data.json，条件概率基于 xiaohongshu/tree。
所有 pattern 按 条件概率 * pattern元素长度 降序；账号占 60% 配额，平台库占 40% 配额。
"""

import json
import sys
from pathlib import Path
from typing import Any

# 保证直接运行或作为包加载时都能解析 utils / tools（IDE 可跳转）
_root = Path(__file__).resolve().parent.parent
if str(_root) not in sys.path:
    sys.path.insert(0, str(_root))
from utils.conditional_ratio_calc import (
    build_node_index_for_tree_dir,
    calc_pattern_conditional_ratio,
    calc_pattern_conditional_ratio_with_index,
)
from tools.point_match import (
    DEFAULT_MATCH_THRESHOLD,
)

try:
    from agent.tools import tool, ToolResult, ToolContext
except ImportError:
    def tool(*args, **kwargs):
        return lambda f: f
    ToolResult = None  # 仅用 main() 测核心逻辑时可无 agent
    ToolContext = None

# 与 pattern_data_process 一致的 key 定义
TOP_KEYS = [
    "depth_max_with_name",
    "depth_mixed",
    "depth_max_concrete",
    "depth2_medium",
    "depth1_abstract",
    "depth_max_minus_1",
    "depth_max_minus_2",
    "depth_3",
    "depth_4",
]
SUB_KEYS = ["two_x", "one_x", "zero_x"]

_BASE_INPUT = Path(__file__).resolve().parent.parent / "input"

_PLATFORM_TREE_DIR = _BASE_INPUT / "xiaohongshu" / "tree"
_PLATFORM_PATTERN_FILE = _BASE_INPUT / "xiaohongshu" / "pattern" / "processed_edge_data.json"


def _pattern_file(account_name: str) -> Path:
    """pattern 库文件：../input/{account_name}/处理后数据/pattern/pattern.json"""
    return _BASE_INPUT / account_name / "处理后数据" / "pattern" / "pattern.json"


def _platform_pattern_file() -> Path:
    """平台库 pattern：../input/xiaohongshu/pattern/processed_edge_data.json"""
    return _PLATFORM_PATTERN_FILE


def _slim_pattern(p: dict) -> tuple[float, int, list[str], int]:
    """提取 name 列表（去重保序）、support、length、post_count。"""
    names = [item["name"] for item in (p.get("items") or [])]
    seen = set()
    unique = []
    for n in names:
        if n not in seen:
            seen.add(n)
            unique.append(n)
    support = round(float(p.get("support", 0)), 4)
    length = int(p.get("length", 0))
    post_count = int(p.get("post_count", 0))
    return support, length, unique, post_count


def _merge_and_dedupe(patterns: list[dict]) -> list[dict]:
    """
    按 items 的 name 集合去重（不区分顺序），留 support 最大；
    输出格式保留 s、l、i（nameA+nameB+nameC）及 post_count，供条件概率计算使用。
    """
    key_to_best: dict[tuple, tuple[float, int, int]] = {}
    for p in patterns:
        support, length, unique, post_count = _slim_pattern(p)
        if not unique:
            continue
        key = tuple(sorted(unique))
        if key not in key_to_best or support > key_to_best[key][0]:
            key_to_best[key] = (support, length, post_count)
    out = []
    for k, (s, l, post_count) in key_to_best.items():
        out.append({
            "s": s,
            "l": l,
            "i": "+".join(k),
            "post_count": post_count,
        })
    out.sort(key=lambda x: x["s"] * x["l"], reverse=True)
    return out


def _load_and_merge_patterns(account_name: str) -> list[dict]:
    """读取 pattern 库 JSON，按 TOP_KEYS/SUB_KEYS 合并为列表并做合并、去重。"""
    path = _pattern_file(account_name)
    if not path.is_file():
        return []
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    all_patterns = []
    for top in TOP_KEYS:
        if top not in data:
            continue
        block = data[top]
        for sub in SUB_KEYS:
            all_patterns.extend(block.get(sub) or [])
    return _merge_and_dedupe(all_patterns)


def _load_and_merge_platform_patterns() -> list[dict]:
    """读取平台库 pattern JSON，结构与账号库相同，合并去重。"""
    path = _platform_pattern_file()
    if not path.is_file():
        return []
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    all_patterns = []
    for top in TOP_KEYS:
        if top not in data:
            continue
        block = data[top]
        for sub in SUB_KEYS:
            all_patterns.extend(block.get(sub) or [])
    return _merge_and_dedupe(all_patterns)


def _load_match_lookup(file_path: Path) -> dict[tuple[str, str], float]:
    """
    读取 match_data 文件，返回 (帖子选题点, 人设树节点) -> 最高匹配分。
    文件格式：[{"name": 帖子选题点, "match_personas": [{"name": 节点名, "match_score": float}]}]
    """
    lookup: dict[tuple[str, str], float] = {}
    if not file_path.is_file():
        return lookup
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception:
        return lookup
    if not isinstance(data, list):
        return lookup
    for item in data:
        if not isinstance(item, dict):
            continue
        topic = item.get("name")
        personas = item.get("match_personas")
        if topic is None or not isinstance(personas, list):
            continue
        topic_s = str(topic).strip()
        if not topic_s:
            continue
        for mp in personas:
            if not isinstance(mp, dict):
                continue
            node = mp.get("name")
            score = mp.get("match_score")
            if node is None or score is None:
                continue
            try:
                sc = float(score)
            except (TypeError, ValueError):
                continue
            key = (topic_s, str(node).strip())
            if key not in lookup or sc > lookup[key]:
                lookup[key] = sc
    return lookup


def _pattern_has_derived_match(
    pattern_name: str,
    derived_topics: set[str],
    match_lookup: dict[tuple[str, str], float],
    threshold: float,
) -> bool:
    """pattern 中至少有一个元素与任意 derived_topic 的匹配分 >= threshold。"""
    for elem in (e.strip() for e in pattern_name.split("+")):
        if not elem:
            continue
        for topic in derived_topics:
            if match_lookup.get((topic, elem), 0.0) >= threshold:
                return True
    return False


def _parse_derived_list(derived_items: list[dict[str, str]]) -> list[tuple[str, str]]:
    """将 agent 传入的 [{"topic": "x", "source_node": "y"}, ...] 转为 DerivedItem 列表。"""
    out = []
    for item in derived_items:
        if isinstance(item, dict):
            topic = item.get("topic") or item.get("已推导的选题点")
            source = item.get("source_node") or item.get("推导来源人设树节点")
            if topic is not None and source is not None:
                out.append((str(topic).strip(), str(source).strip()))
        elif isinstance(item, (list, tuple)) and len(item) >= 2:
            out.append((str(item[0]).strip(), str(item[1]).strip()))
    return out


def get_patterns_by_conditional_ratio(
    account_name: str,
    derived_list: list[tuple[str, str]],
    conditional_ratio_threshold: float,
    top_n: int,
) -> list[dict[str, Any]]:
    """
    从 pattern 库中获取条件概率 >= 阈值的 pattern，按 条件概率 * pattern元素长度 降序返回 top_n 条。
    derived_list 为空时，条件概率使用 pattern 自身的 support（s）。
    返回每项：pattern名称（nameA+nameB+nameC）、条件概率。
    """
    merged = _load_and_merge_patterns(account_name)
    if not merged:
        return []
    base_dir = _BASE_INPUT
    scored: list[tuple[dict, float]] = []

    if not derived_list:
        for p in merged:
            ratio = float(p.get("s", 0))
            if ratio >= conditional_ratio_threshold:
                scored.append((p, ratio))
    else:
        for p in merged:
            ratio = calc_pattern_conditional_ratio(
                account_name, derived_list, p, base_dir=base_dir
            )
            if ratio >= conditional_ratio_threshold:
                scored.append((p, ratio))

    scored.sort(key=lambda x: -(x[1] * x[0]["l"]))
    result = []
    for p, ratio in scored[:top_n]:
        result.append({
            "pattern名称": p["i"],
            "条件概率": round(ratio, 6),
        })
    return result


def get_platform_patterns_by_conditional_ratio(
    derived_list: list[tuple[str, str]],
    conditional_ratio_threshold: float,
    top_n: int,
) -> list[dict[str, Any]]:
    """
    平台库 pattern：数据来自 xiaohongshu/pattern/processed_edge_data.json，
    条件概率基于 xiaohongshu/tree 的节点索引（与账号侧 calc_pattern 规则一致）。
    按 条件概率 * pattern元素长度 降序返回 top_n 条。
    """
    merged = _load_and_merge_platform_patterns()
    if not merged:
        return []
    platform_index = build_node_index_for_tree_dir(_PLATFORM_TREE_DIR)
    scored: list[tuple[dict, float]] = []

    if not derived_list:
        for p in merged:
            ratio = float(p.get("s", 0))
            if ratio >= conditional_ratio_threshold:
                scored.append((p, ratio))
    else:
        for p in merged:
            ratio = calc_pattern_conditional_ratio_with_index(derived_list, p, platform_index)
            if ratio >= conditional_ratio_threshold:
                scored.append((p, ratio))

    scored.sort(key=lambda x: -(x[1] * x[0]["l"]))
    result = []
    for p, ratio in scored[:top_n]:
        result.append({
            "pattern名称": p["i"],
            "条件概率": round(ratio, 6),
        })
    return result


# ---------------------------------------------------------------------------
# Agent Tool
# ---------------------------------------------------------------------------

@tool()
async def find_pattern(
    account_name: str,
    post_id: str,
    derived_items: list[dict[str, str]],
    conditional_ratio_threshold: float,
    top_n: int = 100,
    match_score_threshold: float = DEFAULT_MATCH_THRESHOLD,
) -> ToolResult:
    """
    按条件概率阈值从 pattern 库筛选：第一节为账号 pattern（优先使用），第二节为平台库 pattern。
    所有 pattern 按 条件概率 * pattern元素长度 降序排列。

    Args:
    account_name : 账号名，用于定位该账号的 pattern 库。
    post_id : 帖子ID，用于加载 match_data 过滤（derived_items 非空时生效）。
    derived_items : 已推导选题点列表，可为空。
    conditional_ratio_threshold : 条件概率阈值。
    top_n : 最终返回总条数上限。
    match_score_threshold : pattern 元素与帖子选题点的匹配分阈值。

    Returns:
    ToolResult：output 分「账号 pattern」「平台库 pattern」两段；平台段已排除与账号段 pattern 名称完全相同的项。
    """
    pattern_path = _pattern_file(account_name)
    if not pattern_path.is_file():
        return ToolResult(
            title="Pattern 库不存在",
            output=f"pattern 文件不存在: {pattern_path}",
            error="Pattern file not found",
        )
    try:
        derived_list = _parse_derived_list(derived_items or [])
        derived_topics = {topic for topic, _ in derived_list}
        thr = float(match_score_threshold)
        total_top_n = max(0, int(top_n))
        account_top_n = int(total_top_n * 0.6)
        platform_top_n = total_top_n - account_top_n
        # 有过滤时候选池放大，以保证过滤后仍有足够数量
        candidate_mult = max(total_top_n * 5, 500) if derived_topics and post_id else 0

        # 预加载 match_lookup（仅当 derived_topics 非空且有 post_id 时）
        account_match_lookup: dict[tuple[str, str], float] = {}
        platform_match_lookup: dict[tuple[str, str], float] = {}
        if derived_topics and post_id:
            account_match_file = (
                _BASE_INPUT / account_name / "处理后数据" / "match_data"
                / f"{post_id}_匹配_all.json"
            )
            platform_match_file = (
                _BASE_INPUT / "xiaohongshu" / "match_data" / f"{post_id}_匹配_all.json"
            )
            account_match_lookup = _load_match_lookup(account_match_file)
            platform_match_lookup = _load_match_lookup(platform_match_file)

        def _filter_by_derived_match(
            items: list[dict],
            match_lookup: dict[tuple[str, str], float],
        ) -> list[dict]:
            """derived_topics 非空时过滤：pattern 至少有一个元素与任意 topic 匹配分 >= thr。"""
            if not derived_topics or not post_id:
                return items
            return [
                x for x in items
                if _pattern_has_derived_match(
                    str(x.get("pattern名称", "")), derived_topics, match_lookup, thr
                )
            ]

        # ---------- 账号 pattern ----------
        account_candidate_n = candidate_mult if candidate_mult else account_top_n
        items_account_raw = get_patterns_by_conditional_ratio(
            account_name, derived_list, conditional_ratio_threshold, account_candidate_n
        )
        items_account = _filter_by_derived_match(items_account_raw, account_match_lookup)[:account_top_n]
        account_pattern_names = {str(x.get("pattern名称", "")).strip() for x in items_account}

        # ---------- 平台库 pattern ----------
        platform_candidate_n = (candidate_mult + len(account_pattern_names)) if candidate_mult else (platform_top_n + len(account_pattern_names))
        items_platform_raw = get_platform_patterns_by_conditional_ratio(
            derived_list,
            conditional_ratio_threshold / 5,
            platform_candidate_n,
        )
        items_platform = _filter_by_derived_match(
            [x for x in items_platform_raw if str(x.get("pattern名称", "")).strip() not in account_pattern_names],
            platform_match_lookup,
        )[:platform_top_n]

        def _format_pattern_block(xs: list[dict[str, Any]]) -> list[str]:
            return [f"- {x['pattern名称']}\t条件概率={x['条件概率']}" for x in xs]

        lines_out: list[str] = []
        lines_out.append(
            "【优先使用】第一节为账号 pattern（优先使用）；第二节为平台库 pattern。"
        )
        lines_out.append("")
        lines_out.append("—— 账号 pattern ——")
        if not items_account:
            lines_out.append(
                f"（无：未找到条件概率 >= {conditional_ratio_threshold} 的 pattern）"
            )
        else:
            lines_out.extend(_format_pattern_block(items_account))
        lines_out.append("")
        lines_out.append("—— 平台库 pattern ——")
        if not items_platform:
            lines_out.append("（无：未找到达标 pattern）")
        else:
            lines_out.extend(_format_pattern_block(items_platform))

        output = "\n".join(lines_out)
        return ToolResult(
            title=f"符合条件概率的 Pattern ({account_name}, 阈值={conditional_ratio_threshold})",
            output=output,
            metadata={
                "account_name": account_name,
                "conditional_ratio_threshold": conditional_ratio_threshold,
                "top_n": top_n,
                "quota": {
                    "account_top_n": account_top_n,
                    "platform_top_n": platform_top_n,
                },
                "account_pattern_count": len(items_account),
                "platform_pattern_count": len(items_platform),
                "count": len(items_account) + len(items_platform),
            },
        )
    except Exception as e:
        return ToolResult(
            title="查找 Pattern 失败",
            output=str(e),
            error=str(e),
        )


def main() -> None:
    """本地测试：用家有大志账号、已推导选题点，查询符合条件概率阈值的 pattern。"""
    import asyncio

    account_name = "家有大志"
    post_id = "68fb6a5c000000000302e5de"
    derived_items = [
        {"topic": "分享", "source_node": "分享"},
        {"topic": "植入方式", "source_node": "植入方式"},
        {"topic": "叙事结构", "source_node": "叙事结构"},
    ]
    derived_items: list[dict[str, str]] = []
    conditional_ratio_threshold = 0.2
    top_n = 500

    # 1）直接调用核心函数（仅验证排序逻辑）
    # derived_list = _parse_derived_list(derived_items)
    # items = get_patterns_by_conditional_ratio(
    #     account_name, derived_list, conditional_ratio_threshold, top_n
    # )
    # print(f"账号: {account_name}, 阈值: {conditional_ratio_threshold}, top_n: {top_n}")
    # print(f"共 {len(items)} 条 pattern:\n")
    # for x in items:
    #     print(f"  - {x['pattern名称']}\t条件概率={x['条件概率']}")

    # 2）有 agent 时通过 tool 接口再跑一遍
    if ToolResult is not None:
        async def run_tool():
            result = await find_pattern(
                account_name=account_name,
                post_id=post_id,
                derived_items=derived_items,
                conditional_ratio_threshold=conditional_ratio_threshold,
                top_n=top_n,
            )
            print("\n--- Tool 返回 ---")
            print(result.output)
        asyncio.run(run_tool())


if __name__ == "__main__":
    main()