howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
							#!/usr/bin/env python3
"""
账号人设总结：
1. 从 input/{account_name}/处理后数据/tree 目录下读取人设树 JSON 文件并合并
2. 将合并后的 JSON 填充到 topic_summary_prompt.md 中的 {topic_point_tree}
3. 调用大模型生成账号人设总结，写入 input/{account_name}/处理后数据/persona_data/persona_summary.json
"""

import asyncio
import json
import logging
import sys
from pathlib import Path
from typing import Any, Dict

logger = logging.getLogger(__name__)


# 确保可以导入 agent 内的 LLM 调用封装（本文件在 data_process 下，多一层目录）
_project_root = Path(__file__).resolve().parent.parent.parent.parent
if str(_project_root) not in sys.path:
    sys.path.insert(0, str(_project_root))

try:
    from agent.llm.openrouter import openrouter_llm_call
except ImportError:  # pragma: no cover - 仅用于本地缺少依赖时的降级提示
    openrouter_llm_call = None  # type: ignore[assignment]


# 复用与 search_and_eval 相同的模型，保证行为一致
EVAL_LLM_MODEL = "google/gemini-3.1-pro-preview"

# 脚本与 topic_summary_prompt.md 在 data_process；数据在 overall_derivation/input
BASE_DIR = Path(__file__).resolve().parent
OVERALL_DERIVATION_DIR = BASE_DIR.parent
INPUT_BASE = OVERALL_DERIVATION_DIR / "input"

# 人设树中不送入 LLM 的字段（递归删除）
_TREE_STRIP_KEYS = frozenset(
    {
        "_post_ids",
        "_child_categories_relation",
        "_child_categories_relation_detail",
    }
)


def _strip_tree_fields(obj: Any) -> Any:
    """递归从树结构中移除 _TREE_STRIP_KEYS 中的键。"""
    if isinstance(obj, dict):
        return {
            k: _strip_tree_fields(v)
            for k, v in obj.items()
            if k not in _TREE_STRIP_KEYS
        }
    if isinstance(obj, list):
        return [_strip_tree_fields(x) for x in obj]
    return obj


def _extract_json_object(content: str) -> Dict[str, Any]:
    """
    从 LLM 回复中解析第一个 JSON 对象（允许被 ```json ... ``` 包裹）。
    逻辑参考 tools/search_and_eval.py 中的实现。
    """
    content = content.strip()

    # 处理 ```json ... ``` 包裹的情况
    import re

    m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content)
    if m:
        content = m.group(1).strip()

    # 截取最外层 { ... }
    start = content.find("{")
    end = content.rfind("}")
    if start != -1 and end != -1:
        content = content[start : end + 1]
    return json.loads(content)


def _load_topic_point_tree(account_name: str) -> Dict[str, Any]:
    """
    读取 input/{account_name}/处理后数据/tree 目录下的所有 JSON 文件，并合并成一个字典：
    {
      "<文件名去掉后缀>": <该文件对应的树 JSON>,
      ...
    }
    每棵树加载后会去掉 _post_ids、_child_categories_relation、_child_categories_relation_detail。
    """
    tree_dir = INPUT_BASE / account_name / "处理后数据" / "tree"
    if not tree_dir.is_dir():
        raise FileNotFoundError(f"人设树目录不存在: {tree_dir}")

    merged: Dict[str, Any] = {}
    files = sorted(tree_dir.glob("*.json"))
    if not files:
        raise FileNotFoundError(f"人设树目录中未找到任何 JSON 文件: {tree_dir}")

    for path in files:
        with open(path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                raise ValueError(f"解析 JSON 文件失败: {path}") from e
        merged[path.stem] = _strip_tree_fields(data)
        logger.info("已加载人设树文件: %s", path.name)

    return merged


def _load_prompt_template() -> str:
    """读取 topic_summary_prompt.md 模板。"""
    prompt_path = BASE_DIR / "topic_summary_prompt.md"
    if not prompt_path.is_file():
        raise FileNotFoundError(f"找不到 prompt 模板文件: {prompt_path}")
    with open(prompt_path, "r", encoding="utf-8") as f:
        return f.read()


async def generate_topic_summary(account_name: str) -> Dict[str, Any]:
    """
    生成账号人设总结，并返回解析后的 JSON 结果。
    同时将结果写入 persona_summary.json 文件。
    """
    if openrouter_llm_call is None:
        raise RuntimeError("未找到 openrouter_llm_call，请检查 agent.llm 依赖是否可用。")

    # 1. 加载并合并人设树
    topic_tree = _load_topic_point_tree(account_name)
    topic_tree_str = json.dumps(topic_tree, ensure_ascii=False, indent=2)
    logger.info("已合并人设树，共包含 %d 个子树", len(topic_tree))

    # 2. 读取并填充 prompt 模板
    prompt_template = _load_prompt_template()
    system_prompt = prompt_template.replace("{topic_point_tree}", topic_tree_str)

    # 3. 调用 LLM 生成总结
    messages = [
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": "请根据以上说明，严格按照 JSON 模板输出账号人设总结，仅输出 JSON，不要包含其他解释性文字。",
        },
    ]

    logger.info("开始调用 LLM 生成账号人设总结，account_name=%s", account_name)
    llm_result = await openrouter_llm_call(messages, model=EVAL_LLM_MODEL)
    content = llm_result.get("content", "") if isinstance(llm_result, dict) else ""
    if not content:
        raise RuntimeError("LLM 未返回任何内容")

    try:
        summary_data = _extract_json_object(content)
    except Exception as e:  # noqa: BLE001
        logger.exception("解析 LLM 返回的 JSON 失败")
        raise RuntimeError(f"解析 LLM 返回内容失败: {e}") from e

    # 4. 写入 persona_summary.json
    persona_dir = INPUT_BASE / account_name / "处理后数据" / "persona_data"
    persona_dir.mkdir(parents=True, exist_ok=True)
    persona_file = persona_dir / "persona_summary.json"
    with open(persona_file, "w", encoding="utf-8") as f:
        json.dump(summary_data, f, ensure_ascii=False, indent=2)
    logger.info("已写入账号人设总结到文件: %s", persona_file)

    return summary_data


def main(account_name) -> None:
    # parser = argparse.ArgumentParser(description="根据人设树生成账号人设总结")
    # parser.add_argument("account_name", help="账号名称（对应 input/{account_name} 目录）")
    # args = parser.parse_args(argv)

    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
        datefmt="%H:%M:%S",
    )

    logger.info("生成账号人设总结，account_name=%s", account_name)

    async def _run() -> None:
        summary = await generate_topic_summary(account_name)
        print(json.dumps(summary, ensure_ascii=False, indent=2))

    asyncio.run(_run())


if __name__ == "__main__":
    main(account_name="空间点阵设计研究室")