#!/usr/bin/env python3 """ 账号人设总结: 1. 从 input/{account_name}/处理后数据/tree 目录下读取人设树 JSON 文件并合并 2. 将合并后的 JSON 填充到 topic_summary_prompt.md 中的 {topic_point_tree} 3. 调用大模型生成账号人设总结,写入 input/{account_name}/处理后数据/persona_data/persona_summary.json """ import asyncio import json import logging import sys from pathlib import Path from typing import Any, Dict logger = logging.getLogger(__name__) # 确保可以导入 agent 内的 LLM 调用封装(本文件在 data_process 下,多一层目录) _project_root = Path(__file__).resolve().parent.parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) try: from agent.llm.openrouter import openrouter_llm_call except ImportError: # pragma: no cover - 仅用于本地缺少依赖时的降级提示 openrouter_llm_call = None # type: ignore[assignment] # 复用与 search_and_eval 相同的模型,保证行为一致 EVAL_LLM_MODEL = "google/gemini-3.1-pro-preview" # 脚本与 topic_summary_prompt.md 在 data_process;数据在 overall_derivation/input BASE_DIR = Path(__file__).resolve().parent OVERALL_DERIVATION_DIR = BASE_DIR.parent INPUT_BASE = OVERALL_DERIVATION_DIR / "input" # 人设树中不送入 LLM 的字段(递归删除) _TREE_STRIP_KEYS = frozenset( { "_post_ids", "_child_categories_relation", "_child_categories_relation_detail", } ) def _strip_tree_fields(obj: Any) -> Any: """递归从树结构中移除 _TREE_STRIP_KEYS 中的键。""" if isinstance(obj, dict): return { k: _strip_tree_fields(v) for k, v in obj.items() if k not in _TREE_STRIP_KEYS } if isinstance(obj, list): return [_strip_tree_fields(x) for x in obj] return obj def _extract_json_object(content: str) -> Dict[str, Any]: """ 从 LLM 回复中解析第一个 JSON 对象(允许被 ```json ... ``` 包裹)。 逻辑参考 tools/search_and_eval.py 中的实现。 """ content = content.strip() # 处理 ```json ... ``` 包裹的情况 import re m = re.search(r"```(?:json)?\s*([\s\S]*?)\s*```", content) if m: content = m.group(1).strip() # 截取最外层 { ... } start = content.find("{") end = content.rfind("}") if start != -1 and end != -1: content = content[start : end + 1] return json.loads(content) def _load_topic_point_tree(account_name: str) -> Dict[str, Any]: """ 读取 input/{account_name}/处理后数据/tree 目录下的所有 JSON 文件,并合并成一个字典: { "<文件名去掉后缀>": <该文件对应的树 JSON>, ... } 每棵树加载后会去掉 _post_ids、_child_categories_relation、_child_categories_relation_detail。 """ tree_dir = INPUT_BASE / account_name / "处理后数据" / "tree" if not tree_dir.is_dir(): raise FileNotFoundError(f"人设树目录不存在: {tree_dir}") merged: Dict[str, Any] = {} files = sorted(tree_dir.glob("*.json")) if not files: raise FileNotFoundError(f"人设树目录中未找到任何 JSON 文件: {tree_dir}") for path in files: with open(path, "r", encoding="utf-8") as f: try: data = json.load(f) except json.JSONDecodeError as e: raise ValueError(f"解析 JSON 文件失败: {path}") from e merged[path.stem] = _strip_tree_fields(data) logger.info("已加载人设树文件: %s", path.name) return merged def _load_prompt_template() -> str: """读取 topic_summary_prompt.md 模板。""" prompt_path = BASE_DIR / "topic_summary_prompt.md" if not prompt_path.is_file(): raise FileNotFoundError(f"找不到 prompt 模板文件: {prompt_path}") with open(prompt_path, "r", encoding="utf-8") as f: return f.read() async def generate_topic_summary(account_name: str) -> Dict[str, Any]: """ 生成账号人设总结,并返回解析后的 JSON 结果。 同时将结果写入 persona_summary.json 文件。 """ if openrouter_llm_call is None: raise RuntimeError("未找到 openrouter_llm_call,请检查 agent.llm 依赖是否可用。") # 1. 加载并合并人设树 topic_tree = _load_topic_point_tree(account_name) topic_tree_str = json.dumps(topic_tree, ensure_ascii=False, indent=2) logger.info("已合并人设树,共包含 %d 个子树", len(topic_tree)) # 2. 读取并填充 prompt 模板 prompt_template = _load_prompt_template() system_prompt = prompt_template.replace("{topic_point_tree}", topic_tree_str) # 3. 调用 LLM 生成总结 messages = [ {"role": "system", "content": system_prompt}, { "role": "user", "content": "请根据以上说明,严格按照 JSON 模板输出账号人设总结,仅输出 JSON,不要包含其他解释性文字。", }, ] logger.info("开始调用 LLM 生成账号人设总结,account_name=%s", account_name) llm_result = await openrouter_llm_call(messages, model=EVAL_LLM_MODEL) content = llm_result.get("content", "") if isinstance(llm_result, dict) else "" if not content: raise RuntimeError("LLM 未返回任何内容") try: summary_data = _extract_json_object(content) except Exception as e: # noqa: BLE001 logger.exception("解析 LLM 返回的 JSON 失败") raise RuntimeError(f"解析 LLM 返回内容失败: {e}") from e # 4. 写入 persona_summary.json persona_dir = INPUT_BASE / account_name / "处理后数据" / "persona_data" persona_dir.mkdir(parents=True, exist_ok=True) persona_file = persona_dir / "persona_summary.json" with open(persona_file, "w", encoding="utf-8") as f: json.dump(summary_data, f, ensure_ascii=False, indent=2) logger.info("已写入账号人设总结到文件: %s", persona_file) return summary_data def main(account_name) -> None: # parser = argparse.ArgumentParser(description="根据人设树生成账号人设总结") # parser.add_argument("account_name", help="账号名称(对应 input/{account_name} 目录)") # args = parser.parse_args(argv) logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%H:%M:%S", ) logger.info("生成账号人设总结,account_name=%s", account_name) async def _run() -> None: summary = await generate_topic_summary(account_name) print(json.dumps(summary, ensure_ascii=False, indent=2)) asyncio.run(_run()) if __name__ == "__main__": main(account_name="空间点阵设计研究室")