| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273 |
- """
- Step2: 增量词在人设中的匹配分析
- 基于 Step1 的匹配结果(取 Top1),分析增量词在人设系统中的匹配情况
- """
- import os
- import sys
- import json
- import asyncio
- from pathlib import Path
- from agents import trace
- from agents.tracing.create import custom_span
- from lib.my_trace import set_trace_smith as set_trace
- from lib.match_analyzer import match_batch
- from lib.data_loader import load_persona_data, load_inspiration_data, select_inspiration
- # 模型配置
- MODEL_NAME = "google/gemini-2.5-pro"
- def format_persona_system(persona_data: dict) -> str:
- """格式化完整人设系统为参考文本
- Args:
- persona_data: 人设数据
- Returns:
- 格式化的人设系统文本
- """
- lines = ["# 人设系统"]
- # 处理三个部分:灵感点列表、目的点、关键点列表
- for section_key, section_title in [
- ("灵感点列表", "【灵感点】灵感的来源和性质"),
- ("目的点", "【目的点】创作的目的和价值导向"),
- ("关键点列表", "【关键点】内容的核心主体和表达方式")
- ]:
- section_data = persona_data.get(section_key, [])
- if not section_data:
- continue
- lines.append(f"\n## {section_title}\n")
- for perspective in section_data:
- perspective_name = perspective.get("视角名称", "")
- lines.append(f"\n### 视角:{perspective_name}")
- for pattern in perspective.get("模式列表", []):
- pattern_name = pattern.get("分类名称", "")
- pattern_def = pattern.get("核心定义", "")
- lines.append(f"\n 【一级】{pattern_name}")
- if pattern_def:
- lines.append(f" 定义:{pattern_def}")
- # 二级细分
- for sub in pattern.get("二级细分", []):
- sub_name = sub.get("分类名称", "")
- sub_def = sub.get("分类定义", "")
- lines.append(f" 【二级】{sub_name}:{sub_def}")
- return "\n".join(lines)
- def find_step1_file(persona_dir: str, inspiration: str, model_name: str) -> str:
- """查找 step1 输出文件
- Args:
- persona_dir: 人设目录
- inspiration: 灵感点名称
- model_name: 模型名称
- Returns:
- step1 文件路径
- Raises:
- SystemExit: 找不到文件时退出
- """
- step1_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
- model_name_short = model_name.replace("google/", "").replace("/", "_")
- step1_file_pattern = f"*_step1_*_{model_name_short}.json"
- step1_files = list(Path(step1_dir).glob(step1_file_pattern))
- if not step1_files:
- print(f"❌ 找不到 step1 输出文件")
- print(f"查找路径: {step1_dir}/{step1_file_pattern}")
- sys.exit(1)
- return str(step1_files[0])
- async def process_step2_incremental_match(
- step1_top1: dict,
- persona_data: dict,
- inspiration: str,
- current_time: str = None,
- log_url: str = None
- ) -> dict:
- """执行增量词匹配分析(核心业务逻辑)
- Args:
- step1_top1: step1 的 top1 匹配结果
- persona_data: 人设数据
- inspiration: 灵感名称
- current_time: 当前时间戳
- log_url: trace URL
- Returns:
- 匹配结果字典
- """
- # 从 step1 结果中提取信息
- business_info = step1_top1.get("业务信息", {})
- match_result = step1_top1.get("匹配结果", {})
- step1_inspiration = business_info.get("灵感", "")
- matched_element = business_info.get("匹配要素", "")
- incremental_parts = match_result.get("增量部分", {})
- incremental_words = list(incremental_parts.keys())
- # 格式化人设系统
- persona_system_text = format_persona_system(persona_data)
- # 构建补充上下文(B_Context - 统一构造一次)
- b_context = f"""这些增量词来自灵感「{step1_inspiration}」,
- 在 step1 匹配中,与人设要素「{matched_element}」匹配时产生的增量部分。"""
- if not incremental_words:
- print("⚠️ Top1 结果没有增量词,跳过分析")
- return {
- "元数据": {
- "current_time": current_time,
- "log_url": log_url,
- "model": MODEL_NAME,
- "步骤": "Step2: 增量词在人设中的匹配"
- },
- "灵感": step1_inspiration,
- "输入信息": {
- "B": [],
- "A": persona_system_text,
- "B_Context": b_context, # 使用统一构造的 context
- "A_Context": ""
- },
- "step1_结果": step1_top1,
- "匹配结果": []
- }
- print(f"\n开始增量词匹配分析: {step1_inspiration}")
- print(f"匹配要素: {matched_element}")
- print(f"增量词数量: {len(incremental_words)}, 模型: {MODEL_NAME}\n")
- # 使用 custom_span 标识整个流程
- with custom_span(
- name=f"Step2: 增量词匹配 - {step1_inspiration}",
- data={
- "灵感": step1_inspiration,
- "匹配要素": matched_element,
- "增量词数量": len(incremental_words),
- "模型": MODEL_NAME,
- "步骤": "增量词在人设中的匹配分析"
- }
- ):
- # 调用通用批量匹配模块
- match_results = await match_batch(
- b_items=incremental_words,
- a_content=persona_system_text,
- model_name=MODEL_NAME,
- b_context=b_context
- )
- # 按 score 降序排序
- if isinstance(match_results, list):
- match_results.sort(key=lambda x: x.get('score', 0), reverse=True)
- # 构建输出(使用统一构造的变量)
- return {
- "元数据": {
- "current_time": current_time,
- "log_url": log_url,
- "model": MODEL_NAME,
- "步骤": "Step2: 增量词在人设中的匹配"
- },
- "灵感": step1_inspiration,
- "输入信息": {
- "B": incremental_words,
- "A": persona_system_text,
- "B_Context": b_context, # 使用统一构造的 context
- "A_Context": ""
- },
- "匹配结果": match_results,
- "step1_结果": step1_top1,
- }
- async def main(current_time: str, log_url: str):
- """主函数"""
- # 解析命令行参数
- persona_dir = sys.argv[1] if len(sys.argv) > 1 else "data/阿里多多酱/out/人设_1110"
- inspiration_arg = sys.argv[2] if len(sys.argv) > 2 else "0"
- print(f"{'=' * 80}")
- print(f"Step2: 增量词在人设中的匹配分析(Top1)")
- print(f"{'=' * 80}")
- print(f"人设目录: {persona_dir}")
- print(f"灵感参数: {inspiration_arg}")
- # 加载数据
- persona_data = load_persona_data(persona_dir)
- inspiration_data = load_inspiration_data(persona_dir)
- inspiration_list = [item["灵感点"] for item in inspiration_data]
- test_inspiration = select_inspiration(inspiration_arg, inspiration_list)
- # 查找并加载 step1 结果
- step1_file = find_step1_file(persona_dir, test_inspiration, MODEL_NAME)
- step1_filename = os.path.basename(step1_file)
- step1_basename = os.path.splitext(step1_filename)[0]
- print(f"Step1 输入文件: {step1_file}")
- with open(step1_file, 'r', encoding='utf-8') as f:
- step1_data = json.load(f)
- actual_inspiration = step1_data.get("灵感", "")
- step1_results = step1_data.get("匹配结果列表", [])
- if not step1_results:
- print("❌ step1 结果为空")
- sys.exit(1)
- print(f"灵感: {actual_inspiration}")
- # 默认处理 top1,后续可以支持指定第几个
- result_index = 0 # 使用第 1 个匹配结果(top1)
- selected_result = step1_results[result_index]
- print(f"处理第 {result_index + 1} 个匹配结果(Top{result_index + 1})\n")
- # 执行核心业务逻辑
- output = await process_step2_incremental_match(
- step1_top1=selected_result,
- persona_data=persona_data,
- inspiration=actual_inspiration,
- current_time=current_time,
- log_url=log_url
- )
- # 在元数据中添加 step1 匹配索引
- output["元数据"]["step1_匹配索引"] = result_index + 1
- # 保存结果
- output_dir = os.path.join(persona_dir, "how", "灵感点", test_inspiration)
- model_name_short = MODEL_NAME.replace("google/", "").replace("/", "_")
- # 提取 step1 的范围标识(all 或 top10 等)
- scope_prefix = step1_basename.split("_")[0] # 提取 "all" 或 "top10" 等
- output_filename = f"{scope_prefix}_step2_top{result_index + 1}_增量词匹配_{model_name_short}.json"
- os.makedirs(output_dir, exist_ok=True)
- output_file = os.path.join(output_dir, output_filename)
- with open(output_file, 'w', encoding='utf-8') as f:
- json.dump(output, f, ensure_ascii=False, indent=2)
- print(f"\n完成!结果已保存到: {output_file}")
- if log_url:
- print(f"Trace: {log_url}\n")
- if __name__ == "__main__":
- # 设置 trace
- current_time, log_url = set_trace()
- # 使用 trace 上下文包裹整个执行流程
- with trace("Step2: 增量词匹配分析"):
- asyncio.run(main(current_time, log_url))
|