step2_incremental_match.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. """
  2. Step2: 增量词在人设中的匹配分析
  3. 基于 Step1 的匹配结果(取 Top1),分析增量词在人设系统中的匹配情况
  4. """
  5. import os
  6. import sys
  7. import json
  8. import asyncio
  9. from pathlib import Path
  10. from agents import trace
  11. from agents.tracing.create import custom_span
  12. from lib.my_trace import set_trace_smith as set_trace
  13. from lib.match_analyzer import match_single
  14. from lib.data_loader import load_persona_data, load_inspiration_list, select_inspiration
  15. # 模型配置
  16. MODEL_NAME = "google/gemini-2.5-pro"
  17. def format_persona_system(persona_data: dict) -> str:
  18. """格式化完整人设系统为参考文本
  19. Args:
  20. persona_data: 人设数据
  21. Returns:
  22. 格式化的人设系统文本
  23. """
  24. lines = ["# 人设系统"]
  25. # 处理三个部分:灵感点列表、目的点、关键点列表
  26. for section_key, section_title in [
  27. ("灵感点列表", "【灵感点】灵感的来源和性质"),
  28. ("目的点", "【目的点】创作的目的和价值导向"),
  29. ("关键点列表", "【关键点】内容的核心主体和表达方式")
  30. ]:
  31. section_data = persona_data.get(section_key, [])
  32. if not section_data:
  33. continue
  34. lines.append(f"\n## {section_title}\n")
  35. for perspective in section_data:
  36. perspective_name = perspective.get("视角名称", "")
  37. lines.append(f"\n### 视角:{perspective_name}")
  38. for pattern in perspective.get("模式列表", []):
  39. pattern_name = pattern.get("分类名称", "")
  40. pattern_def = pattern.get("核心定义", "")
  41. lines.append(f"\n 【一级】{pattern_name}")
  42. if pattern_def:
  43. lines.append(f" 定义:{pattern_def}")
  44. # 二级细分
  45. for sub in pattern.get("二级细分", []):
  46. sub_name = sub.get("分类名称", "")
  47. sub_def = sub.get("分类定义", "")
  48. lines.append(f" 【二级】{sub_name}:{sub_def}")
  49. return "\n".join(lines)
  50. def find_step1_file(persona_dir: str, inspiration: str, model_name: str) -> str:
  51. """查找 step1 输出文件
  52. Args:
  53. persona_dir: 人设目录
  54. inspiration: 灵感点名称
  55. model_name: 模型名称
  56. Returns:
  57. step1 文件路径
  58. Raises:
  59. SystemExit: 找不到文件时退出
  60. """
  61. step1_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  62. model_name_short = model_name.replace("google/", "").replace("/", "_")
  63. step1_file_pattern = f"*_step1_*_{model_name_short}.json"
  64. step1_files = list(Path(step1_dir).glob(step1_file_pattern))
  65. if not step1_files:
  66. print(f"❌ 找不到 step1 输出文件")
  67. print(f"查找路径: {step1_dir}/{step1_file_pattern}")
  68. sys.exit(1)
  69. return str(step1_files[0])
  70. async def process_step2_incremental_match(
  71. step1_top1: dict,
  72. persona_data: dict,
  73. inspiration: str,
  74. current_time: str = None,
  75. log_url: str = None
  76. ) -> dict:
  77. """执行增量词匹配分析(核心业务逻辑)
  78. Args:
  79. step1_top1: step1 的 top1 匹配结果
  80. persona_data: 人设数据
  81. inspiration: 灵感名称
  82. current_time: 当前时间戳
  83. log_url: trace URL
  84. Returns:
  85. 匹配结果字典
  86. """
  87. # 从 step1 结果中提取信息
  88. business_info = step1_top1.get("业务信息", {})
  89. match_result = step1_top1.get("匹配结果", {})
  90. step1_inspiration = business_info.get("灵感", "")
  91. matched_element = business_info.get("匹配要素", "")
  92. incremental_parts = match_result.get("增量部分", {})
  93. incremental_words = list(incremental_parts.keys())
  94. # 格式化人设系统
  95. persona_system_text = format_persona_system(persona_data)
  96. # 构建补充上下文(B_Context - 统一构造一次)
  97. b_context = f"""这些增量词来自灵感「{step1_inspiration}」,
  98. 在 step1 匹配中,与人设要素「{matched_element}」匹配时产生的增量部分。"""
  99. if not incremental_words:
  100. print("⚠️ Top1 结果没有增量词,跳过分析")
  101. return {
  102. "元数据": {
  103. "current_time": current_time,
  104. "log_url": log_url,
  105. "model": MODEL_NAME,
  106. "步骤": "Step2: 增量词在人设中的匹配"
  107. },
  108. "灵感": step1_inspiration,
  109. "输入信息": {
  110. "B": "", # 空字符串
  111. "A": persona_system_text,
  112. "B_Context": b_context, # 使用统一构造的 context
  113. "A_Context": ""
  114. },
  115. "step1_结果": step1_top1,
  116. "匹配结果": {
  117. "score": 0.0,
  118. "score说明": "无增量词",
  119. "相同部分": {},
  120. "增量部分": {}
  121. }
  122. }
  123. print(f"\n开始增量词匹配分析: {step1_inspiration}")
  124. print(f"匹配要素: {matched_element}")
  125. print(f"增量词数量: {len(incremental_words)}, 模型: {MODEL_NAME}\n")
  126. # 将增量词列表拼接成一个字符串(用换行符分隔)
  127. b_content = "\n".join(incremental_words)
  128. # 使用 custom_span 标识整个流程
  129. with custom_span(
  130. name=f"Step2: 增量词匹配 - {step1_inspiration}",
  131. data={
  132. "灵感": step1_inspiration,
  133. "匹配要素": matched_element,
  134. "增量词数量": len(incremental_words),
  135. "模型": MODEL_NAME,
  136. "步骤": "增量词在人设中的匹配分析"
  137. }
  138. ):
  139. # 调用通用匹配模块(单次调用)
  140. match_result = await match_single(
  141. b_content=b_content,
  142. a_content=persona_system_text,
  143. model_name=MODEL_NAME,
  144. b_context=b_context
  145. )
  146. # 构建输出(使用统一构造的变量)
  147. return {
  148. "元数据": {
  149. "current_time": current_time,
  150. "log_url": log_url,
  151. "model": MODEL_NAME,
  152. "步骤": "Step2: 增量词在人设中的匹配"
  153. },
  154. "灵感": step1_inspiration,
  155. "输入信息": {
  156. "B": b_content, # 拼接后的字符串
  157. "A": persona_system_text,
  158. "B_Context": b_context, # 使用统一构造的 context
  159. "A_Context": ""
  160. },
  161. "匹配结果": match_result, # 单个匹配结果对象
  162. "step1_结果": step1_top1,
  163. }
  164. async def main(current_time: str, log_url: str, force: bool = False):
  165. """主函数
  166. Args:
  167. current_time: 当前时间戳
  168. log_url: 日志链接
  169. force: 是否强制重新执行(跳过已存在文件检查)
  170. """
  171. # 解析命令行参数
  172. persona_dir = sys.argv[1] if len(sys.argv) > 1 else "data/阿里多多酱/out/人设_1110"
  173. inspiration_arg = sys.argv[2] if len(sys.argv) > 2 else "0"
  174. # 第三个参数:force(如果从命令行调用且有该参数,则覆盖函数参数)
  175. if len(sys.argv) > 3 and sys.argv[3] == "force":
  176. force = True
  177. print(f"{'=' * 80}")
  178. print(f"Step2: 增量词在人设中的匹配分析(Top1)")
  179. print(f"{'=' * 80}")
  180. print(f"人设目录: {persona_dir}")
  181. print(f"灵感参数: {inspiration_arg}")
  182. # 加载数据
  183. persona_data = load_persona_data(persona_dir)
  184. inspiration_list = load_inspiration_list(persona_dir)
  185. test_inspiration = select_inspiration(inspiration_arg, inspiration_list)
  186. # 查找并加载 step1 结果
  187. step1_file = find_step1_file(persona_dir, test_inspiration, MODEL_NAME)
  188. step1_filename = os.path.basename(step1_file)
  189. step1_basename = os.path.splitext(step1_filename)[0]
  190. print(f"Step1 输入文件: {step1_file}")
  191. # 构建输出文件路径
  192. output_dir = os.path.join(persona_dir, "how", "灵感点", test_inspiration)
  193. model_name_short = MODEL_NAME.replace("google/", "").replace("/", "_")
  194. scope_prefix = step1_basename.split("_")[0] # 提取 "all" 或 "top10" 等
  195. result_index = 0 # 使用第 1 个匹配结果(top1)
  196. output_filename = f"{scope_prefix}_step2_top{result_index + 1}_增量词匹配_{model_name_short}.json"
  197. output_file = os.path.join(output_dir, output_filename)
  198. # 检查文件是否已存在
  199. if not force and os.path.exists(output_file):
  200. print(f"\n✓ 输出文件已存在,跳过执行: {output_file}")
  201. print(f"提示: 如需重新执行,请添加 'force' 参数\n")
  202. return
  203. with open(step1_file, 'r', encoding='utf-8') as f:
  204. step1_data = json.load(f)
  205. actual_inspiration = step1_data.get("灵感", "")
  206. step1_results = step1_data.get("匹配结果列表", [])
  207. if not step1_results:
  208. print("❌ step1 结果为空")
  209. sys.exit(1)
  210. print(f"灵感: {actual_inspiration}")
  211. # 默认处理 top1
  212. selected_result = step1_results[result_index]
  213. print(f"处理第 {result_index + 1} 个匹配结果(Top{result_index + 1})\n")
  214. # 执行核心业务逻辑
  215. output = await process_step2_incremental_match(
  216. step1_top1=selected_result,
  217. persona_data=persona_data,
  218. inspiration=actual_inspiration,
  219. current_time=current_time,
  220. log_url=log_url
  221. )
  222. # 在元数据中添加 step1 匹配索引
  223. output["元数据"]["step1_匹配索引"] = result_index + 1
  224. # 保存结果
  225. os.makedirs(output_dir, exist_ok=True)
  226. with open(output_file, 'w', encoding='utf-8') as f:
  227. json.dump(output, f, ensure_ascii=False, indent=2)
  228. print(f"\n完成!结果已保存到: {output_file}")
  229. if log_url:
  230. print(f"Trace: {log_url}\n")
  231. if __name__ == "__main__":
  232. # 设置 trace
  233. current_time, log_url = set_trace()
  234. # 使用 trace 上下文包裹整个执行流程
  235. with trace("Step2: 增量词匹配分析"):
  236. asyncio.run(main(current_time, log_url))