run_inspiration_analysis.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532
  1. """
  2. 主流程脚本:串联 Step1、搜索和 Step2
  3. 执行完整的灵感分析流程:
  4. 1. Step1: 灵感与人设匹配(调用 step1 main,自动保存结果)
  5. 2. Step1.5: 基于 Top1 匹配要素进行小红书搜索(使用 search_xiaohongshu)
  6. 3. Step2: 增量词在人设中的匹配(调用 step2 main,自动保存结果)
  7. 4. 生成流程汇总文件
  8. """
  9. import os
  10. import sys
  11. import json
  12. import asyncio
  13. import random
  14. import argparse
  15. from agents import trace
  16. from lib.my_trace import set_trace_smith as set_trace
  17. from lib.data_loader import load_inspiration_list, select_inspiration
  18. from lib.utils import read_json
  19. # 导入 step1 和 step2 的 main 函数
  20. import step1_inspiration_match
  21. import step2_incremental_match
  22. # 导入搜索功能
  23. from script.search import search_xiaohongshu
  24. def find_step1_output(persona_dir: str, inspiration: str, max_tasks: int = None) -> str:
  25. """查找 step1 输出文件
  26. Args:
  27. persona_dir: 人设目录
  28. inspiration: 灵感点名称
  29. max_tasks: 任务数限制
  30. Returns:
  31. step1 文件路径
  32. """
  33. from pathlib import Path
  34. step1_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  35. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  36. step1_pattern = f"{scope_prefix}_step1_*.json"
  37. step1_files = list(Path(step1_dir).glob(step1_pattern))
  38. if not step1_files:
  39. raise FileNotFoundError(f"找不到 step1 输出文件: {step1_dir}/{step1_pattern}")
  40. return str(step1_files[0])
  41. def find_step2_output(persona_dir: str, inspiration: str, max_tasks: int = None) -> str:
  42. """查找 step2 输出文件
  43. Args:
  44. persona_dir: 人设目录
  45. inspiration: 灵感点名称
  46. max_tasks: 任务数限制
  47. Returns:
  48. step2 文件路径
  49. """
  50. from pathlib import Path
  51. step2_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  52. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  53. step2_pattern = f"{scope_prefix}_step2_*.json"
  54. step2_files = list(Path(step2_dir).glob(step2_pattern))
  55. if not step2_files:
  56. raise FileNotFoundError(f"找不到 step2 输出文件: {step2_dir}/{step2_pattern}")
  57. return str(step2_files[0])
  58. def get_inspiration_score(persona_dir: str, inspiration: str, max_tasks: int = None) -> float:
  59. """获取灵感的 Step1 Top1 分数
  60. Args:
  61. persona_dir: 人设目录
  62. inspiration: 灵感点名称
  63. max_tasks: 任务数限制
  64. Returns:
  65. Step1 Top1 的 score,如果文件不存在返回 -1
  66. """
  67. try:
  68. step1_file = find_step1_output(persona_dir, inspiration, max_tasks)
  69. step1_data = read_json(step1_file)
  70. results = step1_data.get("匹配结果列表", [])
  71. if results:
  72. return results[0].get('匹配结果', {}).get('score', 0)
  73. return 0
  74. except (FileNotFoundError, Exception):
  75. return -1
  76. def sort_inspirations_by_score(
  77. persona_dir: str,
  78. inspiration_list: list,
  79. max_tasks: int = None
  80. ) -> list:
  81. """根据 Step1 结果分数对灵感列表排序
  82. Args:
  83. persona_dir: 人设目录
  84. inspiration_list: 灵感列表
  85. max_tasks: 任务数限制
  86. Returns:
  87. 排序后的灵感列表(按分数降序)
  88. """
  89. print(f"\n{'─' * 80}")
  90. print(f"正在读取现有 Step1 结果文件...")
  91. print(f"{'─' * 80}")
  92. inspiration_scores = []
  93. for inspiration in inspiration_list:
  94. score = get_inspiration_score(persona_dir, inspiration, max_tasks)
  95. inspiration_scores.append({
  96. "inspiration": inspiration,
  97. "score": score,
  98. "has_result": score >= 0
  99. })
  100. # 统计
  101. has_result_count = sum(1 for item in inspiration_scores if item["has_result"])
  102. print(f"找到 {has_result_count}/{len(inspiration_list)} 个灵感的 Step1 结果")
  103. # 排序:有结果的按分数降序,无结果的放最后(保持原顺序)
  104. sorted_items = sorted(
  105. inspiration_scores,
  106. key=lambda x: (x["has_result"], x["score"]),
  107. reverse=True
  108. )
  109. # 显示排序结果(前10个)
  110. print(f"\n排序后的灵感列表(前10个):")
  111. for i, item in enumerate(sorted_items[:10], 1):
  112. status = f"score={item['score']:.2f}" if item['has_result'] else "无结果"
  113. print(f" {i}. [{status}] {item['inspiration']}")
  114. if len(sorted_items) > 10:
  115. print(f" ... 还有 {len(sorted_items) - 10} 个")
  116. return [item["inspiration"] for item in sorted_items]
  117. async def run_full_analysis(
  118. persona_dir: str,
  119. inspiration: str,
  120. max_tasks: int = None,
  121. force: bool = False,
  122. current_time: str = None,
  123. log_url: str = None,
  124. enable_step2: bool = False
  125. ) -> dict:
  126. """执行完整的灵感分析流程(Step1 + 搜索 + Step2)
  127. Args:
  128. persona_dir: 人设目录路径
  129. inspiration: 灵感点文本
  130. max_tasks: step1 最大任务数(None 表示不限制)
  131. force: 是否强制重新执行(跳过文件存在检查)
  132. current_time: 当前时间戳
  133. log_url: 日志链接
  134. enable_step2: 是否执行 Step2(默认 False)
  135. Returns:
  136. 包含文件路径和状态的字典
  137. """
  138. print(f"\n{'=' * 80}")
  139. print(f"开始完整分析流程: {inspiration}")
  140. print(f"{'=' * 80}\n")
  141. # ========== Step1: 灵感与人设匹配 ==========
  142. print(f"{'─' * 80}")
  143. print(f"Step1: 灵感与人设匹配")
  144. print(f"{'─' * 80}\n")
  145. # 临时修改 sys.argv 来传递参数给 step1
  146. original_argv = sys.argv.copy()
  147. sys.argv = [
  148. "step1_inspiration_match.py",
  149. persona_dir,
  150. inspiration,
  151. str(max_tasks) if max_tasks is not None else "all"
  152. ]
  153. try:
  154. # 调用 step1 的 main 函数(通过参数传递 force)
  155. await step1_inspiration_match.main(current_time, log_url, force=force)
  156. finally:
  157. # 恢复原始参数
  158. sys.argv = original_argv
  159. # 查找 step1 输出文件
  160. step1_file = find_step1_output(persona_dir, inspiration, max_tasks)
  161. print(f"✓ Step1 完成,结果文件: {step1_file}\n")
  162. # 读取 step1 结果
  163. step1_data = read_json(step1_file)
  164. step1_results = step1_data.get("匹配结果列表", [])
  165. if not step1_results:
  166. print("⚠️ Step1 结果为空,终止流程")
  167. return {
  168. "step1_file": step1_file,
  169. "step2_file": None,
  170. "summary_file": None,
  171. "status": "step1_empty"
  172. }
  173. step1_top1 = step1_results[0]
  174. step1_score = step1_top1.get('匹配结果', {}).get('score', 0)
  175. step1_element = step1_top1.get("业务信息", {}).get("匹配要素", "")
  176. print(f"Top1 匹配要素: {step1_element}, score: {step1_score:.2f}")
  177. # ========== Step1.5: 小红书搜索 ==========
  178. print(f"\n{'─' * 80}")
  179. print(f"Step1.5: 基于 Top1 匹配要素进行小红书搜索")
  180. print(f"{'─' * 80}\n")
  181. search_keyword = step1_element
  182. print(f"搜索关键词: {search_keyword}")
  183. # 执行搜索
  184. try:
  185. search_result = search_xiaohongshu(search_keyword)
  186. search_notes_count = len(search_result.get('notes', []))
  187. print(f"✓ 搜索完成,找到 {search_notes_count} 条笔记")
  188. # 保存搜索结果
  189. search_dir = os.path.join(persona_dir, "how", "灵感点", inspiration, "search")
  190. os.makedirs(search_dir, exist_ok=True)
  191. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  192. search_filename = f"{scope_prefix}_search_{search_keyword[:20]}.json" # 截取关键词前20字符避免文件名过长
  193. search_file = os.path.join(search_dir, search_filename)
  194. with open(search_file, 'w', encoding='utf-8') as f:
  195. json.dump(search_result, f, ensure_ascii=False, indent=2)
  196. print(f"✓ 搜索结果已保存: {search_file}\n")
  197. except Exception as e:
  198. print(f"⚠️ 搜索失败: {e}")
  199. search_file = None
  200. search_notes_count = 0
  201. # ========== Step2: 增量词匹配 ==========
  202. step2_file = None
  203. step2_score = None
  204. step2_word_count = None
  205. if enable_step2:
  206. print(f"\n{'─' * 80}")
  207. print(f"Step2: 增量词在人设中的匹配")
  208. print(f"{'─' * 80}\n")
  209. # 临时修改 sys.argv 来传递参数给 step2
  210. sys.argv = [
  211. "step2_incremental_match.py",
  212. persona_dir,
  213. inspiration
  214. ]
  215. try:
  216. # 调用 step2 的 main 函数(通过参数传递 force)
  217. await step2_incremental_match.main(current_time, log_url, force=force)
  218. finally:
  219. # 恢复原始参数
  220. sys.argv = original_argv
  221. # 查找 step2 输出文件
  222. step2_file = find_step2_output(persona_dir, inspiration, max_tasks)
  223. print(f"✓ Step2 完成,结果文件: {step2_file}\n")
  224. # 读取 step2 结果
  225. step2_data = read_json(step2_file)
  226. step2_score = step2_data.get("匹配结果", {}).get("score", 0)
  227. step2_b_content = step2_data.get("输入信息", {}).get("B", "")
  228. step2_word_count = len(step2_b_content.split("\n")) if step2_b_content else 0
  229. else:
  230. print(f"\n{'─' * 80}")
  231. print(f"Step2: 已跳过(使用 --enable-step2 启用)")
  232. print(f"{'─' * 80}\n")
  233. # ========== 保存流程汇总 ==========
  234. output_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  235. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  236. # 从 step1 文件名提取模型名称
  237. step1_filename = os.path.basename(step1_file)
  238. model_short = step1_filename.split("_")[-1].replace(".json", "")
  239. summary_filename = f"{scope_prefix}_summary_完整流程_{model_short}.json"
  240. summary_file = os.path.join(output_dir, summary_filename)
  241. # 构建流程描述
  242. workflow = "Step1 + 搜索"
  243. if enable_step2:
  244. workflow += " + Step2"
  245. summary = {
  246. "元数据": {
  247. "current_time": current_time,
  248. "log_url": log_url,
  249. "流程": workflow,
  250. "step1_model": step1_data.get("元数据", {}).get("model", ""),
  251. "step2_model": step2_data.get("元数据", {}).get("model", "") if enable_step2 and 'step2_data' in locals() else None
  252. },
  253. "灵感": inspiration,
  254. "文件路径": {
  255. "step1": step1_file,
  256. "search": search_file if 'search_file' in locals() else None,
  257. "step2": step2_file
  258. },
  259. "关键指标": {
  260. "step1_top1_score": step1_score,
  261. "step1_top1_匹配要素": step1_element,
  262. "search_keyword": search_keyword if 'search_keyword' in locals() else None,
  263. "search_notes_count": search_notes_count if 'search_notes_count' in locals() else 0,
  264. "step2_增量词数量": step2_word_count,
  265. "step2_score": step2_score
  266. }
  267. }
  268. with open(summary_file, 'w', encoding='utf-8') as f:
  269. json.dump(summary, f, ensure_ascii=False, indent=2)
  270. print(f"{'=' * 80}")
  271. print(f"完整流程执行完成")
  272. print(f"{'=' * 80}")
  273. print(f"\n结果文件:")
  274. print(f" Step1: {step1_file}")
  275. if 'search_file' in locals() and search_file:
  276. print(f" 搜索: {search_file}")
  277. if enable_step2 and step2_file:
  278. print(f" Step2: {step2_file}")
  279. print(f" 汇总: {summary_file}\n")
  280. return {
  281. "step1_file": step1_file,
  282. "search_file": search_file if 'search_file' in locals() else None,
  283. "step2_file": step2_file,
  284. "summary_file": summary_file,
  285. "status": "success"
  286. }
  287. async def main():
  288. """主函数"""
  289. # 解析命令行参数
  290. parser = argparse.ArgumentParser(
  291. description="灵感分析主流程 (Step1 + 搜索 + Step2)",
  292. formatter_class=argparse.RawDescriptionHelpFormatter,
  293. epilog="""
  294. 使用示例:
  295. # 处理第1个灵感(Step1 + 搜索,默认不执行 Step2)
  296. python run_inspiration_analysis.py --dir data/阿里多多酱/out/人设_1110 --count 1
  297. # 启用 Step2 完整流程(Step1 + 搜索 + Step2)
  298. python run_inspiration_analysis.py --count 1 --enable-step2
  299. # 随机处理5个灵感
  300. python run_inspiration_analysis.py --count 5 --shuffle
  301. # 按 Step1 分数排序,处理前10个高分灵感
  302. python run_inspiration_analysis.py --count 10 --sort-by-score
  303. # 处理所有灵感,强制重新执行
  304. python run_inspiration_analysis.py --count all --force
  305. # 处理前10个灵感,step1只处理前20个任务
  306. python run_inspiration_analysis.py --count 10 --max-tasks 20
  307. """
  308. )
  309. parser.add_argument(
  310. "--dir",
  311. default="data/阿里多多酱/out/人设_1110",
  312. help="人设目录路径 (默认: data/阿里多多酱/out/人设_1110)"
  313. )
  314. parser.add_argument(
  315. "--count",
  316. default="1",
  317. help="处理的灵感数量,可以是数字或 'all' (默认: 1)"
  318. )
  319. parser.add_argument(
  320. "--max-tasks",
  321. type=str,
  322. default="all",
  323. help="Step1 处理的最大任务数,可以是数字或 'all' (默认: all)"
  324. )
  325. parser.add_argument(
  326. "--force",
  327. action="store_true",
  328. help="强制重新执行,覆盖已存在的文件"
  329. )
  330. parser.add_argument(
  331. "--shuffle",
  332. action="store_true",
  333. help="随机选择灵感,而不是按顺序"
  334. )
  335. parser.add_argument(
  336. "--sort-by-score",
  337. action="store_true",
  338. help="根据 Step1 结果分数排序(降序),优先处理高分灵感"
  339. )
  340. parser.add_argument(
  341. "--enable-step2",
  342. action="store_true",
  343. help="启用 Step2 增量词匹配(默认关闭)"
  344. )
  345. args = parser.parse_args()
  346. persona_dir = args.dir
  347. force = args.force
  348. shuffle = args.shuffle
  349. sort_by_score = args.sort_by_score
  350. enable_step2 = args.enable_step2
  351. # 处理 max_tasks
  352. max_tasks = None if args.max_tasks == "all" else int(args.max_tasks)
  353. # 动态流程名称
  354. workflow_name = "Step1 + 搜索"
  355. if enable_step2:
  356. workflow_name += " + Step2"
  357. print(f"{'=' * 80}")
  358. print(f"灵感分析主流程 ({workflow_name})")
  359. print(f"{'=' * 80}")
  360. print(f"人设目录: {persona_dir}")
  361. # 加载灵感列表
  362. inspiration_list = load_inspiration_list(persona_dir)
  363. # 确定要处理的灵感数量
  364. if args.count == "all":
  365. inspiration_count = len(inspiration_list)
  366. print(f"处理灵感: 全部 ({inspiration_count} 个)")
  367. else:
  368. inspiration_count = int(args.count)
  369. print(f"处理灵感: 前 {inspiration_count} 个")
  370. if max_tasks:
  371. print(f"Step1 任务数限制: {max_tasks}")
  372. if force:
  373. print(f"强制模式: 重新执行所有步骤")
  374. if shuffle:
  375. print(f"随机模式: 随机选择灵感")
  376. if sort_by_score:
  377. print(f"分数排序: 根据 Step1 结果按分数降序处理")
  378. if enable_step2:
  379. print(f"Step2: 启用增量词匹配")
  380. else:
  381. print(f"Step2: 已关闭(使用 --enable-step2 启用)")
  382. # 选择要处理的灵感列表
  383. if sort_by_score:
  384. # 根据 Step1 结果分数排序
  385. sorted_list = sort_inspirations_by_score(persona_dir, inspiration_list, max_tasks)
  386. inspirations_to_process = sorted_list[:inspiration_count]
  387. elif shuffle:
  388. # 随机打乱灵感列表后选择
  389. shuffled_list = inspiration_list.copy()
  390. random.shuffle(shuffled_list)
  391. inspirations_to_process = shuffled_list[:inspiration_count]
  392. else:
  393. # 按顺序选择前 N 个
  394. inspirations_to_process = inspiration_list[:inspiration_count]
  395. print(f"\n将处理以下灵感:")
  396. for i, insp in enumerate(inspirations_to_process, 1):
  397. print(f" {i}. {insp}")
  398. # 批量执行流程
  399. results = []
  400. for i, inspiration in enumerate(inspirations_to_process, 1):
  401. print(f"\n{'#' * 80}")
  402. print(f"处理第 {i}/{len(inspirations_to_process)} 个灵感: {inspiration}")
  403. print(f"{'#' * 80}")
  404. # 为每个灵感创建独立的 trace
  405. insp_time, insp_log_url = set_trace()
  406. with trace(f"灵感分析: {inspiration}"):
  407. result = await run_full_analysis(
  408. persona_dir=persona_dir,
  409. inspiration=inspiration,
  410. max_tasks=max_tasks,
  411. force=force,
  412. current_time=insp_time,
  413. log_url=insp_log_url,
  414. enable_step2=enable_step2
  415. )
  416. results.append(result)
  417. if insp_log_url:
  418. print(f"本次 Trace: {insp_log_url}")
  419. # 输出最终汇总
  420. print(f"\n{'=' * 80}")
  421. print(f"批量处理完成")
  422. print(f"{'=' * 80}")
  423. success_count = sum(1 for r in results if r["status"] == "success")
  424. print(f"\n成功: {success_count}/{len(results)}")
  425. for i, (insp, result) in enumerate(zip(inspirations_to_process, results), 1):
  426. status_icon = "✓" if result["status"] == "success" else "✗"
  427. print(f" {status_icon} [{i}] {insp}")
  428. if __name__ == "__main__":
  429. # 主流程不设置 trace,由每个灵感独立设置
  430. asyncio.run(main())