run_inspiration_analysis.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601
  1. """
  2. 主流程脚本:串联 Step1、搜索和 Step2
  3. 执行完整的灵感分析流程:
  4. 1. Step1: 灵感与人设匹配(调用 step1 main,自动保存结果)
  5. 2. Step1.5: 基于 Top1 匹配要素进行小红书搜索(使用 search_xiaohongshu)
  6. 3. Step2: 增量词在人设中的匹配(调用 step2 main,自动保存结果)
  7. 4. 生成流程汇总文件
  8. """
  9. import os
  10. import sys
  11. import json
  12. import asyncio
  13. import random
  14. import argparse
  15. from agents import trace
  16. from lib.my_trace import set_trace_smith as set_trace
  17. from lib.data_loader import load_inspiration_list, select_inspiration
  18. from lib.utils import read_json
  19. # 导入 step1 和 step2 的 main 函数
  20. import step1_inspiration_match
  21. import step2_incremental_match
  22. # 导入搜索功能
  23. from script.search import search_xiaohongshu
  24. def find_step1_output(persona_dir: str, inspiration: str, max_tasks: int = None) -> str:
  25. """查找 step1 输出文件
  26. Args:
  27. persona_dir: 人设目录
  28. inspiration: 灵感点名称
  29. max_tasks: 任务数限制
  30. Returns:
  31. step1 文件路径
  32. """
  33. from pathlib import Path
  34. step1_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  35. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  36. step1_pattern = f"{scope_prefix}_step1_*.json"
  37. step1_files = list(Path(step1_dir).glob(step1_pattern))
  38. if not step1_files:
  39. raise FileNotFoundError(f"找不到 step1 输出文件: {step1_dir}/{step1_pattern}")
  40. return str(step1_files[0])
  41. def find_step2_output(persona_dir: str, inspiration: str, max_tasks: int = None) -> str:
  42. """查找 step2 输出文件
  43. Args:
  44. persona_dir: 人设目录
  45. inspiration: 灵感点名称
  46. max_tasks: 任务数限制
  47. Returns:
  48. step2 文件路径
  49. """
  50. from pathlib import Path
  51. step2_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  52. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  53. step2_pattern = f"{scope_prefix}_step2_*.json"
  54. step2_files = list(Path(step2_dir).glob(step2_pattern))
  55. if not step2_files:
  56. raise FileNotFoundError(f"找不到 step2 输出文件: {step2_dir}/{step2_pattern}")
  57. return str(step2_files[0])
  58. def get_inspiration_score(persona_dir: str, inspiration: str, max_tasks: int = None) -> float:
  59. """获取灵感的 Step1 Top1 分数
  60. Args:
  61. persona_dir: 人设目录
  62. inspiration: 灵感点名称
  63. max_tasks: 任务数限制
  64. Returns:
  65. Step1 Top1 的 score,如果文件不存在返回 -1
  66. """
  67. try:
  68. step1_file = find_step1_output(persona_dir, inspiration, max_tasks)
  69. step1_data = read_json(step1_file)
  70. results = step1_data.get("匹配结果列表", [])
  71. if results:
  72. return results[0].get('匹配结果', {}).get('score', 0)
  73. return 0
  74. except (FileNotFoundError, Exception):
  75. return -1
  76. def sort_inspirations_by_score(
  77. persona_dir: str,
  78. inspiration_list: list,
  79. max_tasks: int = None
  80. ) -> list:
  81. """根据 Step1 结果分数对灵感列表排序
  82. Args:
  83. persona_dir: 人设目录
  84. inspiration_list: 灵感列表
  85. max_tasks: 任务数限制
  86. Returns:
  87. 排序后的灵感列表(按分数降序)
  88. """
  89. print(f"\n{'─' * 80}")
  90. print(f"正在读取现有 Step1 结果文件...")
  91. print(f"{'─' * 80}")
  92. inspiration_scores = []
  93. for inspiration in inspiration_list:
  94. score = get_inspiration_score(persona_dir, inspiration, max_tasks)
  95. inspiration_scores.append({
  96. "inspiration": inspiration,
  97. "score": score,
  98. "has_result": score >= 0
  99. })
  100. # 统计
  101. has_result_count = sum(1 for item in inspiration_scores if item["has_result"])
  102. print(f"找到 {has_result_count}/{len(inspiration_list)} 个灵感的 Step1 结果")
  103. # 排序:有结果的按分数降序,无结果的放最后(保持原顺序)
  104. sorted_items = sorted(
  105. inspiration_scores,
  106. key=lambda x: (x["has_result"], x["score"]),
  107. reverse=True
  108. )
  109. # 显示排序结果(前10个)
  110. print(f"\n排序后的灵感列表(前10个):")
  111. for i, item in enumerate(sorted_items[:10], 1):
  112. status = f"score={item['score']:.2f}" if item['has_result'] else "无结果"
  113. print(f" {i}. [{status}] {item['inspiration']}")
  114. if len(sorted_items) > 10:
  115. print(f" ... 还有 {len(sorted_items) - 10} 个")
  116. return [item["inspiration"] for item in sorted_items]
  117. async def run_full_analysis(
  118. persona_dir: str,
  119. inspiration: str,
  120. max_tasks: int = None,
  121. force: bool = False,
  122. current_time: str = None,
  123. log_url: str = None,
  124. enable_step2: bool = False,
  125. search_only: bool = False
  126. ) -> dict:
  127. """执行完整的灵感分析流程(Step1 + 搜索 + Step2)
  128. Args:
  129. persona_dir: 人设目录路径
  130. inspiration: 灵感点文本
  131. max_tasks: step1 最大任务数(None 表示不限制)
  132. force: 是否强制重新执行(跳过文件存在检查)
  133. current_time: 当前时间戳
  134. log_url: 日志链接
  135. enable_step2: 是否执行 Step2(默认 False)
  136. search_only: 是否只执行搜索(跳过 Step1 和 Step2,默认 False)
  137. Returns:
  138. 包含文件路径和状态的字典
  139. """
  140. print(f"\n{'=' * 80}")
  141. print(f"开始{'仅搜索' if search_only else '完整分析'}流程: {inspiration}")
  142. print(f"{'=' * 80}\n")
  143. # ========== Step1: 灵感与人设匹配 ==========
  144. if not search_only:
  145. print(f"{'─' * 80}")
  146. print(f"Step1: 灵感与人设匹配")
  147. print(f"{'─' * 80}\n")
  148. # 临时修改 sys.argv 来传递参数给 step1
  149. original_argv = sys.argv.copy()
  150. sys.argv = [
  151. "step1_inspiration_match.py",
  152. persona_dir,
  153. inspiration,
  154. str(max_tasks) if max_tasks is not None else "all"
  155. ]
  156. try:
  157. # 调用 step1 的 main 函数(通过参数传递 force)
  158. await step1_inspiration_match.main(current_time, log_url, force=force)
  159. finally:
  160. # 恢复原始参数
  161. sys.argv = original_argv
  162. # 查找 step1 输出文件
  163. step1_file = find_step1_output(persona_dir, inspiration, max_tasks)
  164. print(f"✓ Step1 完成,结果文件: {step1_file}\n")
  165. else:
  166. print(f"{'─' * 80}")
  167. print(f"Step1: 跳过(仅搜索模式)")
  168. print(f"{'─' * 80}\n")
  169. # 查找已有的 step1 输出文件
  170. try:
  171. step1_file = find_step1_output(persona_dir, inspiration, max_tasks)
  172. print(f"✓ 找到已有 Step1 结果: {step1_file}\n")
  173. except FileNotFoundError as e:
  174. print(f"⚠️ {e}")
  175. return {
  176. "step1_file": None,
  177. "search_file": None,
  178. "step2_file": None,
  179. "summary_file": None,
  180. "status": "step1_not_found"
  181. }
  182. # 读取 step1 结果
  183. step1_data = read_json(step1_file)
  184. step1_results = step1_data.get("匹配结果列表", [])
  185. if not step1_results:
  186. print("⚠️ Step1 结果为空,终止流程")
  187. return {
  188. "step1_file": step1_file,
  189. "step2_file": None,
  190. "summary_file": None,
  191. "status": "step1_empty"
  192. }
  193. step1_top1 = step1_results[0]
  194. step1_score = step1_top1.get('匹配结果', {}).get('score', 0)
  195. step1_element = step1_top1.get("业务信息", {}).get("匹配要素", "")
  196. print(f"Top1 匹配要素: {step1_element}, score: {step1_score:.2f}")
  197. # ========== Step1.5: 小红书搜索 ==========
  198. print(f"\n{'─' * 80}")
  199. print(f"Step1.5: 基于 Top1 匹配要素进行小红书搜索")
  200. print(f"{'─' * 80}\n")
  201. search_keyword = step1_element
  202. print(f"搜索关键词: {search_keyword}")
  203. # 执行搜索
  204. try:
  205. search_result = search_xiaohongshu(search_keyword)
  206. search_notes_count = len(search_result.get('notes', []))
  207. print(f"✓ 搜索完成,找到 {search_notes_count} 条笔记")
  208. # 保存搜索结果
  209. search_dir = os.path.join(persona_dir, "how", "灵感点", inspiration, "search")
  210. os.makedirs(search_dir, exist_ok=True)
  211. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  212. # 清理文件名中的非法字符
  213. safe_keyword = search_keyword[:20].replace('/', '_').replace('\\', '_').replace(':', '_')
  214. search_filename = f"{scope_prefix}_search_{safe_keyword}.json"
  215. search_file = os.path.join(search_dir, search_filename)
  216. with open(search_file, 'w', encoding='utf-8') as f:
  217. json.dump(search_result, f, ensure_ascii=False, indent=2)
  218. print(f"✓ 搜索结果已保存: {search_file}\n")
  219. except Exception as e:
  220. print(f"⚠️ 搜索失败: {e}")
  221. search_file = None
  222. search_notes_count = 0
  223. # ========== Step2: 增量词匹配 ==========
  224. step2_file = None
  225. step2_score = None
  226. step2_word_count = None
  227. if enable_step2 and not search_only:
  228. print(f"\n{'─' * 80}")
  229. print(f"Step2: 增量词在人设中的匹配")
  230. print(f"{'─' * 80}\n")
  231. # 临时修改 sys.argv 来传递参数给 step2
  232. sys.argv = [
  233. "step2_incremental_match.py",
  234. persona_dir,
  235. inspiration
  236. ]
  237. try:
  238. # 调用 step2 的 main 函数(通过参数传递 force)
  239. await step2_incremental_match.main(current_time, log_url, force=force)
  240. finally:
  241. # 恢复原始参数
  242. sys.argv = original_argv
  243. # 查找 step2 输出文件
  244. step2_file = find_step2_output(persona_dir, inspiration, max_tasks)
  245. print(f"✓ Step2 完成,结果文件: {step2_file}\n")
  246. # 读取 step2 结果
  247. step2_data = read_json(step2_file)
  248. step2_score = step2_data.get("匹配结果", {}).get("score", 0)
  249. step2_b_content = step2_data.get("输入信息", {}).get("B", "")
  250. step2_word_count = len(step2_b_content.split("\n")) if step2_b_content else 0
  251. elif not search_only:
  252. print(f"\n{'─' * 80}")
  253. print(f"Step2: 已跳过(使用 --enable-step2 启用)")
  254. print(f"{'─' * 80}\n")
  255. # ========== 保存流程汇总 ==========
  256. # search_only 模式不保存汇总文件
  257. if not search_only:
  258. output_dir = os.path.join(persona_dir, "how", "灵感点", inspiration)
  259. scope_prefix = f"top{max_tasks}" if max_tasks is not None else "all"
  260. # 从 step1 文件名提取模型名称
  261. step1_filename = os.path.basename(step1_file)
  262. model_short = step1_filename.split("_")[-1].replace(".json", "")
  263. summary_filename = f"{scope_prefix}_summary_完整流程_{model_short}.json"
  264. summary_file = os.path.join(output_dir, summary_filename)
  265. # 构建流程描述
  266. workflow = "Step1 + 搜索"
  267. if enable_step2:
  268. workflow += " + Step2"
  269. summary = {
  270. "元数据": {
  271. "current_time": current_time,
  272. "log_url": log_url,
  273. "流程": workflow,
  274. "step1_model": step1_data.get("元数据", {}).get("model", ""),
  275. "step2_model": step2_data.get("元数据", {}).get("model", "") if enable_step2 and 'step2_data' in locals() else None
  276. },
  277. "灵感": inspiration,
  278. "文件路径": {
  279. "step1": step1_file,
  280. "search": search_file if 'search_file' in locals() else None,
  281. "step2": step2_file
  282. },
  283. "关键指标": {
  284. "step1_top1_score": step1_score,
  285. "step1_top1_匹配要素": step1_element,
  286. "search_keyword": search_keyword if 'search_keyword' in locals() else None,
  287. "search_notes_count": search_notes_count if 'search_notes_count' in locals() else 0,
  288. "step2_增量词数量": step2_word_count,
  289. "step2_score": step2_score
  290. }
  291. }
  292. with open(summary_file, 'w', encoding='utf-8') as f:
  293. json.dump(summary, f, ensure_ascii=False, indent=2)
  294. else:
  295. summary_file = None
  296. print(f"{'=' * 80}")
  297. print(f"{'仅搜索' if search_only else '完整流程'}执行完成")
  298. print(f"{'=' * 80}")
  299. print(f"\n结果文件:")
  300. if not search_only:
  301. print(f" Step1: {step1_file}")
  302. if 'search_file' in locals() and search_file:
  303. print(f" 搜索: {search_file}")
  304. if enable_step2 and step2_file:
  305. print(f" Step2: {step2_file}")
  306. if summary_file:
  307. print(f" 汇总: {summary_file}")
  308. print()
  309. return {
  310. "step1_file": step1_file if not search_only else None,
  311. "search_file": search_file if 'search_file' in locals() else None,
  312. "step2_file": step2_file,
  313. "summary_file": summary_file,
  314. "status": "success"
  315. }
  316. async def main():
  317. """主函数"""
  318. # 解析命令行参数
  319. parser = argparse.ArgumentParser(
  320. description="灵感分析主流程 (Step1 + 搜索 + Step2)",
  321. formatter_class=argparse.RawDescriptionHelpFormatter,
  322. epilog="""
  323. 使用示例:
  324. # 处理第1个灵感(Step1 + 搜索,默认不执行 Step2)
  325. python run_inspiration_analysis.py --dir data/阿里多多酱/out/人设_1110 --count 1
  326. # 启用 Step2 完整流程(Step1 + 搜索 + Step2)
  327. python run_inspiration_analysis.py --count 1 --enable-step2
  328. # 随机处理5个灵感
  329. python run_inspiration_analysis.py --count 5 --shuffle
  330. # 按 Step1 分数排序,处理前10个高分灵感
  331. python run_inspiration_analysis.py --count 10 --sort-by-score
  332. # 仅搜索模式:基于已有 Step1 结果,按分数降序搜索前10个
  333. python run_inspiration_analysis.py --search-only --count 10
  334. # 处理所有灵感,强制重新执行
  335. python run_inspiration_analysis.py --count all --force
  336. # 处理前10个灵感,step1只处理前20个任务
  337. python run_inspiration_analysis.py --count 10 --max-tasks 20
  338. """
  339. )
  340. parser.add_argument(
  341. "--dir",
  342. default="data/阿里多多酱/out/人设_1110",
  343. help="人设目录路径 (默认: data/阿里多多酱/out/人设_1110)"
  344. )
  345. parser.add_argument(
  346. "--count",
  347. default="1",
  348. help="处理的灵感数量,可以是数字或 'all' (默认: 1)"
  349. )
  350. parser.add_argument(
  351. "--max-tasks",
  352. type=str,
  353. default="all",
  354. help="Step1 处理的最大任务数,可以是数字或 'all' (默认: all)"
  355. )
  356. parser.add_argument(
  357. "--force",
  358. action="store_true",
  359. help="强制重新执行,覆盖已存在的文件"
  360. )
  361. parser.add_argument(
  362. "--shuffle",
  363. action="store_true",
  364. help="随机选择灵感,而不是按顺序"
  365. )
  366. parser.add_argument(
  367. "--sort-by-score",
  368. action="store_true",
  369. help="根据 Step1 结果分数排序(降序),优先处理高分灵感"
  370. )
  371. parser.add_argument(
  372. "--enable-step2",
  373. action="store_true",
  374. help="启用 Step2 增量词匹配(默认关闭)"
  375. )
  376. parser.add_argument(
  377. "--search-only",
  378. action="store_true",
  379. help="仅执行搜索(跳过 Step1 和 Step2,基于已有 Step1 结果,自动按分数降序)"
  380. )
  381. args = parser.parse_args()
  382. persona_dir = args.dir
  383. force = args.force
  384. shuffle = args.shuffle
  385. sort_by_score = args.sort_by_score
  386. enable_step2 = args.enable_step2
  387. search_only = args.search_only
  388. # search_only 模式自动启用分数排序
  389. if search_only:
  390. sort_by_score = True
  391. enable_step2 = False # 搜索模式下强制禁用 step2
  392. if shuffle:
  393. print("⚠️ 警告: --search-only 模式会自动按分数排序,忽略 --shuffle 参数")
  394. shuffle = False
  395. # 处理 max_tasks
  396. max_tasks = None if args.max_tasks == "all" else int(args.max_tasks)
  397. # 动态流程名称
  398. if search_only:
  399. workflow_name = "仅搜索"
  400. else:
  401. workflow_name = "Step1 + 搜索"
  402. if enable_step2:
  403. workflow_name += " + Step2"
  404. print(f"{'=' * 80}")
  405. print(f"灵感分析主流程 ({workflow_name})")
  406. print(f"{'=' * 80}")
  407. print(f"人设目录: {persona_dir}")
  408. # 加载灵感列表
  409. inspiration_list = load_inspiration_list(persona_dir)
  410. # 确定要处理的灵感数量
  411. if args.count == "all":
  412. inspiration_count = len(inspiration_list)
  413. print(f"处理灵感: 全部 ({inspiration_count} 个)")
  414. else:
  415. inspiration_count = int(args.count)
  416. print(f"处理灵感: 前 {inspiration_count} 个")
  417. if max_tasks:
  418. print(f"Step1 任务数限制: {max_tasks}")
  419. if search_only:
  420. print(f"搜索模式: 仅搜索(跳过 Step1 和 Step2)")
  421. print(f"分数排序: 根据已有 Step1 结果按分数降序处理")
  422. else:
  423. if force:
  424. print(f"强制模式: 重新执行所有步骤")
  425. if shuffle:
  426. print(f"随机模式: 随机选择灵感")
  427. if sort_by_score:
  428. print(f"分数排序: 根据 Step1 结果按分数降序处理")
  429. if enable_step2:
  430. print(f"Step2: 启用增量词匹配")
  431. else:
  432. print(f"Step2: 已关闭(使用 --enable-step2 启用)")
  433. # 选择要处理的灵感列表
  434. if sort_by_score:
  435. # 根据 Step1 结果分数排序
  436. sorted_list = sort_inspirations_by_score(persona_dir, inspiration_list, max_tasks)
  437. inspirations_to_process = sorted_list[:inspiration_count]
  438. elif shuffle:
  439. # 随机打乱灵感列表后选择
  440. shuffled_list = inspiration_list.copy()
  441. random.shuffle(shuffled_list)
  442. inspirations_to_process = shuffled_list[:inspiration_count]
  443. else:
  444. # 按顺序选择前 N 个
  445. inspirations_to_process = inspiration_list[:inspiration_count]
  446. print(f"\n将处理以下灵感:")
  447. for i, insp in enumerate(inspirations_to_process, 1):
  448. print(f" {i}. {insp}")
  449. # 批量执行流程
  450. results = []
  451. for i, inspiration in enumerate(inspirations_to_process, 1):
  452. print(f"\n{'#' * 80}")
  453. print(f"处理第 {i}/{len(inspirations_to_process)} 个灵感: {inspiration}")
  454. print(f"{'#' * 80}")
  455. # search_only 模式不创建 trace
  456. if search_only:
  457. result = await run_full_analysis(
  458. persona_dir=persona_dir,
  459. inspiration=inspiration,
  460. max_tasks=max_tasks,
  461. force=force,
  462. current_time=None,
  463. log_url=None,
  464. enable_step2=enable_step2,
  465. search_only=search_only
  466. )
  467. else:
  468. # 为每个灵感创建独立的 trace
  469. insp_time, insp_log_url = set_trace()
  470. with trace(f"灵感分析: {inspiration}"):
  471. result = await run_full_analysis(
  472. persona_dir=persona_dir,
  473. inspiration=inspiration,
  474. max_tasks=max_tasks,
  475. force=force,
  476. current_time=insp_time,
  477. log_url=insp_log_url,
  478. enable_step2=enable_step2,
  479. search_only=search_only
  480. )
  481. if insp_log_url:
  482. print(f"本次 Trace: {insp_log_url}")
  483. results.append(result)
  484. # 输出最终汇总
  485. print(f"\n{'=' * 80}")
  486. print(f"批量处理完成")
  487. print(f"{'=' * 80}")
  488. success_count = sum(1 for r in results if r["status"] == "success")
  489. print(f"\n成功: {success_count}/{len(results)}")
  490. for i, (insp, result) in enumerate(zip(inspirations_to_process, results), 1):
  491. status_icon = "✓" if result["status"] == "success" else "✗"
  492. print(f" {status_icon} [{i}] {insp}")
  493. if __name__ == "__main__":
  494. # 主流程不设置 trace,由每个灵感独立设置
  495. asyncio.run(main())