batch_3forms.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. """
  2. 三形式 query 批量搜索 + 多模态评估
  3. 针对 evaluation/high_priority_queries_full.json 的前 N 条高优 query,每条 query 用三种形式搜索 + 评估:
  4. 形式 A(原词组合):直接用 item["q"],如 "反推 提示词 教程"
  5. 形式 B(句子填充):gemini flash 把词组改写成自然搜索短句,**禁止注入具体工具/品牌/示例**
  6. 形式 C(同义替换):按 synonym_pools 对 动作/类型/知识词 各取同义词重组
  7. 输出(按 query 分文件夹):
  8. output_dir/
  9. q00/ form_A.json form_B.json form_C.json
  10. q01/ ...
  11. ...
  12. forms_preview.json # 三形式 query 预览
  13. summary.json # 三形式对比汇总
  14. 每个 form_X.json = {query 词} ↔ {帖子源信息 + 评估结果}(一对多)。
  15. 搜索 / 评估 / 多模态图片逻辑复用 script/search_and_evaluate.py。
  16. 用法:
  17. python batch_3forms.py --count 10 --platforms xhs,gzh,zhihu --max-count 10 \
  18. --output-dir runs/3forms_001
  19. """
  20. import argparse
  21. import asyncio
  22. import json
  23. import random
  24. import sys
  25. from pathlib import Path
  26. from typing import Any, Callable, Dict, List, Optional, Tuple
  27. _PROJECT_ROOT = Path(__file__).resolve().parents[4]
  28. if str(_PROJECT_ROOT) not in sys.path:
  29. sys.path.insert(0, str(_PROJECT_ROOT))
  30. from examples.process_pipeline.script.llm_helper import call_llm_with_retry
  31. from examples.process_pipeline.script.search_eval.search_and_evaluate import (
  32. search_all, evaluate_posts, transcribe_video_posts, build_query_overrides,
  33. )
  34. from examples.process_pipeline.script.llm_evaluate_sources import build_eval_llm_call
  35. _HIGH_PRIORITY = Path(__file__).resolve().parent / "evaluation" / "high_priority_queries_full.json"
  36. # synonym_pools: 主源 script/search_eval/evaluation/ (IDE 编辑那份就是 runtime 读的);
  37. # 若主源缺失则退到 test_script/evaluation/ 历史副本
  38. _SYNONYM_POOLS = Path(__file__).resolve().parent / "evaluation" / "synonym_pools.json"
  39. if not _SYNONYM_POOLS.exists():
  40. _SYNONYM_POOLS = _PROJECT_ROOT / "examples" / "process_pipeline" / "test_script" / "evaluation" / "synonym_pools.json"
  41. # ── 形式 A:原词组合 ─────────────────────────────────────────────────────────────
  42. def form_a(items: List[Dict[str, Any]]) -> List[str]:
  43. return [it["q"] for it in items]
  44. # ── 形式 B:gemini 句子化(禁止注入示例)─────────────────────────────────────────
  45. def _validate_sentences(data: Dict[str, Any], n: int) -> Optional[str]:
  46. qs = data.get("sentences")
  47. if not isinstance(qs, list):
  48. return "sentences 必须是数组"
  49. if len(qs) != n:
  50. return f"sentences 长度应为 {n},得到 {len(qs)}"
  51. if not all(isinstance(x, str) and x.strip() for x in qs):
  52. return "sentences 每项必须是非空字符串"
  53. return None
  54. async def form_b(items: List[Dict[str, Any]], llm_call: Callable, model: str) -> Tuple[List[str], float]:
  55. """把每条词组改写成自然搜索短句(一次批量调用,按序对齐)。"""
  56. words = [it["q"] for it in items]
  57. system = (
  58. "你是中文搜索词改写器。把每个『关键词组』改写成一句自然、口语、适合在内容平台"
  59. "搜索框输入的短句。严格要求:只表达词组本身的意图,"
  60. "**绝不添加任何具体工具名 / 品牌 / 产品 / 模型名 / 风格名 / 数字示例**"
  61. "(如 Midjourney、赛博朋克、SD 等都禁止出现)。只输出 JSON。"
  62. )
  63. user = (
  64. "把下面每个词组改写成一句自然搜索短句,顺序一一对应,输出:\n"
  65. '{"sentences": ["短句1", "短句2", ...]}\n\n'
  66. f"词组列表(共 {len(words)} 个):\n{json.dumps(words, ensure_ascii=False, indent=2)}"
  67. )
  68. data, cost = await call_llm_with_retry(
  69. llm_call=llm_call, messages=[{"role": "system", "content": system},
  70. {"role": "user", "content": user}],
  71. model=model, temperature=0.4, max_tokens=2000,
  72. validate_fn=lambda d: _validate_sentences(d, len(words)), task_name="FormB",
  73. )
  74. if not data:
  75. print(" ⚠️ form B 生成失败,回退用原词组")
  76. return list(words), cost
  77. return [s.strip() for s in data["sentences"]], cost
  78. # ── 形式 C:同义替换重组 ─────────────────────────────────────────────────────────
  79. class SynonymComposer:
  80. def __init__(self, pools: Dict[str, Any], rng: random.Random):
  81. self.action = pools.get("action_leaves", {})
  82. self.types = pools.get("types", {})
  83. self.knowledge = pools.get("knowledge", {})
  84. self.tool_type = pools.get("tool_type", {})
  85. self.rng = rng
  86. def _pick(self, pool: Any, fallback: str) -> str:
  87. pool = [x for x in pool if isinstance(x, str)] if isinstance(pool, list) else []
  88. return self.rng.choice(pool) if pool else fallback
  89. def compose(self, item: Dict[str, Any]) -> str:
  90. """按 synonym_pools._usage:[模态/工具前缀] 动作 类型 知识词。"""
  91. parts: List[str] = []
  92. c = item.get("constraint")
  93. if isinstance(c, dict):
  94. if c.get("kind") == "模态" and c.get("value"):
  95. parts.append(str(c["value"]))
  96. elif c.get("kind") == "工具类型":
  97. parts.append(self._pick(self.tool_type.get(c.get("value")), str(c.get("限定词") or "")))
  98. parts.append(self._pick(self.action.get(item.get("action", "")), item.get("action", "")))
  99. parts.append(self._pick(self.types.get(item.get("type", "")), item.get("type", "")))
  100. gx = self.knowledge.get("工序", {})
  101. parts.append(self._pick(gx.get("单步") if isinstance(gx, dict) else None, "教程"))
  102. return " ".join(p for p in parts if p)
  103. def form_c(items: List[Dict[str, Any]], seed: int) -> List[str]:
  104. pools = json.loads(_SYNONYM_POOLS.read_text(encoding="utf-8"))
  105. composer = SynonymComposer(pools, random.Random(seed))
  106. return [composer.compose(it) for it in items]
  107. # ── 单个 (query, form) 的搜索 + 评估 + 落盘 ──────────────────────────────────────
  108. async def run_one(
  109. qtext: str, form_key: str, original_q: str,
  110. args, eval_llm, eval_model_id, out_file: Path,
  111. query_overrides=None,
  112. ) -> Dict[str, Any]:
  113. platforms = [p.strip() for p in args.platforms.split(",") if p.strip()]
  114. sources = await search_all(platforms, [qtext], args.max_count, args.max_concurrent,
  115. query_overrides=query_overrides)
  116. try:
  117. from examples.process_pipeline.script.extract_sources import _convert_timestamps
  118. _convert_timestamps(sources)
  119. except Exception:
  120. pass
  121. # 视频帖转写:把字幕并入正文再评估(默认开)
  122. if not args.no_transcribe and sources:
  123. n = await transcribe_video_posts(sources, concurrency=args.max_concurrent)
  124. if n:
  125. print(f" 🎙️ 视频转写 {n} 条")
  126. cost = 0.0
  127. if not args.no_eval and sources:
  128. # 评估只看 query 词 + 帖子:把该形式的搜索词 qtext 作为检索锚点
  129. sources, cost = await evaluate_posts(
  130. sources, "", eval_llm, eval_model_id, args.max_concurrent,
  131. include_images=not args.no_images, max_images=args.max_images,
  132. image_mode=args.image_mode, query=qtext,
  133. )
  134. for s in sources:
  135. imgs = s.pop("_image_data_urls", None)
  136. if imgs is not None:
  137. s["images_sent"] = len(imgs)
  138. # discard 交给前端按 schema 字段动态过滤,后端只统计 LLM 评估失败数
  139. failed = sum(1 for s in sources if (s.get("llm_evaluation") or {}).get("_error"))
  140. out_file.parent.mkdir(parents=True, exist_ok=True)
  141. out_file.write_text(json.dumps({
  142. "form": form_key,
  143. "query": qtext, # 该形式实际搜索用的词(也是评估的检索锚点)
  144. "original_q": original_q, # 原词组(形式 A 的基准)
  145. "platforms": platforms,
  146. "total": len(sources), "failed": failed,
  147. "results": sources, # 帖子源信息 + llm_evaluation,一对多
  148. }, ensure_ascii=False, indent=2), encoding="utf-8")
  149. print(f" [{form_key}] {qtext!r} → total={len(sources)} failed={failed} "
  150. f"cost=${cost:.4f}")
  151. return {"form": form_key, "total": len(sources),
  152. "failed": failed, "cost": round(cost, 4)}
  153. async def reeval_existing(args, eval_llm, eval_model_id) -> None:
  154. """只重跑评估、覆盖旧评估,不重新搜索。
  155. 读 output_dir 下已有的 q*/form_*.json,对里面已抓到的 post 重新评估(评估锚点 = 文件里
  156. 记录的该形式 query 词),原地覆盖 llm_evaluation 后写回。适合改了评估 prompt / 模型后复评。
  157. 用 --start / --count 在 q 编号层(自然数序)切片限制范围,与主流程同语义;每个 q 文件夹下
  158. 的所有 form_A/B/C.json 一起复评(三种形式可比性)。
  159. """
  160. import re
  161. output_dir = Path(args.output_dir)
  162. # 按 q 编号自然数排序:避免 "q10" < "q2" 这种字符串误排(与 server.py _qnum 同口径)
  163. def _qnum(p):
  164. m = re.search(r"\d+", p.name)
  165. return (int(m.group()) if m else 0, p.name)
  166. q_dirs = sorted([d for d in output_dir.glob("q*") if d.is_dir()], key=_qnum)
  167. if not q_dirs:
  168. print(f"❌ {output_dir} 下没有 q*/ 子目录,无可复评内容"); return
  169. # --reeval-q 优先于 --start/--count:直接按 q 名过滤(接 "q01" 或 "q01,q05,q12" 多选)
  170. reeval_q = getattr(args, 'reeval_q', None)
  171. if reeval_q:
  172. wanted = {x.strip() for x in reeval_q.split(',') if x.strip()}
  173. sliced = [d for d in q_dirs if d.name in wanted]
  174. if not sliced:
  175. print(f"[X] 指定 q ({reeval_q}) 在 {output_dir} 下不存在"); return
  176. range_label = f"q={','.join(d.name for d in sliced)}"
  177. else:
  178. sliced = q_dirs[args.start : args.start + args.count]
  179. range_label = f"q[{args.start}:{args.start + args.count}]"
  180. files = [f for qd in sliced for f in sorted(qd.glob("form_*.json"))]
  181. if not files:
  182. print(f"❌ {output_dir} 切片 {range_label} 下没有 form_*.json"); return
  183. print(f"♻️ 复评模式:{range_label} → {len(sliced)} 个 query / "
  184. f"{len(files)} 个文件,模型 {eval_model_id}(不重新搜索)")
  185. for f in files:
  186. d = json.loads(f.read_text(encoding="utf-8"))
  187. results = d.get("results", [])
  188. if not results:
  189. print(f" - {f.relative_to(output_dir)}: 空,跳过"); continue
  190. # 清掉旧评估痕迹,重新评
  191. for s in results:
  192. s.pop("llm_evaluation", None)
  193. s.pop("images_sent", None)
  194. s.pop("_image_data_urls", None)
  195. qtext = d.get("query", "") # 该形式实际搜索词 = 评估检索锚点
  196. if not args.no_transcribe and results:
  197. await transcribe_video_posts(results, concurrency=args.max_concurrent)
  198. results, cost = await evaluate_posts(
  199. results, "", eval_llm, eval_model_id, args.max_concurrent,
  200. include_images=not args.no_images, max_images=args.max_images,
  201. image_mode=args.image_mode, query=qtext,
  202. )
  203. for s in results:
  204. imgs = s.pop("_image_data_urls", None)
  205. if imgs is not None:
  206. s["images_sent"] = len(imgs)
  207. failed = sum(1 for s in results if (s.get("llm_evaluation") or {}).get("_error"))
  208. d.update({"results": results, "total": len(results), "failed": failed})
  209. # 旧字段清掉, 避免误读
  210. for k in ("report", "discard", "requirement"):
  211. d.pop(k, None)
  212. f.write_text(json.dumps(d, ensure_ascii=False, indent=2), encoding="utf-8")
  213. print(f" ✓ {f.relative_to(output_dir)}: total={len(results)} failed={failed} "
  214. f"cost=${cost:.4f}")
  215. print("♻️ 复评完成(已覆盖原文件)")
  216. async def append_existing(args, eval_llm, eval_model_id, gen_llm, gen_model_id) -> None:
  217. """往已有 q*/form_*.json 追加新渠道结果,不重搜旧渠道。
  218. 用文件里存的 query 词、只搜 --platforms 指定的新渠道,评估后按 (平台, id) 去重合并进
  219. 原 results,旧渠道结果原样保留。适合先跑了中文渠道、再补 youtube/x 等。
  220. """
  221. from examples.process_pipeline.script.extract_sources import _convert_timestamps
  222. output_dir = Path(args.output_dir)
  223. files = sorted(output_dir.glob("q*/form_*.json"))
  224. if not files:
  225. print(f"❌ {output_dir} 下没有 q*/form_*.json,无可追加目标"); return
  226. new_plats = [p.strip() for p in args.platforms.split(",") if p.strip()]
  227. print(f"➕ 追加模式:{len(files)} 个文件追加渠道 {new_plats}(不重搜旧渠道)")
  228. # 英文平台一次性翻译所有 query
  229. queries = list(dict.fromkeys(json.loads(f.read_text(encoding="utf-8")).get("query", "") for f in files))
  230. overrides = await build_query_overrides(new_plats, queries, gen_llm, gen_model_id)
  231. for f in files:
  232. d = json.loads(f.read_text(encoding="utf-8"))
  233. qtext = d.get("query", "")
  234. existing = d.get("results", [])
  235. existing_keys = {(r.get("platform"), r.get("channel_content_id")) for r in existing}
  236. new_sources = await search_all(new_plats, [qtext], args.max_count, args.max_concurrent,
  237. query_overrides=overrides)
  238. new_sources = [s for s in new_sources
  239. if (s.get("platform"), s.get("channel_content_id")) not in existing_keys]
  240. try:
  241. _convert_timestamps(new_sources)
  242. except Exception:
  243. pass
  244. if not args.no_transcribe and new_sources:
  245. await transcribe_video_posts(new_sources, concurrency=args.max_concurrent)
  246. cost = 0.0
  247. if not args.no_eval and new_sources:
  248. new_sources, cost = await evaluate_posts(
  249. new_sources, "", eval_llm, eval_model_id, args.max_concurrent,
  250. include_images=not args.no_images, max_images=args.max_images,
  251. image_mode=args.image_mode, query=qtext,
  252. )
  253. for s in new_sources:
  254. imgs = s.pop("_image_data_urls", None)
  255. if imgs is not None:
  256. s["images_sent"] = len(imgs)
  257. merged = existing + new_sources
  258. plats_union = list(dict.fromkeys((d.get("platforms") or []) + new_plats))
  259. failed = sum(1 for s in merged if (s.get("llm_evaluation") or {}).get("_error"))
  260. d.update({"platforms": plats_union, "results": merged,
  261. "total": len(merged), "failed": failed})
  262. for k in ("report", "discard"):
  263. d.pop(k, None)
  264. f.write_text(json.dumps(d, ensure_ascii=False, indent=2), encoding="utf-8")
  265. print(f" ✓ {f.relative_to(output_dir)}: +{len(new_sources)} 新帖 → total={len(merged)} "
  266. f"failed={failed} cost=${cost:.4f}")
  267. print("➕ 追加完成(已并入原文件)")
  268. async def run(args) -> None:
  269. eval_llm0, eval_model0 = build_eval_llm_call(args.eval_model)
  270. if args.reeval:
  271. await reeval_existing(args, eval_llm0, eval_model0)
  272. return
  273. if args.append:
  274. gen_llm0, gen_model0 = build_eval_llm_call(args.gen_model)
  275. await append_existing(args, eval_llm0, eval_model0, gen_llm0, gen_model0)
  276. return
  277. queries_file = Path(args.queries_file) if getattr(args, "queries_file", None) else _HIGH_PRIORITY
  278. all_items = json.loads(queries_file.read_text(encoding="utf-8"))["queries"]
  279. print(f"📂 query 源: {queries_file.name} ({len(all_items)} 条)")
  280. only_q = getattr(args, "only_q", None)
  281. if only_q:
  282. # 支持 "1,5,51" 或 "q01,q05,q51";优先级高于 --start/--count
  283. import re as _re
  284. raw = [t.strip() for t in only_q.split(",") if t.strip()]
  285. idxs = []
  286. for t in raw:
  287. m = _re.match(r"q?(\d+)$", t)
  288. if not m:
  289. print(f"⚠️ 忽略无法解析的 q: {t!r}"); continue
  290. i = int(m.group(1))
  291. if 0 <= i < len(all_items):
  292. idxs.append(i)
  293. else:
  294. print(f"⚠️ idx {i} 超出范围 [0,{len(all_items)}),忽略")
  295. idxs = sorted(dict.fromkeys(idxs)) # 去重保序
  296. if not idxs:
  297. print("❌ --only-q 没有合法索引可用"); return
  298. items = [all_items[i] for i in idxs]
  299. print(f"📋 取 high_priority 指定 {len(idxs)} 条 query (idx={','.join(map(str, idxs))})"
  300. f" | 渠道 {args.platforms} | 每渠道≤{args.max_count} 帖")
  301. else:
  302. start = args.start
  303. items = all_items[start:start + args.count]
  304. idxs = list(range(start, start + len(items))) # 绝对下标,用于文件夹命名
  305. print(f"📋 取 high_priority 第 {start}~{start+len(items)-1} 条 query(共 {len(items)} 条)"
  306. f" | 渠道 {args.platforms} | 每渠道≤{args.max_count} 帖")
  307. eval_llm, eval_model_id = build_eval_llm_call(args.eval_model)
  308. gen_llm, gen_model_id = build_eval_llm_call(args.gen_model)
  309. print(f"🧠 评估模型 {args.eval_model}->{eval_model_id} | form B 生成 {args.gen_model}->{gen_model_id}")
  310. output_dir = Path(args.output_dir)
  311. output_dir.mkdir(parents=True, exist_ok=True)
  312. # ── 不覆盖原 index:选中区间里 form_A/B/C.json 都已落地的 q 整体跳过 ─────────
  313. # 设计意图:默认搜+评模式不覆盖原有数据,省钱省时间;要重评请走 --reeval(只覆盖
  314. # llm_evaluation 字段,保留 post / found_by_queries);要补渠道走 --append。
  315. # 部分形式缺失(如只有 form_A.json,B/C 没跑)的 q 保留进队列,下面循环里再按 form 粒度跳。
  316. def _q_fully_done(absi: int) -> bool:
  317. qd = output_dir / f"q{absi:04d}"
  318. return all((qd / f"form_{fk}.json").exists() for fk in ("A", "B", "C"))
  319. pairs = [(it, absi) for it, absi in zip(items, idxs) if not _q_fully_done(absi)]
  320. skipped_full = len(items) - len(pairs)
  321. if skipped_full:
  322. print(f"⏭️ {skipped_full} 个 q 三形式已全在 -> 跳过(不覆盖;如需重评 --reeval)")
  323. if not pairs:
  324. print("✅ 选中区间内所有 q 都已完成,无新搜任务"); return
  325. items = [it for it, _ in pairs]
  326. idxs = [absi for _, absi in pairs]
  327. qa = form_a(items)
  328. qb, b_cost = await form_b(items, gen_llm, gen_model_id)
  329. qc = form_c(items, args.seed)
  330. # forms_preview 用绝对下标做 key,多次区间跑不会互相覆盖
  331. preview_path = output_dir / "forms_preview.json"
  332. preview = {}
  333. if preview_path.exists():
  334. try:
  335. loaded = json.loads(preview_path.read_text(encoding="utf-8"))
  336. if isinstance(loaded, dict):
  337. preview = loaded # 旧版本写成 list,非 dict 一律重置
  338. except Exception:
  339. preview = {}
  340. for j, absi in enumerate(idxs):
  341. preview[str(absi)] = {"idx": absi, "A": qa[j], "B": qb[j], "C": qc[j]}
  342. preview_path.write_text(json.dumps(preview, ensure_ascii=False, indent=2), encoding="utf-8")
  343. print("📝 三形式预览 → forms_preview.json")
  344. for j, absi in enumerate(idxs):
  345. print(f" [{absi}] A={qa[j]!r} B={qb[j]!r} C={qc[j]!r}")
  346. # 英文平台(youtube/x):对全部形式的 query 一次性翻成英文
  347. platforms = [p.strip() for p in args.platforms.split(",") if p.strip()]
  348. all_q = list(dict.fromkeys(qa + qb + qc))
  349. overrides = await build_query_overrides(platforms, all_q, gen_llm, gen_model_id)
  350. summary = []
  351. for j, absi in enumerate(idxs):
  352. qdir = output_dir / f"q{absi:04d}"
  353. print(f"\n▶ q{absi:04d} 原词={qa[j]!r}")
  354. per_form = {}
  355. for fk, qtext in (("A", qa[j]), ("B", qb[j]), ("C", qc[j])):
  356. ff = qdir / f"form_{fk}.json"
  357. # form 粒度的不覆盖:部分形式补漏时只跑缺的那个,已存在的整体保留
  358. if ff.exists():
  359. print(f" [{fk}] {ff.name} 已存在 -> 跳过")
  360. continue
  361. stat = await run_one(qtext, fk, qa[j], args, eval_llm, eval_model_id,
  362. ff, query_overrides=overrides)
  363. per_form[fk] = stat
  364. summary.append({"idx": absi, "q": qa[j], "forms": per_form})
  365. (output_dir / "summary.json").write_text(json.dumps({
  366. "count": len(items), "platforms": args.platforms, "eval_model": eval_model_id,
  367. "form_b_gen_cost": round(b_cost, 4), "per_query": summary,
  368. }, ensure_ascii=False, indent=2), encoding="utf-8")
  369. # 形式聚合对比 (discard 由前端按 schema 字段动态过滤, 后端只看总量和评估失败数)
  370. print(f"\n{'='*60}\n📊 三形式对比 (total / failed 合计)")
  371. for fk in ("A", "B", "C"):
  372. tot = sum(s["forms"][fk]["total"] for s in summary if fk in s["forms"])
  373. fail = sum(s["forms"][fk].get("failed", 0) for s in summary if fk in s["forms"])
  374. print(f" 形式 {fk}: total={tot} failed={fail}")
  375. print(f"→ {output_dir/'summary.json'}")
  376. def main() -> None:
  377. from dotenv import load_dotenv
  378. load_dotenv()
  379. from examples.process_pipeline.script.llm_evaluate_sources import EVAL_MODELS
  380. p = argparse.ArgumentParser(description="三形式 query 批量搜索 + 多模态评估")
  381. p.add_argument("--start", type=int, default=0, help="起始 query 下标(0-based,默认 0)")
  382. p.add_argument("--count", type=int, default=10, help="从 --start 起取几条 query(默认 10)")
  383. p.add_argument("--only-q", default=None,
  384. help="离散指定要跑的 q(如 '51,55,331' 或 'q51,q55,q331'),优先于 --start/--count")
  385. p.add_argument("--queries-file", default=None,
  386. help="自定义 query 源 JSON 路径(结构需含 queries[...]),默认读 evaluation/high_priority_queries_full.json")
  387. p.add_argument("--platforms", default="xhs,gzh,zhihu", help="逗号分隔渠道(默认 xhs,gzh,zhihu)")
  388. p.add_argument("--max-count", type=int, default=10, help="每个 (渠道,query) 取几条帖子(默认 10)")
  389. p.add_argument("--output-dir", required=True, help="输出目录")
  390. p.add_argument("--eval-model", default="gemini-flash-lite", choices=list(EVAL_MODELS),
  391. help="评估模型(默认 gemini-flash-lite,多模态)")
  392. p.add_argument("--gen-model", default="gemini-flash-lite", choices=list(EVAL_MODELS),
  393. help="form B 句子生成模型(默认 gemini-flash-lite)")
  394. p.add_argument("--max-concurrent", type=int, default=3, help="搜索 / 评估并发上限")
  395. p.add_argument("--max-images", type=int, default=4, help="每帖最多发给模型几张配图")
  396. p.add_argument("--image-mode", choices=["url", "base64"], default="url",
  397. help="图片传输:url 直传(快,默认) / base64 下载内嵌(稳)")
  398. p.add_argument("--no-images", action="store_true", help="不发图(纯文本评估)")
  399. p.add_argument("--no-transcribe", action="store_true",
  400. help="不对视频帖转写(默认会转写并把字幕并入正文再评估)")
  401. p.add_argument("--no-eval", action="store_true", help="只搜不评估")
  402. p.add_argument("--reeval", action="store_true",
  403. help="只重跑评估、覆盖 output-dir 下已有 q*/form_*.json(不重新搜索);"
  404. "用 --start / --count 在 q 编号层限范围,或 --reeval-q 直接指定")
  405. p.add_argument("--reeval-q", default=None,
  406. help="仅复评指定的 q(如 'q01' 或 'q01,q05,q12'),优先于 --start/--count")
  407. p.add_argument("--append", action="store_true",
  408. help="往已有 q*/form_*.json 追加 --platforms 指定的新渠道结果(不重搜旧渠道)")
  409. p.add_argument("--seed", type=int, default=42, help="form C 同义替换随机种子")
  410. args = p.parse_args()
  411. asyncio.run(run(args))
  412. if __name__ == "__main__":
  413. main()