eval_one_sample.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. """单样本评估工具:sample.md = template + 一条帖子的字面产物,工具直接读字面调 LLM。
  2. sample.md 含 2 个 `=== BLOCK ===` 块(纯产物):
  3. === SYSTEM === system message 字面
  4. === USER === user message 字面(含填好的 query / rubric_md / rubric_json /
  5. 帖子 JSON / 输出要求)。多模态图片 URL 内嵌在帖子 JSON 的 post.images
  6. 字段——execute 时从 USER 字面提取那段 JSON,把 post.images 作为
  7. image_url 数组附给 LLM。
  8. 工作流:
  9. template + form_*.json 某条帖子 → `render` → 生成 sample.md → `execute` → LLM 输出评估 JSON
  10. 用法:
  11. # 1. 编译:从 form_*.json 取一条帖子 + 当前 template/rubric → 写出新 sample.md
  12. python eval_one_sample.py render \\
  13. --form runs/q01/form_A.json --case-id xhs_abc123 --out my_sample.md
  14. # 同上但按下标选 case
  15. python eval_one_sample.py render --form runs/q01/form_A.json --index 0 --out my_sample.md
  16. # 2. 执行:读 USER 字面 + 从内嵌 JSON 提 post.images 作多模态附件 → LLM;
  17. # 结果落盘 <sample>.eval.json (含 model / cost / timestamp / 评估 JSON), 评估失败不写。
  18. python eval_one_sample.py execute my_sample.md --model qwen
  19. python eval_one_sample.py execute my_sample.md --model qwen --out my_result.json
  20. # 3. dump:打印 SYSTEM/USER 字面到 stdout(不调 LLM、不写文件)
  21. python eval_one_sample.py dump my_sample.md
  22. """
  23. import argparse
  24. import asyncio
  25. import json
  26. import re
  27. import sys
  28. from datetime import datetime
  29. from pathlib import Path
  30. from typing import Any, Dict, List, Optional, Tuple
  31. _PROJECT_ROOT = Path(__file__).resolve().parents[4] # search_eval/ 在 process_pipeline/test_script/ 下,深度 +1
  32. if str(_PROJECT_ROOT) not in sys.path:
  33. sys.path.insert(0, str(_PROJECT_ROOT))
  34. from examples.process_pipeline.script.llm_helper import call_llm_with_retry
  35. from examples.process_pipeline.script.llm_evaluate_sources import (
  36. _format_post_for_eval, _build_eval_messages,
  37. build_eval_llm_call, DEFAULT_EVAL_MODEL, EVAL_MODELS,
  38. )
  39. def _validate_minimal(data: Any) -> Optional[str]:
  40. """eval_one_sample 用的宽松校验:仅要求 LLM 输出是非空 JSON 对象。
  41. 跟 llm_evaluate_sources._validate_eval(写死了英文 schema 字段)不同 ——
  42. execute 是单样本调试工具,目的是看 LLM 在自定义 prompt 下的原始输出,不该耦合
  43. 任何特定字段名(支持中文 schema 的 mod.md 等)。严格 schema 校验留给批量评估管线。
  44. """
  45. if not isinstance(data, dict):
  46. return "输出不是 JSON 对象"
  47. if not data:
  48. return "输出是空对象"
  49. return None
  50. # ── sample.md 解析 ────────────────────────────────────────────────────────────
  51. # 块分隔符兼容两种格式:
  52. # - `=== BLOCK_NAME ===` 单独成行 (旧 sample.md / template.md 格式)
  53. # - `# BLOCK_NAME` markdown H1 标题 (mod.md 风格)
  54. # BLOCK_NAME 限定为大写英文/下划线 token, 行末无其他字符 —— 避免跟"# 中文注释"
  55. # 或"# query: ..."等含冒号/中文的行混淆。
  56. _BLOCK_HEADER_RE = re.compile(r"^(?:===\s+([A-Z_]+)\s+===|#\s+([A-Z_]+))\s*$")
  57. def parse_sample(path: Path) -> Dict[str, str]:
  58. """读 sample.md → 各块 raw 文本 dict。支持两种块分隔符,见 _BLOCK_HEADER_RE。"""
  59. text = path.read_text(encoding="utf-8")
  60. blocks: Dict[str, List[str]] = {}
  61. current: Optional[str] = None
  62. for line in text.splitlines():
  63. m = _BLOCK_HEADER_RE.match(line)
  64. if m:
  65. current = m.group(1) or m.group(2) # group(1)=== style; group(2)=# style
  66. blocks[current] = []
  67. continue
  68. if current:
  69. blocks[current].append(line)
  70. return {k: "\n".join(v).strip("\n") for k, v in blocks.items()}
  71. # markdown 编辑器有时会把 url 自动转成 `[url](url)`(如 mod.md 里 source_url / images);
  72. # 这层包裹要剥掉才能被 LLM 多模态 / requests 当作真 URL 用。
  73. _MD_LINK_RE = re.compile(r"^\s*\[(.+?)\]\(\s*(.+?)\s*\)\s*$")
  74. def _unwrap_md_link(s: str) -> str:
  75. """处理 markdown 链接污染: '[url](url)' → 'url'。原样返回非链接字符串。"""
  76. if not isinstance(s, str):
  77. return s
  78. m = _MD_LINK_RE.match(s)
  79. return m.group(2).strip() if m else s
  80. def _strip_comments(raw: str) -> str:
  81. """去掉以 `#` 开头的注释行(用于 POST/IMAGES 块解析)。"""
  82. return "\n".join(ln for ln in raw.splitlines() if not ln.lstrip().startswith("#")).strip()
  83. def parse_meta(raw: str) -> Dict[str, str]:
  84. """META 块: key: value 一行一条。"""
  85. meta: Dict[str, str] = {}
  86. for line in raw.splitlines():
  87. s = line.strip()
  88. if not s or s.startswith("#"):
  89. continue
  90. k, sep, v = s.partition(":")
  91. if sep:
  92. meta[k.strip()] = v.strip()
  93. return meta
  94. def parse_post(raw: str) -> Dict[str, Any]:
  95. """POST 块: 一段 JSON。"""
  96. text = _strip_comments(raw)
  97. if not text:
  98. raise ValueError("=== POST === 块为空")
  99. return json.loads(text)
  100. # markdown 编辑器(Typora/Obsidian 等)会自动在 `_` `*` `#` `[` `]` `(` `)` 这种字符前
  101. # 加反斜杠转义,防它们触发 markdown 语法(如 `_X_` 渲染斜体)。但 JSON 字符串里 `\X`
  102. # 只能是有限几种合法转义(`\n` `\t` `\"` `\\` 等),其他会让 json.loads 报错。
  103. # 这个正则只剥上面那几个字符前的反斜杠——它们都不可能是 JSON 合法转义目标,剥了零风险。
  104. _MD_AUTOESC_RE = re.compile(r"\\(?=[_*#\[\]()])")
  105. def extract_post_json_from_user(user_text: str) -> Optional[Dict[str, Any]]:
  106. """从 USER 块字面里提取嵌入的帖子 JSON(找『【待评估帖子』标记后的第一个完整 {...})。
  107. 用 brace counter 而不是正则——能正确处理嵌套对象 / 字符串里的 `{` `}`。
  108. json.loads 失败时尝试剥 markdown 编辑器自动转义(`\\_` 等)后 retry——sample.md 的常见坑。
  109. """
  110. marker_idx = user_text.find("【待评估帖子")
  111. if marker_idx < 0:
  112. return None
  113. start = user_text.find("{", marker_idx)
  114. if start < 0:
  115. return None
  116. depth = 0
  117. in_str = False
  118. escape = False
  119. for i in range(start, len(user_text)):
  120. c = user_text[i]
  121. if escape:
  122. escape = False
  123. continue
  124. if c == "\\":
  125. escape = True
  126. continue
  127. if c == '"':
  128. in_str = not in_str
  129. continue
  130. if in_str:
  131. continue
  132. if c == "{":
  133. depth += 1
  134. elif c == "}":
  135. depth -= 1
  136. if depth == 0:
  137. raw = user_text[start : i + 1]
  138. try:
  139. return json.loads(raw)
  140. except json.JSONDecodeError:
  141. # 试剥 markdown 自动转义后 retry
  142. try:
  143. return json.loads(_MD_AUTOESC_RE.sub("", raw))
  144. except json.JSONDecodeError:
  145. return None
  146. return None
  147. # ── render:form_*.json + template → sample.md(纯产物三块) ──────────────────
  148. def _pick_case(form_path: Path, case_id: Optional[str], index: int) -> Tuple[Dict[str, Any], Dict[str, Any], str]:
  149. """从 form_*.json 选一条 source。返回 (form_data, source, case_label)。"""
  150. form_data = json.loads(form_path.read_text(encoding="utf-8"))
  151. results = form_data.get("results", [])
  152. if not results:
  153. sys.exit(f"❌ {form_path} 的 results 为空")
  154. if case_id:
  155. source = next((r for r in results if r.get("case_id") == case_id), None)
  156. if source is None:
  157. sys.exit(f"❌ {form_path} 没找到 case_id={case_id!r};前 5 个: "
  158. f"{[r.get('case_id') for r in results[:5]]}")
  159. return form_data, source, case_id
  160. if index < 0 or index >= len(results):
  161. sys.exit(f"❌ {form_path} 共 {len(results)} 条,--index {index} 越界")
  162. source = results[index]
  163. return form_data, source, source.get("case_id", f"index_{index}")
  164. def write_rendered_sample(out_path: Path, src_label: str, query: str,
  165. system: str, user_text: str) -> None:
  166. """把 SYSTEM / USER 两块写到 sample.md,顶部加追溯注释。
  167. 不再单独写 IMAGES 块——多模态图片 URL 已在 USER 字面里(post.images 数组),
  168. execute 时直接从那里提取,避免一份数据存两处。
  169. """
  170. header = (
  171. f"# 已渲染的评估 prompt sample(template + 一条帖子 → 字面产物)\n"
  172. f"# 源: {src_label}\n"
  173. f"# query: {query}\n"
  174. f"# 工具: python eval_one_sample.py execute {out_path.name}\n"
  175. f"#\n"
  176. f"# 多模态图片 URL 内嵌在 USER 块的帖子 JSON 的 post.images 字段;execute 自动提取。\n"
  177. f"# 改 template / rubric / 换帖子后,重新跑 render 生成新 sample。\n"
  178. )
  179. parts = [
  180. header,
  181. "=== SYSTEM ===",
  182. system,
  183. "",
  184. "=== USER ===",
  185. user_text,
  186. "",
  187. ]
  188. out_path.write_text("\n".join(parts), encoding="utf-8")
  189. def cmd_render(args: argparse.Namespace) -> None:
  190. form_data, source, case_label = _pick_case(args.form, args.case_id, args.index)
  191. query = args.query or form_data.get("query", "")
  192. requirement = args.requirement or ""
  193. post_block = _format_post_for_eval(source)
  194. # 不传 image_urls —— USER 文本里不拼 IMAGE_HINT;execute 自己从 post.images 取图时再追加 hint
  195. # rubric 已固化进 eval_prompt_template.md, 不再 load 外部 rubric 文件
  196. messages = _build_eval_messages(
  197. requirement=requirement, post_block=post_block,
  198. image_urls=None, query=query,
  199. )
  200. system = messages[0]["content"]
  201. user_text = messages[1]["content"]
  202. if isinstance(user_text, list):
  203. user_text = next(b["text"] for b in user_text if b["type"] == "text")
  204. n_images = len((source.get("post") or {}).get("images") or [])
  205. src_label = f"{args.form.name}#{case_label}"
  206. write_rendered_sample(args.out, src_label, query, system, user_text)
  207. print(f"✓ render → {args.out}")
  208. print(f" 源: {src_label}")
  209. print(f" query: {query!r}")
  210. print(f" system {len(system)} chars / user {len(user_text)} chars / "
  211. f"images {n_images} 张(内嵌在 post.images,execute 时自动取)")
  212. # ── execute:读 SYSTEM/USER/IMAGES 拼 messages → LLM ──────────────────────────
  213. def build_messages_from_blocks(blocks: Dict[str, str], include_images: bool,
  214. max_images: int = 10
  215. ) -> Tuple[List[Dict[str, Any]], int]:
  216. """从 sample.md 拼 messages: SYSTEM/USER 块字面 + 从 USER 内嵌 JSON 取 post.images。
  217. 多模态时:
  218. 1) 从 USER 块字面提取嵌入的帖子 JSON(extract_post_json_from_user);
  219. 2) 取 post.images URL 列表(截到 max_images 防 token 爆);
  220. 3) user content 末尾追加 USER_IMAGE_HINT 提示 LLM 下方有图;
  221. 4) 拼 image_url 数组挂在 user content list 后。
  222. """
  223. from examples.process_pipeline.script.llm_evaluate_sources import load_prompt_template
  224. system = blocks.get("SYSTEM", "").strip()
  225. user_text = blocks.get("USER", "").strip()
  226. if not system or not user_text:
  227. raise ValueError("sample.md 缺 SYSTEM / USER 块——先跑 `render` 生成")
  228. image_urls: List[str] = []
  229. if include_images:
  230. post = extract_post_json_from_user(user_text)
  231. if post:
  232. raw_urls = (post.get("post") or {}).get("images") or []
  233. for u in raw_urls:
  234. if not isinstance(u, str):
  235. continue
  236. u = _unwrap_md_link(u) # mod.md 的 markdown 链接污染清洗
  237. if u.startswith("http"):
  238. image_urls.append(u)
  239. image_urls = image_urls[:max_images]
  240. if image_urls:
  241. hint = load_prompt_template().get("USER_IMAGE_HINT", "")
  242. text = user_text + ("\n\n" + hint if hint else "")
  243. user_content: List[Dict[str, Any]] = [{"type": "text", "text": text}]
  244. for u in image_urls:
  245. user_content.append({"type": "image_url", "image_url": {"url": u}})
  246. msgs = [{"role": "system", "content": system},
  247. {"role": "user", "content": user_content}]
  248. else:
  249. msgs = [{"role": "system", "content": system},
  250. {"role": "user", "content": user_text}]
  251. return msgs, len(image_urls)
  252. async def cmd_execute(args: argparse.Namespace) -> None:
  253. from dotenv import load_dotenv
  254. load_dotenv()
  255. blocks = parse_sample(args.sample)
  256. messages, n_images = build_messages_from_blocks(
  257. blocks, include_images=not args.no_images, max_images=args.max_images,
  258. )
  259. llm_call, model_id = build_eval_llm_call(args.model)
  260. print(f"=== 执行 {args.sample.name} | 模型: {model_id} | 图片: {n_images} 张 ===\n")
  261. data, cost = await call_llm_with_retry(
  262. llm_call=llm_call, messages=messages, model=model_id,
  263. temperature=0.1, max_tokens=2000,
  264. validate_fn=_validate_minimal, task_name=f"OneSample[{args.sample.stem}]",
  265. )
  266. print(f"\n--- 评估结果(cost ${cost:.4f})---")
  267. if data is None:
  268. print("❌ 评估失败(校验未通过或重试耗尽,见上方日志);不写文件")
  269. return
  270. print(json.dumps(data, ensure_ascii=False, indent=2))
  271. # 落盘:默认 <sample>.eval.json (同目录),含元数据 + 评估 JSON; 评估失败不写,避免覆盖好结果
  272. out_path = args.out or (args.sample.parent / f"{args.sample.stem}.eval.json")
  273. payload = {
  274. "sample": args.sample.name,
  275. "model": model_id,
  276. "image_count": n_images,
  277. "cost": round(cost, 4),
  278. "timestamp": datetime.now().isoformat(timespec="seconds"),
  279. "evaluation": data,
  280. }
  281. out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  282. print(f"\n💾 evaluation 落盘 → {out_path}")
  283. # ── 默认 dump:打印 SYSTEM/USER 字面 ──────────────────────────────────────────
  284. def cmd_dump(args: argparse.Namespace) -> None:
  285. blocks = parse_sample(args.sample)
  286. system = blocks.get("SYSTEM", "").strip()
  287. user_text = blocks.get("USER", "").strip()
  288. if not system or not user_text:
  289. sys.exit("❌ sample.md 缺 SYSTEM/USER 块——先跑 `render` 生成")
  290. # 从 USER 字面提帖子 JSON, 显示 post.images 数量(execute 会取这些图作多模态)
  291. post = extract_post_json_from_user(user_text)
  292. n_images = len((post.get("post") or {}).get("images") or []) if post else 0
  293. print(f"=== SYSTEM ({len(system)} chars) ===\n")
  294. print(system)
  295. print(f"\n\n=== USER ({len(user_text)} chars) ===\n")
  296. print(user_text)
  297. if n_images:
  298. print(f"\n\n[内嵌帖子 JSON 含 {n_images} 张图;execute 时将作多模态附件取用]")
  299. # ── CLI ───────────────────────────────────────────────────────────────────────
  300. def main() -> None:
  301. sys.stdout.reconfigure(encoding="utf-8")
  302. parser = argparse.ArgumentParser(description="单样本评估:render / execute / dump")
  303. sub = parser.add_subparsers(dest="cmd", required=True)
  304. p_render = sub.add_parser("render", help="从 form_*.json 取一条帖子 → 当前 template/rubric 渲染 → 写出 sample.md")
  305. p_render.add_argument("--form", type=Path, required=True, help="form_*.json 路径(含 results 数组)")
  306. p_render.add_argument("--case-id", default=None, help="按 case_id 选 case(优先于 --index)")
  307. p_render.add_argument("--index", type=int, default=0, help="按下标选 case(默认 0;--case-id 提供时忽略)")
  308. p_render.add_argument("--out", type=Path, required=True, help="输出 sample.md 路径")
  309. p_render.add_argument("--query", default="", help="覆盖 form.query")
  310. p_render.add_argument("--requirement", default="", help="评估时的 requirement(默认空)")
  311. p_exec = sub.add_parser("execute", help="读 sample.md 字面调 LLM 评估")
  312. p_exec.add_argument("sample", type=Path)
  313. p_exec.add_argument("--model", default=DEFAULT_EVAL_MODEL,
  314. help=("shortcut: " + ", ".join(EVAL_MODELS) +
  315. "; 也可直接传 raw 模型 id (如 google/gemini-3.1-flash-lite / openai/gpt-5.4)"))
  316. p_exec.add_argument("--no-images", action="store_true", help="不发图(纯文本评估)")
  317. p_exec.add_argument("--max-images", type=int, default=10,
  318. help="多模态最多发几张图(默认 10;防 token 爆)")
  319. p_exec.add_argument("--out", type=Path, default=None,
  320. help="评估结果 JSON 输出路径(默认 <sample>.eval.json,跟 sample 同目录;评估失败不写)")
  321. p_dump = sub.add_parser("dump", help="打印 SYSTEM/USER 字面(不调 LLM、不写文件)")
  322. p_dump.add_argument("sample", type=Path)
  323. args = parser.parse_args()
  324. if args.cmd == "render":
  325. cmd_render(args)
  326. elif args.cmd == "execute":
  327. asyncio.run(cmd_execute(args))
  328. else: # dump
  329. cmd_dump(args)
  330. if __name__ == "__main__":
  331. main()