| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393 |
- """单样本评估工具:sample.md = template + 一条帖子的字面产物,工具直接读字面调 LLM。
- sample.md 含 2 个 `=== BLOCK ===` 块(纯产物):
- === SYSTEM === system message 字面
- === USER === user message 字面(含填好的 query / rubric_md / rubric_json /
- 帖子 JSON / 输出要求)。多模态图片 URL 内嵌在帖子 JSON 的 post.images
- 字段——execute 时从 USER 字面提取那段 JSON,把 post.images 作为
- image_url 数组附给 LLM。
- 工作流:
- template + form_*.json 某条帖子 → `render` → 生成 sample.md → `execute` → LLM 输出评估 JSON
- 用法:
- # 1. 编译:从 form_*.json 取一条帖子 + 当前 template/rubric → 写出新 sample.md
- python eval_one_sample.py render \\
- --form runs/q01/form_A.json --case-id xhs_abc123 --out my_sample.md
- # 同上但按下标选 case
- python eval_one_sample.py render --form runs/q01/form_A.json --index 0 --out my_sample.md
- # 2. 执行:读 USER 字面 + 从内嵌 JSON 提 post.images 作多模态附件 → LLM;
- # 结果落盘 <sample>.eval.json (含 model / cost / timestamp / 评估 JSON), 评估失败不写。
- python eval_one_sample.py execute my_sample.md --model qwen
- python eval_one_sample.py execute my_sample.md --model qwen --out my_result.json
- # 3. dump:打印 SYSTEM/USER 字面到 stdout(不调 LLM、不写文件)
- python eval_one_sample.py dump my_sample.md
- """
- import argparse
- import asyncio
- import json
- import re
- import sys
- from datetime import datetime
- from pathlib import Path
- from typing import Any, Dict, List, Optional, Tuple
- _PROJECT_ROOT = Path(__file__).resolve().parents[4] # search_eval/ 在 process_pipeline/test_script/ 下,深度 +1
- if str(_PROJECT_ROOT) not in sys.path:
- sys.path.insert(0, str(_PROJECT_ROOT))
- from examples.process_pipeline.script.llm_helper import call_llm_with_retry
- from examples.process_pipeline.script.llm_evaluate_sources import (
- _format_post_for_eval, _build_eval_messages,
- build_eval_llm_call, DEFAULT_EVAL_MODEL, EVAL_MODELS,
- )
- def _validate_minimal(data: Any) -> Optional[str]:
- """eval_one_sample 用的宽松校验:仅要求 LLM 输出是非空 JSON 对象。
- 跟 llm_evaluate_sources._validate_eval(写死了英文 schema 字段)不同 ——
- execute 是单样本调试工具,目的是看 LLM 在自定义 prompt 下的原始输出,不该耦合
- 任何特定字段名(支持中文 schema 的 mod.md 等)。严格 schema 校验留给批量评估管线。
- """
- if not isinstance(data, dict):
- return "输出不是 JSON 对象"
- if not data:
- return "输出是空对象"
- return None
- # ── sample.md 解析 ────────────────────────────────────────────────────────────
- # 块分隔符兼容两种格式:
- # - `=== BLOCK_NAME ===` 单独成行 (旧 sample.md / template.md 格式)
- # - `# BLOCK_NAME` markdown H1 标题 (mod.md 风格)
- # BLOCK_NAME 限定为大写英文/下划线 token, 行末无其他字符 —— 避免跟"# 中文注释"
- # 或"# query: ..."等含冒号/中文的行混淆。
- _BLOCK_HEADER_RE = re.compile(r"^(?:===\s+([A-Z_]+)\s+===|#\s+([A-Z_]+))\s*$")
- def parse_sample(path: Path) -> Dict[str, str]:
- """读 sample.md → 各块 raw 文本 dict。支持两种块分隔符,见 _BLOCK_HEADER_RE。"""
- text = path.read_text(encoding="utf-8")
- blocks: Dict[str, List[str]] = {}
- current: Optional[str] = None
- for line in text.splitlines():
- m = _BLOCK_HEADER_RE.match(line)
- if m:
- current = m.group(1) or m.group(2) # group(1)=== style; group(2)=# style
- blocks[current] = []
- continue
- if current:
- blocks[current].append(line)
- return {k: "\n".join(v).strip("\n") for k, v in blocks.items()}
- # markdown 编辑器有时会把 url 自动转成 `[url](url)`(如 mod.md 里 source_url / images);
- # 这层包裹要剥掉才能被 LLM 多模态 / requests 当作真 URL 用。
- _MD_LINK_RE = re.compile(r"^\s*\[(.+?)\]\(\s*(.+?)\s*\)\s*$")
- def _unwrap_md_link(s: str) -> str:
- """处理 markdown 链接污染: '[url](url)' → 'url'。原样返回非链接字符串。"""
- if not isinstance(s, str):
- return s
- m = _MD_LINK_RE.match(s)
- return m.group(2).strip() if m else s
- def _strip_comments(raw: str) -> str:
- """去掉以 `#` 开头的注释行(用于 POST/IMAGES 块解析)。"""
- return "\n".join(ln for ln in raw.splitlines() if not ln.lstrip().startswith("#")).strip()
- def parse_meta(raw: str) -> Dict[str, str]:
- """META 块: key: value 一行一条。"""
- meta: Dict[str, str] = {}
- for line in raw.splitlines():
- s = line.strip()
- if not s or s.startswith("#"):
- continue
- k, sep, v = s.partition(":")
- if sep:
- meta[k.strip()] = v.strip()
- return meta
- def parse_post(raw: str) -> Dict[str, Any]:
- """POST 块: 一段 JSON。"""
- text = _strip_comments(raw)
- if not text:
- raise ValueError("=== POST === 块为空")
- return json.loads(text)
- # markdown 编辑器(Typora/Obsidian 等)会自动在 `_` `*` `#` `[` `]` `(` `)` 这种字符前
- # 加反斜杠转义,防它们触发 markdown 语法(如 `_X_` 渲染斜体)。但 JSON 字符串里 `\X`
- # 只能是有限几种合法转义(`\n` `\t` `\"` `\\` 等),其他会让 json.loads 报错。
- # 这个正则只剥上面那几个字符前的反斜杠——它们都不可能是 JSON 合法转义目标,剥了零风险。
- _MD_AUTOESC_RE = re.compile(r"\\(?=[_*#\[\]()])")
- def extract_post_json_from_user(user_text: str) -> Optional[Dict[str, Any]]:
- """从 USER 块字面里提取嵌入的帖子 JSON(找『【待评估帖子』标记后的第一个完整 {...})。
- 用 brace counter 而不是正则——能正确处理嵌套对象 / 字符串里的 `{` `}`。
- json.loads 失败时尝试剥 markdown 编辑器自动转义(`\\_` 等)后 retry——sample.md 的常见坑。
- """
- marker_idx = user_text.find("【待评估帖子")
- if marker_idx < 0:
- return None
- start = user_text.find("{", marker_idx)
- if start < 0:
- return None
- depth = 0
- in_str = False
- escape = False
- for i in range(start, len(user_text)):
- c = user_text[i]
- if escape:
- escape = False
- continue
- if c == "\\":
- escape = True
- continue
- if c == '"':
- in_str = not in_str
- continue
- if in_str:
- continue
- if c == "{":
- depth += 1
- elif c == "}":
- depth -= 1
- if depth == 0:
- raw = user_text[start : i + 1]
- try:
- return json.loads(raw)
- except json.JSONDecodeError:
- # 试剥 markdown 自动转义后 retry
- try:
- return json.loads(_MD_AUTOESC_RE.sub("", raw))
- except json.JSONDecodeError:
- return None
- return None
- # ── render:form_*.json + template → sample.md(纯产物三块) ──────────────────
- def _pick_case(form_path: Path, case_id: Optional[str], index: int) -> Tuple[Dict[str, Any], Dict[str, Any], str]:
- """从 form_*.json 选一条 source。返回 (form_data, source, case_label)。"""
- form_data = json.loads(form_path.read_text(encoding="utf-8"))
- results = form_data.get("results", [])
- if not results:
- sys.exit(f"❌ {form_path} 的 results 为空")
- if case_id:
- source = next((r for r in results if r.get("case_id") == case_id), None)
- if source is None:
- sys.exit(f"❌ {form_path} 没找到 case_id={case_id!r};前 5 个: "
- f"{[r.get('case_id') for r in results[:5]]}")
- return form_data, source, case_id
- if index < 0 or index >= len(results):
- sys.exit(f"❌ {form_path} 共 {len(results)} 条,--index {index} 越界")
- source = results[index]
- return form_data, source, source.get("case_id", f"index_{index}")
- def write_rendered_sample(out_path: Path, src_label: str, query: str,
- system: str, user_text: str) -> None:
- """把 SYSTEM / USER 两块写到 sample.md,顶部加追溯注释。
- 不再单独写 IMAGES 块——多模态图片 URL 已在 USER 字面里(post.images 数组),
- execute 时直接从那里提取,避免一份数据存两处。
- """
- header = (
- f"# 已渲染的评估 prompt sample(template + 一条帖子 → 字面产物)\n"
- f"# 源: {src_label}\n"
- f"# query: {query}\n"
- f"# 工具: python eval_one_sample.py execute {out_path.name}\n"
- f"#\n"
- f"# 多模态图片 URL 内嵌在 USER 块的帖子 JSON 的 post.images 字段;execute 自动提取。\n"
- f"# 改 template / rubric / 换帖子后,重新跑 render 生成新 sample。\n"
- )
- parts = [
- header,
- "=== SYSTEM ===",
- system,
- "",
- "=== USER ===",
- user_text,
- "",
- ]
- out_path.write_text("\n".join(parts), encoding="utf-8")
- def cmd_render(args: argparse.Namespace) -> None:
- form_data, source, case_label = _pick_case(args.form, args.case_id, args.index)
- query = args.query or form_data.get("query", "")
- requirement = args.requirement or ""
- post_block = _format_post_for_eval(source)
- # 不传 image_urls —— USER 文本里不拼 IMAGE_HINT;execute 自己从 post.images 取图时再追加 hint
- # rubric 已固化进 eval_prompt_template.md, 不再 load 外部 rubric 文件
- messages = _build_eval_messages(
- requirement=requirement, post_block=post_block,
- image_urls=None, query=query,
- )
- system = messages[0]["content"]
- user_text = messages[1]["content"]
- if isinstance(user_text, list):
- user_text = next(b["text"] for b in user_text if b["type"] == "text")
- n_images = len((source.get("post") or {}).get("images") or [])
- src_label = f"{args.form.name}#{case_label}"
- write_rendered_sample(args.out, src_label, query, system, user_text)
- print(f"✓ render → {args.out}")
- print(f" 源: {src_label}")
- print(f" query: {query!r}")
- print(f" system {len(system)} chars / user {len(user_text)} chars / "
- f"images {n_images} 张(内嵌在 post.images,execute 时自动取)")
- # ── execute:读 SYSTEM/USER/IMAGES 拼 messages → LLM ──────────────────────────
- def build_messages_from_blocks(blocks: Dict[str, str], include_images: bool,
- max_images: int = 10
- ) -> Tuple[List[Dict[str, Any]], int]:
- """从 sample.md 拼 messages: SYSTEM/USER 块字面 + 从 USER 内嵌 JSON 取 post.images。
- 多模态时:
- 1) 从 USER 块字面提取嵌入的帖子 JSON(extract_post_json_from_user);
- 2) 取 post.images URL 列表(截到 max_images 防 token 爆);
- 3) user content 末尾追加 USER_IMAGE_HINT 提示 LLM 下方有图;
- 4) 拼 image_url 数组挂在 user content list 后。
- """
- from examples.process_pipeline.script.llm_evaluate_sources import load_prompt_template
- system = blocks.get("SYSTEM", "").strip()
- user_text = blocks.get("USER", "").strip()
- if not system or not user_text:
- raise ValueError("sample.md 缺 SYSTEM / USER 块——先跑 `render` 生成")
- image_urls: List[str] = []
- if include_images:
- post = extract_post_json_from_user(user_text)
- if post:
- raw_urls = (post.get("post") or {}).get("images") or []
- for u in raw_urls:
- if not isinstance(u, str):
- continue
- u = _unwrap_md_link(u) # mod.md 的 markdown 链接污染清洗
- if u.startswith("http"):
- image_urls.append(u)
- image_urls = image_urls[:max_images]
- if image_urls:
- hint = load_prompt_template().get("USER_IMAGE_HINT", "")
- text = user_text + ("\n\n" + hint if hint else "")
- user_content: List[Dict[str, Any]] = [{"type": "text", "text": text}]
- for u in image_urls:
- user_content.append({"type": "image_url", "image_url": {"url": u}})
- msgs = [{"role": "system", "content": system},
- {"role": "user", "content": user_content}]
- else:
- msgs = [{"role": "system", "content": system},
- {"role": "user", "content": user_text}]
- return msgs, len(image_urls)
- async def cmd_execute(args: argparse.Namespace) -> None:
- from dotenv import load_dotenv
- load_dotenv()
- blocks = parse_sample(args.sample)
- messages, n_images = build_messages_from_blocks(
- blocks, include_images=not args.no_images, max_images=args.max_images,
- )
- llm_call, model_id = build_eval_llm_call(args.model)
- print(f"=== 执行 {args.sample.name} | 模型: {model_id} | 图片: {n_images} 张 ===\n")
- data, cost = await call_llm_with_retry(
- llm_call=llm_call, messages=messages, model=model_id,
- temperature=0.1, max_tokens=2000,
- validate_fn=_validate_minimal, task_name=f"OneSample[{args.sample.stem}]",
- )
- print(f"\n--- 评估结果(cost ${cost:.4f})---")
- if data is None:
- print("❌ 评估失败(校验未通过或重试耗尽,见上方日志);不写文件")
- return
- print(json.dumps(data, ensure_ascii=False, indent=2))
- # 落盘:默认 <sample>.eval.json (同目录),含元数据 + 评估 JSON; 评估失败不写,避免覆盖好结果
- out_path = args.out or (args.sample.parent / f"{args.sample.stem}.eval.json")
- payload = {
- "sample": args.sample.name,
- "model": model_id,
- "image_count": n_images,
- "cost": round(cost, 4),
- "timestamp": datetime.now().isoformat(timespec="seconds"),
- "evaluation": data,
- }
- out_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
- print(f"\n💾 evaluation 落盘 → {out_path}")
- # ── 默认 dump:打印 SYSTEM/USER 字面 ──────────────────────────────────────────
- def cmd_dump(args: argparse.Namespace) -> None:
- blocks = parse_sample(args.sample)
- system = blocks.get("SYSTEM", "").strip()
- user_text = blocks.get("USER", "").strip()
- if not system or not user_text:
- sys.exit("❌ sample.md 缺 SYSTEM/USER 块——先跑 `render` 生成")
- # 从 USER 字面提帖子 JSON, 显示 post.images 数量(execute 会取这些图作多模态)
- post = extract_post_json_from_user(user_text)
- n_images = len((post.get("post") or {}).get("images") or []) if post else 0
- print(f"=== SYSTEM ({len(system)} chars) ===\n")
- print(system)
- print(f"\n\n=== USER ({len(user_text)} chars) ===\n")
- print(user_text)
- if n_images:
- print(f"\n\n[内嵌帖子 JSON 含 {n_images} 张图;execute 时将作多模态附件取用]")
- # ── CLI ───────────────────────────────────────────────────────────────────────
- def main() -> None:
- sys.stdout.reconfigure(encoding="utf-8")
- parser = argparse.ArgumentParser(description="单样本评估:render / execute / dump")
- sub = parser.add_subparsers(dest="cmd", required=True)
- p_render = sub.add_parser("render", help="从 form_*.json 取一条帖子 → 当前 template/rubric 渲染 → 写出 sample.md")
- p_render.add_argument("--form", type=Path, required=True, help="form_*.json 路径(含 results 数组)")
- p_render.add_argument("--case-id", default=None, help="按 case_id 选 case(优先于 --index)")
- p_render.add_argument("--index", type=int, default=0, help="按下标选 case(默认 0;--case-id 提供时忽略)")
- p_render.add_argument("--out", type=Path, required=True, help="输出 sample.md 路径")
- p_render.add_argument("--query", default="", help="覆盖 form.query")
- p_render.add_argument("--requirement", default="", help="评估时的 requirement(默认空)")
- p_exec = sub.add_parser("execute", help="读 sample.md 字面调 LLM 评估")
- p_exec.add_argument("sample", type=Path)
- p_exec.add_argument("--model", default=DEFAULT_EVAL_MODEL,
- help=("shortcut: " + ", ".join(EVAL_MODELS) +
- "; 也可直接传 raw 模型 id (如 google/gemini-3.1-flash-lite / openai/gpt-5.4)"))
- p_exec.add_argument("--no-images", action="store_true", help="不发图(纯文本评估)")
- p_exec.add_argument("--max-images", type=int, default=10,
- help="多模态最多发几张图(默认 10;防 token 爆)")
- p_exec.add_argument("--out", type=Path, default=None,
- help="评估结果 JSON 输出路径(默认 <sample>.eval.json,跟 sample 同目录;评估失败不写)")
- p_dump = sub.add_parser("dump", help="打印 SYSTEM/USER 字面(不调 LLM、不写文件)")
- p_dump.add_argument("sample", type=Path)
- args = parser.parse_args()
- if args.cmd == "render":
- cmd_render(args)
- elif args.cmd == "execute":
- asyncio.run(cmd_execute(args))
- else: # dump
- cmd_dump(args)
- if __name__ == "__main__":
- main()
|