| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- # -*- coding: utf-8 -*-
- """一次性:用当前 eval_prompt_template.md 对单条已存帖子重评(复用生产评估链路 evaluate_posts)。
- 支持 --escalate-model 演示 sonnet+flash-lite 组合(模糊带升级)。"""
- import argparse, asyncio, json, sys
- from pathlib import Path
- PROJECT_ROOT = Path(__file__).resolve().parents[3] # …/Agent
- sys.path.insert(0, str(PROJECT_ROOT))
- from dotenv import load_dotenv
- load_dotenv()
- MW = Path(__file__).resolve().parent
- sys.path.insert(0, str(MW))
- import db
- from examples.process_pipeline.script.search_eval.search_and_evaluate import evaluate_posts
- from examples.process_pipeline.script.llm_evaluate_sources import (
- _EVAL_PRODUCT_FIELDS, build_eval_llm_call, DEFAULT_EVAL_MODEL,
- )
- def _load(query_id):
- return json.loads((MW / "runs" / "search_process" / f"{query_id}.json")
- .read_text(encoding="utf-8"))
- async def main():
- ap = argparse.ArgumentParser()
- ap.add_argument("--query-id", required=True)
- ap.add_argument("--case-id", required=True)
- ap.add_argument("--query", default="")
- ap.add_argument("--model", default=DEFAULT_EVAL_MODEL)
- ap.add_argument("--escalate-model", default="")
- ap.add_argument("--escalate-band", type=float, nargs=2, default=[4.0, 6.0])
- ap.add_argument("--max-images", type=int, default=4)
- a = ap.parse_args()
- data = _load(a.query_id)
- query = a.query or data.get("query", "")
- src = next((s for s in data.get("results", []) if s.get("case_id") == a.case_id), None)
- if not src:
- raise SystemExit(f"未找到 case_id={a.case_id}")
- for k in _EVAL_PRODUCT_FIELDS:
- src.pop(k, None)
- llm_call, model_id = build_eval_llm_call(a.model)
- esc_llm = esc_model = None
- if a.escalate_model:
- esc_llm, esc_model = build_eval_llm_call(a.escalate_model)
- print(f"▶ 重评 {a.case_id} 初评={model_id}"
- + (f" 升级={esc_model} 带[{a.escalate_band[0]:g},{a.escalate_band[1]:g}]" if esc_model else "")
- + f" query={query!r}\n")
- sources, cost = await evaluate_posts(
- [src], "", llm_call, model_id, max_concurrent=1,
- include_images=True, max_images=a.max_images, image_mode="url", query=query,
- escalate_llm=esc_llm, escalate_model=esc_model, escalate_band=tuple(a.escalate_band),
- )
- ev = sources[0]["llm_evaluation"]
- overall = db.overall_score(ev)
- pub = (src.get("post") or {}).get("publish_timestamp", "")
- adopted = db.is_adopted(overall, ev, pub)
- print("\n" + "=" * 60)
- print(f"最终评估模型 = {sources[0].get('_escalated') or model_id}")
- print(f"综合分(overall_score) = {overall}")
- print(f" · 和内容制作知识相关 = {((ev.get('相关性') or {}).get('和内容制作知识相关') or {}).get('得分')}")
- print(f" · 可复现性 = {db._fixed_dim_score(ev, '可复现性')} (门槛 <4 → 不采纳)")
- print(f" · 意图可控性 = {db._fixed_dim_score(ev, '意图可控性')} (暂只采分)")
- print(f"采纳判定(is_adopted) = {adopted}")
- print(f"总成本 ≈ ${cost:.4f}")
- if __name__ == "__main__":
- asyncio.run(main())
|