server.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # -*- coding: utf-8 -*-
  2. """搜索评估案例查看 server。
  3. 沿用 图文排版搜索评估.html 的版式(卡片 + dialog 详情 + rubric 评分条),
  4. 数据实时扫描 runs/*/form_*.json —— runs 下每新增一个 q 文件夹,刷新即出现。
  5. 分页:query → 三种形式(A/B/C) → 三个渠道 三行从上到下。
  6. 用法:python server.py [port] 默认 8770,浏览器开 http://0.0.0.0:8770
  7. """
  8. import json, re, glob, sys, pathlib, subprocess
  9. from datetime import datetime
  10. from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
  11. try: # Windows 控制台默认 cp1252,中文 print 会崩,统一切 utf-8
  12. sys.stdout.reconfigure(encoding="utf-8")
  13. except Exception:
  14. pass
  15. HERE = pathlib.Path(__file__).parent
  16. PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 8770
  17. PLAT = {"xhs": "小红书", "gzh": "公众号", "zhihu": "知乎", "x": "X", "bili": "B站", "douyin": "抖音",
  18. "sph": "视频号", "youtube": "YouTube", "github": "GitHub", "toutiao": "头条", "weibo": "微博"}
  19. KT = {"procedure": "工序", "step": "步骤", "tool": "工具"}
  20. # 从 taxonomy 取动作叶子/类型名,用于把 original_q 解析回原始维度(动作×类型 正交)
  21. EVALDIR = HERE.parent.parent / "test_script" / "evaluation"
  22. if not EVALDIR.exists():
  23. EVALDIR = HERE.parent / "evaluation"
  24. try:
  25. _jm = json.load(open(EVALDIR / "judged_matrix.json", encoding="utf-8"))
  26. ACT_L1 = {a["name"]: a["l1"] for a in _jm["actions"]}
  27. ACTION_SET = set(ACT_L1)
  28. TYPE_SET = {t["name"] for t in _jm["types"]}
  29. ACTIONS_TAX = [{"name": a["name"], "l1": a["l1"], "l2": a.get("l2", "")} for a in _jm["actions"]]
  30. TYPES_TAX = [{"name": t["name"], "l1": t["l1"]} for t in _jm["types"]]
  31. # taxonomy 顺序沿用 judged_matrix(严格版);矩阵分值改用 type_action_scores(宽松版) —
  32. # 两份是同一组 27×50 cell 的独立 gemini judging,前者只 53 格到 tier3,后者 156 格到 score3
  33. _tas = json.load(open(EVALDIR / "type_action_scores.json", encoding="utf-8"))["scores"]
  34. _MATRIX = []
  35. for a in _jm["actions"]:
  36. row = []
  37. for t in _jm["types"]:
  38. rec = _tas.get(t["name"], {}).get(a["name"])
  39. row.append({"tier": rec["score"], "r": rec.get("reason", "")} if rec else {})
  40. _MATRIX.append(row)
  41. except Exception:
  42. ACT_L1, ACTION_SET, TYPE_SET, ACTIONS_TAX, TYPES_TAX, _MATRIX = {}, set(), set(), [], [], []
  43. MODSET = {"文", "图", "视频", "音频"}
  44. TOOLQUAL = {"AI": "AI 模型", "软件": "桌面 APP", "电脑端": "桌面 APP", "在线": "云端 Web",
  45. "网页版": "云端 Web", "代码": "API·CLI", "命令行": "API·CLI", "插件": "插件扩展"}
  46. def parse_dims(oq):
  47. """把组合 query(如 '文 元素生成 提示词 教程')解析回 {动作, 类型, 动作L1, 约束}。"""
  48. toks = (oq or "").split()
  49. action = next((t for t in toks if t in ACTION_SET), None)
  50. type_ = next((t for t in toks if t in TYPE_SET), None)
  51. cons = None
  52. if toks:
  53. t0 = toks[0]
  54. if t0 in MODSET:
  55. cons = {"kind": "模态", "value": t0}
  56. elif t0 in TOOLQUAL:
  57. cons = {"kind": "工具类型", "value": TOOLQUAL[t0]}
  58. return {"action": action, "type": type_, "action_l1": ACT_L1.get(action, ""), "constraint": cons}
  59. def flat_scores(sc):
  60. f = {}
  61. for k, v in (sc or {}).items():
  62. if isinstance(v, dict):
  63. for kk, vv in v.items():
  64. try: f[kk] = int(vv)
  65. except Exception: pass
  66. else:
  67. try: f[k] = int(v)
  68. except Exception: pass
  69. return f
  70. def _recency_hard(date_str):
  71. """按 publish_timestamp 头 10 字符(YYYY-MM-DD)算硬时效:半年内=3 / 两年内=2 / 更早=1。
  72. 取代原 LLM 评的 recency 维度——脚本算更稳,发布时间在帖子抓取时就有,无需 LLM token。
  73. """
  74. try:
  75. d = datetime.strptime((date_str or "")[:10], "%Y-%m-%d")
  76. except (ValueError, TypeError):
  77. return None
  78. days = (datetime.now() - d).days
  79. if days <= 180: return 3
  80. if days <= 730: return 2
  81. return 1
  82. def adapt(r):
  83. p = r.get("post", {}); e = r.get("llm_evaluation", {})
  84. # 1. 判定是否为新版中文 schema
  85. is_new_schema = "评分" in e or "知识类型" in e or "制作相关性" in e
  86. # 2. 解析 知识类型 (knowledge_type)
  87. kt = []
  88. if is_new_schema:
  89. kt_raw = e.get("知识类型") or []
  90. for k in kt_raw:
  91. if k in ("工序", "procedure"): kt.append("procedure")
  92. elif k in ("步骤", "step"): kt.append("step")
  93. elif k in ("工具", "tool"): kt.append("tool")
  94. else:
  95. kt = e.get("knowledge_type") or []
  96. # 3. 解析 评分 (scores)
  97. CN_TO_EN = {
  98. "相关性": "relevance",
  99. "成品质量": "result_quality",
  100. "可信度": "credibility",
  101. "具体用例": "concrete_use_case",
  102. "完整性": "completeness",
  103. "步骤结构": "step_structure",
  104. "步骤可复现": "step_reproducibility",
  105. "步骤可复现性": "step_reproducibility",
  106. "能力定义": "capability_definition",
  107. "实现深度": "implementation_depth",
  108. "边界失败": "boundary_failure_eval",
  109. "通用性": "generality",
  110. "能力覆盖": "capability_coverage",
  111. "有效对比": "effective_comparison",
  112. "参数具体": "param_specificity",
  113. "实操示例": "worked_example",
  114. "实操用例": "worked_example",
  115. "示例完整": "worked_example",
  116. "版本限制": "version_limits",
  117. "版本说明": "version_limits",
  118. "限制说明": "version_limits",
  119. }
  120. fs = {}
  121. score_reasons = {}
  122. if is_new_schema:
  123. # 新版嵌套结构: "评分": { "通用": { "相关性": { "得分": 5, "理由": "..." } } }
  124. pf = e.get("评分") or {}
  125. for cat, metrics in pf.items():
  126. if isinstance(metrics, dict):
  127. for metric, val in metrics.items():
  128. en_key = CN_TO_EN.get(metric, metric)
  129. if isinstance(val, dict) and "得分" in val:
  130. try: fs[en_key] = int(val["得分"])
  131. except Exception: pass
  132. elif isinstance(val, (int, float)):
  133. fs[en_key] = int(val)
  134. if isinstance(val, dict) and "理由" in val:
  135. score_reasons[en_key] = val["理由"]
  136. else:
  137. fs = flat_scores(e.get("scores", {}))
  138. # 计算均分 (overall)
  139. overall = round(sum(fs.values()) / len(fs), 1) if fs else 0
  140. anomaly = bool(e.get("error")) or not fs
  141. grade = p.get("_quality_grade", "")
  142. fb = r.get("found_by_queries", [])
  143. # 4. 解析 制作相关性 (production_relevance)
  144. if is_new_schema:
  145. pr_block = e.get("制作相关性") or {}
  146. pr_raw = pr_block.get("得分") if isinstance(pr_block, dict) else pr_block
  147. else:
  148. pr_raw = e.get("production_relevance")
  149. try: production_relevance = int(float(pr_raw)) if pr_raw is not None else None
  150. except (TypeError, ValueError): production_relevance = None
  151. recency_hard = _recency_hard(p.get("publish_timestamp", ""))
  152. # 5. 解析 判定决策 (decision) 和 理由 (reason)
  153. reason = e.get("判定理由") or e.get("reason") or ""
  154. # 根据过滤指标决定是否保留 (过滤指标判定逻辑优先,不依赖文字匹配)
  155. is_discard = False
  156. # 制作相关性低于2则丢弃(非空且 < 2,1分丢弃,兼容旧版本不含该指标的情况)
  157. if production_relevance is not None and production_relevance < 2:
  158. is_discard = True
  159. # 时效性低于2被丢弃(非空且 < 2,1分丢弃,发布时间超两年的老帖)
  160. elif recency_hard is not None and recency_hard < 2:
  161. is_discard = True
  162. # 综合均分低于3被丢弃
  163. elif overall < 3:
  164. is_discard = True
  165. decision = "discard" if is_discard else "report"
  166. return {
  167. "platform": PLAT.get(r.get("platform"), r.get("platform")), "platformKey": r.get("platform"),
  168. "title": p.get("title", "") or "(无标题)", "date": (p.get("publish_timestamp", "") or "")[:10],
  169. "url": r.get("source_url", ""), "engagement": f'{p.get("like_count", 0)} 赞',
  170. "knowledge_type": kt, "decision": decision,
  171. "tools": [KT.get(k, k) for k in kt] + ([f"质量 {grade}"] if grade else []), "found_by": fb,
  172. "images": (p.get("images") or [])[:6], "text": p.get("body_text", "") or "",
  173. "scores": fs, "overall": overall, "reason": reason, "score_reasons": score_reasons,
  174. "grade": grade, "qscore": p.get("_quality_score", 0), "anomaly": anomaly,
  175. "production_relevance": production_relevance, "recency_hard": recency_hard,
  176. }
  177. def scan_runs():
  178. runs = {}
  179. for f in sorted(glob.glob(str(HERE / "runs" / "*" / "form_*.json"))):
  180. try:
  181. d = json.load(open(f, encoding="utf-8"))
  182. except Exception:
  183. continue
  184. run = pathlib.Path(f).parent.name
  185. results = [adapt(r) for r in d.get("results", [])]
  186. report_val = sum(1 for r in results if r.get("decision") == "report" and not r.get("anomaly"))
  187. discard_val = sum(1 for r in results if r.get("decision") == "discard" and not r.get("anomaly"))
  188. runs.setdefault(run, []).append({
  189. "form": d.get("form"), "query": d.get("query"), "original_q": d.get("original_q", ""),
  190. "requirement": d.get("requirement", ""),
  191. "platforms": d.get("platforms", []), "total": d.get("total"),
  192. "report": report_val, "discard": discard_val,
  193. "results": results,
  194. })
  195. for v in runs.values():
  196. v.sort(key=lambda x: x.get("form") or "")
  197. def _qnum(name): # "q156" → 156,按数字排,避免 "q156" < "q99" 的字符串误排
  198. m = re.search(r"\d+", name)
  199. return (int(m.group()) if m else 0, name)
  200. out = []
  201. for k, v in sorted(runs.items(), key=lambda kv: _qnum(kv[0])):
  202. oq = v[0].get("original_q") or v[0].get("query") or ""
  203. seen, hits = set(), 0 # 知识命中数 = 各形式采纳(report)且非异常、按 url 去重后的帖子数
  204. for f in v:
  205. for r in f.get("results", []):
  206. if r.get("decision") == "report" and not r.get("anomaly") and r.get("url") not in seen:
  207. seen.add(r.get("url")); hits += 1
  208. out.append({"key": k, "forms": v, "dims": parse_dims(oq), "original_q": oq,
  209. "hits": hits, "tot": sum((f.get("total") or 0) for f in v)})
  210. return {"queries": out, "actions": ACTIONS_TAX, "types": TYPES_TAX, "matrix": _MATRIX}
  211. class H(BaseHTTPRequestHandler):
  212. def _send(self, code, body, ctype):
  213. b = body.encode("utf-8") if isinstance(body, str) else body
  214. self.send_response(code); self.send_header("Content-Type", ctype + "; charset=utf-8")
  215. self.send_header("Content-Length", str(len(b))); self.end_headers(); self.wfile.write(b)
  216. def do_GET(self):
  217. if self.path in ("/", "/index.html"):
  218. try:
  219. page = (HERE / "index.html").read_text(encoding="utf-8")
  220. self._send(200, page, "text/html")
  221. except Exception as e:
  222. self._send(500, f"Error reading index.html: {e}", "text/plain")
  223. elif self.path.startswith("/api/data"):
  224. self._send(200, json.dumps(scan_runs(), ensure_ascii=False), "application/json")
  225. else:
  226. self._send(404, "not found", "text/plain")
  227. def do_POST(self):
  228. # /api/reeval —— 后台启动 batch_3forms.py 只对指定 q 复评,立即返回(不等结果)
  229. # 复评是 LLM 调用、几十秒到几分钟;浏览器侧用 fetch 启动 + 提示用户稍后刷新,不阻塞
  230. if self.path != "/api/reeval":
  231. self._send(404, json.dumps({"error": "not found"}), "application/json"); return
  232. length = int(self.headers.get("Content-Length") or 0)
  233. raw = self.rfile.read(length).decode("utf-8") if length > 0 else "{}"
  234. try:
  235. payload = json.loads(raw)
  236. except Exception as e:
  237. self._send(400, json.dumps({"error": f"bad json: {e}"}), "application/json"); return
  238. q = (payload.get("q") or "").strip()
  239. # 限定 qNN 形式避免路径注入
  240. if not re.match(r"^q\d+$", q):
  241. self._send(400, json.dumps({"error": f"bad q (expect 'qNN'): {q!r}"},
  242. ensure_ascii=False), "application/json"); return
  243. q_dir = HERE / "runs" / q
  244. if not q_dir.is_dir():
  245. self._send(404, json.dumps({"error": f"runs/{q} not found"}, ensure_ascii=False),
  246. "application/json"); return
  247. # 后台跑 batch_3forms.py,stdout/stderr 合并写到 q_dir/_reeval.log(可 tail 看进度)
  248. log_path = q_dir / "_reeval.log"
  249. try:
  250. log_fh = open(log_path, "w", encoding="utf-8", buffering=1)
  251. cmd = [sys.executable, "-u", str(HERE / "batch_3forms.py"),
  252. "--reeval", "--reeval-q", q, "--output-dir", str(HERE / "runs")]
  253. flags = subprocess.CREATE_NEW_PROCESS_GROUP if sys.platform == "win32" else 0
  254. proc = subprocess.Popen(cmd, stdout=log_fh, stderr=subprocess.STDOUT,
  255. cwd=str(HERE), creationflags=flags)
  256. self._send(200, json.dumps(
  257. {"status": "started", "pid": proc.pid, "q": q,
  258. "log": str(log_path.relative_to(HERE))},
  259. ensure_ascii=False), "application/json")
  260. except Exception as e:
  261. self._send(500, json.dumps({"error": f"failed to start: {e}"},
  262. ensure_ascii=False), "application/json")
  263. def log_message(self, *a): pass
  264. if __name__ == "__main__":
  265. n = len(scan_runs()["queries"])
  266. print(f"搜索评估查看 server:http://0.0.0.0:{PORT} (runs/ 下 {n} 个 query,实时扫描)")
  267. ThreadingHTTPServer(("0.0.0.0", PORT), H).serve_forever()