server.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879
  1. # -*- coding: utf-8 -*-
  2. """搜索评估案例查看 server。
  3. 沿用 图文排版搜索评估.html 的版式(卡片 + dialog 详情 + rubric 评分条),
  4. 数据实时扫描 runs_full/*/form_*.json —— runs_full 下每新增一个 q 文件夹,刷新即出现。
  5. 分页:query → 三种形式(A/B/C) → 三个渠道 三行从上到下。
  6. 用法:python server.py [port] 默认 8770,浏览器开 http://0.0.0.0:8770
  7. """
  8. import json, re, glob, sys, pathlib, subprocess, threading
  9. from datetime import datetime
  10. from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
  11. from urllib.parse import urlparse, parse_qs
  12. try: # Windows 控制台默认 cp1252,中文 print 会崩,统一切 utf-8
  13. sys.stdout.reconfigure(encoding="utf-8")
  14. except Exception:
  15. pass
  16. HERE = pathlib.Path(__file__).parent
  17. sys.path.insert(0, str(HERE))
  18. PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 8770
  19. PLAT = {"xhs": "小红书", "gzh": "公众号", "zhihu": "知乎", "x": "X", "bili": "B站", "douyin": "抖音",
  20. "sph": "视频号", "youtube": "YouTube", "github": "GitHub", "toutiao": "头条", "weibo": "微博"}
  21. KT = {"procedure": "工序", "step": "步骤", "tool": "工具"}
  22. # 从 taxonomy 取动作叶子/类型名,用于把 original_q 解析回原始维度(动作×类型 正交)
  23. # 路径优先级:search_eval/evaluation/(主源,IDE 编辑那份就是 runtime 实际读的)
  24. # → test_script/evaluation/(历史副本兜底)→ script/evaluation/(更老兜底)
  25. # 谁也找不到时整目录扫空,server 仍能起。
  26. EVALDIR = HERE / "evaluation"
  27. if not EVALDIR.exists():
  28. EVALDIR = HERE.parent.parent / "test_script" / "evaluation"
  29. if not EVALDIR.exists():
  30. EVALDIR = HERE.parent / "evaluation"
  31. try:
  32. _jm = json.load(open(EVALDIR / "judged_matrix.json", encoding="utf-8"))
  33. ACT_L1 = {a["name"]: a["l1"] for a in _jm["actions"]}
  34. ACTION_SET = set(ACT_L1)
  35. TYPE_SET = {t["name"] for t in _jm["types"]}
  36. ACTIONS_TAX = [{"name": a["name"], "l1": a["l1"], "l2": a.get("l2", "")} for a in _jm["actions"]]
  37. TYPES_TAX = [{"name": t["name"], "l1": t["l1"]} for t in _jm["types"]]
  38. # taxonomy 顺序沿用 judged_matrix(严格版);矩阵分值改用 type_action_scores(宽松版) —
  39. # 两份是同一组 27×50 cell 的独立 gemini judging,前者只 53 格到 tier3,后者 156 格到 score3
  40. _tas = json.load(open(EVALDIR / "type_action_scores.json", encoding="utf-8"))["scores"]
  41. _MATRIX = []
  42. for a in _jm["actions"]:
  43. row = []
  44. for t in _jm["types"]:
  45. rec = _tas.get(t["name"], {}).get(a["name"])
  46. row.append({"tier": rec["score"], "r": rec.get("reason", "")} if rec else {})
  47. _MATRIX.append(row)
  48. except Exception:
  49. ACT_L1, ACTION_SET, TYPE_SET, ACTIONS_TAX, TYPES_TAX, _MATRIX = {}, set(), set(), [], [], []
  50. ACTIVE_TASKS = {}
  51. ACTIVE_REEVALS = {}
  52. from batch_extract_procedures import _short_case, _source_to_dsl_input, _write_meta, _composite_score
  53. def run_extraction_task(q, folder_name, src_path, out_dir, engine, model):
  54. task_key = f"{q}/{folder_name}"
  55. log_path = out_dir / "_extract.log"
  56. try:
  57. out_dir.mkdir(parents=True, exist_ok=True)
  58. if engine == "cyber_runner":
  59. script_path = HERE / "procedure-dsl" / "run_cyber.py"
  60. else:
  61. script_path = HERE / "procedure-dsl" / "run_procedure_dsl.py"
  62. cmd = [
  63. sys.executable, "-u", str(script_path),
  64. str(src_path),
  65. "--out-dir", str(out_dir),
  66. "--model", model,
  67. "--max-turns", "300"
  68. ]
  69. if engine != "cyber_runner":
  70. cmd.extend(["--max-retries", "3"])
  71. flags = subprocess.CREATE_NEW_PROCESS_GROUP if sys.platform == "win32" else 0
  72. with open(log_path, "w", encoding="utf-8", buffering=1) as log_fh:
  73. proc = subprocess.Popen(cmd, stdout=log_fh, stderr=subprocess.STDOUT,
  74. cwd=str(HERE), creationflags=flags)
  75. ACTIVE_TASKS[task_key]["pid"] = proc.pid
  76. proc.wait()
  77. if proc.returncode == 0:
  78. try:
  79. import build_workflows
  80. build_workflows.write_one(q, folder_name, runs_dir=HERE / "runs_full")
  81. ACTIVE_TASKS[task_key]["status"] = "success"
  82. except Exception as ex:
  83. ACTIVE_TASKS[task_key]["status"] = "failed"
  84. ACTIVE_TASKS[task_key]["error"] = f"Workflow compilation failed: {ex}"
  85. with open(log_path, "a", encoding="utf-8") as f_err:
  86. f_err.write(f"\n[server error] Workflow compilation failed: {ex}\n")
  87. else:
  88. ACTIVE_TASKS[task_key]["status"] = "failed"
  89. ACTIVE_TASKS[task_key]["error"] = f"Runner failed with exit code {proc.returncode}"
  90. except Exception as e:
  91. ACTIVE_TASKS[task_key]["status"] = "failed"
  92. ACTIVE_TASKS[task_key]["error"] = str(e)
  93. try:
  94. with open(log_path, "a", encoding="utf-8") as f_err:
  95. f_err.write(f"\n[server error] Extraction failed: {e}\n")
  96. except Exception:
  97. pass
  98. MODSET = {"文", "图", "视频", "音频"}
  99. TOOLQUAL = {"AI": "AI 模型", "软件": "桌面 APP", "电脑端": "桌面 APP", "在线": "云端 Web",
  100. "网页版": "云端 Web", "代码": "API·CLI", "命令行": "API·CLI", "插件": "插件扩展"}
  101. def parse_dims(oq):
  102. """把组合 query(如 '文 元素生成 提示词 教程')解析回 {动作, 类型, 动作L1, 约束}。"""
  103. toks = (oq or "").split()
  104. action = next((t for t in toks if t in ACTION_SET), None)
  105. type_ = next((t for t in toks if t in TYPE_SET), None)
  106. cons = None
  107. if toks:
  108. t0 = toks[0]
  109. if t0 in MODSET:
  110. cons = {"kind": "模态", "value": t0}
  111. elif t0 in TOOLQUAL:
  112. cons = {"kind": "工具类型", "value": TOOLQUAL[t0]}
  113. return {"action": action, "type": type_, "action_l1": ACT_L1.get(action, ""), "constraint": cons}
  114. def flat_scores(sc):
  115. f = {}
  116. for k, v in (sc or {}).items():
  117. if isinstance(v, dict):
  118. for kk, vv in v.items():
  119. try: f[kk] = int(vv)
  120. except Exception: pass
  121. else:
  122. try: f[k] = int(v)
  123. except Exception: pass
  124. return f
  125. def _recency_hard(date_str):
  126. """按 publish_timestamp 头 10 字符(YYYY-MM-DD)算硬时效:半年内=3 / 两年内=2 / 更早=1。
  127. 取代原 LLM 评的 recency 维度——脚本算更稳,发布时间在帖子抓取时就有,无需 LLM token。
  128. """
  129. try:
  130. d = datetime.strptime((date_str or "")[:10], "%Y-%m-%d")
  131. except (ValueError, TypeError):
  132. return None
  133. days = (datetime.now() - d).days
  134. if days <= 180: return 3
  135. if days <= 730: return 2
  136. return 1
  137. def adapt(r, run, form_name=None):
  138. p = r.get("post", {}); e = r.get("llm_evaluation", {})
  139. # 1. 解析 知识类型 (knowledge_type)
  140. kt = []
  141. kt_raw = e.get("知识类型") or e.get("knowledge_type") or []
  142. for k in kt_raw:
  143. if k in ("工序", "procedure"): kt.append("procedure")
  144. elif k in ("能力", "步骤", "step"): kt.append("step")
  145. elif k in ("工具", "tool"): kt.append("tool")
  146. fs = {}
  147. score_reasons = {}
  148. # 检测是否为 eval_prompt_sample-mod 里的新版 0-10 分数 schema
  149. is_mod_schema = "相关性" in e and isinstance(e["相关性"], dict) and ("和内容制作知识相关" in e["相关性"] or "和 query 相关" in e["相关性"])
  150. if is_mod_schema:
  151. # 新版 0-10 分数格式解析
  152. # 1. 相关性
  153. rel = e.get("相关性") or {}
  154. for subkey, item in rel.items():
  155. if isinstance(item, dict):
  156. score_val = item.get("得分")
  157. reason_val = item.get("理由")
  158. code_key = None
  159. if "内容制作" in subkey or "知识" in subkey:
  160. code_key = "relevance_production"
  161. elif "query" in subkey or "检索" in subkey:
  162. code_key = "relevance_query"
  163. if code_key and score_val is not None:
  164. try:
  165. fs[code_key] = float(score_val)
  166. if reason_val:
  167. score_reasons[code_key] = reason_val
  168. except Exception:
  169. pass
  170. # 2. 质量
  171. q_block = e.get("质量") or {}
  172. fixed = q_block.get("固定维度") or {}
  173. # 固定维度
  174. fixed_keys = {
  175. "时效性": "recency",
  176. "热度性": "popularity",
  177. "评论反馈": "feedback"
  178. }
  179. for cn, code in fixed_keys.items():
  180. item = fixed.get(cn)
  181. if isinstance(item, dict):
  182. score_val = item.get("得分")
  183. reason_val = item.get("理由")
  184. if score_val is not None:
  185. try:
  186. fs[code] = float(score_val)
  187. if reason_val:
  188. score_reasons[code] = reason_val
  189. except Exception:
  190. pass
  191. # 用例 (真实感, 表现力)
  192. usecase = fixed.get("用例") or {}
  193. usecase_keys = {
  194. "真实感": "realism",
  195. "表现力": "expressiveness"
  196. }
  197. for cn, code in usecase_keys.items():
  198. item = usecase.get(cn)
  199. if isinstance(item, dict):
  200. score_val = item.get("得分")
  201. reason_val = item.get("理由")
  202. if score_val is not None:
  203. try:
  204. fs[code] = float(score_val)
  205. if reason_val:
  206. score_reasons[code] = reason_val
  207. except Exception:
  208. pass
  209. # 动态维度
  210. dynamic = q_block.get("动态维度") or {}
  211. # 工序
  212. proc = dynamic.get("工序") or {}
  213. if proc:
  214. item = proc.get("流程完整性")
  215. if isinstance(item, dict):
  216. score_val = item.get("得分")
  217. reason_val = item.get("理由")
  218. if score_val is not None:
  219. try:
  220. fs["procedure_completeness"] = float(score_val)
  221. if reason_val:
  222. score_reasons["procedure_completeness"] = reason_val
  223. except Exception:
  224. pass
  225. field = proc.get("字段完整性") or {}
  226. field_keys = {
  227. "输入完整性": "procedure_input",
  228. "实现完整性": "procedure_implementation",
  229. "输出完整性": "procedure_output"
  230. }
  231. for cn, code in field_keys.items():
  232. item = field.get(cn)
  233. if isinstance(item, dict):
  234. score_val = item.get("得分")
  235. reason_val = item.get("理由")
  236. if score_val is not None:
  237. try:
  238. fs[code] = float(score_val)
  239. if reason_val:
  240. score_reasons[code] = reason_val
  241. except Exception:
  242. pass
  243. item = proc.get("泛化性")
  244. if isinstance(item, dict):
  245. score_val = item.get("得分")
  246. reason_val = item.get("理由")
  247. if score_val is not None:
  248. try:
  249. fs["procedure_generality"] = float(score_val)
  250. if reason_val:
  251. score_reasons["procedure_generality"] = reason_val
  252. except Exception:
  253. pass
  254. # 能力
  255. cap = dynamic.get("能力") or dynamic.get("步骤") or {}
  256. if cap:
  257. field = cap.get("字段完整性") or {}
  258. field_keys = {
  259. "输入完整性": "step_input",
  260. "实现完整性": "step_implementation",
  261. "输出完整性": "step_output"
  262. }
  263. for cn, code in field_keys.items():
  264. item = field.get(cn)
  265. if isinstance(item, dict):
  266. score_val = item.get("得分")
  267. reason_val = item.get("理由")
  268. if score_val is not None:
  269. try:
  270. fs[code] = float(score_val)
  271. if reason_val:
  272. score_reasons[code] = reason_val
  273. except Exception:
  274. pass
  275. item = cap.get("泛化性")
  276. if isinstance(item, dict):
  277. score_val = item.get("得分")
  278. reason_val = item.get("理由")
  279. if score_val is not None:
  280. try:
  281. fs["step_generality"] = float(score_val)
  282. if reason_val:
  283. score_reasons["step_generality"] = reason_val
  284. except Exception:
  285. pass
  286. # 工具
  287. tool = dynamic.get("工具") or {}
  288. if tool:
  289. tool_keys = {
  290. "能力边界覆盖": "tool_boundary",
  291. "有效比较": "tool_comparison",
  292. "参数/接口具体性": "tool_specificity",
  293. "实操示例": "tool_example",
  294. "版本&限制": "tool_limits"
  295. }
  296. for cn, code in tool_keys.items():
  297. item = tool.get(cn)
  298. if isinstance(item, dict):
  299. score_val = item.get("得分")
  300. reason_val = item.get("理由")
  301. if score_val is not None:
  302. try:
  303. fs[code] = float(score_val)
  304. if reason_val:
  305. score_reasons[code] = reason_val
  306. except Exception:
  307. pass
  308. else:
  309. # 兼容老版 1-5 分数 schema (带 "评分" 或 old-style flatness)
  310. is_new_schema = "评分" in e or "知识类型" in e or "制作相关性" in e
  311. CN_TO_EN = {
  312. "相关性": "relevance",
  313. "成品质量": "result_quality",
  314. "可信度": "credibility",
  315. "具体用例": "concrete_use_case",
  316. "完整性": "completeness",
  317. "步骤结构": "step_structure",
  318. "步骤可复现": "step_reproducibility",
  319. "步骤可复现性": "step_reproducibility",
  320. "能力定义": "capability_definition",
  321. "实现深度": "implementation_depth",
  322. "边界失败": "boundary_failure_eval",
  323. "通用性": "generality",
  324. "能力覆盖": "capability_coverage",
  325. "有效对比": "effective_comparison",
  326. "参数具体": "param_specificity",
  327. "实操示例": "worked_example",
  328. "实操用例": "worked_example",
  329. "示例完整": "worked_example",
  330. "版本限制": "version_limits",
  331. "版本说明": "version_limits",
  332. "限制说明": "version_limits",
  333. }
  334. if is_new_schema:
  335. pf = e.get("评分") or {}
  336. for cat, metrics in pf.items():
  337. if isinstance(metrics, dict):
  338. for metric, val in metrics.items():
  339. en_key = CN_TO_EN.get(metric, metric)
  340. if isinstance(val, dict) and "得分" in val:
  341. try: fs[en_key] = int(val["得分"])
  342. except Exception: pass
  343. elif isinstance(val, (int, float)):
  344. fs[en_key] = int(val)
  345. if isinstance(val, dict) and "理由" in val:
  346. score_reasons[en_key] = val["理由"]
  347. else:
  348. fs = flat_scores(e.get("scores", {}))
  349. # 计算均分 (overall)
  350. if is_mod_schema:
  351. rel_keys = {"relevance_production", "relevance_query"}
  352. rel_vals = [v for k, v in fs.items() if k in rel_keys]
  353. qual_vals = [v for k, v in fs.items() if k not in rel_keys]
  354. rel_avg = sum(rel_vals) / len(rel_vals) if rel_vals else None
  355. qual_avg = sum(qual_vals) / len(qual_vals) if qual_vals else None
  356. if rel_avg is not None and qual_avg is not None:
  357. overall = round((rel_avg + qual_avg) / 2, 1)
  358. elif rel_avg is not None:
  359. overall = round(rel_avg, 1)
  360. elif qual_avg is not None:
  361. overall = round(qual_avg, 1)
  362. else:
  363. overall = 0.0
  364. else:
  365. overall = round(sum(fs.values()) / len(fs), 1) if fs else 0
  366. anomaly = bool(e.get("error")) or not fs
  367. grade = p.get("_quality_grade", "")
  368. fb = r.get("found_by_queries", [])
  369. # 4. 解析 制作相关性 (production_relevance)
  370. if is_mod_schema:
  371. # 新版使用 "相关性" 中的 "和内容制作知识相关" 代表制作相关性
  372. production_relevance = fs.get("relevance_production")
  373. else:
  374. if is_new_schema:
  375. pr_block = e.get("制作相关性") or {}
  376. pr_raw = pr_block.get("得分") if isinstance(pr_block, dict) else pr_block
  377. if isinstance(pr_block, dict) and "理由" in pr_block:
  378. score_reasons["production_relevance"] = pr_block["理由"]
  379. else:
  380. pr_raw = e.get("production_relevance")
  381. try: production_relevance = int(float(pr_raw)) if pr_raw is not None else None
  382. except (TypeError, ValueError): production_relevance = None
  383. recency_hard = _recency_hard(p.get("publish_timestamp", ""))
  384. # 5. 解析 判定决策 (decision) 和 理由 (reason)
  385. reason = e.get("判定理由") or e.get("reason") or ""
  386. # 根据过滤指标决定是否保留 (过滤指标判定逻辑优先,不依赖文字匹配)
  387. is_discard = False
  388. # 制作相关性低于阈值则丢弃 (新版 0-10 满分,因此低于 4 丢弃;老版低于 2 丢弃)
  389. if production_relevance is not None:
  390. threshold = 4 if is_mod_schema else 2
  391. if production_relevance < threshold:
  392. is_discard = True
  393. # 时效性低于 2 被丢弃(发布时间超两年的老帖)
  394. if recency_hard is not None and recency_hard < 2:
  395. is_discard = True
  396. # 综合均分低于阈值被丢弃 (新版低于 6 丢弃;老版低于 3 丢弃)
  397. if overall is not None:
  398. threshold_ov = 6 if is_mod_schema else 3
  399. if overall < threshold_ov:
  400. is_discard = True
  401. decision = "discard" if is_discard else "report"
  402. # Find matching procedure html
  403. procedure_html = None
  404. case_id = r.get("case_id", "")
  405. title = p.get("title", "")
  406. run_dir = HERE / "runs_full" / run
  407. if run_dir.is_dir():
  408. # 1. 优先扫描该帖子对应的文件夹下的任何 HTML 文件 (不限名称)
  409. # 文件夹名格式: {form}_{platform}_{channel_content_id[:8]}
  410. content_id = r.get("channel_content_id") or ""
  411. if not content_id and case_id and "_" in case_id:
  412. content_id = case_id.split("_", 1)[1]
  413. plat_key = r.get("platform") or ""
  414. if form_name and plat_key and content_id:
  415. folder_name = f"{form_name}_{plat_key}_{content_id[:8]}"
  416. case_dir = run_dir / "procedures" / folder_name
  417. if case_dir.is_dir():
  418. html_files = list(case_dir.glob("*.html"))
  419. if html_files:
  420. procedure_html = f"runs_full/{run}/procedures/{folder_name}/{html_files[0].name}"
  421. # 2. 其次匹配标准文件名: case-{case_id}.html 或 {case_id}.html
  422. candidate_dirs = [run_dir, run_dir / "procedures"]
  423. if not procedure_html and case_id:
  424. named_files = [f"case-{case_id}.html", f"{case_id}.html"]
  425. for d_dir in candidate_dirs:
  426. if d_dir.is_dir():
  427. for name in named_files:
  428. if (d_dir / name).is_file():
  429. procedure_html = f"runs_full/{run}/procedures/{name}" if d_dir.name == "procedures" else f"runs_full/{run}/{name}"
  430. break
  431. if procedure_html:
  432. break
  433. # 3. 再次匹配 HTML 内部的标准声明 (meta 标签或 HTML 注释)
  434. if not procedure_html and case_id:
  435. for d_dir in candidate_dirs:
  436. if d_dir.is_dir():
  437. for html_path in d_dir.glob("*.html"):
  438. try:
  439. content = html_path.read_text(encoding="utf-8")
  440. if f'name="case-id" content="{case_id}"' in content or \
  441. f'name="case_id" content="{case_id}"' in content or \
  442. f'<!-- case_id: {case_id} -->' in content or \
  443. f'<!-- case-id: {case_id} -->' in content:
  444. procedure_html = f"runs_full/{run}/procedures/{html_path.name}" if d_dir.name == "procedures" else f"runs_full/{run}/{html_path.name}"
  445. break
  446. except Exception:
  447. continue
  448. if procedure_html:
  449. break
  450. # 4. 最后使用标题作为兜底模糊匹配
  451. if not procedure_html and title:
  452. for d_dir in candidate_dirs:
  453. if d_dir.is_dir():
  454. for html_path in d_dir.glob("*.html"):
  455. try:
  456. content = html_path.read_text(encoding="utf-8")
  457. if title in content:
  458. procedure_html = f"runs_full/{run}/procedures/{html_path.name}" if d_dir.name == "procedures" else f"runs_full/{run}/{html_path.name}"
  459. break
  460. except Exception:
  461. continue
  462. if procedure_html:
  463. break
  464. return {
  465. "case_id": r.get("case_id", ""),
  466. "platform": PLAT.get(r.get("platform"), r.get("platform")), "platformKey": r.get("platform"),
  467. "title": p.get("title", "") or "(无标题)", "date": (p.get("publish_timestamp", "") or "")[:10],
  468. "url": r.get("source_url", ""), "engagement": f'{p.get("like_count", 0)} 赞',
  469. "knowledge_type": kt, "decision": decision,
  470. "tools": [KT.get(k, k) for k in kt] + ([f"质量 {grade}"] if grade else []), "found_by": fb,
  471. "images": (p.get("images") or [])[:6], "text": p.get("body_text", "") or "",
  472. "scores": fs, "overall": overall, "reason": reason, "score_reasons": score_reasons,
  473. "grade": grade, "qscore": p.get("_quality_score", 0), "anomaly": anomaly,
  474. "production_relevance": production_relevance, "recency_hard": recency_hard,
  475. "run": run, "procedure_html": procedure_html,
  476. }
  477. def scan_runs():
  478. runs = {}
  479. for f in sorted(glob.glob(str(HERE / "runs_full" / "*" / "form_*.json"))):
  480. try:
  481. d = json.load(open(f, encoding="utf-8"))
  482. except Exception:
  483. continue
  484. run = pathlib.Path(f).parent.name
  485. form_name = d.get("form") or ""
  486. results = [adapt(r, run, form_name) for r in d.get("results", [])]
  487. report_val = sum(1 for r in results if r.get("decision") == "report" and not r.get("anomaly"))
  488. discard_val = sum(1 for r in results if r.get("decision") == "discard" and not r.get("anomaly"))
  489. runs.setdefault(run, []).append({
  490. "form": d.get("form"), "query": d.get("query"), "original_q": d.get("original_q", ""),
  491. "requirement": d.get("requirement", ""),
  492. "platforms": d.get("platforms", []), "total": d.get("total"),
  493. "report": report_val, "discard": discard_val,
  494. "results": results,
  495. })
  496. for v in runs.values():
  497. v.sort(key=lambda x: x.get("form") or "")
  498. def _qnum(name): # "q156" → 156,按数字排,避免 "q156" < "q99" 的字符串误排
  499. m = re.search(r"\d+", name)
  500. return (int(m.group()) if m else 0, name)
  501. out = []
  502. for k, v in sorted(runs.items(), key=lambda kv: _qnum(kv[0])):
  503. oq = v[0].get("original_q") or v[0].get("query") or ""
  504. seen, hits = set(), 0 # 知识命中数 = 各形式采纳(report)且非异常、按 url 去重后的帖子数
  505. for f in v:
  506. for r in f.get("results", []):
  507. if r.get("decision") == "report" and not r.get("anomaly") and r.get("url") not in seen:
  508. seen.add(r.get("url")); hits += 1
  509. out.append({"key": k, "forms": v, "dims": parse_dims(oq), "original_q": oq,
  510. "hits": hits, "tot": sum((f.get("total") or 0) for f in v)})
  511. active_reevals = {k: v["status"] for k, v in ACTIVE_REEVALS.items()}
  512. return {"queries": out, "actions": ACTIONS_TAX, "types": TYPES_TAX, "matrix": _MATRIX, "active_reevals": active_reevals}
  513. class H(BaseHTTPRequestHandler):
  514. def _send(self, code, body, ctype):
  515. b = body.encode("utf-8") if isinstance(body, str) else body
  516. self.send_response(code)
  517. if ctype.startswith("text/") or ctype == "application/json" or ctype == "application/javascript":
  518. self.send_header("Content-Type", ctype + "; charset=utf-8")
  519. else:
  520. self.send_header("Content-Type", ctype)
  521. self.send_header("Content-Length", str(len(b))); self.end_headers(); self.wfile.write(b)
  522. def do_GET(self):
  523. parsed = urlparse(self.path)
  524. path = parsed.path
  525. params = parse_qs(parsed.query)
  526. if path in ("/", "/index.html"):
  527. try:
  528. page = (HERE / "index.html").read_text(encoding="utf-8")
  529. self._send(200, page, "text/html")
  530. except Exception as e:
  531. self._send(500, f"Error reading index.html: {e}", "text/plain")
  532. elif path == "/api/data":
  533. self._send(200, json.dumps(scan_runs(), ensure_ascii=False), "application/json")
  534. elif path == "/api/procedure_status":
  535. q = (params.get("q") or [""])[0].strip()
  536. form = (params.get("form") or [""])[0].strip()
  537. case_id = (params.get("case_id") or [""])[0].strip()
  538. if not q or not form or not case_id:
  539. self._send(400, "missing q, form, or case_id", "text/plain")
  540. return
  541. folder_name = f"{form}_{_short_case(case_id)}"
  542. task_key = f"{q}/{folder_name}"
  543. if task_key in ACTIVE_TASKS:
  544. task = ACTIVE_TASKS[task_key]
  545. res = {
  546. "status": task["status"],
  547. "error": task["error"]
  548. }
  549. if task["status"] == "success":
  550. out_dir = HERE / "runs_full" / q / "procedures" / folder_name
  551. html_files = list(out_dir.glob("*.html")) if out_dir.is_dir() else []
  552. if html_files:
  553. res["procedure_html"] = f"runs_full/{q}/procedures/{folder_name}/{html_files[0].name}"
  554. self._send(200, json.dumps(res, ensure_ascii=False), "application/json")
  555. return
  556. out_dir = HERE / "runs_full" / q / "procedures" / folder_name
  557. html_files = list(out_dir.glob("*.html")) if out_dir.is_dir() else []
  558. if html_files:
  559. self._send(200, json.dumps({
  560. "status": "success",
  561. "procedure_html": f"runs_full/{q}/procedures/{folder_name}/{html_files[0].name}"
  562. }, ensure_ascii=False), "application/json")
  563. return
  564. log_path = out_dir / "_extract.log"
  565. if log_path.is_file():
  566. self._send(200, json.dumps({"status": "failed", "error": "Not running, but no HTML output found (possibly crashed)."}, ensure_ascii=False), "application/json")
  567. return
  568. self._send(200, json.dumps({"status": "not_started"}, ensure_ascii=False), "application/json")
  569. elif path == "/api/procedure_log":
  570. q = (params.get("q") or [""])[0].strip()
  571. form = (params.get("form") or [""])[0].strip()
  572. case_id = (params.get("case_id") or [""])[0].strip()
  573. if not q or not form or not case_id:
  574. self._send(400, "missing q, form, or case_id", "text/plain")
  575. return
  576. folder_name = f"{form}_{_short_case(case_id)}"
  577. log_path = HERE / "runs_full" / q / "procedures" / folder_name / "_extract.log"
  578. if not log_path.is_file():
  579. self._send(200, json.dumps({"log": ""}, ensure_ascii=False), "application/json")
  580. return
  581. try:
  582. content = log_path.read_text(encoding="utf-8", errors="replace")
  583. self._send(200, json.dumps({"log": content}, ensure_ascii=False), "application/json")
  584. except Exception as e:
  585. self._send(500, json.dumps({"error": str(e)}, ensure_ascii=False), "application/json")
  586. elif path == "/api/spec_content":
  587. file_name = (params.get("file") or [""])[0].strip()
  588. allowed = [
  589. "README.md",
  590. "tools.md",
  591. "extraction/phase1-skeleton.md",
  592. "extraction/phase2-normalize.md",
  593. "extraction/phase3-finalize.md",
  594. "taxonomy/type_suggestions.md"
  595. ]
  596. if file_name not in allowed:
  597. self._send(400, "invalid file parameter", "text/plain")
  598. return
  599. target_path = HERE / "procedure-dsl" / "spec" / file_name
  600. if not target_path.is_file():
  601. self._send(404, "spec file not found", "text/plain")
  602. return
  603. try:
  604. content = target_path.read_text(encoding="utf-8", errors="replace")
  605. self._send(200, json.dumps({"content": content}, ensure_ascii=False), "application/json")
  606. except Exception as e:
  607. self._send(500, json.dumps({"error": str(e)}, ensure_ascii=False), "application/json")
  608. elif path == "/api/reeval_status":
  609. q = (params.get("q") or [""])[0].strip()
  610. if not q:
  611. self._send(400, "missing q", "text/plain")
  612. return
  613. if q in ACTIVE_REEVALS:
  614. self._send(200, json.dumps({
  615. "status": ACTIVE_REEVALS[q]["status"],
  616. "error": ACTIVE_REEVALS[q].get("error")
  617. }, ensure_ascii=False), "application/json")
  618. else:
  619. self._send(200, json.dumps({"status": "not_started"}, ensure_ascii=False), "application/json")
  620. elif self.path.startswith("/runs_full/"):
  621. try:
  622. clean_path = self.path.split("?")[0]
  623. parts = clean_path.strip("/").split("/")
  624. target_file = HERE
  625. for part in parts:
  626. target_file = target_file / part
  627. runs_dir = HERE / "runs_full"
  628. if runs_dir.resolve() in target_file.resolve().parents and target_file.is_file():
  629. content = target_file.read_bytes()
  630. ext = target_file.suffix.lower()
  631. ctype = "text/html"
  632. if ext in (".png", ".webp"):
  633. ctype = f"image/{ext[1:]}"
  634. elif ext in (".jpg", ".jpeg"):
  635. ctype = "image/jpeg"
  636. elif ext == ".json":
  637. ctype = "application/json"
  638. elif ext == ".js":
  639. ctype = "application/javascript"
  640. elif ext == ".css":
  641. ctype = "text/css"
  642. self._send(200, content, ctype)
  643. else:
  644. self._send(404, "not found", "text/plain")
  645. except Exception as e:
  646. self._send(500, f"Error: {e}", "text/plain")
  647. else:
  648. self._send(404, "not found", "text/plain")
  649. def do_POST(self):
  650. if self.path == "/api/generate_procedure":
  651. length = int(self.headers.get("Content-Length") or 0)
  652. raw = self.rfile.read(length).decode("utf-8") if length > 0 else "{}"
  653. try:
  654. payload = json.loads(raw)
  655. except Exception as e:
  656. self._send(400, json.dumps({"error": f"bad json: {e}"}), "application/json"); return
  657. q = (payload.get("q") or "").strip()
  658. form = (payload.get("form") or "").strip()
  659. case_id = (payload.get("case_id") or "").strip()
  660. engine = (payload.get("engine") or "cyber_runner").strip()
  661. model = (payload.get("model") or "google/gemini-3.1-flash-lite").strip()
  662. if not re.match(r"^q\d+$", q):
  663. self._send(400, json.dumps({"error": f"bad q (expect 'qNN'): {q!r}"}, ensure_ascii=False), "application/json"); return
  664. if form not in ("A", "B", "C"):
  665. self._send(400, json.dumps({"error": f"bad form: {form!r}"}, ensure_ascii=False), "application/json"); return
  666. if not case_id:
  667. self._send(400, json.dumps({"error": "missing case_id"}, ensure_ascii=False), "application/json"); return
  668. q_dir = HERE / "runs_full" / q
  669. form_file = q_dir / f"form_{form}.json"
  670. if not form_file.is_file():
  671. self._send(404, json.dumps({"error": f"form file not found: {form_file.name}"}, ensure_ascii=False), "application/json"); return
  672. try:
  673. with open(form_file, encoding="utf-8") as f:
  674. form_data = json.load(f)
  675. except Exception as e:
  676. self._send(500, json.dumps({"error": f"failed to read form: {e}"}, ensure_ascii=False), "application/json"); return
  677. matching_result = None
  678. for r in form_data.get("results", []):
  679. if r.get("case_id") == case_id:
  680. matching_result = r
  681. break
  682. if not matching_result:
  683. self._send(404, json.dumps({"error": f"case_id {case_id} not found in form {form}"}, ensure_ascii=False), "application/json"); return
  684. folder_name = f"{form}_{_short_case(case_id)}"
  685. out_dir = q_dir / "procedures" / folder_name
  686. out_dir.mkdir(parents=True, exist_ok=True)
  687. src_path = out_dir / "_source.json"
  688. try:
  689. with open(src_path, "w", encoding="utf-8") as f:
  690. json.dump(_source_to_dsl_input(matching_result), f, ensure_ascii=False, indent=2)
  691. score = _composite_score(matching_result.get("llm_evaluation") or {})
  692. _write_meta(out_dir, case_id=case_id, from_q=q, form=form, score=score)
  693. except Exception as e:
  694. self._send(500, json.dumps({"error": f"failed to write inputs: {e}"}, ensure_ascii=False), "application/json"); return
  695. task_key = f"{q}/{folder_name}"
  696. ACTIVE_TASKS[task_key] = {
  697. "status": "running",
  698. "start_time": datetime.now().isoformat(),
  699. "pid": None,
  700. "error": None
  701. }
  702. t = threading.Thread(target=run_extraction_task, args=(q, folder_name, src_path, out_dir, engine, model))
  703. t.daemon = True
  704. t.start()
  705. self._send(200, json.dumps({
  706. "status": "started",
  707. "task_key": task_key,
  708. "log": f"runs_full/{q}/procedures/{folder_name}/_extract.log"
  709. }, ensure_ascii=False), "application/json")
  710. elif self.path == "/api/reeval":
  711. length = int(self.headers.get("Content-Length") or 0)
  712. raw = self.rfile.read(length).decode("utf-8") if length > 0 else "{}"
  713. try:
  714. payload = json.loads(raw)
  715. except Exception as e:
  716. self._send(400, json.dumps({"error": f"bad json: {e}"}), "application/json"); return
  717. q = (payload.get("q") or "").strip()
  718. if not re.match(r"^q\d+$", q):
  719. self._send(400, json.dumps({"error": f"bad q (expect 'qNN'): {q!r}"},
  720. ensure_ascii=False), "application/json"); return
  721. q_dir = HERE / "runs_full" / q
  722. if not q_dir.is_dir():
  723. self._send(404, json.dumps({"error": f"runs_full/{q} not found"}, ensure_ascii=False),
  724. "application/json"); return
  725. log_path = q_dir / "_reeval.log"
  726. try:
  727. log_fh = open(log_path, "w", encoding="utf-8", buffering=1)
  728. cmd = [sys.executable, "-u", str(HERE / "batch_3forms.py"),
  729. "--reeval", "--reeval-q", q, "--output-dir", str(HERE / "runs_full")]
  730. flags = subprocess.CREATE_NEW_PROCESS_GROUP if sys.platform == "win32" else 0
  731. proc = subprocess.Popen(cmd, stdout=log_fh, stderr=subprocess.STDOUT,
  732. cwd=str(HERE), creationflags=flags)
  733. ACTIVE_REEVALS[q] = {
  734. "status": "running",
  735. "pid": proc.pid,
  736. "error": None
  737. }
  738. def wait_reeval(q_key, p_obj, fh):
  739. try:
  740. p_obj.wait()
  741. if p_obj.returncode == 0:
  742. ACTIVE_REEVALS[q_key]["status"] = "success"
  743. else:
  744. ACTIVE_REEVALS[q_key]["status"] = "failed"
  745. ACTIVE_REEVALS[q_key]["error"] = f"Subprocess exited with code {p_obj.returncode}"
  746. except Exception as ex:
  747. ACTIVE_REEVALS[q_key]["status"] = "failed"
  748. ACTIVE_REEVALS[q_key]["error"] = str(ex)
  749. finally:
  750. try:
  751. fh.close()
  752. except Exception:
  753. pass
  754. t = threading.Thread(target=wait_reeval, args=(q, proc, log_fh))
  755. t.daemon = True
  756. t.start()
  757. self._send(200, json.dumps(
  758. {"status": "started", "pid": proc.pid, "q": q,
  759. "log": str(log_path.relative_to(HERE))},
  760. ensure_ascii=False), "application/json")
  761. except Exception as e:
  762. self._send(500, json.dumps({"error": f"failed to start: {e}"},
  763. ensure_ascii=False), "application/json")
  764. elif self.path == "/api/save_spec":
  765. length = int(self.headers.get("Content-Length") or 0)
  766. raw = self.rfile.read(length).decode("utf-8") if length > 0 else "{}"
  767. try:
  768. payload = json.loads(raw)
  769. except Exception as e:
  770. self._send(400, json.dumps({"error": f"bad json: {e}"}), "application/json"); return
  771. file_name = (payload.get("file") or "").strip()
  772. content = payload.get("content") or ""
  773. allowed = [
  774. "README.md",
  775. "tools.md",
  776. "extraction/phase1-skeleton.md",
  777. "extraction/phase2-normalize.md",
  778. "extraction/phase3-finalize.md",
  779. "taxonomy/type_suggestions.md"
  780. ]
  781. if file_name not in allowed:
  782. self._send(400, json.dumps({"error": "invalid file parameter"}), "application/json"); return
  783. target_path = HERE / "procedure-dsl" / "spec" / file_name
  784. try:
  785. target_path.parent.mkdir(parents=True, exist_ok=True)
  786. target_path.write_text(content, encoding="utf-8")
  787. self._send(200, json.dumps({"status": "ok"}, ensure_ascii=False), "application/json")
  788. except Exception as e:
  789. self._send(500, json.dumps({"error": str(e)}, ensure_ascii=False), "application/json")
  790. else:
  791. self._send(404, json.dumps({"error": "not found"}), "application/json")
  792. def log_message(self, *a): pass
  793. if __name__ == "__main__":
  794. n = len(scan_runs()["queries"])
  795. print(f"搜索评估查看 server:http://0.0.0.0:{PORT} (runs_full/ 下 {n} 个 query,实时扫描)")
  796. ThreadingHTTPServer(("0.0.0.0", PORT), H).serve_forever()