build_full_queries.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # -*- coding: utf-8 -*-
  2. """一次性脚本:把高优 156 cell 铺成 156 × 3 lens × 6 工具类型约束 = 2808 条 query。
  3. 输入:
  4. - 同目录 judged_matrix.json:拿 actions/types,确定 156 个 score==3 的 (action, type) cell
  5. - 同目录 type_action_scores.json:兜底拿 score;现有 high_priority_queries.json 用来回填 gemini_q
  6. 输出:
  7. - high_priority_queries_full.json:扁平 queries 数组,按 (cell, lens, constraint) 顺序
  8. 约束 6 档:null(无约束)+ {AI 模型, 桌面 APP, 云端 Web, API·CLI, 插件扩展}
  9. lens 后缀:工序→教程 / 工具→工具 / 能力→技巧
  10. 注意:模态约束(文/图/视频/音频)已废弃,不再列入。runs/ 下历史模态文件夹保留作旧实验,
  11. 新表索引体系跟它们不挂钩;runs_to_full_mapping.json 的 unmatched 段会标出来。
  12. """
  13. import json
  14. from pathlib import Path
  15. HERE = Path(__file__).parent
  16. SRC_HP = HERE.parent / "high_priority_queries.json" # 原 468 条,用它的 gemini_q
  17. JM = HERE / "judged_matrix.json"
  18. TAS = HERE / "type_action_scores.json"
  19. OUT = HERE / "high_priority_queries_full.json"
  20. LENS_SUFFIX = {"工序": "教程", "工具": "工具", "能力": "技巧"}
  21. # (首词, constraint dict);首词为 None 表示无约束
  22. CONSTRAINTS = [
  23. (None, None),
  24. ("AI", {"kind": "工具类型", "value": "AI 模型", "限定词": "AI"}),
  25. ("软件", {"kind": "工具类型", "value": "桌面 APP", "限定词": "软件"}),
  26. ("在线", {"kind": "工具类型", "value": "云端 Web", "限定词": "在线"}),
  27. ("代码", {"kind": "工具类型", "value": "API·CLI", "限定词": "代码"}),
  28. ("插件", {"kind": "工具类型", "value": "插件扩展", "限定词": "插件"}),
  29. ]
  30. def main():
  31. jm = json.load(open(JM, encoding="utf-8"))
  32. actions = jm["actions"] # 27 条;带 name / l1 / (l2)
  33. types = jm["types"] # 50 条;带 name / l1
  34. act_l1 = {a["name"]: a["l1"] for a in actions}
  35. tas = json.load(open(TAS, encoding="utf-8"))["scores"]
  36. # 取 score==3 的 (type, action) cell,对齐 high_priority_queries.json 的"156 高优"集合
  37. cells = []
  38. for t in types:
  39. for a in actions:
  40. rec = tas.get(t["name"], {}).get(a["name"])
  41. if rec and rec.get("score") == 3:
  42. cells.append((a["name"], t["name"]))
  43. # 从原文件读 gemini_q:以 (action, type) 为键,每 cell 取首条
  44. gemini_by_cell = {}
  45. if SRC_HP.exists():
  46. for it in json.load(open(SRC_HP, encoding="utf-8"))["queries"]:
  47. key = (it["action"], it["type"])
  48. gemini_by_cell.setdefault(key, it.get("gemini_q", ""))
  49. # 生成:cell × lens × constraint
  50. queries = []
  51. for action, type_ in cells:
  52. for lens, suffix in LENS_SUFFIX.items():
  53. for qual, cons in CONSTRAINTS:
  54. tokens = [qual, action, type_, suffix] if qual else [action, type_, suffix]
  55. queries.append({
  56. "q": " ".join(tokens),
  57. "lens": lens,
  58. "type": type_,
  59. "action": action,
  60. "action_l1": act_l1.get(action, ""),
  61. "score": 3,
  62. "constraint": cons,
  63. "gemini_q": gemini_by_cell.get((action, type_), ""),
  64. })
  65. out = {
  66. "_doc": (
  67. "高优先级 query 全展开:156 cell × 3 lens × 6 工具类型约束 = 2808 条。"
  68. "lens 后缀 工序→教程 / 工具→工具 / 能力→技巧;约束 6 档 = 无约束 + 5 工具类型。"
  69. "模态约束已废弃,runs/ 下历史模态 q 文件夹不在本索引内。"
  70. "gemini_q 按 (action, type) cell 复用同一句,跨 lens / 约束不变。"
  71. ),
  72. "model": "gemini-3.1-flash-lite",
  73. "threshold": 3,
  74. "lenses": list(LENS_SUFFIX),
  75. "tool_constraints": [c["value"] if c else None for _, c in CONSTRAINTS],
  76. "cells": len(cells),
  77. "per_cell": len(LENS_SUFFIX) * len(CONSTRAINTS), # 18
  78. "total": len(queries),
  79. "queries": queries,
  80. }
  81. with open(OUT, "w", encoding="utf-8") as f:
  82. json.dump(out, f, ensure_ascii=False, indent=2)
  83. # 自检:156 × 3 × 6 = 2808
  84. assert len(cells) == 156, f"cells={len(cells)} 不是 156,检查 type_action_scores 阈值"
  85. expected = 156 * 3 * 6
  86. assert len(queries) == expected, f"queries={len(queries)} != {expected}"
  87. # 抽样打印
  88. print(f"✅ cells={len(cells)} queries={len(queries)} -> {OUT}")
  89. print()
  90. print("== 抽样:第 1 个 cell 的 18 条 ==")
  91. for q in queries[:18]:
  92. cv = (q["constraint"] or {}).get("value", "")
  93. print(f" [{q['lens']}/{cv or '无约束':6}] {q['q']}")
  94. if __name__ == "__main__":
  95. main()