# -*- coding: utf-8 -*- """一次性脚本:把高优 156 cell 铺成 156 × 3 lens × 6 工具类型约束 = 2808 条 query。 输入: - 同目录 judged_matrix.json:拿 actions/types,确定 156 个 score==3 的 (action, type) cell - 同目录 type_action_scores.json:兜底拿 score;现有 high_priority_queries.json 用来回填 gemini_q 输出: - high_priority_queries_full.json:扁平 queries 数组,按 (cell, lens, constraint) 顺序 约束 6 档:null(无约束)+ {AI 模型, 桌面 APP, 云端 Web, API·CLI, 插件扩展} lens 后缀:工序→教程 / 工具→工具 / 能力→技巧 注意:模态约束(文/图/视频/音频)已废弃,不再列入。runs/ 下历史模态文件夹保留作旧实验, 新表索引体系跟它们不挂钩;runs_to_full_mapping.json 的 unmatched 段会标出来。 """ import json from pathlib import Path HERE = Path(__file__).parent SRC_HP = HERE.parent / "high_priority_queries.json" # 原 468 条,用它的 gemini_q JM = HERE / "judged_matrix.json" TAS = HERE / "type_action_scores.json" OUT = HERE / "high_priority_queries_full.json" LENS_SUFFIX = {"工序": "教程", "工具": "工具", "能力": "技巧"} # (首词, constraint dict);首词为 None 表示无约束 CONSTRAINTS = [ (None, None), ("AI", {"kind": "工具类型", "value": "AI 模型", "限定词": "AI"}), ("软件", {"kind": "工具类型", "value": "桌面 APP", "限定词": "软件"}), ("在线", {"kind": "工具类型", "value": "云端 Web", "限定词": "在线"}), ("代码", {"kind": "工具类型", "value": "API·CLI", "限定词": "代码"}), ("插件", {"kind": "工具类型", "value": "插件扩展", "限定词": "插件"}), ] def main(): jm = json.load(open(JM, encoding="utf-8")) actions = jm["actions"] # 27 条;带 name / l1 / (l2) types = jm["types"] # 50 条;带 name / l1 act_l1 = {a["name"]: a["l1"] for a in actions} tas = json.load(open(TAS, encoding="utf-8"))["scores"] # 取 score==3 的 (type, action) cell,对齐 high_priority_queries.json 的"156 高优"集合 cells = [] for t in types: for a in actions: rec = tas.get(t["name"], {}).get(a["name"]) if rec and rec.get("score") == 3: cells.append((a["name"], t["name"])) # 从原文件读 gemini_q:以 (action, type) 为键,每 cell 取首条 gemini_by_cell = {} if SRC_HP.exists(): for it in json.load(open(SRC_HP, encoding="utf-8"))["queries"]: key = (it["action"], it["type"]) gemini_by_cell.setdefault(key, it.get("gemini_q", "")) # 生成:cell × lens × constraint queries = [] for action, type_ in cells: for lens, suffix in LENS_SUFFIX.items(): for qual, cons in CONSTRAINTS: tokens = [qual, action, type_, suffix] if qual else [action, type_, suffix] queries.append({ "q": " ".join(tokens), "lens": lens, "type": type_, "action": action, "action_l1": act_l1.get(action, ""), "score": 3, "constraint": cons, "gemini_q": gemini_by_cell.get((action, type_), ""), }) out = { "_doc": ( "高优先级 query 全展开:156 cell × 3 lens × 6 工具类型约束 = 2808 条。" "lens 后缀 工序→教程 / 工具→工具 / 能力→技巧;约束 6 档 = 无约束 + 5 工具类型。" "模态约束已废弃,runs/ 下历史模态 q 文件夹不在本索引内。" "gemini_q 按 (action, type) cell 复用同一句,跨 lens / 约束不变。" ), "model": "gemini-3.1-flash-lite", "threshold": 3, "lenses": list(LENS_SUFFIX), "tool_constraints": [c["value"] if c else None for _, c in CONSTRAINTS], "cells": len(cells), "per_cell": len(LENS_SUFFIX) * len(CONSTRAINTS), # 18 "total": len(queries), "queries": queries, } with open(OUT, "w", encoding="utf-8") as f: json.dump(out, f, ensure_ascii=False, indent=2) # 自检:156 × 3 × 6 = 2808 assert len(cells) == 156, f"cells={len(cells)} 不是 156,检查 type_action_scores 阈值" expected = 156 * 3 * 6 assert len(queries) == expected, f"queries={len(queries)} != {expected}" # 抽样打印 print(f"✅ cells={len(cells)} queries={len(queries)} -> {OUT}") print() print("== 抽样:第 1 个 cell 的 18 条 ==") for q in queries[:18]: cv = (q["constraint"] or {}).get("value", "") print(f" [{q['lens']}/{cv or '无约束':6}] {q['q']}") if __name__ == "__main__": main()