| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- # -*- coding: utf-8 -*-
- """一次性脚本:把高优 156 cell 铺成 156 × 3 lens × 6 工具类型约束 = 2808 条 query。
- 输入:
- - 同目录 judged_matrix.json:拿 actions/types,确定 156 个 score==3 的 (action, type) cell
- - 同目录 type_action_scores.json:兜底拿 score;现有 high_priority_queries.json 用来回填 gemini_q
- 输出:
- - high_priority_queries_full.json:扁平 queries 数组,按 (cell, lens, constraint) 顺序
- 约束 6 档:null(无约束)+ {AI 模型, 桌面 APP, 云端 Web, API·CLI, 插件扩展}
- lens 后缀:工序→教程 / 工具→工具 / 能力→技巧
- 注意:模态约束(文/图/视频/音频)已废弃,不再列入。runs/ 下历史模态文件夹保留作旧实验,
- 新表索引体系跟它们不挂钩;runs_to_full_mapping.json 的 unmatched 段会标出来。
- """
- import json
- from pathlib import Path
- HERE = Path(__file__).parent
- SRC_HP = HERE.parent / "high_priority_queries.json" # 原 468 条,用它的 gemini_q
- JM = HERE / "judged_matrix.json"
- TAS = HERE / "type_action_scores.json"
- OUT = HERE / "high_priority_queries_full.json"
- LENS_SUFFIX = {"工序": "教程", "工具": "工具", "能力": "技巧"}
- # (首词, constraint dict);首词为 None 表示无约束
- CONSTRAINTS = [
- (None, None),
- ("AI", {"kind": "工具类型", "value": "AI 模型", "限定词": "AI"}),
- ("软件", {"kind": "工具类型", "value": "桌面 APP", "限定词": "软件"}),
- ("在线", {"kind": "工具类型", "value": "云端 Web", "限定词": "在线"}),
- ("代码", {"kind": "工具类型", "value": "API·CLI", "限定词": "代码"}),
- ("插件", {"kind": "工具类型", "value": "插件扩展", "限定词": "插件"}),
- ]
- def main():
- jm = json.load(open(JM, encoding="utf-8"))
- actions = jm["actions"] # 27 条;带 name / l1 / (l2)
- types = jm["types"] # 50 条;带 name / l1
- act_l1 = {a["name"]: a["l1"] for a in actions}
- tas = json.load(open(TAS, encoding="utf-8"))["scores"]
- # 取 score==3 的 (type, action) cell,对齐 high_priority_queries.json 的"156 高优"集合
- cells = []
- for t in types:
- for a in actions:
- rec = tas.get(t["name"], {}).get(a["name"])
- if rec and rec.get("score") == 3:
- cells.append((a["name"], t["name"]))
- # 从原文件读 gemini_q:以 (action, type) 为键,每 cell 取首条
- gemini_by_cell = {}
- if SRC_HP.exists():
- for it in json.load(open(SRC_HP, encoding="utf-8"))["queries"]:
- key = (it["action"], it["type"])
- gemini_by_cell.setdefault(key, it.get("gemini_q", ""))
- # 生成:cell × lens × constraint
- queries = []
- for action, type_ in cells:
- for lens, suffix in LENS_SUFFIX.items():
- for qual, cons in CONSTRAINTS:
- tokens = [qual, action, type_, suffix] if qual else [action, type_, suffix]
- queries.append({
- "q": " ".join(tokens),
- "lens": lens,
- "type": type_,
- "action": action,
- "action_l1": act_l1.get(action, ""),
- "score": 3,
- "constraint": cons,
- "gemini_q": gemini_by_cell.get((action, type_), ""),
- })
- out = {
- "_doc": (
- "高优先级 query 全展开:156 cell × 3 lens × 6 工具类型约束 = 2808 条。"
- "lens 后缀 工序→教程 / 工具→工具 / 能力→技巧;约束 6 档 = 无约束 + 5 工具类型。"
- "模态约束已废弃,runs/ 下历史模态 q 文件夹不在本索引内。"
- "gemini_q 按 (action, type) cell 复用同一句,跨 lens / 约束不变。"
- ),
- "model": "gemini-3.1-flash-lite",
- "threshold": 3,
- "lenses": list(LENS_SUFFIX),
- "tool_constraints": [c["value"] if c else None for _, c in CONSTRAINTS],
- "cells": len(cells),
- "per_cell": len(LENS_SUFFIX) * len(CONSTRAINTS), # 18
- "total": len(queries),
- "queries": queries,
- }
- with open(OUT, "w", encoding="utf-8") as f:
- json.dump(out, f, ensure_ascii=False, indent=2)
- # 自检:156 × 3 × 6 = 2808
- assert len(cells) == 156, f"cells={len(cells)} 不是 156,检查 type_action_scores 阈值"
- expected = 156 * 3 * 6
- assert len(queries) == expected, f"queries={len(queries)} != {expected}"
- # 抽样打印
- print(f"✅ cells={len(cells)} queries={len(queries)} -> {OUT}")
- print()
- print("== 抽样:第 1 个 cell 的 18 条 ==")
- for q in queries[:18]:
- cv = (q["constraint"] or {}).get("value", "")
- print(f" [{q['lens']}/{cv or '无约束':6}] {q['q']}")
- if __name__ == "__main__":
- main()
|