remap_runs_full.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # -*- coding: utf-8 -*-
  2. """把 runs_full/qXXXX 里 idx 错位的 q 目录原地搬到正确 idx 位。
  3. 错位成因:旧 batch_3forms.py 默认吃 high_priority_queries.json(468 条),写入 runs_full
  4. 时用的是那个文件的 idx。后来换默认到 high_priority_queries_full.json(2808 条),
  5. 两套 idx 表是不同的,于是 runs_full/q0002 装的 original_q 在新 full 表里其实属于
  6. 另一个 idx。本脚本读每个 q 的 form_A.json original_q,查新 full 表的等值 q,
  7. mv 到正确目录 q{正确 idx:04d}/。
  8. 规则:
  9. - 已在正确位的: 跳过
  10. - 错位但目标位空闲: shutil.move 改名(顺手更新 _manifest.json)
  11. - 错位但目标位已占用: 不动,写入 _conflicts 段供人工裁决
  12. - original_q 在新 full 表里找不到: 标记 unmatched(应该不会发生,前面 build_runs_mapping 已过滤)
  13. """
  14. import json
  15. import re
  16. import shutil
  17. from pathlib import Path
  18. HERE = Path(__file__).parent
  19. SEARCH_EVAL = HERE.parent
  20. RUNS_FULL = SEARCH_EVAL / "runs_full"
  21. FULL = HERE / "high_priority_queries_full.json"
  22. MANIFEST = RUNS_FULL / "_manifest.json"
  23. def _qnum(name: str) -> int:
  24. m = re.search(r"\d+", name)
  25. return int(m.group()) if m else -1
  26. def load_original_q(qd: Path):
  27. for fk in ("A", "B", "C"):
  28. f = qd / f"form_{fk}.json"
  29. if not f.exists():
  30. continue
  31. try:
  32. d = json.load(open(f, encoding="utf-8"))
  33. oq = d.get("original_q") or d.get("query") or ""
  34. if oq:
  35. return oq, fk
  36. except Exception:
  37. continue
  38. return "", ""
  39. def main():
  40. full = json.load(open(FULL, encoding="utf-8"))["queries"]
  41. q_to_idx = {item["q"]: i for i, item in enumerate(full)}
  42. q_dirs = sorted([d for d in RUNS_FULL.iterdir() if d.is_dir() and d.name.startswith("q")],
  43. key=lambda p: _qnum(p.name))
  44. already_ok, moved, conflicts, unmatched = [], [], [], []
  45. # Plan: 先扫描决策,再执行;避免边扫边动导致后续判断错乱
  46. plan = []
  47. for qd in q_dirs:
  48. cur_idx = _qnum(qd.name)
  49. oq, src_form = load_original_q(qd)
  50. if not oq:
  51. unmatched.append({"name": qd.name, "reason": "form_*.json 缺失或 original_q 空"})
  52. continue
  53. correct_idx = q_to_idx.get(oq)
  54. if correct_idx is None:
  55. unmatched.append({"name": qd.name, "original_q": oq,
  56. "reason": "在新 full 表中找不到等值 q(可能是历史模态)"})
  57. continue
  58. if correct_idx == cur_idx:
  59. already_ok.append((qd.name, oq))
  60. continue
  61. plan.append((qd, cur_idx, correct_idx, oq))
  62. # 两轮执行:先把没冲突的全搬走,再回头看冲突(可能搬走后冲突就消解了)
  63. pending = list(plan)
  64. progress = True
  65. while pending and progress:
  66. progress = False
  67. next_pending = []
  68. for qd, cur_idx, correct_idx, oq in pending:
  69. dst = RUNS_FULL / f"q{correct_idx:04d}"
  70. if dst.exists():
  71. next_pending.append((qd, cur_idx, correct_idx, oq))
  72. continue
  73. shutil.move(str(qd), str(dst))
  74. moved.append({"from": qd.name, "to": dst.name, "original_q": oq})
  75. progress = True
  76. pending = next_pending
  77. # 仍有冲突的:目标位真的被占用且非空(让人工看)
  78. for qd, cur_idx, correct_idx, oq in pending:
  79. dst = RUNS_FULL / f"q{correct_idx:04d}"
  80. dst_oq, _ = load_original_q(dst) if dst.exists() else ("", "")
  81. conflicts.append({
  82. "src": qd.name, "src_idx": cur_idx, "src_original_q": oq,
  83. "dst": dst.name, "dst_idx": correct_idx, "dst_original_q": dst_oq,
  84. })
  85. # 更新 manifest:用搬完后的 runs_full 状态重写
  86. new_manifest_items = []
  87. for qd in sorted([d for d in RUNS_FULL.iterdir() if d.is_dir() and d.name.startswith("q")],
  88. key=lambda p: _qnum(p.name)):
  89. oq, _ = load_original_q(qd)
  90. idx = _qnum(qd.name)
  91. if idx >= 0 and idx < len(full):
  92. item = full[idx]
  93. new_manifest_items.append({
  94. "qdir": qd.name, "full_idx": idx,
  95. "original_q": oq,
  96. "cell_idx": idx // 18,
  97. "lens": item["lens"],
  98. "constraint_value": (item.get("constraint") or {}).get("value") or "无约束",
  99. })
  100. MANIFEST.write_text(json.dumps({
  101. "_doc": "runs_full/qXXXX 的当前状态快照;XXXX = high_priority_queries_full.json idx 4 位 zero-pad",
  102. "items": new_manifest_items,
  103. }, ensure_ascii=False, indent=2), encoding="utf-8")
  104. print(f"=== runs_full 原地重映射结果 ===")
  105. print(f" 原本就对的: {len(already_ok)}")
  106. print(f" 移动到位: {len(moved)}")
  107. print(f" 冲突待人工: {len(conflicts)}")
  108. print(f" 未匹配: {len(unmatched)}")
  109. print(f"→ manifest 已重写: {MANIFEST}")
  110. print()
  111. if moved[:8]:
  112. print("=== 已搬动(抽样)===")
  113. for m in moved[:8]:
  114. print(f" {m['from']:<7} → {m['to']:<7} [{m['original_q']}]")
  115. if conflicts:
  116. print()
  117. print("=== ⚠️ 冲突(目标位被别的 original_q 占住,未搬)===")
  118. for c in conflicts:
  119. print(f" {c['src']} (idx={c['src_idx']}, q={c['src_original_q']!r})")
  120. print(f" -> 想去 {c['dst']} (idx={c['dst_idx']}, 但被 {c['dst_original_q']!r} 占)")
  121. if unmatched:
  122. print()
  123. print("=== ⚠️ 在新 full 中找不到 ===")
  124. for u in unmatched:
  125. print(f" {u}")
  126. if __name__ == "__main__":
  127. main()