backfill_req_cap.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #!/usr/bin/env python3
  2. """
  3. 系统性补齐 requirement_capability 到"研究全集"(A 方案)。
  4. 对每个 requirement (folder),读其 capabilities_extracted.json 中所有 cap,
  5. 用 alias 解析成 canonical capability_id,
  6. INSERT requirement_capability ON CONFLICT DO NOTHING。
  7. 现有 req_cap 大多只是 strategy 的 workflow_outline 子集,
  8. 此脚本会把所有 research-discovered 的 cap 都补进来。
  9. 源数据:
  10. - 94 folder:/Users/sunlit/Downloads/output 2/<NN>/
  11. - 5 folder(重跑数据):/Users/sunlit/Downloads/5/<NN>/
  12. 非标准 schema 处理:
  13. - 008/034:capabilities_extracted.json 没有 extracted_capabilities key,
  14. 用 'capabilities' list
  15. - 044:旧 schema,用 capability_id / capability_name
  16. - 053:extracted_capabilities=[],跳过(已有 strategy-subset 即是全集)
  17. - 077:JSON parse error,用正则 fallback 提取 name + id
  18. - 004/031/066/070:虽 strategy 格式异常,但 caps 文件是标准的
  19. """
  20. import hashlib
  21. import json
  22. import re
  23. import sys
  24. from pathlib import Path
  25. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  26. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  27. from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS
  28. from knowhub.scripts.rename_merged_capabilities import RENAMES
  29. from knowhub.scripts.llm_renames import LLM_RENAMES
  30. OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
  31. RERUN_DIR = Path('/Users/sunlit/Downloads/5')
  32. RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
  33. def norm(s):
  34. return (s or '').strip().lower()
  35. def build_alias_map(cur):
  36. member_to_canonical = {}
  37. for canonical, members in MERGE_CLUSTERS.items():
  38. for m in members:
  39. member_to_canonical[m] = canonical
  40. def final(cid, limit=10):
  41. seen = set()
  42. while cid in member_to_canonical and cid not in seen and limit > 0:
  43. seen.add(cid); cid = member_to_canonical[cid]; limit -= 1
  44. return cid
  45. for m in list(member_to_canonical.keys()):
  46. member_to_canonical[m] = final(m)
  47. alias = {}
  48. cur.execute('SELECT id, name FROM capability')
  49. for r in cur.fetchall():
  50. alias[norm(r['name'])] = r['id']
  51. for cid, (new_name, _) in RENAMES.items():
  52. alias[norm(new_name)] = final(cid)
  53. # LLM-generated renames (aliases for LLM-invented cap names that are actually dups)
  54. for llm_name, canonical in LLM_RENAMES.items():
  55. alias[norm(llm_name)] = final(canonical)
  56. return alias
  57. def extract_caps_from_file(folder_path):
  58. """Return list of {id, name} cap dicts from capabilities_extracted.json, or []."""
  59. fp = folder_path / 'capabilities_extracted.json'
  60. if not fp.exists():
  61. return [], 'no_file'
  62. folder_key = folder_path.name
  63. text = fp.read_text(encoding='utf-8')
  64. # Special: 077 JSON parse error — regex fallback
  65. try:
  66. data = json.loads(text)
  67. except Exception:
  68. names = re.findall(r'"name"\s*:\s*"([^"]+)"', text)
  69. ids = re.findall(r'"id"\s*:\s*(?:"([^"]+)"|null)', text)
  70. caps = []
  71. for i, n in enumerate(names):
  72. caps.append({'id': ids[i] if i < len(ids) else None, 'name': n})
  73. return caps, 'parse_err_regex'
  74. # 008/034: 'capabilities' key
  75. if 'extracted_capabilities' not in data and 'capabilities' in data:
  76. ec = data['capabilities']
  77. return [{'id': c.get('id') or c.get('cap_id'), 'name': c.get('name', '')}
  78. for c in ec if isinstance(c, dict)], 'alt_key'
  79. ec = data.get('extracted_capabilities', [])
  80. if not ec:
  81. return [], 'empty'
  82. # 044: old schema capability_id/capability_name
  83. first = ec[0] if ec else {}
  84. if isinstance(first, dict) and 'capability_name' in first and 'name' not in first:
  85. return [{'id': c.get('capability_id'),
  86. 'name': c.get('capability_name', '')}
  87. for c in ec if isinstance(c, dict)], 'old_schema'
  88. return [{'id': c.get('id'), 'name': c.get('name', '')}
  89. for c in ec if isinstance(c, dict)], 'standard'
  90. def get_req_text_and_id(folder_path, cur):
  91. """Try blueprint.json → strategy.json → capabilities_extracted.json for requirement text."""
  92. for fn in ['blueprint.json', 'strategy.json', 'capabilities_extracted.json']:
  93. fp = folder_path / fn
  94. if not fp.exists():
  95. continue
  96. try:
  97. d = json.loads(fp.read_text(encoding='utf-8'))
  98. rt = d.get('requirement', '')
  99. if rt:
  100. cur.execute('SELECT id FROM requirement WHERE description = %s LIMIT 1', (rt,))
  101. row = cur.fetchone()
  102. if row:
  103. return rt, row['id']
  104. except Exception:
  105. continue
  106. return None, None
  107. def main():
  108. s = PostgreSQLCapabilityStore()
  109. cur = s._get_cursor()
  110. try:
  111. print('Building alias map...', flush=True)
  112. alias = build_alias_map(cur)
  113. print(f' alias entries: {len(alias)}', flush=True)
  114. # resolve source folder per requirement folder
  115. folders = []
  116. for d in sorted(OUTPUT_DIR.iterdir()):
  117. if not d.is_dir():
  118. continue
  119. key = d.name
  120. if key in RERUN_FOLDERS:
  121. folders.append(RERUN_DIR / key) # use re-run data
  122. else:
  123. folders.append(d)
  124. totals = {'folders_processed': 0, 'folders_no_req': 0,
  125. 'caps_resolved': 0, 'caps_unresolved': 0,
  126. 'inserted': 0, 'already_present': 0,
  127. 'schema_counts': {}, 'unresolved_names': []}
  128. for folder in folders:
  129. fk = folder.name
  130. req_text, req_id = get_req_text_and_id(folder, cur)
  131. if not req_id:
  132. totals['folders_no_req'] += 1
  133. print(f'[{fk}] ⚠️ no matching req', flush=True)
  134. continue
  135. caps, schema = extract_caps_from_file(folder)
  136. totals['schema_counts'][schema] = totals['schema_counts'].get(schema, 0) + 1
  137. # pre-count current req_cap for this req
  138. cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s',
  139. (req_id,))
  140. before = cur.fetchone()['c']
  141. resolved_ids = set()
  142. unresolved = []
  143. for cap in caps:
  144. cid = cap.get('id')
  145. name = cap.get('name', '')
  146. found = None
  147. # (1) id exists in DB?
  148. if cid:
  149. cur.execute('SELECT 1 FROM capability WHERE id = %s', (cid,))
  150. if cur.fetchone():
  151. found = cid
  152. # (2) alias by name?
  153. if not found and name:
  154. cand = alias.get(norm(name))
  155. if cand:
  156. cur.execute('SELECT 1 FROM capability WHERE id = %s', (cand,))
  157. if cur.fetchone():
  158. found = cand
  159. if found:
  160. resolved_ids.add(found)
  161. totals['caps_resolved'] += 1
  162. else:
  163. unresolved.append(f'{cid}/{name[:30]}')
  164. totals['caps_unresolved'] += 1
  165. inserted = 0
  166. for cid in resolved_ids:
  167. cur.execute("""INSERT INTO requirement_capability (requirement_id, capability_id)
  168. VALUES (%s, %s) ON CONFLICT DO NOTHING""", (req_id, cid))
  169. inserted += cur.rowcount or 0
  170. cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s',
  171. (req_id,))
  172. after = cur.fetchone()['c']
  173. totals['inserted'] += (after - before)
  174. totals['folders_processed'] += 1
  175. if unresolved:
  176. totals['unresolved_names'].extend(unresolved[:3])
  177. print(f'[{fk}] ({schema}) req={req_id} caps_in_file={len(caps)} resolved={len(resolved_ids)} '
  178. f'new_inserted={after-before} (was {before} → now {after}) '
  179. f'unresolved={len(unresolved)}',
  180. flush=True)
  181. print(f'\n{"="*60}\nTotals:', flush=True)
  182. for k, v in totals.items():
  183. if isinstance(v, list):
  184. print(f' {k}: {len(v)} (sample: {v[:5]})', flush=True)
  185. else:
  186. print(f' {k}: {v}', flush=True)
  187. finally:
  188. cur.close()
  189. s.close()
  190. if __name__ == '__main__':
  191. main()