dump_unresolved_caps.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/bin/env python3
  2. """
  3. 把 backfill_req_cap.py 没匹配到 canonical 的 cap 全部 dump 出来,
  4. 附带 folder / is_new / description / implements,供人工判断。
  5. """
  6. import hashlib
  7. import json
  8. import re
  9. import sys
  10. from collections import defaultdict
  11. from pathlib import Path
  12. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  13. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  14. from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS
  15. from knowhub.scripts.rename_merged_capabilities import RENAMES
  16. OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
  17. RERUN_DIR = Path('/Users/sunlit/Downloads/5')
  18. RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
  19. def norm(s):
  20. return (s or '').strip().lower()
  21. def build_alias(cur):
  22. m2c = {}
  23. for canonical, members in MERGE_CLUSTERS.items():
  24. for m in members:
  25. m2c[m] = canonical
  26. def final(cid, limit=10):
  27. seen = set()
  28. while cid in m2c and cid not in seen and limit > 0:
  29. seen.add(cid); cid = m2c[cid]; limit -= 1
  30. return cid
  31. for m in list(m2c.keys()):
  32. m2c[m] = final(m)
  33. alias = {}
  34. cur.execute('SELECT id, name FROM capability')
  35. for r in cur.fetchall():
  36. alias[norm(r['name'])] = r['id']
  37. for cid, (new_name, _) in RENAMES.items():
  38. alias[norm(new_name)] = final(cid)
  39. return alias
  40. def extract_caps(folder):
  41. fp = folder / 'capabilities_extracted.json'
  42. if not fp.exists():
  43. return []
  44. text = fp.read_text(encoding='utf-8')
  45. try:
  46. data = json.loads(text)
  47. except Exception:
  48. names = re.findall(r'"name"\s*:\s*"([^"]+)"', text)
  49. ids = re.findall(r'"id"\s*:\s*(?:"([^"]+)"|null)', text)
  50. # is_new on same cap boundary — harder, skip for regex case
  51. return [{'id': ids[i] if i < len(ids) else None,
  52. 'name': n, 'is_new': None, 'description': '', 'implements': {}}
  53. for i, n in enumerate(names)]
  54. ec = data.get('extracted_capabilities', data.get('capabilities', []))
  55. out = []
  56. for c in ec:
  57. if not isinstance(c, dict):
  58. continue
  59. out.append({
  60. 'id': c.get('id') or c.get('cap_id') or c.get('capability_id'),
  61. 'name': c.get('name') or c.get('capability_name', ''),
  62. 'is_new': c.get('is_new'),
  63. 'description': c.get('description', '') or c.get('why_needed', '')
  64. or c.get('relevance_reason', ''),
  65. 'implements': c.get('implements') or c.get('suggested_tools', []),
  66. })
  67. return out
  68. def main():
  69. s = PostgreSQLCapabilityStore()
  70. cur = s._get_cursor()
  71. try:
  72. alias = build_alias(cur)
  73. # Group unresolved by normalized name (to see duplicates across folders)
  74. by_name = defaultdict(lambda: {'folders': [], 'desc': '', 'impl': '', 'is_new_votes': []})
  75. folders = []
  76. for d in sorted(OUTPUT_DIR.iterdir()):
  77. if not d.is_dir(): continue
  78. if d.name in RERUN_FOLDERS:
  79. folders.append(RERUN_DIR / d.name)
  80. else:
  81. folders.append(d)
  82. for folder in folders:
  83. caps = extract_caps(folder)
  84. for cap in caps:
  85. cid = cap.get('id'); name = cap.get('name', '')
  86. if not name: continue
  87. resolved = None
  88. if cid:
  89. cur.execute('SELECT 1 FROM capability WHERE id=%s', (cid,))
  90. if cur.fetchone(): resolved = cid
  91. if not resolved:
  92. cand = alias.get(norm(name))
  93. if cand:
  94. cur.execute('SELECT 1 FROM capability WHERE id=%s', (cand,))
  95. if cur.fetchone(): resolved = cand
  96. if resolved: continue
  97. key = norm(name)
  98. by_name[key]['name'] = name
  99. by_name[key]['folders'].append(folder.name)
  100. if not by_name[key]['desc'] and cap.get('description'):
  101. by_name[key]['desc'] = cap['description'][:300]
  102. if not by_name[key]['impl'] and cap.get('implements'):
  103. by_name[key]['impl'] = str(cap['implements'])[:200]
  104. by_name[key]['is_new_votes'].append(cap.get('is_new'))
  105. # Sort by frequency (most common first)
  106. sorted_list = sorted(by_name.items(), key=lambda x: -len(x[1]['folders']))
  107. total_occ = sum(len(v['folders']) for v in by_name.values())
  108. print(f'UNIQUE UNRESOLVED CAPS: {len(sorted_list)}', flush=True)
  109. print(f'TOTAL OCCURRENCES: {total_occ}', flush=True)
  110. print()
  111. # Write to file for easier review
  112. out_path = Path('/tmp/unresolved_caps.md')
  113. with out_path.open('w') as f:
  114. f.write(f'# Unresolved Caps ({len(sorted_list)} unique)\n\n')
  115. for key, v in sorted_list:
  116. f.write(f'## {v["name"]}\n')
  117. f.write(f'- folders ({len(v["folders"])}): {v["folders"]}\n')
  118. f.write(f'- is_new votes: {v["is_new_votes"]}\n')
  119. if v['desc']:
  120. f.write(f'- desc: {v["desc"]}\n')
  121. if v['impl']:
  122. f.write(f'- impl: {v["impl"]}\n')
  123. f.write('\n')
  124. print(f'Written: {out_path}', flush=True)
  125. # print top 30 to terminal
  126. for i, (key, v) in enumerate(sorted_list[:30]):
  127. n_fold = len(v['folders']); nm = v['name']; ds = v['desc'][:120] if v['desc'] else ''
  128. print(f'{i+1:3d}. [{n_fold}x] {nm}', flush=True)
  129. if ds: print(f' desc: {ds}', flush=True)
  130. finally:
  131. cur.close(); s.close()
  132. if __name__ == '__main__':
  133. main()