phase5_fuzzy_fill_remaining.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. #!/usr/bin/env python3
  2. """
  3. 用 Chinese 2-gram fuzzy 匹配补齐最后 12 条 0-cap knowledge 的 knowledge_capability。
  4. 这些 knowledge 的源 strategy 用了 LLM 自造的 cap ID(如 aigc_color_unification),
  5. canonical alias 解析失败,改用名字 2-gram Jaccard + 阈值 0.3 做 best-effort。
  6. 打印 match 决策供审核,然后 INSERT。
  7. """
  8. import json
  9. import re
  10. import sys
  11. from pathlib import Path
  12. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  13. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  14. OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
  15. RERUN_DIR = Path('/Users/sunlit/Downloads/5')
  16. RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
  17. THRESHOLD = 0.3
  18. def ch_bigrams(s):
  19. s = re.sub(r'\s+', '', s or '')
  20. return set(s[i:i+2] for i in range(len(s)-1))
  21. def name_sim(a, b):
  22. ba, bb = ch_bigrams(a), ch_bigrams(b)
  23. if not ba or not bb: return 0
  24. return len(ba & bb) / len(ba | bb)
  25. def main():
  26. s = PostgreSQLCapabilityStore()
  27. cur = s._get_cursor()
  28. try:
  29. cur.execute('SELECT id, name FROM capability')
  30. db_caps = [(r['id'], r['name']) for r in cur.fetchall()]
  31. print(f'db_caps: {len(db_caps)}', flush=True)
  32. # 找 0-cap 的 howard_strategy_instance knowledge
  33. cur.execute("""SELECT k.id, k.tags FROM knowledge k
  34. WHERE k.version='howard_strategy_instance'
  35. AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=k.id)""")
  36. zero_cap = cur.fetchall()
  37. print(f'0-cap knowledge: {len(zero_cap)}', flush=True)
  38. # 找 req_text
  39. cur.execute('SELECT id, description FROM requirement')
  40. req_desc = {r['id']: r['description'] for r in cur.fetchall()}
  41. # 收集 req_id → folder
  42. req_to_folder = {}
  43. for folder in sorted(OUTPUT_DIR.iterdir()) + sorted(RERUN_DIR.iterdir()):
  44. if not folder.is_dir(): continue
  45. if folder.name in RERUN_FOLDERS and folder.parent == OUTPUT_DIR:
  46. continue # 跳过 output 2/ 的 rerun 坏版本
  47. strat_path = folder / 'strategy.json'
  48. if not strat_path.exists(): continue
  49. try: sd = json.loads(strat_path.read_text(encoding='utf-8'))
  50. except: continue
  51. if not isinstance(sd, dict): continue
  52. rt = sd.get('requirement')
  53. for rid, d in req_desc.items():
  54. if d == rt:
  55. req_to_folder[rid] = folder
  56. break
  57. total_ins = 0
  58. for k in zero_cap:
  59. tags = k['tags'] if isinstance(k['tags'], dict) else json.loads(k['tags'] or '{}')
  60. orig_name = tags.get('original_strategy_name','')
  61. # req_id
  62. cur.execute('SELECT requirement_id FROM requirement_knowledge WHERE knowledge_id=%s', (k['id'],))
  63. r = cur.fetchone()
  64. if not r: continue
  65. req_id = r['requirement_id']
  66. folder = req_to_folder.get(req_id)
  67. if not folder:
  68. print(f' [{req_id}] no folder', flush=True); continue
  69. # 读源
  70. try:
  71. sd = json.loads((folder/'strategy.json').read_text(encoding='utf-8'))
  72. except:
  73. continue
  74. target_strat = None
  75. for s_data in sd.get('strategies', []):
  76. if isinstance(s_data, dict) and s_data.get('name','') == orig_name:
  77. target_strat = s_data; break
  78. if not target_strat:
  79. print(f' [{req_id}] no match for {orig_name!r}', flush=True); continue
  80. # 提取 workflow_outline caps 的名字(因 id 无效),逐个 fuzzy 匹配到 canonical
  81. wo = target_strat.get('workflow_outline', [])
  82. print(f'\n[{k["id"]}] {req_id} / {orig_name[:50]}', flush=True)
  83. matched_caps = set()
  84. for ph in wo:
  85. if not isinstance(ph, dict): continue
  86. for c in ph.get('capabilities', []):
  87. if not isinstance(c, dict): continue
  88. src_name = c.get('name','')
  89. if not src_name: continue
  90. # fuzzy match
  91. best_id = None; best_name = None; best_sim = 0
  92. for cid, cname in db_caps:
  93. sim = name_sim(src_name, cname)
  94. if sim > best_sim:
  95. best_sim = sim; best_id = cid; best_name = cname
  96. if best_sim >= THRESHOLD:
  97. print(f' "{src_name}" → {best_id} "{best_name}" (sim={best_sim:.2f})', flush=True)
  98. matched_caps.add(best_id)
  99. else:
  100. print(f' "{src_name}" → NO MATCH (best={best_name} sim={best_sim:.2f})', flush=True)
  101. # constraint check: 必须 ∈ req_cap
  102. for cid in matched_caps:
  103. cur.execute('SELECT 1 FROM requirement_capability WHERE requirement_id=%s AND capability_id=%s',
  104. (req_id, cid))
  105. if cur.fetchone():
  106. cur.execute("""INSERT INTO knowledge_capability (knowledge_id, capability_id)
  107. VALUES (%s, %s) ON CONFLICT DO NOTHING""", (k['id'], cid))
  108. total_ins += cur.rowcount or 0
  109. else:
  110. print(f' skip {cid}: not in req_cap for {req_id}', flush=True)
  111. print(f'\nTotal inserted: {total_ins}', flush=True)
  112. # Re-check
  113. cur.execute("""SELECT COUNT(*) c FROM knowledge
  114. WHERE version='howard_strategy_instance'
  115. AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=knowledge.id)""")
  116. print(f'still 0-cap: {cur.fetchone()["c"]}', flush=True)
  117. cur.execute('SELECT COUNT(*) c FROM knowledge_capability')
  118. print(f'knowledge_capability total: {cur.fetchone()["c"]}', flush=True)
  119. finally:
  120. cur.close(); s.close()
  121. if __name__ == '__main__':
  122. main()