#!/usr/bin/env python3 """ 用 Chinese 2-gram fuzzy 匹配补齐最后 12 条 0-cap knowledge 的 knowledge_capability。 这些 knowledge 的源 strategy 用了 LLM 自造的 cap ID(如 aigc_color_unification), canonical alias 解析失败,改用名字 2-gram Jaccard + 阈值 0.3 做 best-effort。 打印 match 决策供审核,然后 INSERT。 """ import json import re import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2') RERUN_DIR = Path('/Users/sunlit/Downloads/5') RERUN_FOLDERS = {'032', '046', '069', '085', '097'} THRESHOLD = 0.3 def ch_bigrams(s): s = re.sub(r'\s+', '', s or '') return set(s[i:i+2] for i in range(len(s)-1)) def name_sim(a, b): ba, bb = ch_bigrams(a), ch_bigrams(b) if not ba or not bb: return 0 return len(ba & bb) / len(ba | bb) def main(): s = PostgreSQLCapabilityStore() cur = s._get_cursor() try: cur.execute('SELECT id, name FROM capability') db_caps = [(r['id'], r['name']) for r in cur.fetchall()] print(f'db_caps: {len(db_caps)}', flush=True) # 找 0-cap 的 howard_strategy_instance knowledge cur.execute("""SELECT k.id, k.tags FROM knowledge k WHERE k.version='howard_strategy_instance' AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=k.id)""") zero_cap = cur.fetchall() print(f'0-cap knowledge: {len(zero_cap)}', flush=True) # 找 req_text cur.execute('SELECT id, description FROM requirement') req_desc = {r['id']: r['description'] for r in cur.fetchall()} # 收集 req_id → folder req_to_folder = {} for folder in sorted(OUTPUT_DIR.iterdir()) + sorted(RERUN_DIR.iterdir()): if not folder.is_dir(): continue if folder.name in RERUN_FOLDERS and folder.parent == OUTPUT_DIR: continue # 跳过 output 2/ 的 rerun 坏版本 strat_path = folder / 'strategy.json' if not strat_path.exists(): continue try: sd = json.loads(strat_path.read_text(encoding='utf-8')) except: continue if not isinstance(sd, dict): continue rt = sd.get('requirement') for rid, d in req_desc.items(): if d == rt: req_to_folder[rid] = folder break total_ins = 0 for k in zero_cap: tags = k['tags'] if isinstance(k['tags'], dict) else json.loads(k['tags'] or '{}') orig_name = tags.get('original_strategy_name','') # req_id cur.execute('SELECT requirement_id FROM requirement_knowledge WHERE knowledge_id=%s', (k['id'],)) r = cur.fetchone() if not r: continue req_id = r['requirement_id'] folder = req_to_folder.get(req_id) if not folder: print(f' [{req_id}] no folder', flush=True); continue # 读源 try: sd = json.loads((folder/'strategy.json').read_text(encoding='utf-8')) except: continue target_strat = None for s_data in sd.get('strategies', []): if isinstance(s_data, dict) and s_data.get('name','') == orig_name: target_strat = s_data; break if not target_strat: print(f' [{req_id}] no match for {orig_name!r}', flush=True); continue # 提取 workflow_outline caps 的名字(因 id 无效),逐个 fuzzy 匹配到 canonical wo = target_strat.get('workflow_outline', []) print(f'\n[{k["id"]}] {req_id} / {orig_name[:50]}', flush=True) matched_caps = set() for ph in wo: if not isinstance(ph, dict): continue for c in ph.get('capabilities', []): if not isinstance(c, dict): continue src_name = c.get('name','') if not src_name: continue # fuzzy match best_id = None; best_name = None; best_sim = 0 for cid, cname in db_caps: sim = name_sim(src_name, cname) if sim > best_sim: best_sim = sim; best_id = cid; best_name = cname if best_sim >= THRESHOLD: print(f' "{src_name}" → {best_id} "{best_name}" (sim={best_sim:.2f})', flush=True) matched_caps.add(best_id) else: print(f' "{src_name}" → NO MATCH (best={best_name} sim={best_sim:.2f})', flush=True) # constraint check: 必须 ∈ req_cap for cid in matched_caps: cur.execute('SELECT 1 FROM requirement_capability WHERE requirement_id=%s AND capability_id=%s', (req_id, cid)) if cur.fetchone(): cur.execute("""INSERT INTO knowledge_capability (knowledge_id, capability_id) VALUES (%s, %s) ON CONFLICT DO NOTHING""", (k['id'], cid)) total_ins += cur.rowcount or 0 else: print(f' skip {cid}: not in req_cap for {req_id}', flush=True) print(f'\nTotal inserted: {total_ins}', flush=True) # Re-check cur.execute("""SELECT COUNT(*) c FROM knowledge WHERE version='howard_strategy_instance' AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=knowledge.id)""") print(f'still 0-cap: {cur.fetchone()["c"]}', flush=True) cur.execute('SELECT COUNT(*) c FROM knowledge_capability') print(f'knowledge_capability total: {cur.fetchone()["c"]}', flush=True) finally: cur.close(); s.close() if __name__ == '__main__': main()