| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- #!/usr/bin/env python3
- """
- 用 Chinese 2-gram fuzzy 匹配补齐最后 12 条 0-cap knowledge 的 knowledge_capability。
- 这些 knowledge 的源 strategy 用了 LLM 自造的 cap ID(如 aigc_color_unification),
- canonical alias 解析失败,改用名字 2-gram Jaccard + 阈值 0.3 做 best-effort。
- 打印 match 决策供审核,然后 INSERT。
- """
- import json
- import re
- import sys
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
- OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
- RERUN_DIR = Path('/Users/sunlit/Downloads/5')
- RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
- THRESHOLD = 0.3
- def ch_bigrams(s):
- s = re.sub(r'\s+', '', s or '')
- return set(s[i:i+2] for i in range(len(s)-1))
- def name_sim(a, b):
- ba, bb = ch_bigrams(a), ch_bigrams(b)
- if not ba or not bb: return 0
- return len(ba & bb) / len(ba | bb)
- def main():
- s = PostgreSQLCapabilityStore()
- cur = s._get_cursor()
- try:
- cur.execute('SELECT id, name FROM capability')
- db_caps = [(r['id'], r['name']) for r in cur.fetchall()]
- print(f'db_caps: {len(db_caps)}', flush=True)
- # 找 0-cap 的 howard_strategy_instance knowledge
- cur.execute("""SELECT k.id, k.tags FROM knowledge k
- WHERE k.version='howard_strategy_instance'
- AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=k.id)""")
- zero_cap = cur.fetchall()
- print(f'0-cap knowledge: {len(zero_cap)}', flush=True)
- # 找 req_text
- cur.execute('SELECT id, description FROM requirement')
- req_desc = {r['id']: r['description'] for r in cur.fetchall()}
- # 收集 req_id → folder
- req_to_folder = {}
- for folder in sorted(OUTPUT_DIR.iterdir()) + sorted(RERUN_DIR.iterdir()):
- if not folder.is_dir(): continue
- if folder.name in RERUN_FOLDERS and folder.parent == OUTPUT_DIR:
- continue # 跳过 output 2/ 的 rerun 坏版本
- strat_path = folder / 'strategy.json'
- if not strat_path.exists(): continue
- try: sd = json.loads(strat_path.read_text(encoding='utf-8'))
- except: continue
- if not isinstance(sd, dict): continue
- rt = sd.get('requirement')
- for rid, d in req_desc.items():
- if d == rt:
- req_to_folder[rid] = folder
- break
- total_ins = 0
- for k in zero_cap:
- tags = k['tags'] if isinstance(k['tags'], dict) else json.loads(k['tags'] or '{}')
- orig_name = tags.get('original_strategy_name','')
- # req_id
- cur.execute('SELECT requirement_id FROM requirement_knowledge WHERE knowledge_id=%s', (k['id'],))
- r = cur.fetchone()
- if not r: continue
- req_id = r['requirement_id']
- folder = req_to_folder.get(req_id)
- if not folder:
- print(f' [{req_id}] no folder', flush=True); continue
- # 读源
- try:
- sd = json.loads((folder/'strategy.json').read_text(encoding='utf-8'))
- except:
- continue
- target_strat = None
- for s_data in sd.get('strategies', []):
- if isinstance(s_data, dict) and s_data.get('name','') == orig_name:
- target_strat = s_data; break
- if not target_strat:
- print(f' [{req_id}] no match for {orig_name!r}', flush=True); continue
- # 提取 workflow_outline caps 的名字(因 id 无效),逐个 fuzzy 匹配到 canonical
- wo = target_strat.get('workflow_outline', [])
- print(f'\n[{k["id"]}] {req_id} / {orig_name[:50]}', flush=True)
- matched_caps = set()
- for ph in wo:
- if not isinstance(ph, dict): continue
- for c in ph.get('capabilities', []):
- if not isinstance(c, dict): continue
- src_name = c.get('name','')
- if not src_name: continue
- # fuzzy match
- best_id = None; best_name = None; best_sim = 0
- for cid, cname in db_caps:
- sim = name_sim(src_name, cname)
- if sim > best_sim:
- best_sim = sim; best_id = cid; best_name = cname
- if best_sim >= THRESHOLD:
- print(f' "{src_name}" → {best_id} "{best_name}" (sim={best_sim:.2f})', flush=True)
- matched_caps.add(best_id)
- else:
- print(f' "{src_name}" → NO MATCH (best={best_name} sim={best_sim:.2f})', flush=True)
- # constraint check: 必须 ∈ req_cap
- for cid in matched_caps:
- cur.execute('SELECT 1 FROM requirement_capability WHERE requirement_id=%s AND capability_id=%s',
- (req_id, cid))
- if cur.fetchone():
- cur.execute("""INSERT INTO knowledge_capability (knowledge_id, capability_id)
- VALUES (%s, %s) ON CONFLICT DO NOTHING""", (k['id'], cid))
- total_ins += cur.rowcount or 0
- else:
- print(f' skip {cid}: not in req_cap for {req_id}', flush=True)
- print(f'\nTotal inserted: {total_ins}', flush=True)
- # Re-check
- cur.execute("""SELECT COUNT(*) c FROM knowledge
- WHERE version='howard_strategy_instance'
- AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=knowledge.id)""")
- print(f'still 0-cap: {cur.fetchone()["c"]}', flush=True)
- cur.execute('SELECT COUNT(*) c FROM knowledge_capability')
- print(f'knowledge_capability total: {cur.fetchone()["c"]}', flush=True)
- finally:
- cur.close(); s.close()
- if __name__ == '__main__':
- main()
|