#!/usr/bin/env python3 """ 系统性补齐 requirement_capability 到"研究全集"（A 方案）。对每个 requirement (folder)，读其 capabilities_extracted.json 中所有 cap，用 alias 解析成 canonical capability_id， INSERT requirement_capability ON CONFLICT DO NOTHING。现有 req_cap 大多只是 strategy 的 workflow_outline 子集，此脚本会把所有 research-discovered 的 cap 都补进来。源数据： - 94 folder：/Users/sunlit/Downloads/output 2// - 5 folder（重跑数据）：/Users/sunlit/Downloads/5// 非标准 schema 处理： - 008/034：capabilities_extracted.json 没有 extracted_capabilities key，用 'capabilities' list - 044：旧 schema，用 capability_id / capability_name - 053：extracted_capabilities=[]，跳过（已有 strategy-subset 即是全集） - 077：JSON parse error，用正则 fallback 提取 name + id - 004/031/066/070：虽 strategy 格式异常，但 caps 文件是标准的 """ import hashlib import json import re import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS from knowhub.scripts.rename_merged_capabilities import RENAMES from knowhub.scripts.llm_renames import LLM_RENAMES OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2') RERUN_DIR = Path('/Users/sunlit/Downloads/5') RERUN_FOLDERS = {'032', '046', '069', '085', '097'} def norm(s): return (s or '').strip().lower() def build_alias_map(cur): member_to_canonical = {} for canonical, members in MERGE_CLUSTERS.items(): for m in members: member_to_canonical[m] = canonical def final(cid, limit=10): seen = set() while cid in member_to_canonical and cid not in seen and limit > 0: seen.add(cid); cid = member_to_canonical[cid]; limit -= 1 return cid for m in list(member_to_canonical.keys()): member_to_canonical[m] = final(m) alias = {} cur.execute('SELECT id, name FROM capability') for r in cur.fetchall(): alias[norm(r['name'])] = r['id'] for cid, (new_name, _) in RENAMES.items(): alias[norm(new_name)] = final(cid) # LLM-generated renames (aliases for LLM-invented cap names that are actually dups) for llm_name, canonical in LLM_RENAMES.items(): alias[norm(llm_name)] = final(canonical) return alias def extract_caps_from_file(folder_path): """Return list of {id, name} cap dicts from capabilities_extracted.json, or [].""" fp = folder_path / 'capabilities_extracted.json' if not fp.exists(): return [], 'no_file' folder_key = folder_path.name text = fp.read_text(encoding='utf-8') # Special: 077 JSON parse error — regex fallback try: data = json.loads(text) except Exception: names = re.findall(r'"name"\s*:\s*"([^"]+)"', text) ids = re.findall(r'"id"\s*:\s*(?:"([^"]+)"|null)', text) caps = [] for i, n in enumerate(names): caps.append({'id': ids[i] if i < len(ids) else None, 'name': n}) return caps, 'parse_err_regex' # 008/034: 'capabilities' key if 'extracted_capabilities' not in data and 'capabilities' in data: ec = data['capabilities'] return [{'id': c.get('id') or c.get('cap_id'), 'name': c.get('name', '')} for c in ec if isinstance(c, dict)], 'alt_key' ec = data.get('extracted_capabilities', []) if not ec: return [], 'empty' # 044: old schema capability_id/capability_name first = ec[0] if ec else {} if isinstance(first, dict) and 'capability_name' in first and 'name' not in first: return [{'id': c.get('capability_id'), 'name': c.get('capability_name', '')} for c in ec if isinstance(c, dict)], 'old_schema' return [{'id': c.get('id'), 'name': c.get('name', '')} for c in ec if isinstance(c, dict)], 'standard' def get_req_text_and_id(folder_path, cur): """Try blueprint.json → strategy.json → capabilities_extracted.json for requirement text.""" for fn in ['blueprint.json', 'strategy.json', 'capabilities_extracted.json']: fp = folder_path / fn if not fp.exists(): continue try: d = json.loads(fp.read_text(encoding='utf-8')) rt = d.get('requirement', '') if rt: cur.execute('SELECT id FROM requirement WHERE description = %s LIMIT 1', (rt,)) row = cur.fetchone() if row: return rt, row['id'] except Exception: continue return None, None def main(): s = PostgreSQLCapabilityStore() cur = s._get_cursor() try: print('Building alias map...', flush=True) alias = build_alias_map(cur) print(f' alias entries: {len(alias)}', flush=True) # resolve source folder per requirement folder folders = [] for d in sorted(OUTPUT_DIR.iterdir()): if not d.is_dir(): continue key = d.name if key in RERUN_FOLDERS: folders.append(RERUN_DIR / key) # use re-run data else: folders.append(d) totals = {'folders_processed': 0, 'folders_no_req': 0, 'caps_resolved': 0, 'caps_unresolved': 0, 'inserted': 0, 'already_present': 0, 'schema_counts': {}, 'unresolved_names': []} for folder in folders: fk = folder.name req_text, req_id = get_req_text_and_id(folder, cur) if not req_id: totals['folders_no_req'] += 1 print(f'[{fk}] ⚠️ no matching req', flush=True) continue caps, schema = extract_caps_from_file(folder) totals['schema_counts'][schema] = totals['schema_counts'].get(schema, 0) + 1 # pre-count current req_cap for this req cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s', (req_id,)) before = cur.fetchone()['c'] resolved_ids = set() unresolved = [] for cap in caps: cid = cap.get('id') name = cap.get('name', '') found = None # (1) id exists in DB? if cid: cur.execute('SELECT 1 FROM capability WHERE id = %s', (cid,)) if cur.fetchone(): found = cid # (2) alias by name? if not found and name: cand = alias.get(norm(name)) if cand: cur.execute('SELECT 1 FROM capability WHERE id = %s', (cand,)) if cur.fetchone(): found = cand if found: resolved_ids.add(found) totals['caps_resolved'] += 1 else: unresolved.append(f'{cid}/{name[:30]}') totals['caps_unresolved'] += 1 inserted = 0 for cid in resolved_ids: cur.execute("""INSERT INTO requirement_capability (requirement_id, capability_id) VALUES (%s, %s) ON CONFLICT DO NOTHING""", (req_id, cid)) inserted += cur.rowcount or 0 cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s', (req_id,)) after = cur.fetchone()['c'] totals['inserted'] += (after - before) totals['folders_processed'] += 1 if unresolved: totals['unresolved_names'].extend(unresolved[:3]) print(f'[{fk}] ({schema}) req={req_id} caps_in_file={len(caps)} resolved={len(resolved_ids)} ' f'new_inserted={after-before} (was {before} → now {after}) ' f'unresolved={len(unresolved)}', flush=True) print(f'\n{"="*60}\nTotals:', flush=True) for k, v in totals.items(): if isinstance(v, list): print(f' {k}: {len(v)} (sample: {v[:5]})', flush=True) else: print(f' {k}: {v}', flush=True) finally: cur.close() s.close() if __name__ == '__main__': main()