| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- #!/usr/bin/env python3
- """
- 系统性补齐 requirement_capability 到"研究全集"(A 方案)。
- 对每个 requirement (folder),读其 capabilities_extracted.json 中所有 cap,
- 用 alias 解析成 canonical capability_id,
- INSERT requirement_capability ON CONFLICT DO NOTHING。
- 现有 req_cap 大多只是 strategy 的 workflow_outline 子集,
- 此脚本会把所有 research-discovered 的 cap 都补进来。
- 源数据:
- - 94 folder:/Users/sunlit/Downloads/output 2/<NN>/
- - 5 folder(重跑数据):/Users/sunlit/Downloads/5/<NN>/
- 非标准 schema 处理:
- - 008/034:capabilities_extracted.json 没有 extracted_capabilities key,
- 用 'capabilities' list
- - 044:旧 schema,用 capability_id / capability_name
- - 053:extracted_capabilities=[],跳过(已有 strategy-subset 即是全集)
- - 077:JSON parse error,用正则 fallback 提取 name + id
- - 004/031/066/070:虽 strategy 格式异常,但 caps 文件是标准的
- """
- import hashlib
- import json
- import re
- import sys
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
- from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS
- from knowhub.scripts.rename_merged_capabilities import RENAMES
- from knowhub.scripts.llm_renames import LLM_RENAMES
- OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
- RERUN_DIR = Path('/Users/sunlit/Downloads/5')
- RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
- def norm(s):
- return (s or '').strip().lower()
- def build_alias_map(cur):
- member_to_canonical = {}
- for canonical, members in MERGE_CLUSTERS.items():
- for m in members:
- member_to_canonical[m] = canonical
- def final(cid, limit=10):
- seen = set()
- while cid in member_to_canonical and cid not in seen and limit > 0:
- seen.add(cid); cid = member_to_canonical[cid]; limit -= 1
- return cid
- for m in list(member_to_canonical.keys()):
- member_to_canonical[m] = final(m)
- alias = {}
- cur.execute('SELECT id, name FROM capability')
- for r in cur.fetchall():
- alias[norm(r['name'])] = r['id']
- for cid, (new_name, _) in RENAMES.items():
- alias[norm(new_name)] = final(cid)
- # LLM-generated renames (aliases for LLM-invented cap names that are actually dups)
- for llm_name, canonical in LLM_RENAMES.items():
- alias[norm(llm_name)] = final(canonical)
- return alias
- def extract_caps_from_file(folder_path):
- """Return list of {id, name} cap dicts from capabilities_extracted.json, or []."""
- fp = folder_path / 'capabilities_extracted.json'
- if not fp.exists():
- return [], 'no_file'
- folder_key = folder_path.name
- text = fp.read_text(encoding='utf-8')
- # Special: 077 JSON parse error — regex fallback
- try:
- data = json.loads(text)
- except Exception:
- names = re.findall(r'"name"\s*:\s*"([^"]+)"', text)
- ids = re.findall(r'"id"\s*:\s*(?:"([^"]+)"|null)', text)
- caps = []
- for i, n in enumerate(names):
- caps.append({'id': ids[i] if i < len(ids) else None, 'name': n})
- return caps, 'parse_err_regex'
- # 008/034: 'capabilities' key
- if 'extracted_capabilities' not in data and 'capabilities' in data:
- ec = data['capabilities']
- return [{'id': c.get('id') or c.get('cap_id'), 'name': c.get('name', '')}
- for c in ec if isinstance(c, dict)], 'alt_key'
- ec = data.get('extracted_capabilities', [])
- if not ec:
- return [], 'empty'
- # 044: old schema capability_id/capability_name
- first = ec[0] if ec else {}
- if isinstance(first, dict) and 'capability_name' in first and 'name' not in first:
- return [{'id': c.get('capability_id'),
- 'name': c.get('capability_name', '')}
- for c in ec if isinstance(c, dict)], 'old_schema'
- return [{'id': c.get('id'), 'name': c.get('name', '')}
- for c in ec if isinstance(c, dict)], 'standard'
- def get_req_text_and_id(folder_path, cur):
- """Try blueprint.json → strategy.json → capabilities_extracted.json for requirement text."""
- for fn in ['blueprint.json', 'strategy.json', 'capabilities_extracted.json']:
- fp = folder_path / fn
- if not fp.exists():
- continue
- try:
- d = json.loads(fp.read_text(encoding='utf-8'))
- rt = d.get('requirement', '')
- if rt:
- cur.execute('SELECT id FROM requirement WHERE description = %s LIMIT 1', (rt,))
- row = cur.fetchone()
- if row:
- return rt, row['id']
- except Exception:
- continue
- return None, None
- def main():
- s = PostgreSQLCapabilityStore()
- cur = s._get_cursor()
- try:
- print('Building alias map...', flush=True)
- alias = build_alias_map(cur)
- print(f' alias entries: {len(alias)}', flush=True)
- # resolve source folder per requirement folder
- folders = []
- for d in sorted(OUTPUT_DIR.iterdir()):
- if not d.is_dir():
- continue
- key = d.name
- if key in RERUN_FOLDERS:
- folders.append(RERUN_DIR / key) # use re-run data
- else:
- folders.append(d)
- totals = {'folders_processed': 0, 'folders_no_req': 0,
- 'caps_resolved': 0, 'caps_unresolved': 0,
- 'inserted': 0, 'already_present': 0,
- 'schema_counts': {}, 'unresolved_names': []}
- for folder in folders:
- fk = folder.name
- req_text, req_id = get_req_text_and_id(folder, cur)
- if not req_id:
- totals['folders_no_req'] += 1
- print(f'[{fk}] ⚠️ no matching req', flush=True)
- continue
- caps, schema = extract_caps_from_file(folder)
- totals['schema_counts'][schema] = totals['schema_counts'].get(schema, 0) + 1
- # pre-count current req_cap for this req
- cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s',
- (req_id,))
- before = cur.fetchone()['c']
- resolved_ids = set()
- unresolved = []
- for cap in caps:
- cid = cap.get('id')
- name = cap.get('name', '')
- found = None
- # (1) id exists in DB?
- if cid:
- cur.execute('SELECT 1 FROM capability WHERE id = %s', (cid,))
- if cur.fetchone():
- found = cid
- # (2) alias by name?
- if not found and name:
- cand = alias.get(norm(name))
- if cand:
- cur.execute('SELECT 1 FROM capability WHERE id = %s', (cand,))
- if cur.fetchone():
- found = cand
- if found:
- resolved_ids.add(found)
- totals['caps_resolved'] += 1
- else:
- unresolved.append(f'{cid}/{name[:30]}')
- totals['caps_unresolved'] += 1
- inserted = 0
- for cid in resolved_ids:
- cur.execute("""INSERT INTO requirement_capability (requirement_id, capability_id)
- VALUES (%s, %s) ON CONFLICT DO NOTHING""", (req_id, cid))
- inserted += cur.rowcount or 0
- cur.execute('SELECT COUNT(*) c FROM requirement_capability WHERE requirement_id=%s',
- (req_id,))
- after = cur.fetchone()['c']
- totals['inserted'] += (after - before)
- totals['folders_processed'] += 1
- if unresolved:
- totals['unresolved_names'].extend(unresolved[:3])
- print(f'[{fk}] ({schema}) req={req_id} caps_in_file={len(caps)} resolved={len(resolved_ids)} '
- f'new_inserted={after-before} (was {before} → now {after}) '
- f'unresolved={len(unresolved)}',
- flush=True)
- print(f'\n{"="*60}\nTotals:', flush=True)
- for k, v in totals.items():
- if isinstance(v, list):
- print(f' {k}: {len(v)} (sample: {v[:5]})', flush=True)
- else:
- print(f' {k}: {v}', flush=True)
- finally:
- cur.close()
- s.close()
- if __name__ == '__main__':
- main()
|