#!/usr/bin/env python3 """ 探查 output-new 文件夹: 1. 99 folders 是否一一对应到现有 v0 的 REQ_xxx(按 requirement 文本匹配) 2. 统计每 folder 的 cap / strat / resource 数量 3. 检测有无异常(req 文本缺失、JSON schema 不规范等) """ import json, sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore OUTPUT = Path('/Users/sunlit/Downloads/output-new') def main(): s = PostgreSQLCapabilityStore() cur = s._get_cursor() try: cur.execute('SELECT id, description FROM requirement') v0_reqs = {r['description']: r['id'] for r in cur.fetchall()} print(f'v0 reqs in DB: {len(v0_reqs)}', flush=True) finally: cur.close(); s.close() folders = sorted([f for f in OUTPUT.iterdir() if f.is_dir()]) print(f'tao_dev folders: {len(folders)}', flush=True) matched, unmatched = 0, [] cap_total = strat_total = res_total = 0 cap_is_new_true = 0 strat_cap_links = 0 for folder in folders: sp = folder / 'strategy.json' cp = folder / 'capabilities_extracted.json' if not sp.exists() or not cp.exists(): unmatched.append((folder.name, 'missing file')); continue try: sd = json.loads(sp.read_text(encoding='utf-8')) cd = json.loads(cp.read_text(encoding='utf-8')) except Exception as e: unmatched.append((folder.name, f'parse: {e}')); continue req_text = sd.get('requirement') or cd.get('requirement') orig_req_id = v0_reqs.get(req_text) if not orig_req_id: unmatched.append((folder.name, f'no match: {req_text[:40]!r}...')); continue matched += 1 caps = cd.get('extracted_capabilities', []) cap_total += len(caps) cap_is_new_true += sum(1 for c in caps if c.get('is_new')) strats = sd.get('strategies', []) strat_total += len(strats) for st in strats: for ph in st.get('workflow_outline', []) or []: if isinstance(ph, dict): strat_cap_links += len(ph.get('capabilities', []) or []) rc = folder / 'raw_cases' if rc.exists(): for cf in rc.iterdir(): try: cj = json.loads(cf.read_text(encoding='utf-8')) res_total += len(cj.get('cases', [])) except Exception: pass print(f'\n匹配到 v0 req: {matched}/{len(folders)}', flush=True) print(f'未匹配: {len(unmatched)}', flush=True) for n, why in unmatched[:10]: print(f' {n}: {why}', flush=True) print(f'\n总计:', flush=True) print(f' extracted_capabilities: {cap_total} (其中 is_new=true: {cap_is_new_true})', flush=True) print(f' strategies: {strat_total} (平均 {strat_total/max(matched,1):.1f}/req)', flush=True) print(f' strategy→cap 链接: {strat_cap_links}', flush=True) print(f' raw_cases 案例: {res_total} (平均 {res_total/max(matched,1):.1f}/req)', flush=True) if __name__ == '__main__': main()