| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- #!/usr/bin/env python3
- """
- 探查 output-new 文件夹:
- 1. 99 folders 是否一一对应到现有 v0 的 REQ_xxx(按 requirement 文本匹配)
- 2. 统计每 folder 的 cap / strat / resource 数量
- 3. 检测有无异常(req 文本缺失、JSON schema 不规范等)
- """
- import json, sys
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
- OUTPUT = Path('/Users/sunlit/Downloads/output-new')
- def main():
- s = PostgreSQLCapabilityStore()
- cur = s._get_cursor()
- try:
- cur.execute('SELECT id, description FROM requirement')
- v0_reqs = {r['description']: r['id'] for r in cur.fetchall()}
- print(f'v0 reqs in DB: {len(v0_reqs)}', flush=True)
- finally:
- cur.close(); s.close()
- folders = sorted([f for f in OUTPUT.iterdir() if f.is_dir()])
- print(f'tao_dev folders: {len(folders)}', flush=True)
- matched, unmatched = 0, []
- cap_total = strat_total = res_total = 0
- cap_is_new_true = 0
- strat_cap_links = 0
- for folder in folders:
- sp = folder / 'strategy.json'
- cp = folder / 'capabilities_extracted.json'
- if not sp.exists() or not cp.exists():
- unmatched.append((folder.name, 'missing file')); continue
- try:
- sd = json.loads(sp.read_text(encoding='utf-8'))
- cd = json.loads(cp.read_text(encoding='utf-8'))
- except Exception as e:
- unmatched.append((folder.name, f'parse: {e}')); continue
- req_text = sd.get('requirement') or cd.get('requirement')
- orig_req_id = v0_reqs.get(req_text)
- if not orig_req_id:
- unmatched.append((folder.name, f'no match: {req_text[:40]!r}...')); continue
- matched += 1
- caps = cd.get('extracted_capabilities', [])
- cap_total += len(caps)
- cap_is_new_true += sum(1 for c in caps if c.get('is_new'))
- strats = sd.get('strategies', [])
- strat_total += len(strats)
- for st in strats:
- for ph in st.get('workflow_outline', []) or []:
- if isinstance(ph, dict):
- strat_cap_links += len(ph.get('capabilities', []) or [])
- rc = folder / 'raw_cases'
- if rc.exists():
- for cf in rc.iterdir():
- try:
- cj = json.loads(cf.read_text(encoding='utf-8'))
- res_total += len(cj.get('cases', []))
- except Exception:
- pass
- print(f'\n匹配到 v0 req: {matched}/{len(folders)}', flush=True)
- print(f'未匹配: {len(unmatched)}', flush=True)
- for n, why in unmatched[:10]:
- print(f' {n}: {why}', flush=True)
- print(f'\n总计:', flush=True)
- print(f' extracted_capabilities: {cap_total} (其中 is_new=true: {cap_is_new_true})', flush=True)
- print(f' strategies: {strat_total} (平均 {strat_total/max(matched,1):.1f}/req)', flush=True)
- print(f' strategy→cap 链接: {strat_cap_links}', flush=True)
- print(f' raw_cases 案例: {res_total} (平均 {res_total/max(matched,1):.1f}/req)', flush=True)
- if __name__ == '__main__':
- main()
|