taodev_probe.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. #!/usr/bin/env python3
  2. """
  3. 探查 output-new 文件夹:
  4. 1. 99 folders 是否一一对应到现有 v0 的 REQ_xxx(按 requirement 文本匹配)
  5. 2. 统计每 folder 的 cap / strat / resource 数量
  6. 3. 检测有无异常(req 文本缺失、JSON schema 不规范等)
  7. """
  8. import json, sys
  9. from pathlib import Path
  10. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  11. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  12. OUTPUT = Path('/Users/sunlit/Downloads/output-new')
  13. def main():
  14. s = PostgreSQLCapabilityStore()
  15. cur = s._get_cursor()
  16. try:
  17. cur.execute('SELECT id, description FROM requirement')
  18. v0_reqs = {r['description']: r['id'] for r in cur.fetchall()}
  19. print(f'v0 reqs in DB: {len(v0_reqs)}', flush=True)
  20. finally:
  21. cur.close(); s.close()
  22. folders = sorted([f for f in OUTPUT.iterdir() if f.is_dir()])
  23. print(f'tao_dev folders: {len(folders)}', flush=True)
  24. matched, unmatched = 0, []
  25. cap_total = strat_total = res_total = 0
  26. cap_is_new_true = 0
  27. strat_cap_links = 0
  28. for folder in folders:
  29. sp = folder / 'strategy.json'
  30. cp = folder / 'capabilities_extracted.json'
  31. if not sp.exists() or not cp.exists():
  32. unmatched.append((folder.name, 'missing file')); continue
  33. try:
  34. sd = json.loads(sp.read_text(encoding='utf-8'))
  35. cd = json.loads(cp.read_text(encoding='utf-8'))
  36. except Exception as e:
  37. unmatched.append((folder.name, f'parse: {e}')); continue
  38. req_text = sd.get('requirement') or cd.get('requirement')
  39. orig_req_id = v0_reqs.get(req_text)
  40. if not orig_req_id:
  41. unmatched.append((folder.name, f'no match: {req_text[:40]!r}...')); continue
  42. matched += 1
  43. caps = cd.get('extracted_capabilities', [])
  44. cap_total += len(caps)
  45. cap_is_new_true += sum(1 for c in caps if c.get('is_new'))
  46. strats = sd.get('strategies', [])
  47. strat_total += len(strats)
  48. for st in strats:
  49. for ph in st.get('workflow_outline', []) or []:
  50. if isinstance(ph, dict):
  51. strat_cap_links += len(ph.get('capabilities', []) or [])
  52. rc = folder / 'raw_cases'
  53. if rc.exists():
  54. for cf in rc.iterdir():
  55. try:
  56. cj = json.loads(cf.read_text(encoding='utf-8'))
  57. res_total += len(cj.get('cases', []))
  58. except Exception:
  59. pass
  60. print(f'\n匹配到 v0 req: {matched}/{len(folders)}', flush=True)
  61. print(f'未匹配: {len(unmatched)}', flush=True)
  62. for n, why in unmatched[:10]:
  63. print(f' {n}: {why}', flush=True)
  64. print(f'\n总计:', flush=True)
  65. print(f' extracted_capabilities: {cap_total} (其中 is_new=true: {cap_is_new_true})', flush=True)
  66. print(f' strategies: {strat_total} (平均 {strat_total/max(matched,1):.1f}/req)', flush=True)
  67. print(f' strategy→cap 链接: {strat_cap_links}', flush=True)
  68. print(f' raw_cases 案例: {res_total} (平均 {res_total/max(matched,1):.1f}/req)', flush=True)
  69. if __name__ == '__main__':
  70. main()