import_initial_data.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. #!/usr/bin/env python3
  2. """
  3. 填充 atomic_capability 和 requirement_table 的初始数据
  4. 数据来源:
  5. - atomic_capabilities.md -> atomic_capability
  6. - requirements_sorted.json -> requirement_table
  7. """
  8. import os, sys, json
  9. import psycopg2
  10. from psycopg2.extras import RealDictCursor
  11. from dotenv import load_dotenv
  12. _dir = os.path.dirname(os.path.abspath(__file__))
  13. _root = os.path.normpath(os.path.join(_dir, '..', '..'))
  14. load_dotenv(os.path.join(_root, '.env'))
  15. def get_conn():
  16. return psycopg2.connect(
  17. host=os.getenv('KNOWHUB_DB'),
  18. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  19. user=os.getenv('KNOWHUB_USER'),
  20. password=os.getenv('KNOWHUB_PASSWORD'),
  21. database=os.getenv('KNOWHUB_DB_NAME'),
  22. connect_timeout=10
  23. )
  24. # ── 解析 atomic_capabilities.md ──
  25. def parse_caps(path):
  26. caps = []
  27. cur = None
  28. with open(path, 'r', encoding='utf-8') as f:
  29. for line in f:
  30. line = line.rstrip()
  31. if line.startswith('### CAP-'):
  32. if cur:
  33. caps.append(cur)
  34. parts = line.split(':', 1)
  35. cap_id = parts[0].replace('### ', '').strip()
  36. cap_name = parts[1].strip() if len(parts) > 1 else ''
  37. cur = {'id': cap_id, 'name': cap_name, 'criterion': '', 'description': '', 'implements': {}, 'tools': [], 'source_knowledge': [], 'requirements': []}
  38. elif cur:
  39. if line.startswith('- **'):
  40. val = line.split(':', 1)[1].strip() if ':' in line else ''
  41. if '功能描述' in line:
  42. cur['description'] = val
  43. elif '判定标准' in line:
  44. cur['criterion'] = val
  45. elif line.strip().startswith('- ComfyUI') or line.strip().startswith('- FLUX') or \
  46. line.strip().startswith('- Midjourney') or line.strip().startswith('- Nano Banana') or \
  47. line.strip().startswith('- Seedream'):
  48. t = line.strip().lstrip('- ')
  49. name = t.split(':')[0].strip()
  50. desc = t.split(':', 1)[1].strip() if ':' in t else ''
  51. cur['implements'][name] = desc
  52. if cur:
  53. caps.append(cur)
  54. return caps
  55. # ── 解析 requirements_sorted.json ──
  56. def parse_reqs(path):
  57. with open(path, 'r', encoding='utf-8') as f:
  58. data = json.load(f)
  59. out = []
  60. for r in data.get('requirements', []):
  61. nodes = [{'node_name': n, 'posts': r.get('source_posts', [])} for n in r.get('source_nodes', [])]
  62. status = '已满足' if r.get('match_status') == '完全满足' else '未满足'
  63. parts = []
  64. if r.get('capability_combination'): parts.append(r['capability_combination'])
  65. if r.get('research_note'): parts.append(r['research_note'])
  66. out.append({
  67. 'id': r['requirement_id'],
  68. 'description': r['requirement_text'],
  69. 'atomics': r.get('matched_capabilities', []),
  70. 'source_nodes': nodes,
  71. 'status': status,
  72. 'match_result': '\n'.join(parts),
  73. })
  74. return out
  75. # ── 主流程 ──
  76. def main():
  77. conn = get_conn()
  78. conn.autocommit = True
  79. cur = conn.cursor(cursor_factory=RealDictCursor)
  80. print('Connected.')
  81. # 1. 填充 atomic_capability
  82. md = os.path.join(_root, 'examples', 'tool_research', 'atomic_capabilities.md')
  83. print(f'\n--- atomic_capability ---')
  84. if not os.path.exists(md):
  85. print(f'File not found: {md}')
  86. else:
  87. caps = parse_caps(md)
  88. ok = 0
  89. for c in caps:
  90. try:
  91. cur.execute("""
  92. INSERT INTO atomic_capability (id, name, criterion, description, requirements, implements, tools, source_knowledge)
  93. VALUES (%s,%s,%s,%s,%s,%s,%s,%s)
  94. ON CONFLICT (id) DO UPDATE SET
  95. name=EXCLUDED.name, criterion=EXCLUDED.criterion, description=EXCLUDED.description, implements=EXCLUDED.implements
  96. """, (c['id'], c['name'], c['criterion'], c['description'],
  97. json.dumps(c['requirements']), json.dumps(c['implements']),
  98. json.dumps(c['tools']), json.dumps(c['source_knowledge'])))
  99. ok += 1
  100. except Exception as e:
  101. print(f' FAIL {c["id"]}: {e}')
  102. print(f'Inserted/updated {ok}/{len(caps)} capabilities.')
  103. # 2. 填充 requirement_table
  104. jf = os.path.join(_root, 'examples', 'tool_research', 'requirements_sorted.json')
  105. print(f'\n--- requirement_table ---')
  106. if not os.path.exists(jf):
  107. print(f'File not found: {jf}')
  108. else:
  109. reqs = parse_reqs(jf)
  110. ok = 0
  111. for r in reqs:
  112. try:
  113. cur.execute("""
  114. INSERT INTO requirement_table (id, description, atomics, source_nodes, status, match_result)
  115. VALUES (%s,%s,%s,%s,%s,%s)
  116. ON CONFLICT (id) DO UPDATE SET
  117. description=EXCLUDED.description, atomics=EXCLUDED.atomics,
  118. source_nodes=EXCLUDED.source_nodes, status=EXCLUDED.status, match_result=EXCLUDED.match_result
  119. """, (r['id'], r['description'], json.dumps(r['atomics']),
  120. json.dumps(r['source_nodes']), r['status'], r['match_result']))
  121. ok += 1
  122. except Exception as e:
  123. print(f' FAIL {r["id"]}: {e}')
  124. print(f'Inserted/updated {ok}/{len(reqs)} requirements.')
  125. # 3. 验证
  126. print('\n--- verify ---')
  127. for t in ['atomic_capability', 'requirement_table']:
  128. cur.execute(f'SELECT COUNT(*) as cnt FROM {t}')
  129. print(f'{t}: {cur.fetchone()["cnt"]} rows')
  130. cur.close()
  131. conn.close()
  132. print('\nDone.')
  133. if __name__ == '__main__':
  134. main()