fill_knowledge_tools.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #!/usr/bin/env python3
  2. """
  3. 补全 knowledge.tools 的反向映射:
  4. 从 tool_table 的 tool_knowledge / case_knowledge / process_knowledge 提取知识引用,
  5. 反向构建 knowledge_id -> [tool_ids] 的映射,写入 knowledge.tools 字段。
  6. 用法:
  7. python fill_knowledge_tools.py # 正常执行
  8. python fill_knowledge_tools.py --dry-run # 仅预览,不写入数据库
  9. """
  10. import os
  11. import sys
  12. import io
  13. import json
  14. import psycopg2
  15. from psycopg2.extras import RealDictCursor
  16. from dotenv import load_dotenv
  17. # 解决 Windows 终端编码问题
  18. sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
  19. sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
  20. _dir = os.path.dirname(os.path.abspath(__file__))
  21. _root = os.path.normpath(os.path.join(_dir, '..', '..'))
  22. load_dotenv(os.path.join(_root, '.env'))
  23. def get_conn():
  24. conn = psycopg2.connect(
  25. host=os.getenv('KNOWHUB_DB'),
  26. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  27. user=os.getenv('KNOWHUB_USER'),
  28. password=os.getenv('KNOWHUB_PASSWORD'),
  29. database=os.getenv('KNOWHUB_DB_NAME'),
  30. connect_timeout=10
  31. )
  32. conn.autocommit = True
  33. return conn
  34. def parse_jsonb_list(val):
  35. """将 JSONB 字段安全解析为 list"""
  36. if isinstance(val, list):
  37. return val
  38. if isinstance(val, str):
  39. try:
  40. parsed = json.loads(val)
  41. return parsed if isinstance(parsed, list) else []
  42. except json.JSONDecodeError:
  43. return []
  44. return []
  45. def main():
  46. dry_run = '--dry-run' in sys.argv
  47. conn = get_conn()
  48. cur = conn.cursor(cursor_factory=RealDictCursor)
  49. print("Connected.\n")
  50. # ── Step 1: 加载所有工具及其知识引用 ──
  51. print("=== [1] Loading tool_table knowledge references ===")
  52. cur.execute("""
  53. SELECT id, name, tool_knowledge, case_knowledge, process_knowledge
  54. FROM tool_table
  55. ORDER BY id
  56. """)
  57. tools = cur.fetchall()
  58. print(f" Found {len(tools)} tools")
  59. # ── Step 2: 构建 knowledge_id -> [tool_ids] 反向映射 ──
  60. print("\n=== [2] Building reverse mapping: knowledge_id -> tool_ids ===")
  61. # knowledge_id -> set of tool_ids
  62. knowledge_to_tools = {}
  63. # 统计
  64. tool_with_refs = 0
  65. total_refs = 0
  66. for tool in tools:
  67. tool_id = tool['id']
  68. tool_name = tool['name']
  69. # 合并三种知识引用
  70. tk = parse_jsonb_list(tool['tool_knowledge'])
  71. ck = parse_jsonb_list(tool['case_knowledge'])
  72. pk = parse_jsonb_list(tool['process_knowledge'])
  73. all_knowledge_ids = set(tk + ck + pk)
  74. if not all_knowledge_ids:
  75. continue
  76. tool_with_refs += 1
  77. total_refs += len(all_knowledge_ids)
  78. for kid in all_knowledge_ids:
  79. if kid not in knowledge_to_tools:
  80. knowledge_to_tools[kid] = set()
  81. knowledge_to_tools[kid].add(tool_id)
  82. print(f" Tools with knowledge refs: {tool_with_refs}")
  83. print(f" Total tool->knowledge references: {total_refs}")
  84. print(f" Unique knowledge IDs referenced: {len(knowledge_to_tools)}")
  85. # ── Step 3: 验证知识 ID 是否存在于 knowledge 表 ──
  86. print("\n=== [3] Validating knowledge IDs ===")
  87. cur.execute("SELECT id FROM knowledge")
  88. existing_knowledge_ids = {r['id'] for r in cur.fetchall()}
  89. print(f" Knowledge entries in DB: {len(existing_knowledge_ids)}")
  90. missing = set(knowledge_to_tools.keys()) - existing_knowledge_ids
  91. valid = set(knowledge_to_tools.keys()) & existing_knowledge_ids
  92. if missing:
  93. print(f" [!] {len(missing)} knowledge IDs referenced but not found in DB:")
  94. for kid in sorted(missing)[:10]:
  95. tools_ref = sorted(knowledge_to_tools[kid])
  96. print(f" {kid} (referenced by {tools_ref[:3]}{'...' if len(tools_ref)>3 else ''})")
  97. if len(missing) > 10:
  98. print(f" ... and {len(missing) - 10} more")
  99. else:
  100. print(f" All {len(valid)} referenced knowledge IDs exist in DB")
  101. # ── Step 4: 写入 knowledge.tools ──
  102. print(f"\n=== [4] Updating knowledge.tools {'(DRY RUN)' if dry_run else ''} ===")
  103. updated = 0
  104. skipped = 0
  105. for kid in sorted(valid):
  106. tool_ids = sorted(knowledge_to_tools[kid])
  107. if not dry_run:
  108. cur.execute(
  109. "UPDATE knowledge SET tools = %s WHERE id = %s",
  110. (json.dumps(tool_ids), kid)
  111. )
  112. updated += 1
  113. for kid in sorted(missing):
  114. skipped += 1
  115. print(f" Updated: {updated} knowledge entries")
  116. print(f" Skipped: {skipped} (knowledge ID not found)")
  117. # ── Step 5: 验证 ──
  118. if not dry_run:
  119. print("\n=== [5] Verification ===")
  120. cur.execute("""
  121. SELECT COUNT(*) as cnt FROM knowledge
  122. WHERE tools != '[]'::jsonb AND tools IS NOT NULL
  123. """)
  124. filled = cur.fetchone()['cnt']
  125. print(f" Knowledge entries with non-empty tools: {filled}")
  126. print("\n -- Sample (first 10) --")
  127. cur.execute("""
  128. SELECT id, tools
  129. FROM knowledge
  130. WHERE tools != '[]'::jsonb AND tools IS NOT NULL
  131. ORDER BY id LIMIT 10
  132. """)
  133. for r in cur.fetchall():
  134. tools_list = parse_jsonb_list(r['tools'])
  135. print(f" {r['id']}: tools={tools_list}")
  136. # 统计分布
  137. cur.execute("""
  138. SELECT jsonb_array_length(tools) as tool_count, COUNT(*) as cnt
  139. FROM knowledge
  140. WHERE tools != '[]'::jsonb AND tools IS NOT NULL
  141. GROUP BY jsonb_array_length(tools)
  142. ORDER BY tool_count
  143. """)
  144. print("\n -- Distribution: how many tools per knowledge --")
  145. for r in cur.fetchall():
  146. print(f" {r['tool_count']} tool(s): {r['cnt']} knowledge entries")
  147. # ── Summary ──
  148. print(f"\n=== Summary ===")
  149. print(f" Knowledge entries updated: {updated}")
  150. print(f" Missing knowledge IDs: {skipped}")
  151. if dry_run:
  152. print(f"\n (DRY RUN mode - no changes written to database)")
  153. cur.close()
  154. conn.close()
  155. print("\nDone.")
  156. if __name__ == '__main__':
  157. main()