clean_invalid_knowledge_refs.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. #!/usr/bin/env python3
  2. """
  3. 清理 tool_table 中无效的 knowledge 引用
  4. """
  5. import os
  6. import json
  7. import psycopg2
  8. from psycopg2.extras import RealDictCursor
  9. from dotenv import load_dotenv
  10. load_dotenv()
  11. def clean_invalid_refs():
  12. conn = psycopg2.connect(
  13. host=os.getenv('KNOWHUB_DB'),
  14. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  15. user=os.getenv('KNOWHUB_USER'),
  16. password=os.getenv('KNOWHUB_PASSWORD'),
  17. database=os.getenv('KNOWHUB_DB_NAME')
  18. )
  19. conn.autocommit = False
  20. cur = conn.cursor(cursor_factory=RealDictCursor)
  21. # 查所有工具
  22. cur.execute("SELECT id, name, knowledge FROM tool_table WHERE knowledge IS NOT NULL AND knowledge::text != '[]'")
  23. tools = cur.fetchall()
  24. print(f"检查 {len(tools)} 个工具...")
  25. cleaned_count = 0
  26. total_removed = 0
  27. for tool in tools:
  28. knowledge_ids = tool['knowledge'] if isinstance(tool['knowledge'], list) else json.loads(tool['knowledge'] or '[]')
  29. if not knowledge_ids:
  30. continue
  31. # 检查哪些存在
  32. cur.execute('SELECT id FROM knowledge WHERE id = ANY(%s)', (knowledge_ids,))
  33. existing = {r['id'] for r in cur.fetchall()}
  34. # 过滤出有效的
  35. valid_ids = [kid for kid in knowledge_ids if kid in existing]
  36. if len(valid_ids) < len(knowledge_ids):
  37. removed = len(knowledge_ids) - len(valid_ids)
  38. print(f"✅ {tool['name']}: 移除 {removed} 个无效引用,保留 {len(valid_ids)} 个")
  39. cur.execute(
  40. "UPDATE tool_table SET knowledge = %s WHERE id = %s",
  41. (json.dumps(valid_ids), tool['id'])
  42. )
  43. cleaned_count += 1
  44. total_removed += removed
  45. conn.commit()
  46. print(f"\n完成:清理了 {cleaned_count} 个工具,共移除 {total_removed} 个无效引用")
  47. conn.close()
  48. if __name__ == "__main__":
  49. clean_invalid_refs()