clean_resource_knowledge_refs.py 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!/usr/bin/env python3
  2. """
  3. 清理 resources 表中 metadata.knowledge_ids 的无效引用
  4. """
  5. import os
  6. import json
  7. import psycopg2
  8. from psycopg2.extras import RealDictCursor
  9. from dotenv import load_dotenv
  10. load_dotenv()
  11. def clean_resource_knowledge_refs():
  12. conn = psycopg2.connect(
  13. host=os.getenv('KNOWHUB_DB'),
  14. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  15. user=os.getenv('KNOWHUB_USER'),
  16. password=os.getenv('KNOWHUB_PASSWORD'),
  17. database=os.getenv('KNOWHUB_DB_NAME')
  18. )
  19. conn.autocommit = False
  20. cur = conn.cursor(cursor_factory=RealDictCursor)
  21. # 查所有有 knowledge_ids 的资源
  22. cur.execute("SELECT id, title, metadata FROM resources WHERE metadata IS NOT NULL")
  23. resources = cur.fetchall()
  24. print(f"检查 {len(resources)} 个资源...")
  25. cleaned_count = 0
  26. total_removed = 0
  27. for res in resources:
  28. metadata = res['metadata'] if isinstance(res['metadata'], dict) else json.loads(res['metadata'] or '{}')
  29. knowledge_ids = metadata.get('knowledge_ids', [])
  30. if not knowledge_ids:
  31. continue
  32. # 检查哪些存在
  33. cur.execute('SELECT id FROM knowledge WHERE id = ANY(%s)', (knowledge_ids,))
  34. existing = {r['id'] for r in cur.fetchall()}
  35. # 过滤出有效的
  36. valid_ids = [kid for kid in knowledge_ids if kid in existing]
  37. if len(valid_ids) < len(knowledge_ids):
  38. removed = len(knowledge_ids) - len(valid_ids)
  39. print(f"✅ {res['title']}: 移除 {removed} 个无效引用,保留 {len(valid_ids)} 个")
  40. metadata['knowledge_ids'] = valid_ids
  41. cur.execute(
  42. "UPDATE resources SET metadata = %s WHERE id = %s",
  43. (json.dumps(metadata), res['id'])
  44. )
  45. cleaned_count += 1
  46. total_removed += removed
  47. conn.commit()
  48. print(f"\n完成:清理了 {cleaned_count} 个资源,共移除 {total_removed} 个无效引用")
  49. conn.close()
  50. if __name__ == "__main__":
  51. clean_resource_knowledge_refs()