fix_embedding_migration.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/usr/bin/env python3
  2. """
  3. 修正 knowledge 表的 embedding 迁移错误:
  4. 旧 embedding 是基于 task 生成的,应该迁移到 task_embedding 而非 content_embedding。
  5. 1. 将 content_embedding 数据移到 task_embedding(如果 task_embedding 为空)
  6. 2. 清空 content_embedding(等待 fill_embeddings.py 重新生成)
  7. """
  8. import os, psycopg2
  9. from dotenv import load_dotenv
  10. _dir = os.path.dirname(os.path.abspath(__file__))
  11. _root = os.path.normpath(os.path.join(_dir, '..', '..'))
  12. load_dotenv(os.path.join(_root, '.env'))
  13. conn = psycopg2.connect(
  14. host=os.getenv('KNOWHUB_DB'),
  15. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  16. user=os.getenv('KNOWHUB_USER'),
  17. password=os.getenv('KNOWHUB_PASSWORD'),
  18. database=os.getenv('KNOWHUB_DB_NAME'),
  19. connect_timeout=10
  20. )
  21. conn.autocommit = True
  22. cur = conn.cursor()
  23. print("Connected.\n")
  24. # Step 1: 将 content_embedding 移到 task_embedding(仅当 task_embedding 为空时)
  25. print("[1] Moving content_embedding -> task_embedding ...")
  26. cur.execute("""
  27. UPDATE knowledge
  28. SET task_embedding = content_embedding
  29. WHERE task_embedding IS NULL AND content_embedding IS NOT NULL
  30. """)
  31. print(f" Moved {cur.rowcount} rows.")
  32. # Step 2: 清空 content_embedding(让 fill_embeddings.py 基于 content 重新生成)
  33. print("[2] Clearing content_embedding ...")
  34. cur.execute("UPDATE knowledge SET content_embedding = NULL WHERE content_embedding IS NOT NULL")
  35. print(f" Cleared {cur.rowcount} rows.")
  36. # Verify
  37. print("\n[Verify]")
  38. cur.execute("SELECT COUNT(*) FROM knowledge WHERE task_embedding IS NOT NULL")
  39. print(f" task_embedding: {cur.fetchone()[0]} rows have data")
  40. cur.execute("SELECT COUNT(*) FROM knowledge WHERE content_embedding IS NOT NULL")
  41. print(f" content_embedding: {cur.fetchone()[0]} rows have data (should be 0)")
  42. cur.close()
  43. conn.close()
  44. print("\nDone. Now run fill_embeddings.py to generate content_embedding.")