migrate_knowledge.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. #!/usr/bin/env python3
  2. """
  3. 补全 knowledge 表的剩余变更:
  4. 1. 添加 tools 列
  5. 2. 添加 task_embedding, content_embedding 列
  6. 3. 将 embedding 数据迁移到 content_embedding
  7. 4. 删除旧 embedding 列
  8. 注意:步骤3涉及 387 条 x 1536维向量的 UPDATE,可能需要几十秒。
  9. 如果中途卡住不要 kill 进程,等它跑完,否则会死锁。
  10. """
  11. import os, psycopg2
  12. from dotenv import load_dotenv
  13. _dir = os.path.dirname(os.path.abspath(__file__))
  14. _root = os.path.normpath(os.path.join(_dir, '..', '..'))
  15. load_dotenv(os.path.join(_root, '.env'))
  16. conn = psycopg2.connect(
  17. host=os.getenv('KNOWHUB_DB'),
  18. port=int(os.getenv('KNOWHUB_PORT', 5432)),
  19. user=os.getenv('KNOWHUB_USER'),
  20. password=os.getenv('KNOWHUB_PASSWORD'),
  21. database=os.getenv('KNOWHUB_DB_NAME'),
  22. connect_timeout=10
  23. )
  24. conn.autocommit = True
  25. cur = conn.cursor()
  26. print("Connected.\n")
  27. def col_exists(table, column):
  28. cur.execute("""
  29. SELECT 1 FROM pg_catalog.pg_attribute a
  30. JOIN pg_catalog.pg_class c ON a.attrelid = c.oid
  31. JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid
  32. WHERE c.relname = %s AND n.nspname = 'public'
  33. AND a.attname = %s AND a.attnum > 0 AND NOT a.attisdropped
  34. """, (table, column))
  35. return cur.fetchone() is not None
  36. # Step 1: 添加 tools 列
  37. print("[1] Add tools column...")
  38. if not col_exists('knowledge', 'tools'):
  39. cur.execute("ALTER TABLE knowledge ADD COLUMN tools JSONB DEFAULT '[]'")
  40. print(" + Added tools")
  41. else:
  42. print(" . tools already exists")
  43. # Step 2: 添加 task_embedding 和 content_embedding
  44. print("[2] Add task_embedding, content_embedding columns...")
  45. if not col_exists('knowledge', 'task_embedding'):
  46. cur.execute("ALTER TABLE knowledge ADD COLUMN task_embedding real[]")
  47. print(" + Added task_embedding")
  48. else:
  49. print(" . task_embedding already exists")
  50. if not col_exists('knowledge', 'content_embedding'):
  51. cur.execute("ALTER TABLE knowledge ADD COLUMN content_embedding real[]")
  52. print(" + Added content_embedding")
  53. else:
  54. print(" . content_embedding already exists")
  55. # Step 3: 迁移 embedding -> content_embedding
  56. if col_exists('knowledge', 'embedding'):
  57. print("[3] Migrating embedding -> content_embedding (this may take a while)...")
  58. cur.execute("""
  59. UPDATE knowledge
  60. SET content_embedding = embedding
  61. WHERE content_embedding IS NULL AND embedding IS NOT NULL
  62. """)
  63. print(" Migrated.")
  64. # Step 4: 删除旧 embedding
  65. print("[4] Dropping old embedding column...")
  66. cur.execute("ALTER TABLE knowledge DROP COLUMN embedding")
  67. print(" Dropped.")
  68. else:
  69. print("[3] embedding column already removed, skip migration.")
  70. print("[4] skip.")
  71. print("\nDone. Run check_table_structure.py to verify.")
  72. cur.close()
  73. conn.close()