| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- #!/usr/bin/env python3
- """
- 修正 knowledge 表的 embedding 迁移错误:
- 旧 embedding 是基于 task 生成的,应该迁移到 task_embedding 而非 content_embedding。
- 1. 将 content_embedding 数据移到 task_embedding(如果 task_embedding 为空)
- 2. 清空 content_embedding(等待 fill_embeddings.py 重新生成)
- """
- import os, psycopg2
- from dotenv import load_dotenv
- _dir = os.path.dirname(os.path.abspath(__file__))
- _root = os.path.normpath(os.path.join(_dir, '..', '..'))
- load_dotenv(os.path.join(_root, '.env'))
- conn = psycopg2.connect(
- host=os.getenv('KNOWHUB_DB'),
- port=int(os.getenv('KNOWHUB_PORT', 5432)),
- user=os.getenv('KNOWHUB_USER'),
- password=os.getenv('KNOWHUB_PASSWORD'),
- database=os.getenv('KNOWHUB_DB_NAME'),
- connect_timeout=10
- )
- conn.autocommit = True
- cur = conn.cursor()
- print("Connected.\n")
- # Step 1: 将 content_embedding 移到 task_embedding(仅当 task_embedding 为空时)
- print("[1] Moving content_embedding -> task_embedding ...")
- cur.execute("""
- UPDATE knowledge
- SET task_embedding = content_embedding
- WHERE task_embedding IS NULL AND content_embedding IS NOT NULL
- """)
- print(f" Moved {cur.rowcount} rows.")
- # Step 2: 清空 content_embedding(让 fill_embeddings.py 基于 content 重新生成)
- print("[2] Clearing content_embedding ...")
- cur.execute("UPDATE knowledge SET content_embedding = NULL WHERE content_embedding IS NOT NULL")
- print(f" Cleared {cur.rowcount} rows.")
- # Verify
- print("\n[Verify]")
- cur.execute("SELECT COUNT(*) FROM knowledge WHERE task_embedding IS NOT NULL")
- print(f" task_embedding: {cur.fetchone()[0]} rows have data")
- cur.execute("SELECT COUNT(*) FROM knowledge WHERE content_embedding IS NOT NULL")
- print(f" content_embedding: {cur.fetchone()[0]} rows have data (should be 0)")
- cur.close()
- conn.close()
- print("\nDone. Now run fill_embeddings.py to generate content_embedding.")
|