| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- #!/usr/bin/env python3
- """
- 清理 resources 表中 metadata.knowledge_ids 的无效引用
- """
- import os
- import json
- import psycopg2
- from psycopg2.extras import RealDictCursor
- from dotenv import load_dotenv
- load_dotenv()
- def clean_resource_knowledge_refs():
- conn = psycopg2.connect(
- host=os.getenv('KNOWHUB_DB'),
- port=int(os.getenv('KNOWHUB_PORT', 5432)),
- user=os.getenv('KNOWHUB_USER'),
- password=os.getenv('KNOWHUB_PASSWORD'),
- database=os.getenv('KNOWHUB_DB_NAME')
- )
- conn.autocommit = False
- cur = conn.cursor(cursor_factory=RealDictCursor)
- # 查所有有 knowledge_ids 的资源
- cur.execute("SELECT id, title, metadata FROM resources WHERE metadata IS NOT NULL")
- resources = cur.fetchall()
- print(f"检查 {len(resources)} 个资源...")
- cleaned_count = 0
- total_removed = 0
- for res in resources:
- metadata = res['metadata'] if isinstance(res['metadata'], dict) else json.loads(res['metadata'] or '{}')
- knowledge_ids = metadata.get('knowledge_ids', [])
- if not knowledge_ids:
- continue
- # 检查哪些存在
- cur.execute('SELECT id FROM knowledge WHERE id = ANY(%s)', (knowledge_ids,))
- existing = {r['id'] for r in cur.fetchall()}
- # 过滤出有效的
- valid_ids = [kid for kid in knowledge_ids if kid in existing]
- if len(valid_ids) < len(knowledge_ids):
- removed = len(knowledge_ids) - len(valid_ids)
- print(f"✅ {res['title']}: 移除 {removed} 个无效引用,保留 {len(valid_ids)} 个")
- metadata['knowledge_ids'] = valid_ids
- cur.execute(
- "UPDATE resources SET metadata = %s WHERE id = %s",
- (json.dumps(metadata), res['id'])
- )
- cleaned_count += 1
- total_removed += removed
- conn.commit()
- print(f"\n完成:清理了 {cleaned_count} 个资源,共移除 {total_removed} 个无效引用")
- conn.close()
- if __name__ == "__main__":
- clean_resource_knowledge_refs()
|