#!/usr/bin/env python3 """ 清理 resources 表中 metadata.knowledge_ids 的无效引用 """ import os import json import psycopg2 from psycopg2.extras import RealDictCursor from dotenv import load_dotenv load_dotenv() def clean_resource_knowledge_refs(): conn = psycopg2.connect( host=os.getenv('KNOWHUB_DB'), port=int(os.getenv('KNOWHUB_PORT', 5432)), user=os.getenv('KNOWHUB_USER'), password=os.getenv('KNOWHUB_PASSWORD'), database=os.getenv('KNOWHUB_DB_NAME') ) conn.autocommit = False cur = conn.cursor(cursor_factory=RealDictCursor) # 查所有有 knowledge_ids 的资源 cur.execute("SELECT id, title, metadata FROM resources WHERE metadata IS NOT NULL") resources = cur.fetchall() print(f"检查 {len(resources)} 个资源...") cleaned_count = 0 total_removed = 0 for res in resources: metadata = res['metadata'] if isinstance(res['metadata'], dict) else json.loads(res['metadata'] or '{}') knowledge_ids = metadata.get('knowledge_ids', []) if not knowledge_ids: continue # 检查哪些存在 cur.execute('SELECT id FROM knowledge WHERE id = ANY(%s)', (knowledge_ids,)) existing = {r['id'] for r in cur.fetchall()} # 过滤出有效的 valid_ids = [kid for kid in knowledge_ids if kid in existing] if len(valid_ids) < len(knowledge_ids): removed = len(knowledge_ids) - len(valid_ids) print(f"✅ {res['title']}: 移除 {removed} 个无效引用,保留 {len(valid_ids)} 个") metadata['knowledge_ids'] = valid_ids cur.execute( "UPDATE resources SET metadata = %s WHERE id = %s", (json.dumps(metadata), res['id']) ) cleaned_count += 1 total_removed += removed conn.commit() print(f"\n完成:清理了 {cleaned_count} 个资源,共移除 {total_removed} 个无效引用") conn.close() if __name__ == "__main__": clean_resource_knowledge_refs()