#!/usr/bin/env python3 """ 合并 tao_dev_1 版本内的重复 capability,并删除 VCAP 占位条目。 策略: 1. 对每个 (canonical, members) 簇: 对 5 个 junction (requirement_capability / capability_tool / capability_knowledge / capability_resource / strategy_capability) - 先删除会和 canonical 冲突的 member 引用(避免 PK 冲突) - 再 UPDATE member → canonical 2. 删除被合并的 member 行 + VCAP 占位行 3. 全程打印进度(flush=True),不管道给 tail 用法: python knowhub/scripts/merge_capabilities.py --dry-run # 只打印计划 python knowhub/scripts/merge_capabilities.py --execute # 真正执行 """ import argparse import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore # canonical_id -> [member_ids to merge into canonical] MERGE_CLUSTERS = { # ── Round C (跨版本:v0 foundation 吸收 tao_dev_1 同义条) ──── # C1 动画帧/文生视频 "CAP-009": ["CAP-a95da8d7"], # C2 图像上色 "CAP-007": ["CAP-75ba0eac"], # C3 图像细节增强与高清放大 "CAP-006": ["CAP-767e7848"], # C4 草图/3D → 效果图 "CAP-019": ["CAP-c7c8479a"], # C5 图像内文字翻译/重写替换 "CAP-021": ["CAP-44ada771"], # C6 主体/角色一致性(borderline 合并) "CAP-003": ["CAP-1c71b52e"], # ── Round 2 ────────────────────────────────────────── # B1 室内多光源 "CAP-5fb6dd66": ["CAP-aa75f198", "CAP-4da21eb2"], # B2 室内色调 "CAP-3bf86ae3": ["CAP-20e2e0e6"], # B3 室内自然材质 "CAP-56368e3a": ["CAP-e8b46ca2"], # B4 电影级光照 "CAP-aaaef688": ["CAP-9658a2d9"], # B5 虚拟相机运动 "CAP-49175b92": ["CAP-943f7709"], # B6 结构化 Prompt 框架 "CAP-5b000814": ["CAP-f0d52dd1", "CAP-28983111"], # B7 多景别 prompt "CAP-4f15a85f": ["CAP-1da1bf7a", "CAP-0e64f839"], # B8 微距/极端特写 "CAP-26100ea8": ["CAP-b93f5cf9", "CAP-2382899f"], # B9 图文卡片自动排版 "CAP-87ba3b7d": ["CAP-25c0eef3", "CAP-a648fb4a"], # B10 海报多元素排版 "CAP-562d91c1": ["CAP-2df55478", "CAP-cee93f12", "CAP-dba33b2d", "CAP-02b573f4"], # B11 空间透视景深 "CAP-3c49ff0a": ["CAP-edb949a3", "CAP-ee3bac1d", "CAP-19e77a40", "CAP-eb5e0ad2"], # B12 产品摄影光照 "CAP-c2c42fc7": ["CAP-be42acba"], # B13 材质质感 "CAP-0dc2a15b": ["CAP-7f63a4f5"], # B14 霓虹/光晕/发光 "CAP-a35e7966": ["CAP-ec490469", "CAP-ef8c3e8b", "CAP-9dea4396"], # B15 数据图表生成 "CAP-3ee6c232": ["CAP-84a60ca0", "CAP-d02af2bd"], # B16 复古印刷质感 "CAP-9359b49f": ["CAP-59414657", "CAP-718daa01"], # B17 超现实空间 "CAP-1f898cd9": ["CAP-d1a4709c"], # B18 文字透视融合 "CAP-bd4828fc": ["CAP-f3d42bb0"], # B19 拟人化角色 "CAP-e962c3ef": ["CAP-93b35d43", "CAP-5c7abd0c"], # B20 镜面水面反射 "CAP-3b51102e": ["CAP-bb547523"], # B21 粒子光效 "CAP-8467736a": ["CAP-a6c6d4fc"], # B22 自然光时刻 "CAP-e8a77f70": ["CAP-7f2d1a72"], # B23 经典布光模式 "CAP-ed4b506e": ["CAP-08c54a3c"], # B24 几何蒙版裁剪 "CAP-47d6893f": ["CAP-7889940a"], # B25 批量系列海报 "CAP-832e80ac": ["CAP-bb063798"], # B26 文生视频通用 "CAP-a95da8d7": ["CAP-523c8623"], # ── Round 1(已执行完毕,保留以保障脚本幂等)─────── # A1 抠图 "CAP-12d2aa10": ["CAP-b9c2cafc", "CAP-2a40d757", "CAP-88816f1e", "CAP-2ba2bc19", "CAP-f0137cfa", "CAP-ccb3a2fd", "CAP-ad3fd294"], # A2 多图拼贴(后处理) "CAP-41ac8100": ["CAP-17732b2b", "CAP-409eefd9", "CAP-6f791b59", "CAP-bdc6c7c8", "CAP-c009dcce", "CAP-e7467ebf", "CAP-462897e2", "CAP-a815960c", "CAP-f3d22954"], # A2b 大图切割 "CAP-fddd3349": ["CAP-3cf6ab47", "CAP-336fe318"], # A3 鱼眼/广角畸变 "CAP-0e3d61ca": ["CAP-71de6ed3", "CAP-4e9c99fa", "CAP-d49daa5b"], # A4 选择性着色 "CAP-3178172e": ["CAP-1dcb6702", "CAP-d18b8a24"], # A5 丁达尔/体积光 "CAP-3086677b": ["CAP-c6ee82db", "CAP-f07aa0df", "CAP-81f097e1"], # A6 胶片光学 "CAP-6c14041c": ["CAP-2c85b37d", "CAP-8ce1a9c6", "CAP-c9b3c7a5"], # A7 摄影参数 "CAP-ef0a4c0c": ["CAP-17ffe5ea", "CAP-84d68e1b", "CAP-074db966"], # A8 戏剧明暗/Chiaroscuro "CAP-1649b549": ["CAP-e06eeb84", "CAP-e707003d", "CAP-aa25bdf9"], # A9 唇形同步 "CAP-98490894": ["CAP-6fe5aecb", "CAP-89b9875b", "CAP-ad6d8e0d"], # A10 虚拟试衣/服装迁移 "CAP-d92ffc99": ["CAP-1fdb00c2", "CAP-35974014", "CAP-b56d72c7", "CAP-f697cb22"], # A11 AI 配乐 "CAP-5f9644fb": ["CAP-9fe2869e"], # A12 角色多视图(prompt 驱动) "CAP-5342ad19": ["CAP-2aee7861", "CAP-6566ec42", "CAP-faa53945", "CAP-ea7f3b27", "CAP-9edfec88"], # A12b 参数化多视角(ControlNet/坐标) "CAP-ee7df476": ["CAP-939c3610", "CAP-1374aa64", "CAP-3758e4a5"], # A13 逆光/轮廓光 # CAP-47151d87 单独保留(VCAP 会在最后删除) # A14 冷暖对比 "CAP-a185d6d2": ["CAP-6994f914", "CAP-a6d5ef60"], # A15 双重曝光 "CAP-19e5402a": ["CAP-7841b44d", "CAP-6a7ebaa3"], # A16 对话气泡 "CAP-fc2bd5cf": ["CAP-fd64812b"], # A17 文字描边/阴影 "CAP-16c5174b": ["CAP-c13f0764", "CAP-e67c259b"], # A18 LLM 提示词扩写 "CAP-4d8ba002": ["CAP-dd8c832f", "CAP-cc8d042f", "CAP-eeb71b76"], # A18b LLM 故事脚本 "CAP-da51c2ec": ["CAP-da6ef3eb", "CAP-88752108", "CAP-2880810c", "CAP-3313a654", "CAP-34567777"], # A19 360 全景 "CAP-1b3e966f": ["CAP-fd4786c5"], # A20 信息层级排版 "CAP-067edd94": ["CAP-c321e41d", "CAP-bb78b124", "CAP-5753e9dd"], # A21 卡片版式 "CAP-6e77db54": ["CAP-33a038e1"], # A22 单色调锁定(prompt) "CAP-2bd87e28": ["CAP-8dfe77e3", "CAP-7f123558", "CAP-a3985fda"], # A22b 后处理色调映射 "CAP-79590b09": ["CAP-39e1bfda", "CAP-eccd1ce8", "CAP-f08eb0eb", "CAP-a0e2c93a", "CAP-dd9e6d34"], # A23 低饱和/去色 "CAP-298dcb55": ["CAP-a5fb0745"], # A24 高饱和多色并置 "CAP-2de278d6": ["CAP-3ba4cb6e", "CAP-76b117fc", "CAP-252b422a"], # A26 极端视角 "CAP-0c30af82": ["CAP-fecf1f7d", "CAP-a0800d7d"], # A28 手部修复 "CAP-0ba3159e": ["CAP-db60d72e"], # A29 皮肤真实感 "CAP-3b0de1ce": ["CAP-ad971785", "CAP-b36560ff"], # A30 AI 一次生成多格图 "CAP-306c15fe": ["CAP-033b21b5", "CAP-ae5276f1", "CAP-611ac7bd", "CAP-e5fbbe2a", "CAP-8e4dbefa", "CAP-a1e8fd4e", "CAP-56175058", "CAP-5bb77d65", "CAP-e861c293", "CAP-07a8b9a2"], # A31 多格一致性 "CAP-e9b763d2": ["CAP-ddb5e870", "CAP-e1e9a807", "CAP-08b34a6b", "CAP-020d2c05", "CAP-6fa6bd25", "CAP-38d84d76"], # A32 多情绪表情矩阵 "CAP-5a1ac59d": ["CAP-92f70ebf", "CAP-164ecc20"], # A33 配色方案推荐 "CAP-689bac61": ["CAP-4bd95998", "CAP-86887ed5", "CAP-d6954059"], # A38 轮播分页 "CAP-20409fa6": ["CAP-716dd15a", "CAP-e4a5c708", "CAP-a9d3293f"], } # VCAP 占位 + 空描述(单独删除) VCAP_AND_EMPTY = [ "CAP-47310932", "CAP-tao_dev_1-00-01", "CAP-tao_dev_1-00-02", "CAP-tao_dev_1-00-03", "CAP-tao_dev_1-00-04", "CAP-tao_dev_1-01-01", "CAP-tao_dev_1-01-02", "CAP-tao_dev_1-01-03", "CAP-tao_dev_1-02-01", "CAP-tao_dev_1-02-02", "CAP-tao_dev_1-02-03", "CAP-tao_dev_1-03-01", "CAP-tao_dev_1-03-02", "CAP-tao_dev_1-03-03", "CAP-tao_dev_1-03-04", "CAP-tao_dev_1-03-05", "CAP-tao_dev_1-03-06", "CAP-tao_dev_1-04-01", "CAP-tao_dev_1-04-02", "CAP-tao_dev_1-04-03", "CAP-tao_dev_1-04-04", "CAP-tao_dev_1-04-05", ] JUNCTIONS = [ ("requirement_capability", "capability_id"), ("capability_tool", "capability_id"), ("capability_knowledge", "capability_id"), ("capability_resource", "capability_id"), ("strategy_capability", "capability_id"), ] def sanity_check(cur): """校验所有 canonical/member 都真实存在于 DB。""" all_ids = set(MERGE_CLUSTERS.keys()) for members in MERGE_CLUSTERS.values(): all_ids.update(members) all_ids.update(VCAP_AND_EMPTY) cur.execute("SELECT id FROM capability WHERE id = ANY(%s)", (list(all_ids),)) existing = {r['id'] for r in cur.fetchall()} missing = all_ids - existing if missing: print(f"⚠️ 以下 ID 在 DB 中不存在(会跳过):{sorted(missing)}", flush=True) return existing def merge_one_cluster(cur, canonical, members, existing, dry_run): """把 members 合并到 canonical。""" members = [m for m in members if m in existing] if not members: print(f" [skip] {canonical}: 所有 members 都已不存在", flush=True) return 0, 0, 0 junction_updates = 0 junction_dups_removed = 0 # 逐个 member 处理 —— 避免 members 之间互相冲突(都指向同一 other_id) for member in members: for table, col in JUNCTIONS: other_col = { "requirement_capability": "requirement_id", "capability_tool": "tool_id", "capability_knowledge": "knowledge_id", "capability_resource": "resource_id", "strategy_capability": "strategy_id", }[table] # 删除会和 canonical 当前占用冲突的 member 引用 del_sql = ( f"DELETE FROM {table} WHERE {col} = %s " f"AND {other_col} IN (" f" SELECT {other_col} FROM {table} WHERE {col} = %s" f")" ) if dry_run: cur.execute( f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s " f"AND {other_col} IN (" f" SELECT {other_col} FROM {table} WHERE {col} = %s)", (member, canonical), ) dup_n = cur.fetchone()['c'] else: cur.execute(del_sql, (member, canonical)) dup_n = cur.rowcount or 0 junction_dups_removed += dup_n # UPDATE 剩余 member 引用 → canonical upd_sql = f"UPDATE {table} SET {col} = %s WHERE {col} = %s" if dry_run: cur.execute( f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s", (member,), ) upd_n = cur.fetchone()['c'] else: cur.execute(upd_sql, (canonical, member)) upd_n = cur.rowcount or 0 junction_updates += upd_n # 删除 member 行 del_caps_sql = "DELETE FROM capability WHERE id = ANY(%s)" if dry_run: n_del = len(members) else: cur.execute(del_caps_sql, (members,)) n_del = cur.rowcount or 0 print(f" [{'DRY' if dry_run else 'OK '}] {canonical} ← {len(members)} members | " f"junction upd={junction_updates}, dup_removed={junction_dups_removed}, " f"cap_deleted={n_del}", flush=True) return junction_updates, junction_dups_removed, n_del def delete_vcap(cur, ids_to_delete, existing, dry_run): ids = [i for i in ids_to_delete if i in existing] if not ids: print(" [skip] VCAP: 都已不存在", flush=True) return 0 total_j = 0 for table, col in JUNCTIONS: if dry_run: cur.execute( f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = ANY(%s)", (ids,) ) n = cur.fetchone()['c'] else: cur.execute(f"DELETE FROM {table} WHERE {col} = ANY(%s)", (ids,)) n = cur.rowcount or 0 total_j += n if dry_run: n_cap = len(ids) else: cur.execute("DELETE FROM capability WHERE id = ANY(%s)", (ids,)) n_cap = cur.rowcount or 0 print(f" [{'DRY' if dry_run else 'OK '}] VCAP delete: " f"{len(ids)} caps, junction_deleted={total_j}, cap_deleted={n_cap}", flush=True) return n_cap def main(): ap = argparse.ArgumentParser() g = ap.add_mutually_exclusive_group(required=True) g.add_argument("--dry-run", action="store_true") g.add_argument("--execute", action="store_true") args = ap.parse_args() dry = args.dry_run print(f"\n{'='*60}\n{'DRY RUN' if dry else 'EXECUTE'} — capability 合并\n" f"{'='*60}", flush=True) store = PostgreSQLCapabilityStore() cur = store._get_cursor() try: # 前置统计 cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'") before = cur.fetchone()['c'] print(f"📊 合并前 tao_dev_1 capability 数: {before}\n", flush=True) existing = sanity_check(cur) print(f"📋 待合并簇: {len(MERGE_CLUSTERS)},VCAP 待删: " f"{len(VCAP_AND_EMPTY)}\n", flush=True) print("── 阶段 1: 簇内合并 ─────────────", flush=True) tot_upd = tot_dup = tot_del = 0 for idx, (canonical, members) in enumerate(MERGE_CLUSTERS.items(), 1): print(f"\n[{idx}/{len(MERGE_CLUSTERS)}] canonical={canonical}", flush=True) u, d, dl = merge_one_cluster(cur, canonical, members, existing, dry) tot_upd += u tot_dup += d tot_del += dl print("\n── 阶段 2: 删除 VCAP + 空数据 ─────", flush=True) n_vcap = delete_vcap(cur, VCAP_AND_EMPTY, existing, dry) # 事后统计 cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'") after = cur.fetchone()['c'] print(f"\n{'='*60}\n汇总:", flush=True) print(f" junction UPDATE: {tot_upd}", flush=True) print(f" junction 去重 DELETE: {tot_dup}", flush=True) print(f" capability 簇合并删除: {tot_del}", flush=True) print(f" VCAP 删除: {n_vcap}", flush=True) print(f" 合并前: {before} 合并{'后(预测)' if dry else '后'}: {after}", flush=True) if dry: print(f" 预期合并后: {before - tot_del - n_vcap}", flush=True) print(f"{'='*60}\n", flush=True) finally: cur.close() store.close() if __name__ == "__main__": main()