| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- #!/usr/bin/env python3
- """
- 合并 tao_dev_1 版本内的重复 capability,并删除 VCAP 占位条目。
- 策略:
- 1. 对每个 (canonical, members) 簇:
- 对 5 个 junction (requirement_capability / capability_tool /
- capability_knowledge / capability_resource / strategy_capability)
- - 先删除会和 canonical 冲突的 member 引用(避免 PK 冲突)
- - 再 UPDATE member → canonical
- 2. 删除被合并的 member 行 + VCAP 占位行
- 3. 全程打印进度(flush=True),不管道给 tail
- 用法:
- python knowhub/scripts/merge_capabilities.py --dry-run # 只打印计划
- python knowhub/scripts/merge_capabilities.py --execute # 真正执行
- """
- import argparse
- import sys
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
- # canonical_id -> [member_ids to merge into canonical]
- MERGE_CLUSTERS = {
- # ── Round C (跨版本:v0 foundation 吸收 tao_dev_1 同义条) ────
- # C1 动画帧/文生视频
- "CAP-009": ["CAP-a95da8d7"],
- # C2 图像上色
- "CAP-007": ["CAP-75ba0eac"],
- # C3 图像细节增强与高清放大
- "CAP-006": ["CAP-767e7848"],
- # C4 草图/3D → 效果图
- "CAP-019": ["CAP-c7c8479a"],
- # C5 图像内文字翻译/重写替换
- "CAP-021": ["CAP-44ada771"],
- # C6 主体/角色一致性(borderline 合并)
- "CAP-003": ["CAP-1c71b52e"],
- # ── Round 2 ──────────────────────────────────────────
- # B1 室内多光源
- "CAP-5fb6dd66": ["CAP-aa75f198", "CAP-4da21eb2"],
- # B2 室内色调
- "CAP-3bf86ae3": ["CAP-20e2e0e6"],
- # B3 室内自然材质
- "CAP-56368e3a": ["CAP-e8b46ca2"],
- # B4 电影级光照
- "CAP-aaaef688": ["CAP-9658a2d9"],
- # B5 虚拟相机运动
- "CAP-49175b92": ["CAP-943f7709"],
- # B6 结构化 Prompt 框架
- "CAP-5b000814": ["CAP-f0d52dd1", "CAP-28983111"],
- # B7 多景别 prompt
- "CAP-4f15a85f": ["CAP-1da1bf7a", "CAP-0e64f839"],
- # B8 微距/极端特写
- "CAP-26100ea8": ["CAP-b93f5cf9", "CAP-2382899f"],
- # B9 图文卡片自动排版
- "CAP-87ba3b7d": ["CAP-25c0eef3", "CAP-a648fb4a"],
- # B10 海报多元素排版
- "CAP-562d91c1": ["CAP-2df55478", "CAP-cee93f12", "CAP-dba33b2d",
- "CAP-02b573f4"],
- # B11 空间透视景深
- "CAP-3c49ff0a": ["CAP-edb949a3", "CAP-ee3bac1d", "CAP-19e77a40",
- "CAP-eb5e0ad2"],
- # B12 产品摄影光照
- "CAP-c2c42fc7": ["CAP-be42acba"],
- # B13 材质质感
- "CAP-0dc2a15b": ["CAP-7f63a4f5"],
- # B14 霓虹/光晕/发光
- "CAP-a35e7966": ["CAP-ec490469", "CAP-ef8c3e8b", "CAP-9dea4396"],
- # B15 数据图表生成
- "CAP-3ee6c232": ["CAP-84a60ca0", "CAP-d02af2bd"],
- # B16 复古印刷质感
- "CAP-9359b49f": ["CAP-59414657", "CAP-718daa01"],
- # B17 超现实空间
- "CAP-1f898cd9": ["CAP-d1a4709c"],
- # B18 文字透视融合
- "CAP-bd4828fc": ["CAP-f3d42bb0"],
- # B19 拟人化角色
- "CAP-e962c3ef": ["CAP-93b35d43", "CAP-5c7abd0c"],
- # B20 镜面水面反射
- "CAP-3b51102e": ["CAP-bb547523"],
- # B21 粒子光效
- "CAP-8467736a": ["CAP-a6c6d4fc"],
- # B22 自然光时刻
- "CAP-e8a77f70": ["CAP-7f2d1a72"],
- # B23 经典布光模式
- "CAP-ed4b506e": ["CAP-08c54a3c"],
- # B24 几何蒙版裁剪
- "CAP-47d6893f": ["CAP-7889940a"],
- # B25 批量系列海报
- "CAP-832e80ac": ["CAP-bb063798"],
- # B26 文生视频通用
- "CAP-a95da8d7": ["CAP-523c8623"],
- # ── Round 1(已执行完毕,保留以保障脚本幂等)───────
- # A1 抠图
- "CAP-12d2aa10": ["CAP-b9c2cafc", "CAP-2a40d757", "CAP-88816f1e",
- "CAP-2ba2bc19", "CAP-f0137cfa", "CAP-ccb3a2fd",
- "CAP-ad3fd294"],
- # A2 多图拼贴(后处理)
- "CAP-41ac8100": ["CAP-17732b2b", "CAP-409eefd9", "CAP-6f791b59",
- "CAP-bdc6c7c8", "CAP-c009dcce", "CAP-e7467ebf",
- "CAP-462897e2", "CAP-a815960c", "CAP-f3d22954"],
- # A2b 大图切割
- "CAP-fddd3349": ["CAP-3cf6ab47", "CAP-336fe318"],
- # A3 鱼眼/广角畸变
- "CAP-0e3d61ca": ["CAP-71de6ed3", "CAP-4e9c99fa", "CAP-d49daa5b"],
- # A4 选择性着色
- "CAP-3178172e": ["CAP-1dcb6702", "CAP-d18b8a24"],
- # A5 丁达尔/体积光
- "CAP-3086677b": ["CAP-c6ee82db", "CAP-f07aa0df", "CAP-81f097e1"],
- # A6 胶片光学
- "CAP-6c14041c": ["CAP-2c85b37d", "CAP-8ce1a9c6", "CAP-c9b3c7a5"],
- # A7 摄影参数
- "CAP-ef0a4c0c": ["CAP-17ffe5ea", "CAP-84d68e1b", "CAP-074db966"],
- # A8 戏剧明暗/Chiaroscuro
- "CAP-1649b549": ["CAP-e06eeb84", "CAP-e707003d", "CAP-aa25bdf9"],
- # A9 唇形同步
- "CAP-98490894": ["CAP-6fe5aecb", "CAP-89b9875b", "CAP-ad6d8e0d"],
- # A10 虚拟试衣/服装迁移
- "CAP-d92ffc99": ["CAP-1fdb00c2", "CAP-35974014", "CAP-b56d72c7",
- "CAP-f697cb22"],
- # A11 AI 配乐
- "CAP-5f9644fb": ["CAP-9fe2869e"],
- # A12 角色多视图(prompt 驱动)
- "CAP-5342ad19": ["CAP-2aee7861", "CAP-6566ec42", "CAP-faa53945",
- "CAP-ea7f3b27", "CAP-9edfec88"],
- # A12b 参数化多视角(ControlNet/坐标)
- "CAP-ee7df476": ["CAP-939c3610", "CAP-1374aa64", "CAP-3758e4a5"],
- # A13 逆光/轮廓光
- # CAP-47151d87 单独保留(VCAP 会在最后删除)
- # A14 冷暖对比
- "CAP-a185d6d2": ["CAP-6994f914", "CAP-a6d5ef60"],
- # A15 双重曝光
- "CAP-19e5402a": ["CAP-7841b44d", "CAP-6a7ebaa3"],
- # A16 对话气泡
- "CAP-fc2bd5cf": ["CAP-fd64812b"],
- # A17 文字描边/阴影
- "CAP-16c5174b": ["CAP-c13f0764", "CAP-e67c259b"],
- # A18 LLM 提示词扩写
- "CAP-4d8ba002": ["CAP-dd8c832f", "CAP-cc8d042f", "CAP-eeb71b76"],
- # A18b LLM 故事脚本
- "CAP-da51c2ec": ["CAP-da6ef3eb", "CAP-88752108", "CAP-2880810c",
- "CAP-3313a654", "CAP-34567777"],
- # A19 360 全景
- "CAP-1b3e966f": ["CAP-fd4786c5"],
- # A20 信息层级排版
- "CAP-067edd94": ["CAP-c321e41d", "CAP-bb78b124", "CAP-5753e9dd"],
- # A21 卡片版式
- "CAP-6e77db54": ["CAP-33a038e1"],
- # A22 单色调锁定(prompt)
- "CAP-2bd87e28": ["CAP-8dfe77e3", "CAP-7f123558", "CAP-a3985fda"],
- # A22b 后处理色调映射
- "CAP-79590b09": ["CAP-39e1bfda", "CAP-eccd1ce8", "CAP-f08eb0eb",
- "CAP-a0e2c93a", "CAP-dd9e6d34"],
- # A23 低饱和/去色
- "CAP-298dcb55": ["CAP-a5fb0745"],
- # A24 高饱和多色并置
- "CAP-2de278d6": ["CAP-3ba4cb6e", "CAP-76b117fc", "CAP-252b422a"],
- # A26 极端视角
- "CAP-0c30af82": ["CAP-fecf1f7d", "CAP-a0800d7d"],
- # A28 手部修复
- "CAP-0ba3159e": ["CAP-db60d72e"],
- # A29 皮肤真实感
- "CAP-3b0de1ce": ["CAP-ad971785", "CAP-b36560ff"],
- # A30 AI 一次生成多格图
- "CAP-306c15fe": ["CAP-033b21b5", "CAP-ae5276f1", "CAP-611ac7bd",
- "CAP-e5fbbe2a", "CAP-8e4dbefa", "CAP-a1e8fd4e",
- "CAP-56175058", "CAP-5bb77d65", "CAP-e861c293",
- "CAP-07a8b9a2"],
- # A31 多格一致性
- "CAP-e9b763d2": ["CAP-ddb5e870", "CAP-e1e9a807", "CAP-08b34a6b",
- "CAP-020d2c05", "CAP-6fa6bd25", "CAP-38d84d76"],
- # A32 多情绪表情矩阵
- "CAP-5a1ac59d": ["CAP-92f70ebf", "CAP-164ecc20"],
- # A33 配色方案推荐
- "CAP-689bac61": ["CAP-4bd95998", "CAP-86887ed5", "CAP-d6954059"],
- # A38 轮播分页
- "CAP-20409fa6": ["CAP-716dd15a", "CAP-e4a5c708", "CAP-a9d3293f"],
- }
- # VCAP 占位 + 空描述(单独删除)
- VCAP_AND_EMPTY = [
- "CAP-47310932",
- "CAP-tao_dev_1-00-01", "CAP-tao_dev_1-00-02", "CAP-tao_dev_1-00-03",
- "CAP-tao_dev_1-00-04", "CAP-tao_dev_1-01-01", "CAP-tao_dev_1-01-02",
- "CAP-tao_dev_1-01-03", "CAP-tao_dev_1-02-01", "CAP-tao_dev_1-02-02",
- "CAP-tao_dev_1-02-03", "CAP-tao_dev_1-03-01", "CAP-tao_dev_1-03-02",
- "CAP-tao_dev_1-03-03", "CAP-tao_dev_1-03-04", "CAP-tao_dev_1-03-05",
- "CAP-tao_dev_1-03-06", "CAP-tao_dev_1-04-01", "CAP-tao_dev_1-04-02",
- "CAP-tao_dev_1-04-03", "CAP-tao_dev_1-04-04", "CAP-tao_dev_1-04-05",
- ]
- JUNCTIONS = [
- ("requirement_capability", "capability_id"),
- ("capability_tool", "capability_id"),
- ("capability_knowledge", "capability_id"),
- ("capability_resource", "capability_id"),
- ("strategy_capability", "capability_id"),
- ]
- def sanity_check(cur):
- """校验所有 canonical/member 都真实存在于 DB。"""
- all_ids = set(MERGE_CLUSTERS.keys())
- for members in MERGE_CLUSTERS.values():
- all_ids.update(members)
- all_ids.update(VCAP_AND_EMPTY)
- cur.execute("SELECT id FROM capability WHERE id = ANY(%s)", (list(all_ids),))
- existing = {r['id'] for r in cur.fetchall()}
- missing = all_ids - existing
- if missing:
- print(f"⚠️ 以下 ID 在 DB 中不存在(会跳过):{sorted(missing)}", flush=True)
- return existing
- def merge_one_cluster(cur, canonical, members, existing, dry_run):
- """把 members 合并到 canonical。"""
- members = [m for m in members if m in existing]
- if not members:
- print(f" [skip] {canonical}: 所有 members 都已不存在", flush=True)
- return 0, 0, 0
- junction_updates = 0
- junction_dups_removed = 0
- # 逐个 member 处理 —— 避免 members 之间互相冲突(都指向同一 other_id)
- for member in members:
- for table, col in JUNCTIONS:
- other_col = {
- "requirement_capability": "requirement_id",
- "capability_tool": "tool_id",
- "capability_knowledge": "knowledge_id",
- "capability_resource": "resource_id",
- "strategy_capability": "strategy_id",
- }[table]
- # 删除会和 canonical 当前占用冲突的 member 引用
- del_sql = (
- f"DELETE FROM {table} WHERE {col} = %s "
- f"AND {other_col} IN ("
- f" SELECT {other_col} FROM {table} WHERE {col} = %s"
- f")"
- )
- if dry_run:
- cur.execute(
- f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s "
- f"AND {other_col} IN ("
- f" SELECT {other_col} FROM {table} WHERE {col} = %s)",
- (member, canonical),
- )
- dup_n = cur.fetchone()['c']
- else:
- cur.execute(del_sql, (member, canonical))
- dup_n = cur.rowcount or 0
- junction_dups_removed += dup_n
- # UPDATE 剩余 member 引用 → canonical
- upd_sql = f"UPDATE {table} SET {col} = %s WHERE {col} = %s"
- if dry_run:
- cur.execute(
- f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s",
- (member,),
- )
- upd_n = cur.fetchone()['c']
- else:
- cur.execute(upd_sql, (canonical, member))
- upd_n = cur.rowcount or 0
- junction_updates += upd_n
- # 删除 member 行
- del_caps_sql = "DELETE FROM capability WHERE id = ANY(%s)"
- if dry_run:
- n_del = len(members)
- else:
- cur.execute(del_caps_sql, (members,))
- n_del = cur.rowcount or 0
- print(f" [{'DRY' if dry_run else 'OK '}] {canonical} ← {len(members)} members | "
- f"junction upd={junction_updates}, dup_removed={junction_dups_removed}, "
- f"cap_deleted={n_del}",
- flush=True)
- return junction_updates, junction_dups_removed, n_del
- def delete_vcap(cur, ids_to_delete, existing, dry_run):
- ids = [i for i in ids_to_delete if i in existing]
- if not ids:
- print(" [skip] VCAP: 都已不存在", flush=True)
- return 0
- total_j = 0
- for table, col in JUNCTIONS:
- if dry_run:
- cur.execute(
- f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = ANY(%s)", (ids,)
- )
- n = cur.fetchone()['c']
- else:
- cur.execute(f"DELETE FROM {table} WHERE {col} = ANY(%s)", (ids,))
- n = cur.rowcount or 0
- total_j += n
- if dry_run:
- n_cap = len(ids)
- else:
- cur.execute("DELETE FROM capability WHERE id = ANY(%s)", (ids,))
- n_cap = cur.rowcount or 0
- print(f" [{'DRY' if dry_run else 'OK '}] VCAP delete: "
- f"{len(ids)} caps, junction_deleted={total_j}, cap_deleted={n_cap}",
- flush=True)
- return n_cap
- def main():
- ap = argparse.ArgumentParser()
- g = ap.add_mutually_exclusive_group(required=True)
- g.add_argument("--dry-run", action="store_true")
- g.add_argument("--execute", action="store_true")
- args = ap.parse_args()
- dry = args.dry_run
- print(f"\n{'='*60}\n{'DRY RUN' if dry else 'EXECUTE'} — capability 合并\n"
- f"{'='*60}", flush=True)
- store = PostgreSQLCapabilityStore()
- cur = store._get_cursor()
- try:
- # 前置统计
- cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'")
- before = cur.fetchone()['c']
- print(f"📊 合并前 tao_dev_1 capability 数: {before}\n", flush=True)
- existing = sanity_check(cur)
- print(f"📋 待合并簇: {len(MERGE_CLUSTERS)},VCAP 待删: "
- f"{len(VCAP_AND_EMPTY)}\n", flush=True)
- print("── 阶段 1: 簇内合并 ─────────────", flush=True)
- tot_upd = tot_dup = tot_del = 0
- for idx, (canonical, members) in enumerate(MERGE_CLUSTERS.items(), 1):
- print(f"\n[{idx}/{len(MERGE_CLUSTERS)}] canonical={canonical}", flush=True)
- u, d, dl = merge_one_cluster(cur, canonical, members, existing, dry)
- tot_upd += u
- tot_dup += d
- tot_del += dl
- print("\n── 阶段 2: 删除 VCAP + 空数据 ─────", flush=True)
- n_vcap = delete_vcap(cur, VCAP_AND_EMPTY, existing, dry)
- # 事后统计
- cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'")
- after = cur.fetchone()['c']
- print(f"\n{'='*60}\n汇总:", flush=True)
- print(f" junction UPDATE: {tot_upd}", flush=True)
- print(f" junction 去重 DELETE: {tot_dup}", flush=True)
- print(f" capability 簇合并删除: {tot_del}", flush=True)
- print(f" VCAP 删除: {n_vcap}", flush=True)
- print(f" 合并前: {before} 合并{'后(预测)' if dry else '后'}: {after}",
- flush=True)
- if dry:
- print(f" 预期合并后: {before - tot_del - n_vcap}", flush=True)
- print(f"{'='*60}\n", flush=True)
- finally:
- cur.close()
- store.close()
- if __name__ == "__main__":
- main()
|