merge_capabilities.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. #!/usr/bin/env python3
  2. """
  3. 合并 tao_dev_1 版本内的重复 capability,并删除 VCAP 占位条目。
  4. 策略:
  5. 1. 对每个 (canonical, members) 簇:
  6. 对 5 个 junction (requirement_capability / capability_tool /
  7. capability_knowledge / capability_resource / strategy_capability)
  8. - 先删除会和 canonical 冲突的 member 引用(避免 PK 冲突)
  9. - 再 UPDATE member → canonical
  10. 2. 删除被合并的 member 行 + VCAP 占位行
  11. 3. 全程打印进度(flush=True),不管道给 tail
  12. 用法:
  13. python knowhub/scripts/merge_capabilities.py --dry-run # 只打印计划
  14. python knowhub/scripts/merge_capabilities.py --execute # 真正执行
  15. """
  16. import argparse
  17. import sys
  18. from pathlib import Path
  19. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  20. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  21. # canonical_id -> [member_ids to merge into canonical]
  22. MERGE_CLUSTERS = {
  23. # ── Round C (跨版本:v0 foundation 吸收 tao_dev_1 同义条) ────
  24. # C1 动画帧/文生视频
  25. "CAP-009": ["CAP-a95da8d7"],
  26. # C2 图像上色
  27. "CAP-007": ["CAP-75ba0eac"],
  28. # C3 图像细节增强与高清放大
  29. "CAP-006": ["CAP-767e7848"],
  30. # C4 草图/3D → 效果图
  31. "CAP-019": ["CAP-c7c8479a"],
  32. # C5 图像内文字翻译/重写替换
  33. "CAP-021": ["CAP-44ada771"],
  34. # C6 主体/角色一致性(borderline 合并)
  35. "CAP-003": ["CAP-1c71b52e"],
  36. # ── Round 2 ──────────────────────────────────────────
  37. # B1 室内多光源
  38. "CAP-5fb6dd66": ["CAP-aa75f198", "CAP-4da21eb2"],
  39. # B2 室内色调
  40. "CAP-3bf86ae3": ["CAP-20e2e0e6"],
  41. # B3 室内自然材质
  42. "CAP-56368e3a": ["CAP-e8b46ca2"],
  43. # B4 电影级光照
  44. "CAP-aaaef688": ["CAP-9658a2d9"],
  45. # B5 虚拟相机运动
  46. "CAP-49175b92": ["CAP-943f7709"],
  47. # B6 结构化 Prompt 框架
  48. "CAP-5b000814": ["CAP-f0d52dd1", "CAP-28983111"],
  49. # B7 多景别 prompt
  50. "CAP-4f15a85f": ["CAP-1da1bf7a", "CAP-0e64f839"],
  51. # B8 微距/极端特写
  52. "CAP-26100ea8": ["CAP-b93f5cf9", "CAP-2382899f"],
  53. # B9 图文卡片自动排版
  54. "CAP-87ba3b7d": ["CAP-25c0eef3", "CAP-a648fb4a"],
  55. # B10 海报多元素排版
  56. "CAP-562d91c1": ["CAP-2df55478", "CAP-cee93f12", "CAP-dba33b2d",
  57. "CAP-02b573f4"],
  58. # B11 空间透视景深
  59. "CAP-3c49ff0a": ["CAP-edb949a3", "CAP-ee3bac1d", "CAP-19e77a40",
  60. "CAP-eb5e0ad2"],
  61. # B12 产品摄影光照
  62. "CAP-c2c42fc7": ["CAP-be42acba"],
  63. # B13 材质质感
  64. "CAP-0dc2a15b": ["CAP-7f63a4f5"],
  65. # B14 霓虹/光晕/发光
  66. "CAP-a35e7966": ["CAP-ec490469", "CAP-ef8c3e8b", "CAP-9dea4396"],
  67. # B15 数据图表生成
  68. "CAP-3ee6c232": ["CAP-84a60ca0", "CAP-d02af2bd"],
  69. # B16 复古印刷质感
  70. "CAP-9359b49f": ["CAP-59414657", "CAP-718daa01"],
  71. # B17 超现实空间
  72. "CAP-1f898cd9": ["CAP-d1a4709c"],
  73. # B18 文字透视融合
  74. "CAP-bd4828fc": ["CAP-f3d42bb0"],
  75. # B19 拟人化角色
  76. "CAP-e962c3ef": ["CAP-93b35d43", "CAP-5c7abd0c"],
  77. # B20 镜面水面反射
  78. "CAP-3b51102e": ["CAP-bb547523"],
  79. # B21 粒子光效
  80. "CAP-8467736a": ["CAP-a6c6d4fc"],
  81. # B22 自然光时刻
  82. "CAP-e8a77f70": ["CAP-7f2d1a72"],
  83. # B23 经典布光模式
  84. "CAP-ed4b506e": ["CAP-08c54a3c"],
  85. # B24 几何蒙版裁剪
  86. "CAP-47d6893f": ["CAP-7889940a"],
  87. # B25 批量系列海报
  88. "CAP-832e80ac": ["CAP-bb063798"],
  89. # B26 文生视频通用
  90. "CAP-a95da8d7": ["CAP-523c8623"],
  91. # ── Round 1(已执行完毕,保留以保障脚本幂等)───────
  92. # A1 抠图
  93. "CAP-12d2aa10": ["CAP-b9c2cafc", "CAP-2a40d757", "CAP-88816f1e",
  94. "CAP-2ba2bc19", "CAP-f0137cfa", "CAP-ccb3a2fd",
  95. "CAP-ad3fd294"],
  96. # A2 多图拼贴(后处理)
  97. "CAP-41ac8100": ["CAP-17732b2b", "CAP-409eefd9", "CAP-6f791b59",
  98. "CAP-bdc6c7c8", "CAP-c009dcce", "CAP-e7467ebf",
  99. "CAP-462897e2", "CAP-a815960c", "CAP-f3d22954"],
  100. # A2b 大图切割
  101. "CAP-fddd3349": ["CAP-3cf6ab47", "CAP-336fe318"],
  102. # A3 鱼眼/广角畸变
  103. "CAP-0e3d61ca": ["CAP-71de6ed3", "CAP-4e9c99fa", "CAP-d49daa5b"],
  104. # A4 选择性着色
  105. "CAP-3178172e": ["CAP-1dcb6702", "CAP-d18b8a24"],
  106. # A5 丁达尔/体积光
  107. "CAP-3086677b": ["CAP-c6ee82db", "CAP-f07aa0df", "CAP-81f097e1"],
  108. # A6 胶片光学
  109. "CAP-6c14041c": ["CAP-2c85b37d", "CAP-8ce1a9c6", "CAP-c9b3c7a5"],
  110. # A7 摄影参数
  111. "CAP-ef0a4c0c": ["CAP-17ffe5ea", "CAP-84d68e1b", "CAP-074db966"],
  112. # A8 戏剧明暗/Chiaroscuro
  113. "CAP-1649b549": ["CAP-e06eeb84", "CAP-e707003d", "CAP-aa25bdf9"],
  114. # A9 唇形同步
  115. "CAP-98490894": ["CAP-6fe5aecb", "CAP-89b9875b", "CAP-ad6d8e0d"],
  116. # A10 虚拟试衣/服装迁移
  117. "CAP-d92ffc99": ["CAP-1fdb00c2", "CAP-35974014", "CAP-b56d72c7",
  118. "CAP-f697cb22"],
  119. # A11 AI 配乐
  120. "CAP-5f9644fb": ["CAP-9fe2869e"],
  121. # A12 角色多视图(prompt 驱动)
  122. "CAP-5342ad19": ["CAP-2aee7861", "CAP-6566ec42", "CAP-faa53945",
  123. "CAP-ea7f3b27", "CAP-9edfec88"],
  124. # A12b 参数化多视角(ControlNet/坐标)
  125. "CAP-ee7df476": ["CAP-939c3610", "CAP-1374aa64", "CAP-3758e4a5"],
  126. # A13 逆光/轮廓光
  127. # CAP-47151d87 单独保留(VCAP 会在最后删除)
  128. # A14 冷暖对比
  129. "CAP-a185d6d2": ["CAP-6994f914", "CAP-a6d5ef60"],
  130. # A15 双重曝光
  131. "CAP-19e5402a": ["CAP-7841b44d", "CAP-6a7ebaa3"],
  132. # A16 对话气泡
  133. "CAP-fc2bd5cf": ["CAP-fd64812b"],
  134. # A17 文字描边/阴影
  135. "CAP-16c5174b": ["CAP-c13f0764", "CAP-e67c259b"],
  136. # A18 LLM 提示词扩写
  137. "CAP-4d8ba002": ["CAP-dd8c832f", "CAP-cc8d042f", "CAP-eeb71b76"],
  138. # A18b LLM 故事脚本
  139. "CAP-da51c2ec": ["CAP-da6ef3eb", "CAP-88752108", "CAP-2880810c",
  140. "CAP-3313a654", "CAP-34567777"],
  141. # A19 360 全景
  142. "CAP-1b3e966f": ["CAP-fd4786c5"],
  143. # A20 信息层级排版
  144. "CAP-067edd94": ["CAP-c321e41d", "CAP-bb78b124", "CAP-5753e9dd"],
  145. # A21 卡片版式
  146. "CAP-6e77db54": ["CAP-33a038e1"],
  147. # A22 单色调锁定(prompt)
  148. "CAP-2bd87e28": ["CAP-8dfe77e3", "CAP-7f123558", "CAP-a3985fda"],
  149. # A22b 后处理色调映射
  150. "CAP-79590b09": ["CAP-39e1bfda", "CAP-eccd1ce8", "CAP-f08eb0eb",
  151. "CAP-a0e2c93a", "CAP-dd9e6d34"],
  152. # A23 低饱和/去色
  153. "CAP-298dcb55": ["CAP-a5fb0745"],
  154. # A24 高饱和多色并置
  155. "CAP-2de278d6": ["CAP-3ba4cb6e", "CAP-76b117fc", "CAP-252b422a"],
  156. # A26 极端视角
  157. "CAP-0c30af82": ["CAP-fecf1f7d", "CAP-a0800d7d"],
  158. # A28 手部修复
  159. "CAP-0ba3159e": ["CAP-db60d72e"],
  160. # A29 皮肤真实感
  161. "CAP-3b0de1ce": ["CAP-ad971785", "CAP-b36560ff"],
  162. # A30 AI 一次生成多格图
  163. "CAP-306c15fe": ["CAP-033b21b5", "CAP-ae5276f1", "CAP-611ac7bd",
  164. "CAP-e5fbbe2a", "CAP-8e4dbefa", "CAP-a1e8fd4e",
  165. "CAP-56175058", "CAP-5bb77d65", "CAP-e861c293",
  166. "CAP-07a8b9a2"],
  167. # A31 多格一致性
  168. "CAP-e9b763d2": ["CAP-ddb5e870", "CAP-e1e9a807", "CAP-08b34a6b",
  169. "CAP-020d2c05", "CAP-6fa6bd25", "CAP-38d84d76"],
  170. # A32 多情绪表情矩阵
  171. "CAP-5a1ac59d": ["CAP-92f70ebf", "CAP-164ecc20"],
  172. # A33 配色方案推荐
  173. "CAP-689bac61": ["CAP-4bd95998", "CAP-86887ed5", "CAP-d6954059"],
  174. # A38 轮播分页
  175. "CAP-20409fa6": ["CAP-716dd15a", "CAP-e4a5c708", "CAP-a9d3293f"],
  176. }
  177. # VCAP 占位 + 空描述(单独删除)
  178. VCAP_AND_EMPTY = [
  179. "CAP-47310932",
  180. "CAP-tao_dev_1-00-01", "CAP-tao_dev_1-00-02", "CAP-tao_dev_1-00-03",
  181. "CAP-tao_dev_1-00-04", "CAP-tao_dev_1-01-01", "CAP-tao_dev_1-01-02",
  182. "CAP-tao_dev_1-01-03", "CAP-tao_dev_1-02-01", "CAP-tao_dev_1-02-02",
  183. "CAP-tao_dev_1-02-03", "CAP-tao_dev_1-03-01", "CAP-tao_dev_1-03-02",
  184. "CAP-tao_dev_1-03-03", "CAP-tao_dev_1-03-04", "CAP-tao_dev_1-03-05",
  185. "CAP-tao_dev_1-03-06", "CAP-tao_dev_1-04-01", "CAP-tao_dev_1-04-02",
  186. "CAP-tao_dev_1-04-03", "CAP-tao_dev_1-04-04", "CAP-tao_dev_1-04-05",
  187. ]
  188. JUNCTIONS = [
  189. ("requirement_capability", "capability_id"),
  190. ("capability_tool", "capability_id"),
  191. ("capability_knowledge", "capability_id"),
  192. ("capability_resource", "capability_id"),
  193. ("strategy_capability", "capability_id"),
  194. ]
  195. def sanity_check(cur):
  196. """校验所有 canonical/member 都真实存在于 DB。"""
  197. all_ids = set(MERGE_CLUSTERS.keys())
  198. for members in MERGE_CLUSTERS.values():
  199. all_ids.update(members)
  200. all_ids.update(VCAP_AND_EMPTY)
  201. cur.execute("SELECT id FROM capability WHERE id = ANY(%s)", (list(all_ids),))
  202. existing = {r['id'] for r in cur.fetchall()}
  203. missing = all_ids - existing
  204. if missing:
  205. print(f"⚠️ 以下 ID 在 DB 中不存在(会跳过):{sorted(missing)}", flush=True)
  206. return existing
  207. def merge_one_cluster(cur, canonical, members, existing, dry_run):
  208. """把 members 合并到 canonical。"""
  209. members = [m for m in members if m in existing]
  210. if not members:
  211. print(f" [skip] {canonical}: 所有 members 都已不存在", flush=True)
  212. return 0, 0, 0
  213. junction_updates = 0
  214. junction_dups_removed = 0
  215. # 逐个 member 处理 —— 避免 members 之间互相冲突(都指向同一 other_id)
  216. for member in members:
  217. for table, col in JUNCTIONS:
  218. other_col = {
  219. "requirement_capability": "requirement_id",
  220. "capability_tool": "tool_id",
  221. "capability_knowledge": "knowledge_id",
  222. "capability_resource": "resource_id",
  223. "strategy_capability": "strategy_id",
  224. }[table]
  225. # 删除会和 canonical 当前占用冲突的 member 引用
  226. del_sql = (
  227. f"DELETE FROM {table} WHERE {col} = %s "
  228. f"AND {other_col} IN ("
  229. f" SELECT {other_col} FROM {table} WHERE {col} = %s"
  230. f")"
  231. )
  232. if dry_run:
  233. cur.execute(
  234. f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s "
  235. f"AND {other_col} IN ("
  236. f" SELECT {other_col} FROM {table} WHERE {col} = %s)",
  237. (member, canonical),
  238. )
  239. dup_n = cur.fetchone()['c']
  240. else:
  241. cur.execute(del_sql, (member, canonical))
  242. dup_n = cur.rowcount or 0
  243. junction_dups_removed += dup_n
  244. # UPDATE 剩余 member 引用 → canonical
  245. upd_sql = f"UPDATE {table} SET {col} = %s WHERE {col} = %s"
  246. if dry_run:
  247. cur.execute(
  248. f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = %s",
  249. (member,),
  250. )
  251. upd_n = cur.fetchone()['c']
  252. else:
  253. cur.execute(upd_sql, (canonical, member))
  254. upd_n = cur.rowcount or 0
  255. junction_updates += upd_n
  256. # 删除 member 行
  257. del_caps_sql = "DELETE FROM capability WHERE id = ANY(%s)"
  258. if dry_run:
  259. n_del = len(members)
  260. else:
  261. cur.execute(del_caps_sql, (members,))
  262. n_del = cur.rowcount or 0
  263. print(f" [{'DRY' if dry_run else 'OK '}] {canonical} ← {len(members)} members | "
  264. f"junction upd={junction_updates}, dup_removed={junction_dups_removed}, "
  265. f"cap_deleted={n_del}",
  266. flush=True)
  267. return junction_updates, junction_dups_removed, n_del
  268. def delete_vcap(cur, ids_to_delete, existing, dry_run):
  269. ids = [i for i in ids_to_delete if i in existing]
  270. if not ids:
  271. print(" [skip] VCAP: 都已不存在", flush=True)
  272. return 0
  273. total_j = 0
  274. for table, col in JUNCTIONS:
  275. if dry_run:
  276. cur.execute(
  277. f"SELECT COUNT(*) AS c FROM {table} WHERE {col} = ANY(%s)", (ids,)
  278. )
  279. n = cur.fetchone()['c']
  280. else:
  281. cur.execute(f"DELETE FROM {table} WHERE {col} = ANY(%s)", (ids,))
  282. n = cur.rowcount or 0
  283. total_j += n
  284. if dry_run:
  285. n_cap = len(ids)
  286. else:
  287. cur.execute("DELETE FROM capability WHERE id = ANY(%s)", (ids,))
  288. n_cap = cur.rowcount or 0
  289. print(f" [{'DRY' if dry_run else 'OK '}] VCAP delete: "
  290. f"{len(ids)} caps, junction_deleted={total_j}, cap_deleted={n_cap}",
  291. flush=True)
  292. return n_cap
  293. def main():
  294. ap = argparse.ArgumentParser()
  295. g = ap.add_mutually_exclusive_group(required=True)
  296. g.add_argument("--dry-run", action="store_true")
  297. g.add_argument("--execute", action="store_true")
  298. args = ap.parse_args()
  299. dry = args.dry_run
  300. print(f"\n{'='*60}\n{'DRY RUN' if dry else 'EXECUTE'} — capability 合并\n"
  301. f"{'='*60}", flush=True)
  302. store = PostgreSQLCapabilityStore()
  303. cur = store._get_cursor()
  304. try:
  305. # 前置统计
  306. cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'")
  307. before = cur.fetchone()['c']
  308. print(f"📊 合并前 tao_dev_1 capability 数: {before}\n", flush=True)
  309. existing = sanity_check(cur)
  310. print(f"📋 待合并簇: {len(MERGE_CLUSTERS)},VCAP 待删: "
  311. f"{len(VCAP_AND_EMPTY)}\n", flush=True)
  312. print("── 阶段 1: 簇内合并 ─────────────", flush=True)
  313. tot_upd = tot_dup = tot_del = 0
  314. for idx, (canonical, members) in enumerate(MERGE_CLUSTERS.items(), 1):
  315. print(f"\n[{idx}/{len(MERGE_CLUSTERS)}] canonical={canonical}", flush=True)
  316. u, d, dl = merge_one_cluster(cur, canonical, members, existing, dry)
  317. tot_upd += u
  318. tot_dup += d
  319. tot_del += dl
  320. print("\n── 阶段 2: 删除 VCAP + 空数据 ─────", flush=True)
  321. n_vcap = delete_vcap(cur, VCAP_AND_EMPTY, existing, dry)
  322. # 事后统计
  323. cur.execute("SELECT COUNT(*) AS c FROM capability WHERE version='tao_dev_1'")
  324. after = cur.fetchone()['c']
  325. print(f"\n{'='*60}\n汇总:", flush=True)
  326. print(f" junction UPDATE: {tot_upd}", flush=True)
  327. print(f" junction 去重 DELETE: {tot_dup}", flush=True)
  328. print(f" capability 簇合并删除: {tot_del}", flush=True)
  329. print(f" VCAP 删除: {n_vcap}", flush=True)
  330. print(f" 合并前: {before} 合并{'后(预测)' if dry else '后'}: {after}",
  331. flush=True)
  332. if dry:
  333. print(f" 预期合并后: {before - tot_del - n_vcap}", flush=True)
  334. print(f"{'='*60}\n", flush=True)
  335. finally:
  336. cur.close()
  337. store.close()
  338. if __name__ == "__main__":
  339. main()