dedup_howard_round4.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. #!/usr/bin/env python3
  2. """
  3. Round 4 cleanup: 合并 howard_dedup 里因 alias miss 产生的重复 capability.
  4. 所有合并都是 member → canonical:junction 全部改向 canonical,member 删除。
  5. """
  6. import argparse
  7. import sys
  8. from pathlib import Path
  9. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  10. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  11. # canonical -> [members]. canonical 必须存在且是"更好"的版本(含原始 id 或更详细 desc)
  12. CLEANUP = {
  13. # === 视频/动画 ===
  14. "CAP-d92ffc99": ["CAP-bcdfba3c"], # AI 虚拟换装
  15. "CAP-017": ["CAP-1f7f4d72"], # 全向参考
  16. "CAP-3b8df701": ["CAP-974c9781"], # AI VFX 合成
  17. "CAP-98490894": ["CAP-e5934ab7", "CAP-c020af1d"], # 唇形同步
  18. "CAP-7b9d2baf": ["CAP-8c7b1114"], # 首尾帧衔接
  19. "CAP-49175b92": ["CAP-22768ee3"], # 电影镜头运动
  20. # === 图像/构图 ===
  21. "CAP-c9426dcc": ["CAP-8b3d4b8d"], # 产品爆炸分解图
  22. "CAP-7c8532dc": ["CAP-059c8d22"], # 宠物服装虚拟上身
  23. "CAP-fddd3349": ["CAP-7abc6773"], # 网格/全景大图切割
  24. "CAP-1649b549": ["CAP-69f5a034"], # 戏剧性明暗对比
  25. "CAP-d043d289": ["CAP-14378e6f"], # 手持物体精准细节
  26. "CAP-5a1ac59d": ["CAP-5f4aa6b7", "CAP-d1234756", "CAP-82851c39"], # 多情绪表情
  27. "CAP-d93a0ac2": ["CAP-50f66faa"], # 场景光影一致性校正
  28. "CAP-3c49ff0a": ["CAP-f5ea6d52"], # 空间透视
  29. "CAP-d1f429ff": ["CAP-7d07e293"], # 科技感元素
  30. "CAP-8467736a": ["CAP-d1757626"], # 粒子光效
  31. "CAP-5b000814": ["CAP-92c7cc81"], # 结构化 Prompt
  32. "CAP-a35e7966": ["CAP-0545b3ca"], # 霓虹发光
  33. "CAP-2de278d6": ["CAP-6485105e"], # 高饱和配色
  34. "CAP-562d91c1": ["CAP-2ba237a5", "CAP-85ed8dc9", "CAP-a5f8e368"], # 海报多元素排版
  35. "CAP-792fd807": ["CAP-fb3906c8", "CAP-d9f9692f"], # 景深虚化
  36. "CAP-8d69865f": ["CAP-421b1002"], # 照片转绘插画
  37. # === 文字 ===
  38. "CAP-16c5174b": ["CAP-db0cb47e"], # 文字描边阴影
  39. "CAP-00c474e2": ["CAP-e714656e", "CAP-920091f9"], # 视频动效字幕/文字动画
  40. "CAP-ce113c41" if False else "CAP-7423a8b2": [], # (placeholder)
  41. # === 拼贴 ===
  42. "CAP-bc67d346" if False else "CAP-6f73062a": [], # (skip, 6f73062a is current)
  43. # === 信息层级 ===
  44. "CAP-067edd94": ["CAP-64f5a261"], # 层级排版 + 标题字号
  45. # === 数据模板 ===
  46. "CAP-a08749c3": ["CAP-89bf9fd7"], # 模板数据批量套版
  47. # === 文字特效 ===
  48. "CAP-a4b638a6": [], # keep alone (文字特效与动画渲染)
  49. # === 毛绒 ===
  50. "CAP-96182b8f": ["CAP-9692aa0a"], # 毛绒材质渲染
  51. # === 复古大字报 ===
  52. # CAP-c6dfb2b0 特定风格,保留独立
  53. }
  54. # Remove placeholder keys with empty member list
  55. CLEANUP = {k: v for k, v in CLEANUP.items() if v}
  56. JUNCTIONS = [
  57. ("requirement_capability", "capability_id", "requirement_id"),
  58. ("capability_tool", "capability_id", "tool_id"),
  59. ("capability_knowledge", "capability_id", "knowledge_id"),
  60. ("capability_resource", "capability_id", "resource_id"),
  61. ("strategy_capability", "capability_id", "strategy_id"),
  62. ]
  63. def merge_one(cur, canonical, members):
  64. stats = {'junction_upd': 0, 'junction_dup_del': 0, 'cap_del': 0, 'skipped_missing': []}
  65. for member in members:
  66. cur.execute('SELECT 1 FROM capability WHERE id = %s', (member,))
  67. if not cur.fetchone():
  68. stats['skipped_missing'].append(member)
  69. continue
  70. for table, col, other_col in JUNCTIONS:
  71. # delete dups (member has (other_col) that canonical already has)
  72. cur.execute(
  73. f"DELETE FROM {table} WHERE {col} = %s AND {other_col} IN ("
  74. f" SELECT {other_col} FROM {table} WHERE {col} = %s)",
  75. (member, canonical))
  76. stats['junction_dup_del'] += cur.rowcount or 0
  77. # update remaining member refs to canonical
  78. cur.execute(f"UPDATE {table} SET {col} = %s WHERE {col} = %s",
  79. (canonical, member))
  80. stats['junction_upd'] += cur.rowcount or 0
  81. cur.execute('DELETE FROM capability WHERE id = %s', (member,))
  82. stats['cap_del'] += cur.rowcount or 0
  83. return stats
  84. def main():
  85. ap = argparse.ArgumentParser()
  86. g = ap.add_mutually_exclusive_group(required=True)
  87. g.add_argument('--dry-run', action='store_true')
  88. g.add_argument('--execute', action='store_true')
  89. args = ap.parse_args()
  90. s = PostgreSQLCapabilityStore()
  91. cur = s._get_cursor()
  92. try:
  93. cur.execute("SELECT COUNT(*) c FROM capability")
  94. before = cur.fetchone()['c']
  95. print(f'before: {before} caps', flush=True)
  96. total = {'upd': 0, 'dup': 0, 'del': 0, 'skip': []}
  97. for canonical, members in CLEANUP.items():
  98. cur.execute('SELECT name FROM capability WHERE id = %s', (canonical,))
  99. r = cur.fetchone()
  100. if not r:
  101. print(f'❌ canonical {canonical} missing, SKIP cluster', flush=True)
  102. continue
  103. if args.dry_run:
  104. # just check members exist
  105. exists = [m for m in members
  106. if (cur.execute('SELECT 1 FROM capability WHERE id=%s',(m,)),
  107. cur.fetchone())[1]]
  108. print(f'[DRY] {canonical} ({r["name"]}) ← {len(exists)}/{len(members)} members to merge',
  109. flush=True)
  110. else:
  111. stats = merge_one(cur, canonical, members)
  112. total['upd'] += stats['junction_upd']
  113. total['dup'] += stats['junction_dup_del']
  114. total['del'] += stats['cap_del']
  115. total['skip'].extend(stats['skipped_missing'])
  116. print(f'✓ {canonical} ({r["name"][:40]}) ← {stats["cap_del"]} merged '
  117. f'(upd={stats["junction_upd"]}, dup={stats["junction_dup_del"]})',
  118. flush=True)
  119. cur.execute("SELECT COUNT(*) c FROM capability")
  120. after = cur.fetchone()['c']
  121. print(f'\n{"="*50}')
  122. print(f'{"DRY-RUN" if args.dry_run else "EXECUTE"}: {before} → {after} (-{before-after})', flush=True)
  123. if not args.dry_run:
  124. print(f'junction UPDATE={total["upd"]}, dup DELETE={total["dup"]}, cap DELETE={total["del"]}',
  125. flush=True)
  126. if total['skip']:
  127. print(f'skipped (missing): {total["skip"]}', flush=True)
  128. finally:
  129. cur.close()
  130. s.close()
  131. if __name__ == '__main__':
  132. main()