salvage_placeholder_strategies.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. #!/usr/bin/env python3
  2. """
  3. 修复 5 个占位 strategy(REQ_004 / 031 / 053 / 066 / 070):
  4. 把各自的非标准 body schema 正规化成标准 workflow_outline 结构,
  5. 并写 strategy_capability junction(之前 0 条)。
  6. 不改 strategy.id / strategy.name / strategy.description(保留原调研的 rich 描述)。
  7. body 里原 schema 字段也保留(作为 salvage_original),只是增加 workflow_outline。
  8. 各 folder 的 schema 变体:
  9. REQ_004: body.strategy.phases + body.capability_mapping (cap by name, phase_id 映射)
  10. REQ_031: body.strategy.phases[*].capabilities_used (含 'CAP-XXX 名字' 组合字符串)
  11. REQ_053: body.phases[*].capabilities (dict with id/name) + body.core_workflow
  12. REQ_066: body.execution_phases[*].capabilities_used (混合 id 和 name)
  13. REQ_070: body.selected_blueprint.phases[*].capabilities_used
  14. Cap 引用处理:
  15. 1. 若是 'CAP-XXX' 形式且 XXX 存在于 DB → 直接用
  16. 2. 若是 'CAP-tao_dev_1-XX-YY' 老 ID → 通过 MERGE_CLUSTERS 传递闭包解析
  17. 3. 若是纯名字 → 通过 alias map(含 LLM_RENAMES)解析
  18. 4. 解析不到 → 记录到 unresolved,不写 junction
  19. """
  20. import hashlib
  21. import json
  22. import re
  23. import sys
  24. from pathlib import Path
  25. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  26. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  27. from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS
  28. from knowhub.scripts.rename_merged_capabilities import RENAMES
  29. from knowhub.scripts.llm_renames import LLM_RENAMES
  30. TARGET_REQS = ['REQ_004', 'REQ_031', 'REQ_053', 'REQ_066', 'REQ_070']
  31. # 占位 strategy 里特有的老 ID / 含混名字 → canonical
  32. # 这些是 MERGE_CLUSTERS 和 LLM_RENAMES 都没覆盖的 salvage-specific refs
  33. LEGACY_REFS = {
  34. # 老 tao_dev ID
  35. 'CAP-tao_dev_1-02-03': 'CAP-792fd807', # 景深虚化光学模拟
  36. 'CAP-tao_dev_1-03-01': 'CAP-1649b549', # 戏剧性明暗对比
  37. 'CAP-tao_dev_1-00-02': 'CAP-008ee6c9', # 真实感提示词注入
  38. # REQ_004 的名字变体
  39. '结构化提示词工程(PROMPT MASTER)': 'CAP-5b000814',
  40. '参考图融合控制(Omni-Reference)': 'CAP-017',
  41. '手持道具细节强化': 'CAP-d043d289',
  42. # REQ_031 的 is_new cap(现已有 canonical)
  43. '跨物种形态融合生成(is_new)': 'CAP-24dd762b',
  44. '跨物种形态融合生成': 'CAP-24dd762b',
  45. # REQ_053 的变体
  46. '景深虚化光学模拟(前景虚化)': 'CAP-792fd807',
  47. }
  48. def norm(s):
  49. return (s or '').strip().lower()
  50. def build_alias_and_member(cur):
  51. """Return (alias_name→canonical, member_id→canonical) with transitive closure."""
  52. m2c = {}
  53. for canonical, members in MERGE_CLUSTERS.items():
  54. for m in members:
  55. m2c[m] = canonical
  56. def final(cid, limit=10):
  57. seen = set()
  58. while cid in m2c and cid not in seen and limit > 0:
  59. seen.add(cid); cid = m2c[cid]; limit -= 1
  60. return cid
  61. for m in list(m2c.keys()):
  62. m2c[m] = final(m)
  63. alias = {}
  64. cur.execute('SELECT id, name FROM capability')
  65. db_caps = {r['id']: r['name'] for r in cur.fetchall()}
  66. for cid, name in db_caps.items():
  67. alias[norm(name)] = cid
  68. for cid, (new_name, _) in RENAMES.items():
  69. alias[norm(new_name)] = final(cid)
  70. for llm_name, canonical in LLM_RENAMES.items():
  71. alias[norm(llm_name)] = final(canonical)
  72. return alias, m2c, db_caps
  73. def resolve_cap(cap_ref, alias, m2c, db_caps, unresolved):
  74. """cap_ref can be: an id 'CAP-XXX', a name, or 'CAP-XXX 名字' combo."""
  75. if not cap_ref:
  76. return None
  77. cap_ref = str(cap_ref).strip()
  78. # 0. LEGACY_REFS 优先(占位 strategy 特有的老 ID 和 含混名字)
  79. if cap_ref in LEGACY_REFS:
  80. cand = LEGACY_REFS[cap_ref]
  81. if cand in db_caps:
  82. return cand
  83. # Extract leading CAP-... if present
  84. id_match = re.match(r'^(CAP-[\w\-]+)', cap_ref)
  85. candidate_id = id_match.group(1) if id_match else None
  86. # Extract name part
  87. if id_match:
  88. name_part = cap_ref[id_match.end():].strip()
  89. else:
  90. name_part = cap_ref
  91. # 1. Direct ID if exists in DB
  92. if candidate_id and candidate_id in db_caps:
  93. return candidate_id
  94. # 2. LEGACY tao_dev_id
  95. if candidate_id and candidate_id in LEGACY_REFS:
  96. return LEGACY_REFS[candidate_id]
  97. # 3. tao_dev old ID through member->canonical transitive closure
  98. if candidate_id and candidate_id in m2c:
  99. return m2c[candidate_id]
  100. # 4. Name alias
  101. if name_part:
  102. cand = alias.get(norm(name_part))
  103. if cand and cand in db_caps:
  104. return cand
  105. # 5. Try whole ref as name (for cases where no prefix)
  106. cand = alias.get(norm(cap_ref))
  107. if cand and cand in db_caps:
  108. return cand
  109. unresolved.append(cap_ref)
  110. return None
  111. # ═══════════════════════════════════════════════════════════
  112. # Each folder's schema normalizer
  113. def salvage_req_004(body, alias, m2c, db_caps, unresolved):
  114. """body.strategy.phases + body.capability_mapping (phase_id -> caps)."""
  115. strat = body.get('strategy', {})
  116. phases = strat.get('phases', []) if isinstance(strat, dict) else []
  117. cap_map = body.get('capability_mapping', [])
  118. # Build phase_id -> [cap_name, ...]
  119. pid_to_caps = {}
  120. for cm in cap_map:
  121. if not isinstance(cm, dict): continue
  122. cap_name = cm.get('capability')
  123. for pid in cm.get('used_in_phases', []):
  124. # pid can be like 'P1', 'P2(备选)' — take prefix P[digit]
  125. m = re.match(r'(P\d+)', str(pid))
  126. if m:
  127. pid_to_caps.setdefault(m.group(1), []).append(cap_name)
  128. wo = []
  129. for ph in phases:
  130. if not isinstance(ph, dict): continue
  131. pid = ph.get('phase_id', '')
  132. caps_names = pid_to_caps.get(pid, [])
  133. resolved = []
  134. seen = set()
  135. for n in caps_names:
  136. r = resolve_cap(n, alias, m2c, db_caps, unresolved)
  137. if r and r not in seen:
  138. resolved.append({'id': r, 'name': db_caps.get(r, n)})
  139. seen.add(r)
  140. wo.append({
  141. 'phase': ph.get('phase', ''),
  142. 'description': ph.get('description', ''),
  143. 'capabilities': resolved,
  144. })
  145. return wo
  146. def salvage_req_031(body, alias, m2c, db_caps, unresolved):
  147. """body.strategy.phases[*].capabilities_used (strings like 'CAP-003 图像主体一致性保持')."""
  148. strat = body.get('strategy', {})
  149. phases = strat.get('phases', []) if isinstance(strat, dict) else []
  150. wo = []
  151. for ph in phases:
  152. if not isinstance(ph, dict): continue
  153. caps_used = ph.get('capabilities_used', [])
  154. resolved = []
  155. seen = set()
  156. for cu in caps_used:
  157. r = resolve_cap(cu, alias, m2c, db_caps, unresolved)
  158. if r and r not in seen:
  159. resolved.append({'id': r, 'name': db_caps.get(r, cu)})
  160. seen.add(r)
  161. wo.append({
  162. 'phase': ph.get('phase', ''),
  163. 'description': ph.get('description', ''),
  164. 'capabilities': resolved,
  165. })
  166. return wo
  167. def salvage_req_053(body, alias, m2c, db_caps, unresolved):
  168. """body.phases (top-level) with capabilities[*] dict."""
  169. phases = body.get('phases', [])
  170. wo = []
  171. for ph in phases:
  172. if not isinstance(ph, dict): continue
  173. caps_list = ph.get('capabilities', [])
  174. resolved = []
  175. seen = set()
  176. for c in caps_list:
  177. if not isinstance(c, dict): continue
  178. cid = c.get('id')
  179. name = c.get('name', '')
  180. # combined ref
  181. ref = cid if cid else name
  182. r = resolve_cap(ref, alias, m2c, db_caps, unresolved)
  183. if not r and name:
  184. r = resolve_cap(name, alias, m2c, db_caps, unresolved)
  185. if r and r not in seen:
  186. resolved.append({'id': r, 'name': db_caps.get(r, name)})
  187. seen.add(r)
  188. wo.append({
  189. 'phase': ph.get('phase', ''),
  190. 'description': ph.get('description', ''),
  191. 'capabilities': resolved,
  192. })
  193. return wo
  194. def salvage_req_066(body, alias, m2c, db_caps, unresolved):
  195. """body.execution_phases[*].capabilities_used (mixed id/name strings)."""
  196. phases = body.get('execution_phases', [])
  197. wo = []
  198. for ph in phases:
  199. if not isinstance(ph, dict): continue
  200. caps_used = ph.get('capabilities_used', [])
  201. resolved = []
  202. seen = set()
  203. for cu in caps_used:
  204. r = resolve_cap(cu, alias, m2c, db_caps, unresolved)
  205. if r and r not in seen:
  206. resolved.append({'id': r, 'name': db_caps.get(r, cu)})
  207. seen.add(r)
  208. wo.append({
  209. 'phase': ph.get('phase', ''),
  210. 'description': ph.get('description', ''),
  211. 'capabilities': resolved,
  212. })
  213. return wo
  214. def salvage_req_070(body, alias, m2c, db_caps, unresolved):
  215. """body.selected_blueprint.phases[*].capabilities_used."""
  216. sbp = body.get('selected_blueprint', {})
  217. if isinstance(sbp, str):
  218. try: sbp = json.loads(sbp)
  219. except: sbp = {}
  220. phases = sbp.get('phases', []) if isinstance(sbp, dict) else []
  221. wo = []
  222. for ph in phases:
  223. if not isinstance(ph, dict): continue
  224. caps_used = ph.get('capabilities_used', [])
  225. resolved = []
  226. seen = set()
  227. for cu in caps_used:
  228. r = resolve_cap(cu, alias, m2c, db_caps, unresolved)
  229. if r and r not in seen:
  230. resolved.append({'id': r, 'name': db_caps.get(r, cu)})
  231. seen.add(r)
  232. wo.append({
  233. 'phase': ph.get('phase', ''),
  234. 'description': ph.get('description', ''),
  235. 'capabilities': resolved,
  236. })
  237. return wo
  238. SALVAGERS = {
  239. 'REQ_004': salvage_req_004,
  240. 'REQ_031': salvage_req_031,
  241. 'REQ_053': salvage_req_053,
  242. 'REQ_066': salvage_req_066,
  243. 'REQ_070': salvage_req_070,
  244. }
  245. # ═══════════════════════════════════════════════════════════
  246. def main():
  247. s = PostgreSQLCapabilityStore()
  248. cur = s._get_cursor()
  249. try:
  250. alias, m2c, db_caps = build_alias_and_member(cur)
  251. print(f'alias entries: {len(alias)}, members: {len(m2c)}, db caps: {len(db_caps)}', flush=True)
  252. for req_id in TARGET_REQS:
  253. print(f'\n=== {req_id} ===', flush=True)
  254. cur.execute("""SELECT s.id, s.name, s.body FROM strategy s
  255. JOIN requirement_strategy rs ON rs.strategy_id=s.id
  256. WHERE rs.requirement_id=%s""", (req_id,))
  257. row = cur.fetchone()
  258. if not row:
  259. print(f' ⚠️ no strategy found for {req_id}', flush=True)
  260. continue
  261. strat_id, strat_name = row['id'], row['name']
  262. body = row['body'] if isinstance(row['body'], dict) else json.loads(row['body'] or '{}')
  263. unresolved = []
  264. salvager = SALVAGERS[req_id]
  265. wo = salvager(body, alias, m2c, db_caps, unresolved)
  266. phase_count = len(wo)
  267. cap_total = sum(len(ph['capabilities']) for ph in wo)
  268. unique_caps = set()
  269. for ph in wo:
  270. for c in ph['capabilities']:
  271. unique_caps.add(c['id'])
  272. print(f' strategy: {strat_id} ({strat_name})', flush=True)
  273. print(f' produced: {phase_count} phases, {cap_total} cap slots ({len(unique_caps)} unique)', flush=True)
  274. if unresolved:
  275. print(f' unresolved refs: {len(unresolved)}', flush=True)
  276. for u in unresolved[:5]: print(f' - {u!r}', flush=True)
  277. # Add workflow_outline to body (preserve all original fields)
  278. body['workflow_outline'] = wo
  279. body['_salvaged_at'] = '2026-04-22'
  280. body['_salvage_source'] = 'salvage_placeholder_strategies.py'
  281. # Update strategy.body
  282. cur.execute('UPDATE strategy SET body = %s WHERE id = %s',
  283. (json.dumps(body, ensure_ascii=False), strat_id))
  284. # Write strategy_capability junction
  285. # First remove existing (should be 0 but be safe)
  286. cur.execute('DELETE FROM strategy_capability WHERE strategy_id = %s', (strat_id,))
  287. for cap_id in unique_caps:
  288. cur.execute("""INSERT INTO strategy_capability (strategy_id, capability_id, relation_type)
  289. VALUES (%s, %s, 'compose') ON CONFLICT DO NOTHING""", (strat_id, cap_id))
  290. print(f' wrote strategy_capability: {len(unique_caps)} rows', flush=True)
  291. # Verify: count strat_cap rows
  292. cur.execute('SELECT COUNT(*) c FROM strategy_capability WHERE strategy_id=%s', (strat_id,))
  293. print(f' strategy_capability after: {cur.fetchone()["c"]}', flush=True)
  294. # Final verification
  295. print(f'\n{"="*60}', flush=True)
  296. print('All 5 placeholder strategies after salvage:', flush=True)
  297. for req_id in TARGET_REQS:
  298. cur.execute("""SELECT s.id, s.name,
  299. (SELECT COUNT(*) FROM strategy_capability sc WHERE sc.strategy_id=s.id) cap_n
  300. FROM strategy s
  301. JOIN requirement_strategy rs ON rs.strategy_id=s.id
  302. WHERE rs.requirement_id=%s""", (req_id,))
  303. r = cur.fetchone()
  304. if r:
  305. print(f' [{req_id}] {r["id"]} ({r["name"]}): strat_cap={r["cap_n"]}', flush=True)
  306. finally:
  307. cur.close()
  308. s.close()
  309. if __name__ == '__main__':
  310. main()