phase5_fill_alt_knowledge_capability.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #!/usr/bin/env python3
  2. """
  3. 补齐 94 个 alt knowledge 的 knowledge_capability 行。
  4. 原因:bk_20260422_strategy_capability 是 Phase 2 之前的快照,只有 99 个 selected strategy 的 cap。
  5. Phase 2 插入了 94 个 alt 的 cap 进 strategy_capability,Phase 4 迁移时读的是 live(包括 alt),
  6. 但 phase5_add_knowledge_capability.py 用的是 backup(丢 alt)。
  7. 修复方式:重新从源 strategy.json 解析每条 alt 的 workflow_outline caps,
  8. 然后用 (req_id, strategy_name) 匹配 knowledge,INSERT knowledge_capability。
  9. """
  10. import hashlib
  11. import json
  12. import re
  13. import sys
  14. from pathlib import Path
  15. sys.path.insert(0, str(Path(__file__).parent.parent.parent))
  16. from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
  17. from knowhub.scripts.merge_capabilities import MERGE_CLUSTERS
  18. from knowhub.scripts.rename_merged_capabilities import RENAMES
  19. from knowhub.scripts.llm_renames import LLM_RENAMES
  20. from knowhub.scripts.salvage_placeholder_strategies import LEGACY_REFS
  21. OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
  22. RERUN_DIR = Path('/Users/sunlit/Downloads/5')
  23. RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
  24. def norm(s): return (s or '').strip().lower()
  25. def build_alias_and_member(cur):
  26. m2c = {}
  27. for canonical, members in MERGE_CLUSTERS.items():
  28. for m in members: m2c[m] = canonical
  29. def final(cid, limit=10):
  30. seen = set()
  31. while cid in m2c and cid not in seen and limit > 0:
  32. seen.add(cid); cid = m2c[cid]; limit -= 1
  33. return cid
  34. for m in list(m2c.keys()): m2c[m] = final(m)
  35. alias = {}
  36. cur.execute('SELECT id, name FROM capability')
  37. db_caps = {r['id']: r['name'] for r in cur.fetchall()}
  38. for cid, name in db_caps.items():
  39. alias[norm(name)] = cid
  40. for cid, (new_name, _) in RENAMES.items():
  41. alias[norm(new_name)] = final(cid)
  42. for llm_name, canonical in LLM_RENAMES.items():
  43. alias[norm(llm_name)] = final(canonical)
  44. return alias, m2c, db_caps
  45. def resolve_cap_ref(cap_ref, alias, m2c, db_caps):
  46. if not cap_ref: return None
  47. if isinstance(cap_ref, dict):
  48. cid = cap_ref.get('id')
  49. if cid and cid in db_caps: return cid
  50. if cid and cid in LEGACY_REFS: return LEGACY_REFS[cid]
  51. if cid in m2c: return m2c[cid]
  52. name = cap_ref.get('name', '')
  53. if name:
  54. cand = alias.get(norm(name))
  55. if cand and cand in db_caps: return cand
  56. return None
  57. if isinstance(cap_ref, str):
  58. if cap_ref in LEGACY_REFS: return LEGACY_REFS[cap_ref]
  59. m = re.match(r'^(CAP-[\w\-]+)', cap_ref)
  60. if m:
  61. if m.group(1) in db_caps: return m.group(1)
  62. if m.group(1) in LEGACY_REFS: return LEGACY_REFS[m.group(1)]
  63. cand = alias.get(norm(cap_ref))
  64. if cand and cand in db_caps: return cand
  65. return None
  66. def extract_strat_caps(s_data, alias, m2c, db_caps):
  67. wo = s_data.get('workflow_outline') or []
  68. caps = set()
  69. if isinstance(wo, list):
  70. for ph in wo:
  71. if not isinstance(ph, dict): continue
  72. for c in ph.get('capabilities', []) or []:
  73. r = resolve_cap_ref(c, alias, m2c, db_caps)
  74. if r: caps.add(r)
  75. return caps
  76. def main():
  77. s = PostgreSQLCapabilityStore()
  78. cur = s._get_cursor()
  79. try:
  80. alias, m2c, db_caps = build_alias_and_member(cur)
  81. print(f'alias={len(alias)}, m2c={len(m2c)}, db_caps={len(db_caps)}', flush=True)
  82. # 先识别 0-cap 的 knowledge(绝大多是 alt strategies)
  83. cur.execute("""SELECT k.id, k.source FROM knowledge k
  84. WHERE k.version='howard_strategy_instance'
  85. AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc
  86. WHERE kc.knowledge_id = k.id)""")
  87. zero_cap_knowledge = cur.fetchall()
  88. print(f'knowledge with 0 cap links: {len(zero_cap_knowledge)}', flush=True)
  89. # 为这些 knowledge 重新解析原 strategy 的 caps
  90. # knowledge.source.original_strategy_id → 老具体 strategy id(已被删除)
  91. # 但 knowledge.tags.original_strategy_name 保留了名字
  92. # 可以通过 req_id + name 在源 strategy.json 里定位
  93. # 1) 从源文件读所有 (req_id, strategy_name) → caps
  94. source_cap_map = {} # (req_id, strategy_name) → set(cap_id)
  95. # 获取 req_id → req_text 的映射,再 req_text → folder
  96. cur.execute('SELECT id, description FROM requirement')
  97. req_by_id = {r['id']: r['description'] for r in cur.fetchall()}
  98. folders = []
  99. for d in sorted(OUTPUT_DIR.iterdir()):
  100. if not d.is_dir(): continue
  101. if d.name in RERUN_FOLDERS:
  102. folders.append(RERUN_DIR / d.name)
  103. else:
  104. folders.append(d)
  105. for folder in folders:
  106. strat_path = folder / 'strategy.json'
  107. if not strat_path.exists(): continue
  108. try:
  109. sd = json.loads(strat_path.read_text(encoding='utf-8'))
  110. except Exception:
  111. continue
  112. if not isinstance(sd, dict): continue
  113. req_text = sd.get('requirement', '')
  114. if not req_text: continue
  115. # Find req_id by exact match
  116. matched_req = None
  117. for rid, rtext in req_by_id.items():
  118. if rtext == req_text:
  119. matched_req = rid; break
  120. if not matched_req: continue
  121. for s_data in sd.get('strategies', []):
  122. if not isinstance(s_data, dict): continue
  123. name = s_data.get('name', '')
  124. caps = extract_strat_caps(s_data, alias, m2c, db_caps)
  125. source_cap_map[(matched_req, name)] = caps
  126. print(f'source (req, name) → caps 映射共 {len(source_cap_map)} 条', flush=True)
  127. # 2) 匹配 0-cap knowledge 到 source,插入 knowledge_capability
  128. total_ins = 0
  129. not_matched = []
  130. for k in zero_cap_knowledge:
  131. src = k['source'] if isinstance(k['source'], dict) else json.loads(k['source'] or '{}')
  132. # tags 里有 original_strategy_name
  133. # 我们从 knowledge 里额外查
  134. cur.execute('SELECT tags FROM knowledge WHERE id = %s', (k['id'],))
  135. tags = cur.fetchone()['tags']
  136. tags = tags if isinstance(tags, dict) else json.loads(tags or '{}')
  137. orig_name = tags.get('original_strategy_name', '')
  138. # req_id 从 requirement_knowledge 查
  139. cur.execute("""SELECT requirement_id FROM requirement_knowledge
  140. WHERE knowledge_id = %s""", (k['id'],))
  141. row = cur.fetchone()
  142. if not row: continue
  143. req_id = row['requirement_id']
  144. caps = source_cap_map.get((req_id, orig_name), set())
  145. if not caps:
  146. not_matched.append((k['id'], req_id, orig_name))
  147. continue
  148. for cap_id in caps:
  149. cur.execute("""INSERT INTO knowledge_capability (knowledge_id, capability_id)
  150. VALUES (%s, %s) ON CONFLICT DO NOTHING""", (k['id'], cap_id))
  151. total_ins += 1
  152. print(f'\ninserted: {total_ins}', flush=True)
  153. print(f'still not matched: {len(not_matched)}', flush=True)
  154. for item in not_matched[:8]:
  155. print(f' {item}', flush=True)
  156. # 3) 验证
  157. cur.execute("""SELECT COUNT(*) c FROM knowledge
  158. WHERE version='howard_strategy_instance'
  159. AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc
  160. WHERE kc.knowledge_id = knowledge.id)""")
  161. still_zero = cur.fetchone()['c']
  162. print(f'\n仍 0 cap 的 knowledge: {still_zero}', flush=True)
  163. # 约束验证
  164. cur.execute("""SELECT COUNT(*) c FROM knowledge_capability kc
  165. JOIN requirement_knowledge rk ON rk.knowledge_id = kc.knowledge_id
  166. LEFT JOIN requirement_capability rc
  167. ON rc.requirement_id = rk.requirement_id
  168. AND rc.capability_id = kc.capability_id
  169. WHERE rc.capability_id IS NULL""")
  170. print(f'knowledge_cap ⊄ req_cap 违规: {cur.fetchone()["c"]}', flush=True)
  171. cur.execute('SELECT COUNT(*) c FROM knowledge_capability')
  172. print(f'knowledge_capability 总行数: {cur.fetchone()["c"]}', flush=True)
  173. finally:
  174. cur.close()
  175. s.close()
  176. if __name__ == '__main__':
  177. main()