#!/usr/bin/env python3 """ Phase 3:对 193 条具体 strategy 做聚类,准备抽象化。 特征: - cap_signature: set of canonical capability_ids (去重后) - phase_count: int - tool_set: set of primary tool IDs from workflow_outline[*].capabilities[*].implements - name_bigrams: char 2-grams of strategy name 聚类策略(多信号组合): 1. 建立 pairwise composite similarity: composite = 0.6 * cap_jaccard + 0.2 * tool_jaccard + 0.1 * name_bigram_jaccard + 0.1 * phase_count_proximity 2. 粗筛阈值:composite >= 0.35(粗筛 → recall 优先) 3. 传递闭包得到候选簇 输出 /tmp/strategy_clusters.md 含: - 每簇 N 条 strategy(req_id / name / is_selected / coverage_score) - 簇内 cap 交集(majority 50%+ 成员持有) - 簇内常见 tools - 簇内常见 name 2-grams - 建议抽象名 + 判断提示 """ import json import math import re import sys from collections import defaultdict, Counter from itertools import combinations from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore CAP_WEIGHT = 0.55 TOOL_WEIGHT = 0.25 NAME_WEIGHT = 0.15 PHASE_WEIGHT = 0.05 COMPOSITE_THRESHOLD = 0.40 # 粗筛 STRONG_LINK_THRESHOLD = 0.55 # 传递闭包只走强边 def norm(s): return re.sub(r'\s+', '', (s or '').strip().lower()) def ch_bigrams(s): s = re.sub(r'\s+', '', s or '') return set(s[i:i+2] for i in range(len(s) - 1)) def jaccard(a, b): if not a or not b: return 0 return len(a & b) / len(a | b) def phase_proximity(a, b): """Phase 数量差越小越相似""" if a == 0 and b == 0: return 0 # both empty = no signal d = abs(a - b) if d == 0: return 1.0 if d == 1: return 0.7 if d == 2: return 0.4 return 0.1 def extract_strategy_features(cur): """Return list of {id, req_id, name, is_selected, coverage_score, caps, tools, phases}. caps 从 strategy_capability 表读取(已经过 alias 解析,准确); tools + phases 从 body.workflow_outline 提取。 """ db_caps = {} cur.execute('SELECT id, name FROM capability') for r in cur.fetchall(): db_caps[r['id']] = r['name'] # strategy_id → set of cap_ids strat_caps_map = defaultdict(set) cur.execute('SELECT strategy_id, capability_id FROM strategy_capability') for r in cur.fetchall(): strat_caps_map[r['strategy_id']].add(r['capability_id']) cur.execute(""" SELECT s.id, s.name, s.body, rs.requirement_id, rs.is_selected, rs.coverage_score FROM strategy s JOIN requirement_strategy rs ON rs.strategy_id=s.id ORDER BY rs.requirement_id, rs.is_selected DESC""") rows = cur.fetchall() features = [] for row in rows: body = row['body'] if isinstance(row['body'], dict) else json.loads(row['body'] or '{}') wo = body.get('workflow_outline') or [] tools = set(); phases = 0 if isinstance(wo, list): phases = len(wo) for ph in wo: if not isinstance(ph, dict): continue for c in ph.get('capabilities', []) or []: if isinstance(c, dict): impl = c.get('implements') if isinstance(impl, dict): for t in impl.keys(): tools.add(t) elif isinstance(impl, list): for t in impl: if isinstance(t, str): tools.add(t) features.append({ 'id': row['id'], 'req_id': row['requirement_id'], 'name': row['name'], 'is_selected': row['is_selected'], 'coverage_score': row['coverage_score'], 'caps': strat_caps_map[row['id']], # 从 junction 读,准确 'tools': tools, 'phases': phases, 'name_bigrams': ch_bigrams(row['name']), }) return features, db_caps def compute_cap_idf(features): """log(N / df(cap)) — rare caps get higher weight.""" N = len(features) df = Counter() for s in features: for c in s['caps']: df[c] += 1 idf = {c: math.log(N / n) for c, n in df.items()} return idf, df def weighted_jaccard(a, b, idf): """Sum of IDF weights over intersection / union.""" inter = a & b; union = a | b if not union: return 0 w_inter = sum(idf.get(c, 0) for c in inter) w_union = sum(idf.get(c, 0) for c in union) return w_inter / w_union if w_union > 0 else 0 def composite_sim(a, b, cap_idf): """Weighted cap similarity with IDF + other signals.""" cap_j = weighted_jaccard(a['caps'], b['caps'], cap_idf) tool_j = jaccard(a['tools'], b['tools']) name_j = jaccard(a['name_bigrams'], b['name_bigrams']) phase_p = phase_proximity(a['phases'], b['phases']) sim = (CAP_WEIGHT * cap_j + TOOL_WEIGHT * tool_j + NAME_WEIGHT * name_j + PHASE_WEIGHT * phase_p) return sim, cap_j, tool_j, name_j, phase_p def cluster_with_strong_links(features, candidate_threshold, strong_threshold, cap_idf): """ 两层:候选边(做参考)+ 强边(做簇)。 只用强边做传递闭包,避免弱边链条化。 """ n = len(features) strong_adj = defaultdict(set) all_edges = [] for i, j in combinations(range(n), 2): sim, cap_j, tool_j, name_j, phase_p = composite_sim(features[i], features[j], cap_idf) if sim >= candidate_threshold: all_edges.append((i, j, sim, cap_j, tool_j, name_j, phase_p)) if sim >= strong_threshold: strong_adj[i].add(j); strong_adj[j].add(i) visited = set() clusters = [] for i in range(n): if i in visited: continue if i not in strong_adj: clusters.append([i]); visited.add(i); continue comp = []; stack = [i] while stack: x = stack.pop() if x in visited: continue visited.add(x); comp.append(x) for y in strong_adj[x]: if y not in visited: stack.append(y) clusters.append(comp) return clusters, all_edges def summarize_cluster(idx, members, features, db_caps, out): """Write cluster summary to markdown file out.""" strats = [features[i] for i in members] out.write(f'\n## 簇 {idx}({len(strats)} 条 strategies)\n\n') # Stats cap_count = Counter() for s in strats: for c in s['caps']: cap_count[c] += 1 tool_count = Counter() for s in strats: for t in s['tools']: tool_count[t] += 1 name_bigram_count = Counter() for s in strats: for g in s['name_bigrams']: name_bigram_count[g] += 1 total = len(strats) threshold_majority = max(2, total * 0.5) majority_caps = [(c, n) for c, n in cap_count.most_common() if n >= threshold_majority] common_tools = [(t, n) for t, n in tool_count.most_common(8)] common_bigrams = [(g, n) for g, n in name_bigram_count.most_common(6) if n >= 2] # Members table out.write('| req | is_sel | cov | phases | name |\n') out.write('|---|---|---|---|---|\n') for s in strats: cov = f'{s["coverage_score"]:.2f}' if s['coverage_score'] is not None else '—' is_sel = '✓' if s['is_selected'] else ' ' out.write(f'| {s["req_id"]} | {is_sel} | {cov} | {s["phases"]} | {s["name"][:60]} |\n') out.write(f'\n**Majority caps**(≥{threshold_majority:.0f}/{total}成员持有):\n') for cid, n in majority_caps[:15]: out.write(f'- {n}/{total}: `{cid}` {db_caps.get(cid, "?")}\n') if common_tools: out.write(f'\n**Common tools**:\n') for t, n in common_tools: out.write(f'- {n}/{total}: `{t}`\n') if common_bigrams: out.write(f'\n**Common name bigrams** (≥2): ') out.write(' '.join(f'`{g}`({n})' for g, n in common_bigrams) + '\n') def main(): s = PostgreSQLCapabilityStore() cur = s._get_cursor() try: features, db_caps = extract_strategy_features(cur) print(f'Total strategies: {len(features)}', flush=True) cap_idf, cap_df = compute_cap_idf(features) # Print top non-distinctive caps (high df) print('\nTop 10 most common caps (low IDF):') for c, df in sorted(cap_df.items(), key=lambda x: -x[1])[:10]: print(f' df={df:3d} idf={cap_idf[c]:.2f} {c} {db_caps.get(c,"?")[:40]}') clusters, edges = cluster_with_strong_links( features, COMPOSITE_THRESHOLD, STRONG_LINK_THRESHOLD, cap_idf) # Sort clusters by size desc clusters.sort(key=lambda c: -len(c)) multi = [c for c in clusters if len(c) >= 2] singleton = [c for c in clusters if len(c) == 1] print(f'Clusters: {len(multi)} multi-strategy + {len(singleton)} singletons', flush=True) out_path = Path('/tmp/strategy_clusters.md') with out_path.open('w') as out: out.write(f'# Strategy Clusters (composite >= {COMPOSITE_THRESHOLD})\n\n') out.write(f'Total strategies: {len(features)}\n') out.write(f'Multi-member clusters: {len(multi)}(覆盖 {sum(len(c) for c in multi)} strategies)\n') out.write(f'Singletons: {len(singleton)}\n\n') out.write(f'Weights: cap={CAP_WEIGHT}, tool={TOOL_WEIGHT}, name={NAME_WEIGHT}, phase={PHASE_WEIGHT}\n\n') out.write('---\n\n# Multi-member clusters\n') for idx, members in enumerate(multi, 1): summarize_cluster(idx, members, features, db_caps, out) out.write('\n\n---\n\n# Singletons(独立 strategy)\n\n') out.write(f'{len(singleton)} strategies 没找到相似伙伴。\n\n') for c in singleton: s_ = features[c[0]] cov = f'{s_["coverage_score"]:.2f}' if s_['coverage_score'] is not None else '—' out.write(f'- [{s_["req_id"]}] caps={len(s_["caps"])}, phases={s_["phases"]}, cov={cov}: {s_["name"][:60]}\n') print(f'Written: {out_path}', flush=True) # Print summary print('\n== Multi-member clusters ==') for idx, members in enumerate(multi[:20], 1): req_ids = [features[i]['req_id'] for i in members] print(f' [{idx}] {len(members)} members: {", ".join(sorted(set(req_ids)))}') if len(multi) > 20: print(f' ... ({len(multi)-20} more)') finally: cur.close() s.close() if __name__ == '__main__': main()