| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286 |
- #!/usr/bin/env python3
- """
- Phase 3:对 193 条具体 strategy 做聚类,准备抽象化。
- 特征:
- - cap_signature: set of canonical capability_ids (去重后)
- - phase_count: int
- - tool_set: set of primary tool IDs from workflow_outline[*].capabilities[*].implements
- - name_bigrams: char 2-grams of strategy name
- 聚类策略(多信号组合):
- 1. 建立 pairwise composite similarity:
- composite = 0.6 * cap_jaccard
- + 0.2 * tool_jaccard
- + 0.1 * name_bigram_jaccard
- + 0.1 * phase_count_proximity
- 2. 粗筛阈值:composite >= 0.35(粗筛 → recall 优先)
- 3. 传递闭包得到候选簇
- 输出 /tmp/strategy_clusters.md 含:
- - 每簇 N 条 strategy(req_id / name / is_selected / coverage_score)
- - 簇内 cap 交集(majority 50%+ 成员持有)
- - 簇内常见 tools
- - 簇内常见 name 2-grams
- - 建议抽象名 + 判断提示
- """
- import json
- import math
- import re
- import sys
- from collections import defaultdict, Counter
- from itertools import combinations
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).parent.parent.parent))
- from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore
- CAP_WEIGHT = 0.55
- TOOL_WEIGHT = 0.25
- NAME_WEIGHT = 0.15
- PHASE_WEIGHT = 0.05
- COMPOSITE_THRESHOLD = 0.40 # 粗筛
- STRONG_LINK_THRESHOLD = 0.55 # 传递闭包只走强边
- def norm(s):
- return re.sub(r'\s+', '', (s or '').strip().lower())
- def ch_bigrams(s):
- s = re.sub(r'\s+', '', s or '')
- return set(s[i:i+2] for i in range(len(s) - 1))
- def jaccard(a, b):
- if not a or not b: return 0
- return len(a & b) / len(a | b)
- def phase_proximity(a, b):
- """Phase 数量差越小越相似"""
- if a == 0 and b == 0: return 0 # both empty = no signal
- d = abs(a - b)
- if d == 0: return 1.0
- if d == 1: return 0.7
- if d == 2: return 0.4
- return 0.1
- def extract_strategy_features(cur):
- """Return list of {id, req_id, name, is_selected, coverage_score, caps, tools, phases}.
- caps 从 strategy_capability 表读取(已经过 alias 解析,准确);
- tools + phases 从 body.workflow_outline 提取。
- """
- db_caps = {}
- cur.execute('SELECT id, name FROM capability')
- for r in cur.fetchall(): db_caps[r['id']] = r['name']
- # strategy_id → set of cap_ids
- strat_caps_map = defaultdict(set)
- cur.execute('SELECT strategy_id, capability_id FROM strategy_capability')
- for r in cur.fetchall():
- strat_caps_map[r['strategy_id']].add(r['capability_id'])
- cur.execute("""
- SELECT s.id, s.name, s.body,
- rs.requirement_id, rs.is_selected, rs.coverage_score
- FROM strategy s
- JOIN requirement_strategy rs ON rs.strategy_id=s.id
- ORDER BY rs.requirement_id, rs.is_selected DESC""")
- rows = cur.fetchall()
- features = []
- for row in rows:
- body = row['body'] if isinstance(row['body'], dict) else json.loads(row['body'] or '{}')
- wo = body.get('workflow_outline') or []
- tools = set(); phases = 0
- if isinstance(wo, list):
- phases = len(wo)
- for ph in wo:
- if not isinstance(ph, dict): continue
- for c in ph.get('capabilities', []) or []:
- if isinstance(c, dict):
- impl = c.get('implements')
- if isinstance(impl, dict):
- for t in impl.keys(): tools.add(t)
- elif isinstance(impl, list):
- for t in impl:
- if isinstance(t, str): tools.add(t)
- features.append({
- 'id': row['id'],
- 'req_id': row['requirement_id'],
- 'name': row['name'],
- 'is_selected': row['is_selected'],
- 'coverage_score': row['coverage_score'],
- 'caps': strat_caps_map[row['id']], # 从 junction 读,准确
- 'tools': tools,
- 'phases': phases,
- 'name_bigrams': ch_bigrams(row['name']),
- })
- return features, db_caps
- def compute_cap_idf(features):
- """log(N / df(cap)) — rare caps get higher weight."""
- N = len(features)
- df = Counter()
- for s in features:
- for c in s['caps']: df[c] += 1
- idf = {c: math.log(N / n) for c, n in df.items()}
- return idf, df
- def weighted_jaccard(a, b, idf):
- """Sum of IDF weights over intersection / union."""
- inter = a & b; union = a | b
- if not union: return 0
- w_inter = sum(idf.get(c, 0) for c in inter)
- w_union = sum(idf.get(c, 0) for c in union)
- return w_inter / w_union if w_union > 0 else 0
- def composite_sim(a, b, cap_idf):
- """Weighted cap similarity with IDF + other signals."""
- cap_j = weighted_jaccard(a['caps'], b['caps'], cap_idf)
- tool_j = jaccard(a['tools'], b['tools'])
- name_j = jaccard(a['name_bigrams'], b['name_bigrams'])
- phase_p = phase_proximity(a['phases'], b['phases'])
- sim = (CAP_WEIGHT * cap_j + TOOL_WEIGHT * tool_j +
- NAME_WEIGHT * name_j + PHASE_WEIGHT * phase_p)
- return sim, cap_j, tool_j, name_j, phase_p
- def cluster_with_strong_links(features, candidate_threshold, strong_threshold, cap_idf):
- """
- 两层:候选边(做参考)+ 强边(做簇)。
- 只用强边做传递闭包,避免弱边链条化。
- """
- n = len(features)
- strong_adj = defaultdict(set)
- all_edges = []
- for i, j in combinations(range(n), 2):
- sim, cap_j, tool_j, name_j, phase_p = composite_sim(features[i], features[j], cap_idf)
- if sim >= candidate_threshold:
- all_edges.append((i, j, sim, cap_j, tool_j, name_j, phase_p))
- if sim >= strong_threshold:
- strong_adj[i].add(j); strong_adj[j].add(i)
- visited = set()
- clusters = []
- for i in range(n):
- if i in visited: continue
- if i not in strong_adj:
- clusters.append([i]); visited.add(i); continue
- comp = []; stack = [i]
- while stack:
- x = stack.pop()
- if x in visited: continue
- visited.add(x); comp.append(x)
- for y in strong_adj[x]:
- if y not in visited: stack.append(y)
- clusters.append(comp)
- return clusters, all_edges
- def summarize_cluster(idx, members, features, db_caps, out):
- """Write cluster summary to markdown file out."""
- strats = [features[i] for i in members]
- out.write(f'\n## 簇 {idx}({len(strats)} 条 strategies)\n\n')
- # Stats
- cap_count = Counter()
- for s in strats:
- for c in s['caps']: cap_count[c] += 1
- tool_count = Counter()
- for s in strats:
- for t in s['tools']: tool_count[t] += 1
- name_bigram_count = Counter()
- for s in strats:
- for g in s['name_bigrams']: name_bigram_count[g] += 1
- total = len(strats)
- threshold_majority = max(2, total * 0.5)
- majority_caps = [(c, n) for c, n in cap_count.most_common() if n >= threshold_majority]
- common_tools = [(t, n) for t, n in tool_count.most_common(8)]
- common_bigrams = [(g, n) for g, n in name_bigram_count.most_common(6) if n >= 2]
- # Members table
- out.write('| req | is_sel | cov | phases | name |\n')
- out.write('|---|---|---|---|---|\n')
- for s in strats:
- cov = f'{s["coverage_score"]:.2f}' if s['coverage_score'] is not None else '—'
- is_sel = '✓' if s['is_selected'] else ' '
- out.write(f'| {s["req_id"]} | {is_sel} | {cov} | {s["phases"]} | {s["name"][:60]} |\n')
- out.write(f'\n**Majority caps**(≥{threshold_majority:.0f}/{total}成员持有):\n')
- for cid, n in majority_caps[:15]:
- out.write(f'- {n}/{total}: `{cid}` {db_caps.get(cid, "?")}\n')
- if common_tools:
- out.write(f'\n**Common tools**:\n')
- for t, n in common_tools:
- out.write(f'- {n}/{total}: `{t}`\n')
- if common_bigrams:
- out.write(f'\n**Common name bigrams** (≥2): ')
- out.write(' '.join(f'`{g}`({n})' for g, n in common_bigrams) + '\n')
- def main():
- s = PostgreSQLCapabilityStore()
- cur = s._get_cursor()
- try:
- features, db_caps = extract_strategy_features(cur)
- print(f'Total strategies: {len(features)}', flush=True)
- cap_idf, cap_df = compute_cap_idf(features)
- # Print top non-distinctive caps (high df)
- print('\nTop 10 most common caps (low IDF):')
- for c, df in sorted(cap_df.items(), key=lambda x: -x[1])[:10]:
- print(f' df={df:3d} idf={cap_idf[c]:.2f} {c} {db_caps.get(c,"?")[:40]}')
- clusters, edges = cluster_with_strong_links(
- features, COMPOSITE_THRESHOLD, STRONG_LINK_THRESHOLD, cap_idf)
- # Sort clusters by size desc
- clusters.sort(key=lambda c: -len(c))
- multi = [c for c in clusters if len(c) >= 2]
- singleton = [c for c in clusters if len(c) == 1]
- print(f'Clusters: {len(multi)} multi-strategy + {len(singleton)} singletons', flush=True)
- out_path = Path('/tmp/strategy_clusters.md')
- with out_path.open('w') as out:
- out.write(f'# Strategy Clusters (composite >= {COMPOSITE_THRESHOLD})\n\n')
- out.write(f'Total strategies: {len(features)}\n')
- out.write(f'Multi-member clusters: {len(multi)}(覆盖 {sum(len(c) for c in multi)} strategies)\n')
- out.write(f'Singletons: {len(singleton)}\n\n')
- out.write(f'Weights: cap={CAP_WEIGHT}, tool={TOOL_WEIGHT}, name={NAME_WEIGHT}, phase={PHASE_WEIGHT}\n\n')
- out.write('---\n\n# Multi-member clusters\n')
- for idx, members in enumerate(multi, 1):
- summarize_cluster(idx, members, features, db_caps, out)
- out.write('\n\n---\n\n# Singletons(独立 strategy)\n\n')
- out.write(f'{len(singleton)} strategies 没找到相似伙伴。\n\n')
- for c in singleton:
- s_ = features[c[0]]
- cov = f'{s_["coverage_score"]:.2f}' if s_['coverage_score'] is not None else '—'
- out.write(f'- [{s_["req_id"]}] caps={len(s_["caps"])}, phases={s_["phases"]}, cov={cov}: {s_["name"][:60]}\n')
- print(f'Written: {out_path}', flush=True)
- # Print summary
- print('\n== Multi-member clusters ==')
- for idx, members in enumerate(multi[:20], 1):
- req_ids = [features[i]['req_id'] for i in members]
- print(f' [{idx}] {len(members)} members: {", ".join(sorted(set(req_ids)))}')
- if len(multi) > 20: print(f' ... ({len(multi)-20} more)')
- finally:
- cur.close()
- s.close()
- if __name__ == '__main__':
- main()
|