#!/usr/bin/env python3
"""
Phase 3：对 193 条具体 strategy 做聚类，准备抽象化。

特征：
  - cap_signature: set of canonical capability_ids (去重后)
  - phase_count: int
  - tool_set: set of primary tool IDs from workflow_outline[*].capabilities[*].implements
  - name_bigrams: char 2-grams of strategy name

聚类策略（多信号组合）：
  1. 建立 pairwise composite similarity:
     composite = 0.6 * cap_jaccard
                + 0.2 * tool_jaccard
                + 0.1 * name_bigram_jaccard
                + 0.1 * phase_count_proximity
  2. 粗筛阈值：composite >= 0.35（粗筛 → recall 优先）
  3. 传递闭包得到候选簇

输出 /tmp/strategy_clusters.md 含：
  - 每簇 N 条 strategy（req_id / name / is_selected / coverage_score）
  - 簇内 cap 交集（majority 50%+ 成员持有）
  - 簇内常见 tools
  - 簇内常见 name 2-grams
  - 建议抽象名 + 判断提示
"""
import json
import math
import re
import sys
from collections import defaultdict, Counter
from itertools import combinations
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore


CAP_WEIGHT = 0.55
TOOL_WEIGHT = 0.25
NAME_WEIGHT = 0.15
PHASE_WEIGHT = 0.05
COMPOSITE_THRESHOLD = 0.40    # 粗筛
STRONG_LINK_THRESHOLD = 0.55  # 传递闭包只走强边


def norm(s):
    return re.sub(r'\s+', '', (s or '').strip().lower())


def ch_bigrams(s):
    s = re.sub(r'\s+', '', s or '')
    return set(s[i:i+2] for i in range(len(s) - 1))


def jaccard(a, b):
    if not a or not b: return 0
    return len(a & b) / len(a | b)


def phase_proximity(a, b):
    """Phase 数量差越小越相似"""
    if a == 0 and b == 0: return 0  # both empty = no signal
    d = abs(a - b)
    if d == 0: return 1.0
    if d == 1: return 0.7
    if d == 2: return 0.4
    return 0.1


def extract_strategy_features(cur):
    """Return list of {id, req_id, name, is_selected, coverage_score, caps, tools, phases}.
    caps 从 strategy_capability 表读取（已经过 alias 解析，准确）；
    tools + phases 从 body.workflow_outline 提取。
    """
    db_caps = {}
    cur.execute('SELECT id, name FROM capability')
    for r in cur.fetchall(): db_caps[r['id']] = r['name']

    # strategy_id → set of cap_ids
    strat_caps_map = defaultdict(set)
    cur.execute('SELECT strategy_id, capability_id FROM strategy_capability')
    for r in cur.fetchall():
        strat_caps_map[r['strategy_id']].add(r['capability_id'])

    cur.execute("""
      SELECT s.id, s.name, s.body,
             rs.requirement_id, rs.is_selected, rs.coverage_score
      FROM strategy s
      JOIN requirement_strategy rs ON rs.strategy_id=s.id
      ORDER BY rs.requirement_id, rs.is_selected DESC""")
    rows = cur.fetchall()

    features = []
    for row in rows:
        body = row['body'] if isinstance(row['body'], dict) else json.loads(row['body'] or '{}')
        wo = body.get('workflow_outline') or []
        tools = set(); phases = 0
        if isinstance(wo, list):
            phases = len(wo)
            for ph in wo:
                if not isinstance(ph, dict): continue
                for c in ph.get('capabilities', []) or []:
                    if isinstance(c, dict):
                        impl = c.get('implements')
                        if isinstance(impl, dict):
                            for t in impl.keys(): tools.add(t)
                        elif isinstance(impl, list):
                            for t in impl:
                                if isinstance(t, str): tools.add(t)
        features.append({
            'id': row['id'],
            'req_id': row['requirement_id'],
            'name': row['name'],
            'is_selected': row['is_selected'],
            'coverage_score': row['coverage_score'],
            'caps': strat_caps_map[row['id']],  # 从 junction 读，准确
            'tools': tools,
            'phases': phases,
            'name_bigrams': ch_bigrams(row['name']),
        })
    return features, db_caps


def compute_cap_idf(features):
    """log(N / df(cap)) — rare caps get higher weight."""
    N = len(features)
    df = Counter()
    for s in features:
        for c in s['caps']: df[c] += 1
    idf = {c: math.log(N / n) for c, n in df.items()}
    return idf, df


def weighted_jaccard(a, b, idf):
    """Sum of IDF weights over intersection / union."""
    inter = a & b; union = a | b
    if not union: return 0
    w_inter = sum(idf.get(c, 0) for c in inter)
    w_union = sum(idf.get(c, 0) for c in union)
    return w_inter / w_union if w_union > 0 else 0


def composite_sim(a, b, cap_idf):
    """Weighted cap similarity with IDF + other signals."""
    cap_j = weighted_jaccard(a['caps'], b['caps'], cap_idf)
    tool_j = jaccard(a['tools'], b['tools'])
    name_j = jaccard(a['name_bigrams'], b['name_bigrams'])
    phase_p = phase_proximity(a['phases'], b['phases'])
    sim = (CAP_WEIGHT * cap_j + TOOL_WEIGHT * tool_j +
           NAME_WEIGHT * name_j + PHASE_WEIGHT * phase_p)
    return sim, cap_j, tool_j, name_j, phase_p


def cluster_with_strong_links(features, candidate_threshold, strong_threshold, cap_idf):
    """
    两层：候选边（做参考）+ 强边（做簇）。
    只用强边做传递闭包，避免弱边链条化。
    """
    n = len(features)
    strong_adj = defaultdict(set)
    all_edges = []
    for i, j in combinations(range(n), 2):
        sim, cap_j, tool_j, name_j, phase_p = composite_sim(features[i], features[j], cap_idf)
        if sim >= candidate_threshold:
            all_edges.append((i, j, sim, cap_j, tool_j, name_j, phase_p))
            if sim >= strong_threshold:
                strong_adj[i].add(j); strong_adj[j].add(i)

    visited = set()
    clusters = []
    for i in range(n):
        if i in visited: continue
        if i not in strong_adj:
            clusters.append([i]); visited.add(i); continue
        comp = []; stack = [i]
        while stack:
            x = stack.pop()
            if x in visited: continue
            visited.add(x); comp.append(x)
            for y in strong_adj[x]:
                if y not in visited: stack.append(y)
        clusters.append(comp)
    return clusters, all_edges


def summarize_cluster(idx, members, features, db_caps, out):
    """Write cluster summary to markdown file out."""
    strats = [features[i] for i in members]
    out.write(f'\n## 簇 {idx}（{len(strats)} 条 strategies）\n\n')

    # Stats
    cap_count = Counter()
    for s in strats:
        for c in s['caps']: cap_count[c] += 1
    tool_count = Counter()
    for s in strats:
        for t in s['tools']: tool_count[t] += 1
    name_bigram_count = Counter()
    for s in strats:
        for g in s['name_bigrams']: name_bigram_count[g] += 1

    total = len(strats)
    threshold_majority = max(2, total * 0.5)

    majority_caps = [(c, n) for c, n in cap_count.most_common() if n >= threshold_majority]
    common_tools = [(t, n) for t, n in tool_count.most_common(8)]
    common_bigrams = [(g, n) for g, n in name_bigram_count.most_common(6) if n >= 2]

    # Members table
    out.write('| req | is_sel | cov | phases | name |\n')
    out.write('|---|---|---|---|---|\n')
    for s in strats:
        cov = f'{s["coverage_score"]:.2f}' if s['coverage_score'] is not None else '—'
        is_sel = '✓' if s['is_selected'] else ' '
        out.write(f'| {s["req_id"]} | {is_sel} | {cov} | {s["phases"]} | {s["name"][:60]} |\n')

    out.write(f'\n**Majority caps**（≥{threshold_majority:.0f}/{total}成员持有）:\n')
    for cid, n in majority_caps[:15]:
        out.write(f'- {n}/{total}: `{cid}` {db_caps.get(cid, "?")}\n')

    if common_tools:
        out.write(f'\n**Common tools**:\n')
        for t, n in common_tools:
            out.write(f'- {n}/{total}: `{t}`\n')

    if common_bigrams:
        out.write(f'\n**Common name bigrams** (≥2): ')
        out.write(' '.join(f'`{g}`({n})' for g, n in common_bigrams) + '\n')


def main():
    s = PostgreSQLCapabilityStore()
    cur = s._get_cursor()
    try:
        features, db_caps = extract_strategy_features(cur)
        print(f'Total strategies: {len(features)}', flush=True)

        cap_idf, cap_df = compute_cap_idf(features)
        # Print top non-distinctive caps (high df)
        print('\nTop 10 most common caps (low IDF):')
        for c, df in sorted(cap_df.items(), key=lambda x: -x[1])[:10]:
            print(f'  df={df:3d} idf={cap_idf[c]:.2f} {c} {db_caps.get(c,"?")[:40]}')

        clusters, edges = cluster_with_strong_links(
            features, COMPOSITE_THRESHOLD, STRONG_LINK_THRESHOLD, cap_idf)
        # Sort clusters by size desc
        clusters.sort(key=lambda c: -len(c))

        multi = [c for c in clusters if len(c) >= 2]
        singleton = [c for c in clusters if len(c) == 1]
        print(f'Clusters: {len(multi)} multi-strategy + {len(singleton)} singletons', flush=True)

        out_path = Path('/tmp/strategy_clusters.md')
        with out_path.open('w') as out:
            out.write(f'# Strategy Clusters (composite >= {COMPOSITE_THRESHOLD})\n\n')
            out.write(f'Total strategies: {len(features)}\n')
            out.write(f'Multi-member clusters: {len(multi)}（覆盖 {sum(len(c) for c in multi)} strategies）\n')
            out.write(f'Singletons: {len(singleton)}\n\n')
            out.write(f'Weights: cap={CAP_WEIGHT}, tool={TOOL_WEIGHT}, name={NAME_WEIGHT}, phase={PHASE_WEIGHT}\n\n')

            out.write('---\n\n# Multi-member clusters\n')
            for idx, members in enumerate(multi, 1):
                summarize_cluster(idx, members, features, db_caps, out)

            out.write('\n\n---\n\n# Singletons（独立 strategy）\n\n')
            out.write(f'{len(singleton)} strategies 没找到相似伙伴。\n\n')
            for c in singleton:
                s_ = features[c[0]]
                cov = f'{s_["coverage_score"]:.2f}' if s_['coverage_score'] is not None else '—'
                out.write(f'- [{s_["req_id"]}] caps={len(s_["caps"])}, phases={s_["phases"]}, cov={cov}: {s_["name"][:60]}\n')

        print(f'Written: {out_path}', flush=True)
        # Print summary
        print('\n== Multi-member clusters ==')
        for idx, members in enumerate(multi[:20], 1):
            req_ids = [features[i]['req_id'] for i in members]
            print(f'  [{idx}] {len(members)} members: {", ".join(sorted(set(req_ids)))}')
        if len(multi) > 20: print(f'  ... ({len(multi)-20} more)')
    finally:
        cur.close()
        s.close()


if __name__ == '__main__':
    main()