howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							#!/usr/bin/env python3
"""
用 Chinese 2-gram fuzzy 匹配补齐最后 12 条 0-cap knowledge 的 knowledge_capability。
这些 knowledge 的源 strategy 用了 LLM 自造的 cap ID（如 aigc_color_unification），
canonical alias 解析失败，改用名字 2-gram Jaccard + 阈值 0.3 做 best-effort。

打印 match 决策供审核，然后 INSERT。
"""
import json
import re
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from knowhub.knowhub_db.pg_capability_store import PostgreSQLCapabilityStore

OUTPUT_DIR = Path('/Users/sunlit/Downloads/output 2')
RERUN_DIR = Path('/Users/sunlit/Downloads/5')
RERUN_FOLDERS = {'032', '046', '069', '085', '097'}
THRESHOLD = 0.3


def ch_bigrams(s):
    s = re.sub(r'\s+', '', s or '')
    return set(s[i:i+2] for i in range(len(s)-1))


def name_sim(a, b):
    ba, bb = ch_bigrams(a), ch_bigrams(b)
    if not ba or not bb: return 0
    return len(ba & bb) / len(ba | bb)


def main():
    s = PostgreSQLCapabilityStore()
    cur = s._get_cursor()
    try:
        cur.execute('SELECT id, name FROM capability')
        db_caps = [(r['id'], r['name']) for r in cur.fetchall()]
        print(f'db_caps: {len(db_caps)}', flush=True)

        # 找 0-cap 的 howard_strategy_instance knowledge
        cur.execute("""SELECT k.id, k.tags FROM knowledge k
                       WHERE k.version='howard_strategy_instance'
                         AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=k.id)""")
        zero_cap = cur.fetchall()
        print(f'0-cap knowledge: {len(zero_cap)}', flush=True)

        # 找 req_text
        cur.execute('SELECT id, description FROM requirement')
        req_desc = {r['id']: r['description'] for r in cur.fetchall()}

        # 收集 req_id → folder
        req_to_folder = {}
        for folder in sorted(OUTPUT_DIR.iterdir()) + sorted(RERUN_DIR.iterdir()):
            if not folder.is_dir(): continue
            if folder.name in RERUN_FOLDERS and folder.parent == OUTPUT_DIR:
                continue  # 跳过 output 2/ 的 rerun 坏版本
            strat_path = folder / 'strategy.json'
            if not strat_path.exists(): continue
            try: sd = json.loads(strat_path.read_text(encoding='utf-8'))
            except: continue
            if not isinstance(sd, dict): continue
            rt = sd.get('requirement')
            for rid, d in req_desc.items():
                if d == rt:
                    req_to_folder[rid] = folder
                    break

        total_ins = 0
        for k in zero_cap:
            tags = k['tags'] if isinstance(k['tags'], dict) else json.loads(k['tags'] or '{}')
            orig_name = tags.get('original_strategy_name','')
            # req_id
            cur.execute('SELECT requirement_id FROM requirement_knowledge WHERE knowledge_id=%s', (k['id'],))
            r = cur.fetchone()
            if not r: continue
            req_id = r['requirement_id']
            folder = req_to_folder.get(req_id)
            if not folder:
                print(f'  [{req_id}] no folder', flush=True); continue
            # 读源
            try:
                sd = json.loads((folder/'strategy.json').read_text(encoding='utf-8'))
            except:
                continue
            target_strat = None
            for s_data in sd.get('strategies', []):
                if isinstance(s_data, dict) and s_data.get('name','') == orig_name:
                    target_strat = s_data; break
            if not target_strat:
                print(f'  [{req_id}] no match for {orig_name!r}', flush=True); continue

            # 提取 workflow_outline caps 的名字（因 id 无效），逐个 fuzzy 匹配到 canonical
            wo = target_strat.get('workflow_outline', [])
            print(f'\n[{k["id"]}] {req_id} / {orig_name[:50]}', flush=True)
            matched_caps = set()
            for ph in wo:
                if not isinstance(ph, dict): continue
                for c in ph.get('capabilities', []):
                    if not isinstance(c, dict): continue
                    src_name = c.get('name','')
                    if not src_name: continue
                    # fuzzy match
                    best_id = None; best_name = None; best_sim = 0
                    for cid, cname in db_caps:
                        sim = name_sim(src_name, cname)
                        if sim > best_sim:
                            best_sim = sim; best_id = cid; best_name = cname
                    if best_sim >= THRESHOLD:
                        print(f'    "{src_name}" → {best_id} "{best_name}" (sim={best_sim:.2f})', flush=True)
                        matched_caps.add(best_id)
                    else:
                        print(f'    "{src_name}" → NO MATCH (best={best_name} sim={best_sim:.2f})', flush=True)

            # constraint check: 必须 ∈ req_cap
            for cid in matched_caps:
                cur.execute('SELECT 1 FROM requirement_capability WHERE requirement_id=%s AND capability_id=%s',
                            (req_id, cid))
                if cur.fetchone():
                    cur.execute("""INSERT INTO knowledge_capability (knowledge_id, capability_id)
                                   VALUES (%s, %s) ON CONFLICT DO NOTHING""", (k['id'], cid))
                    total_ins += cur.rowcount or 0
                else:
                    print(f'    skip {cid}: not in req_cap for {req_id}', flush=True)

        print(f'\nTotal inserted: {total_ins}', flush=True)
        # Re-check
        cur.execute("""SELECT COUNT(*) c FROM knowledge
                       WHERE version='howard_strategy_instance'
                         AND NOT EXISTS (SELECT 1 FROM knowledge_capability kc WHERE kc.knowledge_id=knowledge.id)""")
        print(f'still 0-cap: {cur.fetchone()["c"]}', flush=True)
        cur.execute('SELECT COUNT(*) c FROM knowledge_capability')
        print(f'knowledge_capability total: {cur.fetchone()["c"]}', flush=True)
    finally:
        cur.close(); s.close()


if __name__ == '__main__':
    main()