howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
							#!/usr/bin/env python3
"""
数据库迁移 v3：JSONB 软关联 → 关联表（junction tables）

兼容当前混合状态：
- tool_table 已被 RENAME 为 tool（之前的 DDL 部分回滚导致）
- atomic_capability、requirement_table 仍为旧名
- 4 张关联表已创建但为空

步骤：
1. 创建 4 张关联表（幂等，已存在则跳过）
2. 从现有表的 JSONB 字段迁移数据到关联表
3. 用 CREATE TABLE AS SELECT 创建 capability 和 requirement（变相重命名）
4. 删除 knowledge 表的 JSONB 关联字段
5. 删除旧表
"""

import os
import json
import psycopg2
from psycopg2.extras import RealDictCursor
from dotenv import load_dotenv

_script_dir = os.path.dirname(os.path.abspath(__file__))
_project_root = os.path.normpath(os.path.join(_script_dir, '..', '..', '..'))
load_dotenv(os.path.join(_project_root, '.env'))


def get_connection():
    conn = psycopg2.connect(
        host=os.getenv('KNOWHUB_DB'),
        port=int(os.getenv('KNOWHUB_PORT', 5432)),
        user=os.getenv('KNOWHUB_USER'),
        password=os.getenv('KNOWHUB_PASSWORD'),
        database=os.getenv('KNOWHUB_DB_NAME'),
        connect_timeout=10
    )
    conn.autocommit = True
    return conn


def table_exists(cursor, name):
    cursor.execute("SELECT 1 FROM information_schema.tables WHERE table_name = %s", (name,))
    return cursor.fetchone() is not None


def column_exists(cursor, table, column):
    cursor.execute(
        "SELECT 1 FROM information_schema.columns WHERE table_name = %s AND column_name = %s",
        (table, column))
    return cursor.fetchone() is not None


def resolve_table(cursor, new_name, old_name):
    """找到实际可用的表名（处理混合重命名状态）"""
    if table_exists(cursor, new_name):
        return new_name
    if table_exists(cursor, old_name):
        return old_name
    raise RuntimeError(f"Neither {new_name} nor {old_name} exists!")


# ─── Step 1: 创建关联表 ──────────────────────────────────────────────────────

CREATE_JUNCTION_TABLES = [
    """
    CREATE TABLE IF NOT EXISTS requirement_capability (
        requirement_id VARCHAR NOT NULL,
        capability_id  VARCHAR NOT NULL,
        PRIMARY KEY (requirement_id, capability_id)
    )
    """,
    """
    CREATE TABLE IF NOT EXISTS capability_tool (
        capability_id VARCHAR NOT NULL,
        tool_id       VARCHAR NOT NULL,
        description   TEXT DEFAULT '',
        PRIMARY KEY (capability_id, tool_id)
    )
    """,
    """
    CREATE TABLE IF NOT EXISTS capability_knowledge (
        capability_id VARCHAR NOT NULL,
        knowledge_id  VARCHAR NOT NULL,
        PRIMARY KEY (capability_id, knowledge_id)
    )
    """,
    """
    CREATE TABLE IF NOT EXISTS tool_knowledge (
        tool_id      VARCHAR NOT NULL,
        knowledge_id VARCHAR NOT NULL,
        PRIMARY KEY (tool_id, knowledge_id)
    )
    """,
]


# ─── Step 2: 数据迁移 ────────────────────────────────────────────────────────

def _parse_json(val):
    if val is None:
        return []
    if isinstance(val, (list, dict)):
        return val
    try:
        return json.loads(val)
    except (json.JSONDecodeError, TypeError):
        return []


def _insert_junction(cursor, table, col_a, col_b, val_a, val_b, extra_cols=None):
    if extra_cols:
        cols = f"{col_a}, {col_b}, {', '.join(extra_cols.keys())}"
        placeholders = ', '.join(['%s'] * (2 + len(extra_cols)))
        values = [val_a, val_b] + list(extra_cols.values())
    else:
        cols = f"{col_a}, {col_b}"
        placeholders = '%s, %s'
        values = [val_a, val_b]
    cursor.execute(
        f"INSERT INTO {table} ({cols}) VALUES ({placeholders}) ON CONFLICT DO NOTHING",
        values
    )


def migrate_data(cursor, tool_tbl, cap_tbl, req_tbl):
    """从 JSONB 字段迁移数据到关联表。使用实际表名。"""
    skipped = {'dangling': [], 'implements_unmatched': []}

    # 预加载有效 ID
    print("    加载 ID...", flush=True)
    cursor.execute(f"SELECT id FROM {req_tbl}")
    valid_reqs = {r['id'] for r in cursor.fetchall()}
    cursor.execute(f"SELECT id FROM {cap_tbl}")
    valid_caps = {r['id'] for r in cursor.fetchall()}
    cursor.execute(f"SELECT id, name FROM {tool_tbl}")
    tool_rows = cursor.fetchall()
    valid_tools = {r['id'] for r in tool_rows}
    cursor.execute("SELECT id FROM knowledge")
    valid_knowledge = {r['id'] for r in cursor.fetchall()}
    print(f"    OK: {len(valid_reqs)} reqs, {len(valid_caps)} caps, {len(valid_tools)} tools, {len(valid_knowledge)} knowledge", flush=True)

    tool_name_to_id = {}
    for r in tool_rows:
        if r['name']:
            tool_name_to_id[r['name'].lower().strip()] = r['id']

    # ── requirement_capability ──
    print("    requirement_capability...", flush=True)
    cursor.execute(f"SELECT id, atomics FROM {req_tbl}")
    for row in cursor.fetchall():
        for cap_id in _parse_json(row['atomics']):
            if cap_id in valid_caps:
                _insert_junction(cursor, 'requirement_capability',
                                 'requirement_id', 'capability_id', row['id'], cap_id)
            else:
                skipped['dangling'].append(f"req_cap: req={row['id']} → cap={cap_id}")

    cursor.execute(f"SELECT id, requirements FROM {cap_tbl}")
    for row in cursor.fetchall():
        for req_id in _parse_json(row['requirements']):
            if req_id in valid_reqs:
                _insert_junction(cursor, 'requirement_capability',
                                 'requirement_id', 'capability_id', req_id, row['id'])
            else:
                skipped['dangling'].append(f"req_cap: req={req_id} → cap={row['id']}")

    # ── capability_tool ──
    print("    capability_tool...", flush=True)
    cursor.execute(f"SELECT id, tools, implements FROM {cap_tbl}")
    for row in cursor.fetchall():
        tools_list = _parse_json(row['tools'])
        implements_dict = _parse_json(row['implements'])
        if not isinstance(implements_dict, dict):
            implements_dict = {}

        impl_by_tool_id = {}
        for tool_name, desc in implements_dict.items():
            key = tool_name.lower().strip()
            matched_id = tool_name_to_id.get(key)
            if not matched_id:
                for stored_name, stored_id in tool_name_to_id.items():
                    if key in stored_name or stored_name in key:
                        matched_id = stored_id
                        break
            if matched_id:
                impl_by_tool_id[matched_id] = desc
            else:
                skipped['implements_unmatched'].append(
                    f"cap={row['id']}: {tool_name} = {desc[:80]}")

        for tool_id in tools_list:
            if tool_id in valid_tools:
                desc = impl_by_tool_id.pop(tool_id, '')
                _insert_junction(cursor, 'capability_tool',
                                 'capability_id', 'tool_id', row['id'], tool_id,
                                 extra_cols={'description': desc})
            else:
                skipped['dangling'].append(f"cap_tool: cap={row['id']} → tool={tool_id}")

        for tool_id, desc in impl_by_tool_id.items():
            if tool_id in valid_tools:
                _insert_junction(cursor, 'capability_tool',
                                 'capability_id', 'tool_id', row['id'], tool_id,
                                 extra_cols={'description': desc})

    # 反向：tool.capabilities（如果列还在）
    if column_exists(cursor, tool_tbl, 'capabilities'):
        cursor.execute(f"SELECT id, capabilities FROM {tool_tbl}")
        for row in cursor.fetchall():
            for cap_id in _parse_json(row['capabilities']):
                if cap_id in valid_caps:
                    _insert_junction(cursor, 'capability_tool',
                                     'capability_id', 'tool_id', cap_id, row['id'])
                else:
                    skipped['dangling'].append(f"cap_tool: cap={cap_id} → tool={row['id']}")
    else:
        print("      (tool.capabilities 列已丢失，仅从 capability 侧迁移)", flush=True)

    # ── capability_knowledge ──
    print("    capability_knowledge...", flush=True)
    cursor.execute(f"SELECT id, source_knowledge FROM {cap_tbl}")
    for row in cursor.fetchall():
        for kid in _parse_json(row['source_knowledge']):
            if kid in valid_knowledge:
                _insert_junction(cursor, 'capability_knowledge',
                                 'capability_id', 'knowledge_id', row['id'], kid)
            else:
                skipped['dangling'].append(f"cap_know: cap={row['id']} → k={kid}")

    cursor.execute("SELECT id, support_capability FROM knowledge")
    for row in cursor.fetchall():
        for cap_id in _parse_json(row['support_capability']):
            if cap_id in valid_caps:
                _insert_junction(cursor, 'capability_knowledge',
                                 'capability_id', 'knowledge_id', cap_id, row['id'])
            else:
                skipped['dangling'].append(f"cap_know: cap={cap_id} → k={row['id']}")

    # ── tool_knowledge ──
    print("    tool_knowledge...", flush=True)
    # 正向：tool.*_knowledge（如果列还在）
    if column_exists(cursor, tool_tbl, 'tool_knowledge'):
        cursor.execute(f"SELECT id, tool_knowledge, case_knowledge, process_knowledge FROM {tool_tbl}")
        for row in cursor.fetchall():
            all_kids = set()
            for field in ('tool_knowledge', 'case_knowledge', 'process_knowledge'):
                all_kids.update(_parse_json(row[field]))
            for kid in all_kids:
                if kid in valid_knowledge:
                    _insert_junction(cursor, 'tool_knowledge',
                                     'tool_id', 'knowledge_id', row['id'], kid)
                else:
                    skipped['dangling'].append(f"tool_know: tool={row['id']} → k={kid}")
    else:
        print("      (tool.*_knowledge 列已丢失，仅从 knowledge 侧迁移)", flush=True)

    # 反向：knowledge.tools
    cursor.execute("SELECT id, tools FROM knowledge")
    for row in cursor.fetchall():
        for tool_id in _parse_json(row['tools']):
            if tool_id in valid_tools:
                _insert_junction(cursor, 'tool_knowledge',
                                 'tool_id', 'knowledge_id', tool_id, row['id'])
            else:
                skipped['dangling'].append(f"tool_know: tool={tool_id} → k={row['id']}")

    return skipped


# ─── 主流程 ───────────────────────────────────────────────────────────────────

def main():
    print("=" * 60)
    print("KnowHub 数据库迁移 v3: JSONB 软关联 → 关联表")
    print("=" * 60)

    conn = get_connection()
    cursor = conn.cursor(cursor_factory=RealDictCursor)

    # 探测实际表名（处理混合重命名状态）
    print("\n[0] 探测表名...")
    tool_tbl = resolve_table(cursor, 'tool', 'tool_table')
    cap_tbl = resolve_table(cursor, 'capability', 'atomic_capability')
    req_tbl = resolve_table(cursor, 'requirement', 'requirement_table')
    print(f"  tool: {tool_tbl}")
    print(f"  capability: {cap_tbl}")
    print(f"  requirement: {req_tbl}")

    # Step 1: 创建关联表
    print("\n[1/5] 创建关联表...")
    for sql in CREATE_JUNCTION_TABLES:
        cursor.execute(sql)
    print("  OK")

    # Step 2: 迁移 JSONB 数据
    print("\n[2/5] 迁移 JSONB 数据到关联表...")
    skipped = migrate_data(cursor, tool_tbl, cap_tbl, req_tbl)

    if skipped['dangling']:
        print(f"\n  [WARN] 跳过悬空引用 {len(skipped['dangling'])} 条:")
        for s in skipped['dangling'][:30]:
            print(f"    - {s}")
        if len(skipped['dangling']) > 30:
            print(f"    ... 还有 {len(skipped['dangling']) - 30} 条")
    if skipped['implements_unmatched']:
        print(f"\n  [WARN] implements 未匹配 {len(skipped['implements_unmatched'])} 条:")
        for s in skipped['implements_unmatched']:
            print(f"    - {s}")

    print("\n  关联表行数:")
    for t in ('requirement_capability', 'capability_tool', 'capability_knowledge', 'tool_knowledge'):
        cursor.execute(f"SELECT COUNT(*) as count FROM {t}")
        print(f"    {t}: {cursor.fetchone()['count']}")

    # Step 3: 创建新表（对需要重命名的表用 CREATE TABLE AS SELECT）
    print("\n[3/5] 创建新表...")

    # tool 已经是新名了，只需删除 JSONB 关联列
    if tool_tbl == 'tool':
        print("  tool: 已是新名，删除 JSONB 列...")
        for col in ('capabilities', 'tool_knowledge', 'case_knowledge', 'process_knowledge'):
            if column_exists(cursor, 'tool', col):
                cursor.execute(f"ALTER TABLE tool DROP COLUMN {col}")
                print(f"    DROP tool.{col}")
    else:
        # tool_table → tool via copy
        if not table_exists(cursor, 'tool'):
            cursor.execute(f"""
                CREATE TABLE tool AS SELECT
                    id, name, version, introduction, tutorial, input, output,
                    updated_time, status, embedding, implemented_tool_ids
                FROM tool_table
            """)
            cursor.execute("ALTER TABLE tool ADD PRIMARY KEY (id)")
            cursor.execute("SELECT COUNT(*) as count FROM tool")
            print(f"  tool_table → tool: {cursor.fetchone()['count']} rows")
        else:
            print("  tool: 已存在，跳过")

    # atomic_capability → capability
    if cap_tbl == 'capability':
        print("  capability: 已是新名，删除 JSONB 列...")
        for col in ('requirements', 'implements', 'tools', 'source_knowledge'):
            if column_exists(cursor, 'capability', col):
                cursor.execute(f"ALTER TABLE capability DROP COLUMN {col}")
                print(f"    DROP capability.{col}")
    else:
        if not table_exists(cursor, 'capability'):
            cursor.execute(f"""
                CREATE TABLE capability AS SELECT
                    id, name, criterion, description, embedding
                FROM atomic_capability
            """)
            cursor.execute("ALTER TABLE capability ADD PRIMARY KEY (id)")
            cursor.execute("SELECT COUNT(*) as count FROM capability")
            print(f"  atomic_capability → capability: {cursor.fetchone()['count']} rows")
        else:
            print("  capability: 已存在，跳过")

    # requirement_table → requirement
    if req_tbl == 'requirement':
        print("  requirement: 已是新名，删除 JSONB 列...")
        for col in ('atomics',):
            if column_exists(cursor, 'requirement', col):
                cursor.execute(f"ALTER TABLE requirement DROP COLUMN {col}")
                print(f"    DROP requirement.{col}")
    else:
        if not table_exists(cursor, 'requirement'):
            cursor.execute(f"""
                CREATE TABLE requirement AS SELECT
                    id, description, source_nodes, status, match_result, embedding
                FROM requirement_table
            """)
            cursor.execute("ALTER TABLE requirement ADD PRIMARY KEY (id)")
            cursor.execute("SELECT COUNT(*) as count FROM requirement")
            print(f"  requirement_table → requirement: {cursor.fetchone()['count']} rows")
        else:
            print("  requirement: 已存在，跳过")

    # Step 4: 删除 knowledge 的 JSONB 关联字段
    print("\n[4/5] 删除 knowledge 表的 JSONB 关联字段...")
    for col in ('support_capability', 'tools'):
        if column_exists(cursor, 'knowledge', col):
            cursor.execute(f"ALTER TABLE knowledge DROP COLUMN {col}")
            print(f"  DROP knowledge.{col}")
        else:
            print(f"  knowledge.{col} 已不存在")

    # Step 5: 删除旧表
    print("\n[5/5] 删除旧表...")
    for old_name, new_name in [('tool_table', 'tool'), ('atomic_capability', 'capability'), ('requirement_table', 'requirement')]:
        if old_name == new_name:
            continue
        if table_exists(cursor, old_name) and table_exists(cursor, new_name):
            cursor.execute(f"DROP TABLE {old_name}")
            print(f"  DROP {old_name}")
        elif not table_exists(cursor, old_name):
            print(f"  {old_name} 已不存在")
        else:
            print(f"  [!] {new_name} 不存在，保留 {old_name}")

    # 最终验证
    print("\n" + "=" * 60)
    print("最终表结构：")
    print("=" * 60)
    for t in ['knowledge', 'tool', 'capability', 'requirement', 'resources',
              'requirement_capability', 'capability_tool', 'capability_knowledge', 'tool_knowledge']:
        try:
            cursor.execute(f"""
                SELECT column_name FROM information_schema.columns
                WHERE table_name = %s ORDER BY ordinal_position
            """, (t,))
            cols = [r['column_name'] for r in cursor.fetchall()]
            cursor.execute(f"SELECT COUNT(*) as count FROM {t}")
            count = cursor.fetchone()['count']
            print(f"\n  {t} ({count} rows)")
            print(f"    {', '.join(cols)}")
        except Exception as e:
            print(f"\n  {t}: ERROR - {e}")

    print("\n" + "=" * 60)
    print("迁移成功！")
    print("=" * 60)

    cursor.close()
    conn.close()


if __name__ == '__main__':
    main()