howard
/
Agent


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949
							# -*- coding: utf-8 -*-
"""mode_workflow · MySQL 持久化(DB 为唯一事实源)
================================================================================
读 .env 的 MYSQL_* 连接 MySQL。四张表:
  search_process —— 每行一个 (query, 帖子):工序方向的搜索 + llm 评估结果
  search_tools   —— 同结构,工具方向的搜索结果(方向由表区分,不再用 mode_type 列)
  mode_process   —— 每行一个解构出的工序(steps 等嵌套结构存 JSON 列)
  mode_tools     —— 每行一个解构出的工具

与旧 fixed_query_eval/db.py 的关键差异:本系统 DB 是主存储,写入失败直接 raise,
不做"失败不阻断"。读侧保留防御(返回空/None)。

用法:
  python db.py init    # 建表(幂等)
  python db.py check   # 打印四表行数
  python db.py clear   # 清空四表数据(TRUNCATE)
"""
import json
import os
import sys
from datetime import datetime
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv()

import pymysql
from pymysql.cursors import DictCursor
from dbutils.pooled_db import PooledDB

# ── 连接池 ──────────────────────────────────────────────────────────────────
# MySQL 是远程 RDS,每次 pymysql.connect() 的 TCP+鉴权握手 ~0.5s。旧实现每个
# 请求新建一条连接,一次"点开帖子"要 2~3 个请求 = 2~3 次握手 ≈ 1s。改用连接池
# 复用长连接后,握手只在池初始化时各发生一次,后续取连接近乎零开销。
# server.py 是 ThreadingHTTPServer(每请求一线程),PooledDB 线程安全,正好匹配。
# 注意:fetch_* 里的 conn.close() 在池连接上语义是"归还池中"而非真正断开。
_POOL = None


def _pool():
    global _POOL
    if _POOL is None:
        if not os.getenv("MYSQL_HOST"):
            raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
        _POOL = PooledDB(
            creator=pymysql,
            mincached=2,          # 启动即预热 2 条,首点不再吃冷握手
            maxcached=5,          # 空闲保留上限
            maxconnections=20,    # 并发上限(ThreadingHTTPServer 线程数)
            blocking=True,        # 连接耗尽时等待而非报错
            ping=1,               # 取用前 ping,自动剔除被 RDS 掐断的死连接
            host=os.getenv("MYSQL_HOST"),
            port=int(os.getenv("MYSQL_PORT", 3306)),
            user=os.getenv("MYSQL_USER"),
            password=os.getenv("MYSQL_PASSWORD"),
            database=os.getenv("MYSQL_DATABASE"),
            charset="utf8mb4", cursorclass=DictCursor,
            autocommit=True, connect_timeout=10,
        )
    return _POOL


def _conn():
    """从池取一条连接;用法不变(with cursor / conn.close() 归还池)。"""
    return _pool().connection()


# ── DDL ──────────────────────────────────────────────────────────────────────

SEARCH_TABLES = {"process": "search_process", "tools": "search_tools"}
MODE_TABLES = {"process": "mode_process", "tools": "mode_tools"}


def _search_table(mode_or_table):
    """mode(process/tools)或表名 → 合法搜索表名(白名单,防 SQL 注入)。"""
    t = SEARCH_TABLES.get(mode_or_table, mode_or_table)
    if t not in SEARCH_TABLES.values():
        raise ValueError(f"未知搜索表/模式: {mode_or_table!r}")
    return t


def _mode_table(mode_or_table):
    """mode(process/tools)或表名 → 合法解构表名(白名单,防 SQL 注入)。"""
    t = MODE_TABLES.get(mode_or_table, mode_or_table)
    if t not in MODE_TABLES.values():
        raise ValueError(f"未知解构表/模式: {mode_or_table!r}")
    return t


def _ddl_search(table, direction):
    return f"""
CREATE TABLE IF NOT EXISTS {table} (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL COMMENT 'q0000',
  query_text    VARCHAR(512)  NULL,
  case_id       VARCHAR(128)  NOT NULL COMMENT 'platform_channelContentId',
  platform      VARCHAR(32)   NULL,
  channel_content_id VARCHAR(128) NULL,
  title         VARCHAR(512)  NULL,
  url           VARCHAR(1024) NULL,
  content_type  VARCHAR(32)   NULL,
  body          LONGTEXT      NULL,
  images        JSON          NULL,
  videos        JSON          NULL,
  like_count    INT           NULL,
  publish_time  VARCHAR(64)   NULL,
  quality_score FLOAT         NULL COMMENT 'post._quality_score',
  quality_grade VARCHAR(8)    NULL,
  found_by      JSON          NULL COMMENT '命中的措辞数组',
  knowledge_type JSON         NULL COMMENT '["能力","工序","工具"] 子集',
  overall_score FLOAT         NULL COMMENT '(相关均值+质量均值)/2',
  llm_evaluation JSON         NULL COMMENT '评估全量 blob',
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  UNIQUE KEY uk_qid_case (query_id, case_id),
  KEY idx_platform (platform)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='搜索+评估结果({direction})';
"""

DDL_PROCESS = """
CREATE TABLE IF NOT EXISTS mode_process (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL,
  case_id       VARCHAR(128)  NOT NULL,
  platform      VARCHAR(32)   NULL,
  post_title    VARCHAR(512)  NULL,
  source        JSON          NULL COMMENT '解构返回的 source 块',
  procedure_id  VARCHAR(16)   NULL COMMENT 'p1,p2…',
  name          VARCHAR(255)  NULL,
  purpose       TEXT          NULL,
  category      VARCHAR(32)   NULL COMMENT '产物创造/资产建设/自动化/分析/学习',
  declarations  JSON          NULL,
  type_registry JSON          NULL,
  steps         JSON          NULL COMMENT '步骤数组全量',
  step_count    INT           NULL,
  tools_used    JSON          NULL COMMENT '从 steps[].via 去重提取',
  model         VARCHAR(64)   NULL,
  version       VARCHAR(32)   NULL COMMENT 'v_MMDDHHMM,保留历史;link_* 为跨 query 复制(cost=0)',
  cost_usd      DECIMAL(10,6) NULL COMMENT '本次解构调用成本(同版本各行相同,聚合需按 case+version 去重)',
  duration_s    FLOAT         NULL,
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  KEY idx_case_ver (case_id, version),
  KEY idx_qid (query_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序解构结果(每行一个工序)';
"""

DDL_TOOLS = """
CREATE TABLE IF NOT EXISTS mode_tools (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL,
  case_id       VARCHAR(128)  NOT NULL,
  platform      VARCHAR(32)   NULL,
  post_title    VARCHAR(512)  NULL,
  tool_name     VARCHAR(255)  NULL,
  substance_scope JSON        NULL COMMENT '实质作用域(数组)',
  form_scope    JSON          NULL COMMENT '形式作用域(数组或null)',
  creation_layer VARCHAR(32)  NULL COMMENT '制作层/创作层',
  source_link   VARCHAR(1024) NULL,
  input_desc    TEXT          NULL,
  output_desc   TEXT          NULL,
  usage_json    JSON          NULL,
  cases_json    JSON          NULL,
  defects_json  JSON          NULL,
  updated_time  VARCHAR(64)   NULL COMMENT '工具最新更新时间',
  model         VARCHAR(64)   NULL,
  version       VARCHAR(32)   NULL COMMENT 'v_MMDDHHMM;link_* 为跨 query 复制(cost=0)',
  cost_usd      DECIMAL(10,6) NULL COMMENT '同 mode_process,聚合按 case+version 去重',
  duration_s    FLOAT         NULL,
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  KEY idx_case_ver (case_id, version),
  KEY idx_qid (query_id),
  KEY idx_tool_name (tool_name)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具解构结果(每行一个工具)';
"""


# 工序知识「已导入知识库」台账:防重复上传(import_process_knowledge.py 用)。
# 每条知识 = 某 case 的某个工序(proc_index 1-based)。记录导入时的 mode_process 版本:
# 版本变了(重解构)说明内容已变,应重导;版本不变即视为「已传过」,跳过。
# 选 DB 台账而非本地文件,是为了换机器/换链接后也不会重复写知识库。
DDL_INGEST_LOG = """
CREATE TABLE IF NOT EXISTS knowledge_ingest_log (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  case_id       VARCHAR(128)  NOT NULL,
  proc_index    INT           NOT NULL COMMENT '工序序号(1-based),对齐导入脚本枚举',
  version       VARCHAR(32)   NULL COMMENT '导入时 mode_process 版本;变了应重导',
  knowledge_id  VARCHAR(128)  NULL COMMENT '接口返回的 knowledge_id',
  api_url       VARCHAR(255)  NULL,
  ingested_at   TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  UNIQUE KEY uk_case_proc (case_id, proc_index)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序知识已导入台账(防重复上传)';
"""


def init_tables():
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(_ddl_search("search_process", "工序方向"))
            cur.execute(_ddl_search("search_tools", "工具方向"))
            cur.execute(DDL_PROCESS)
            cur.execute(DDL_TOOLS)
            cur.execute(DDL_INGEST_LOG)
            # 历史库迁移:version 由 VARCHAR(16) 放宽到 32,容纳 link_v_mopN_* 复制版本。
            # MODIFY 幂等(已是 32 则 MySQL 元数据无操作),建表后表必存在,可安全执行。
            for t in ("mode_process", "mode_tools"):
                cur.execute(f"ALTER TABLE {t} MODIFY COLUMN version VARCHAR(32) NULL")
        print("✅ 建表完成:search_process, search_tools, mode_process, mode_tools, knowledge_ingest_log")
    finally:
        conn.close()


def clear_tables():
    """清空四张表的数据(TRUNCATE,表结构保留)。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
                cur.execute(f"TRUNCATE TABLE {t}")
                print(f"🧹 已清空 {t}")
    finally:
        conn.close()


# ── 工具函数 ──────────────────────────────────────────────────────────────────

def _loads(v, default=None):
    """pymysql 的 JSON 列可能返回字符串,统一解析。"""
    if v is None:
        return default
    if isinstance(v, (list, dict)):
        return v
    try:
        return json.loads(v)
    except Exception:
        return default


def _j(v):
    """写入 JSON 列:None 保持 NULL,其余 dumps。"""
    return None if v is None else json.dumps(v, ensure_ascii=False)


def _collect_scores(node):
    """递归收集嵌套评估里所有「得分」。LLM 直出的得分多为字符串("1"/"4"),
    个别为数字(如 时效性 10),统一按 float 解析;非数值(如 "N/A")跳过不计入。"""
    out = []
    if isinstance(node, dict):
        for k, v in node.items():
            if k == "得分":
                try:
                    out.append(float(v))
                except (TypeError, ValueError):
                    pass
            else:
                out.extend(_collect_scores(v))
    elif isinstance(node, list):
        for v in node:
            out.extend(_collect_scores(v))
    return out


def overall_score(e):
    """综合分 = (相关性各项均值 + 质量各项均值) / 可得部分数。算不出返回 None。"""
    parts = []
    for key in ("相关性", "质量"):
        scores = _collect_scores((e or {}).get(key))
        if scores:
            parts.append(sum(scores) / len(scores))
    return round(sum(parts) / len(parts), 2) if parts else None


def _recency_hard(date_str):
    """硬时效(同 mode_procedure/server.py:_recency_hard):半年内=3 / 两年内=2 / 更早=1。
    publish_time 头 10 字符按 YYYY-MM-DD 解析,失败返回 None(不参与判定)。"""
    try:
        d = datetime.strptime(str(date_str or "")[:10], "%Y-%m-%d")
    except (ValueError, TypeError):
        return None
    days = (datetime.now() - d).days
    if days <= 180:
        return 3
    if days <= 730:
        return 2
    return 1


def _fixed_dim_score(evaluation, name):
    """取 质量.固定维度.<name>.得分 标量,缺失/非数值返回 None(不参与判定)。"""
    v = (((evaluation or {}).get("质量") or {}).get("固定维度") or {}).get(name)
    if isinstance(v, dict):
        v = v.get("得分")
    try:
        return float(v) if v is not None else None
    except (TypeError, ValueError):
        return None


def _impl_score(evaluation):
    """取 质量.动态维度.工序.字段完整性.实现完整性.得分 标量,缺失/非数值返回 None。
    新版 prompt 把旧「可复现性」的硬封顶规则并入了「实现完整性」,故采纳门槛改读此处。"""
    v = ((((((evaluation or {}).get("质量") or {}).get("动态维度") or {})
           .get("工序") or {}).get("字段完整性") or {}).get("实现完整性"))
    if isinstance(v, dict):
        v = v.get("得分")
    try:
        return float(v) if v is not None else None
    except (TypeError, ValueError):
        return None


def _repro_score(evaluation):
    """采纳门槛用的「可复现/可实现」得分:优先旧版「可复现性」(固定维度),
    缺失则回退新版「实现完整性」(动态维度.工序)。这样新旧两套评估 blob 都能正确判定。"""
    v = _fixed_dim_score(evaluation, "可复现性")
    return v if v is not None else _impl_score(evaluation)


def is_adopted(overall, evaluation, publish_time):
    """采纳/命中判定,口径对齐 mode_procedure 的 decision=="report":
    制作相关性<4、可复现/实现完整性<4、发布超两年、综合分<6 —— 任一命中即不采纳;指标缺失不参与判定。
    (意图可控性暂只采分不设门槛,留待阈值标定后再开。)
    可复现/实现门槛兼容新旧 schema:旧版读「可复现性」,新版读「实现完整性」(见 _repro_score)。

    fail-closed:评估失败(_error)、blob 缺失/为空、或综合分算不出(None)→ 直接判不采纳。
    评不出的帖子不该混进命中集(此前 fail-open 会因各指标取不到值而误判采纳)。"""
    if not isinstance(evaluation, dict) or not evaluation or evaluation.get("_error"):
        return False
    if overall is None:
        return False
    rel = None
    v = ((evaluation or {}).get("相关性") or {}).get("和内容制作知识相关")
    if isinstance(v, dict):
        v = v.get("得分")
    try:
        rel = float(v) if v is not None else None
    except (TypeError, ValueError):
        rel = None
    if rel is not None and rel < 4:
        return False
    repro = _repro_score(evaluation)
    if repro is not None and repro < 4:
        return False
    rh = _recency_hard(publish_time)
    if rh is not None and rh < 2:
        return False
    if overall is not None and float(overall) < 6:
        return False
    return True


def is_adopted_rel(overall, rel, publish_time, repro=None):
    """is_adopted 的轻量版:相关性得分(rel)、可复现/实现门槛(repro)已由 SQL JSON_EXTRACT
    直接取出(repro 由 _REPRO_SQL 兼容新旧 schema 取值),无需传输/解析整块 llm_evaluation。
    判定口径与 is_adopted 完全一致(含 fail-closed:综合分算不出→不采纳;失败帖的 overall_score 列为 NULL)。"""
    if overall is None:
        return False
    try:
        rel = float(rel) if rel is not None else None
    except (TypeError, ValueError):
        rel = None
    if rel is not None and rel < 4:
        return False
    try:
        repro = float(repro) if repro is not None else None
    except (TypeError, ValueError):
        repro = None
    if repro is not None and repro < 4:
        return False
    rh = _recency_hard(publish_time)
    if rh is not None and rh < 2:
        return False
    if overall is not None and float(overall) < 6:
        return False
    return True


# ── search_process / search_tools ────────────────────────────────────────────

def upsert_search_posts(query_id, query_text, results, table="search_process"):
    """一组搜索结果写入指定搜索表(按 (query_id, case_id) upsert)。返回写入条数。
    table:search_process(工序方向) / search_tools(工具方向)。"""
    table = _search_table(table)
    if not results:
        return 0
    rows = []
    for r in results:
        post = r.get("post") or {}
        e = r.get("llm_evaluation") or {}
        rows.append((
            query_id, query_text, r.get("case_id"), r.get("platform"),
            r.get("channel_content_id"),
            (post.get("title") or post.get("desc") or "")[:500],
            r.get("source_url"), post.get("content_type"),
            post.get("body_text") or post.get("desc") or "",
            _j(post.get("images") or []), _j(post.get("videos") or []),
            post.get("like_count"),
            str(post.get("publish_time") or post.get("publish_timestamp") or "")[:64],
            post.get("_quality_score"), post.get("_quality_grade"),
            _j(r.get("found_by_queries") or []),
            _j(e.get("知识类型") or []),
            overall_score(e),
            _j(e),
        ))
    sql = f"""
    INSERT INTO {table}
      (query_id, query_text, case_id, platform, channel_content_id, title, url,
       content_type, body, images, videos, like_count, publish_time,
       quality_score, quality_grade, found_by, knowledge_type,
       overall_score, llm_evaluation)
    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    ON DUPLICATE KEY UPDATE
      query_text=VALUES(query_text), platform=VALUES(platform),
      channel_content_id=VALUES(channel_content_id), title=VALUES(title), url=VALUES(url),
      content_type=VALUES(content_type), body=VALUES(body), images=VALUES(images),
      videos=VALUES(videos), like_count=VALUES(like_count), publish_time=VALUES(publish_time),
      quality_score=VALUES(quality_score), quality_grade=VALUES(quality_grade),
      found_by=VALUES(found_by), knowledge_type=VALUES(knowledge_type),
      overall_score=VALUES(overall_score), llm_evaluation=VALUES(llm_evaluation);
    """
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.executemany(sql, rows)
        return len(rows)
    finally:
        conn.close()


def fetch_queries(mode="process"):
    """某方向搜索表的 query 列表 + 帖子数 + 采纳/命中数 + 解构进度。"""
    table = _search_table(mode)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT query_id, MAX(query_text) AS query_text,
                                   COUNT(*) AS post_count
                            FROM {table} GROUP BY query_id ORDER BY query_id""")
            queries = cur.fetchall()
            cur.execute(f"""SELECT query_id, overall_score, llm_evaluation, publish_time
                            FROM {table}""")
            hits = {}
            for r in cur.fetchall():
                if is_adopted(r["overall_score"], _loads(r["llm_evaluation"]), r["publish_time"]):
                    hits[r["query_id"]] = hits.get(r["query_id"], 0) + 1
            cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_process GROUP BY query_id")
            np = {r["query_id"]: r["n"] for r in cur.fetchall()}
            cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_tools GROUP BY query_id")
            nt = {r["query_id"]: r["n"] for r in cur.fetchall()}
    finally:
        conn.close()
    for q in queries:
        q["hit_count"] = hits.get(q["query_id"], 0)
        q["process_done"] = np.get(q["query_id"], 0)
        q["tools_done"] = nt.get(q["query_id"], 0)
    return queries


def fetch_posts(query_id, mode="process"):
    """某方向搜索表里某 query 的全部帖子(JSON 列已解析),带 has_process/has_tools 标记。"""
    table = _search_table(mode)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT * FROM {table} WHERE query_id=%s
                            ORDER BY overall_score DESC, id""", (query_id,))
            rows = cur.fetchall()
            cur.execute("SELECT DISTINCT case_id FROM mode_process WHERE query_id=%s", (query_id,))
            hp = {r["case_id"] for r in cur.fetchall()}
            cur.execute("SELECT DISTINCT case_id FROM mode_tools WHERE query_id=%s", (query_id,))
            ht = {r["case_id"] for r in cur.fetchall()}
    finally:
        conn.close()
    for r in rows:
        for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
            r[col] = _loads(r[col])
        r["adopted"] = is_adopted(r["overall_score"], r["llm_evaluation"], r["publish_time"])
        r["has_process"] = r["case_id"] in hp
        r["has_tools"] = r["case_id"] in ht
        r.pop("created_at", None); r.pop("updated_at", None)
    return rows


def fetch_post(query_id, case_id, table="search_process"):
    """指定搜索表的单帖完整行(给 pipeline 脚本重建 source 用)。无则 None。"""
    table = _search_table(table)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"SELECT * FROM {table} WHERE query_id=%s AND case_id=%s",
                        (query_id, case_id))
            row = cur.fetchone()
    finally:
        conn.close()
    if not row:
        return None
    for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
        row[col] = _loads(row[col])
    return row


# ── mode_process ─────────────────────────────────────────────────────────────

def replace_process(query_id, case_id, platform, post_title, payload,
                    model, version, cost_usd, duration_s):
    """写入一帖某版本的工序解构结果(payload = {source, procedures})。
    删 (case_id, version) 旧行再插,同版本重跑幂等、跨版本保留历史。返回工序条数。"""
    source = payload.get("source")
    procedures = payload.get("procedures") or []
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("DELETE FROM mode_process WHERE case_id=%s AND version=%s",
                        (case_id, version))
            if procedures:
                rows = []
                for p in procedures:
                    steps = p.get("steps") or []
                    vias = []
                    for s in steps:
                        v = s.get("via")
                        if v and v not in vias:
                            vias.append(v)
                    rows.append((
                        query_id, case_id, platform, (post_title or "")[:500],
                        _j(source), p.get("id"), (p.get("name") or "")[:250],
                        p.get("purpose"), p.get("category"),
                        _j(p.get("declarations")), _j(p.get("type_registry")),
                        _j(steps), len(steps), _j(vias),
                        model, version, cost_usd, duration_s,
                    ))
                cur.executemany("""
                INSERT INTO mode_process
                  (query_id, case_id, platform, post_title, source, procedure_id, name,
                   purpose, category, declarations, type_registry, steps, step_count,
                   tools_used, model, version, cost_usd, duration_s)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """, rows)
        return len(procedures)
    finally:
        conn.close()


def fetch_process_versions(case_id):
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
                           FROM mode_process WHERE case_id=%s
                           GROUP BY version ORDER BY version DESC""", (case_id,))
            return cur.fetchall()
    finally:
        conn.close()


def fetch_process(case_id, version=None):
    """重建 {case_id, version, model, source, procedures:[...]}。version=None 取最新。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            if version is None:
                cur.execute("""SELECT version FROM mode_process WHERE case_id=%s
                               ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
                row = cur.fetchone()
                if not row:
                    return None
                version = row["version"]
            cur.execute("""SELECT * FROM mode_process WHERE case_id=%s AND version=%s
                           ORDER BY id""", (case_id, version))
            rows = cur.fetchall()
    finally:
        conn.close()
    return _proc_payload(case_id, version, rows)


def _proc_payload(case_id, version, rows):
    """mode_process 行集 → {case_id, version, …, procedures:[...]}。无行返回 None。"""
    if not rows:
        return None
    procedures = [{
        "id": r["procedure_id"], "name": r["name"], "purpose": r["purpose"],
        "category": r["category"], "declarations": _loads(r["declarations"]),
        "type_registry": _loads(r["type_registry"]), "steps": _loads(r["steps"], []),
        "tools_used": _loads(r["tools_used"], []),
    } for r in rows]
    return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
            "title": rows[0]["post_title"], "model": rows[0]["model"],
            "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
            "duration_s": rows[0]["duration_s"],
            "source": _loads(rows[0]["source"]), "procedures": procedures}


# ── mode_tools ───────────────────────────────────────────────────────────────

def replace_tools(query_id, case_id, platform, post_title, tools,
                  model, version, cost_usd, duration_s):
    """写入一帖某版本的工具解构结果。语义同 replace_process。返回工具条数。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("DELETE FROM mode_tools WHERE case_id=%s AND version=%s",
                        (case_id, version))
            if tools:
                rows = [(
                    query_id, case_id, platform, (post_title or "")[:500],
                    (t.get("工具名称") or "")[:250],
                    _j(t.get("实质作用域")), _j(t.get("形式作用域")),
                    t.get("创作层级"), t.get("来源链接"), t.get("输入"), t.get("输出"),
                    _j(t.get("用法")), _j(t.get("案例")), _j(t.get("缺点")),
                    t.get("最新更新时间"), model, version, cost_usd, duration_s,
                ) for t in tools]
                cur.executemany("""
                INSERT INTO mode_tools
                  (query_id, case_id, platform, post_title, tool_name, substance_scope,
                   form_scope, creation_layer, source_link, input_desc, output_desc,
                   usage_json, cases_json, defects_json, updated_time, model, version,
                   cost_usd, duration_s)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """, rows)
        return len(tools)
    finally:
        conn.close()


def fetch_tools_versions(case_id):
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
                           FROM mode_tools WHERE case_id=%s
                           GROUP BY version ORDER BY version DESC""", (case_id,))
            return cur.fetchall()
    finally:
        conn.close()


def fetch_tools(case_id, version=None):
    """重建 {case_id, version, model, tool_count, tools:[...]}。version=None 取最新。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            if version is None:
                cur.execute("""SELECT version FROM mode_tools WHERE case_id=%s
                               ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
                row = cur.fetchone()
                if not row:
                    return None
                version = row["version"]
            cur.execute("""SELECT * FROM mode_tools WHERE case_id=%s AND version=%s
                           ORDER BY id""", (case_id, version))
            rows = cur.fetchall()
    finally:
        conn.close()
    return _tools_payload(case_id, version, rows)


def _tools_payload(case_id, version, rows):
    """mode_tools 行集 → {case_id, version, …, tools:[...]}。无行返回 None。"""
    if not rows:
        return None
    tools = [{
        "工具名称": r["tool_name"], "实质作用域": _loads(r["substance_scope"]),
        "形式作用域": _loads(r["form_scope"]), "创作层级": r["creation_layer"],
        "来源链接": r["source_link"], "输入": r["input_desc"], "输出": r["output_desc"],
        "用法": _loads(r["usage_json"]), "案例": _loads(r["cases_json"]),
        "缺点": _loads(r["defects_json"]), "最新更新时间": r["updated_time"],
    } for r in rows]
    return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
            "title": rows[0]["post_title"], "model": rows[0]["model"],
            "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
            "duration_s": rows[0]["duration_s"],
            "tool_count": len(tools), "tools": tools}


# ── 点击帖子合一查询(单连接,最少往返;远程 RDS 每次往返 ~80ms,故按次数优化)──

def fetch_extract(mode, case_id, version=None):
    """一次取版本列表 + 解构详情,复用同一条池连接、最少往返。
    返回 {versions, data, missing}。mode: process / tools。"""
    is_proc = mode != "tools"
    mtable = _mode_table("process" if is_proc else "tools")
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT version, COUNT(*) AS n, MAX(model) AS model
                            FROM {mtable} WHERE case_id=%s
                            GROUP BY version ORDER BY version DESC""", (case_id,))
            versions = cur.fetchall()
            # 详情:把"取最新版本"折进同一条 SQL,版本指定时直接用;省一次往返。
            target = version or (versions[0]["version"] if versions else None)
            rows = []
            if target is not None:
                cur.execute(f"SELECT * FROM {mtable} WHERE case_id=%s AND version=%s ORDER BY id",
                            (case_id, target))
                rows = cur.fetchall()
    finally:
        conn.close()
    payload = (_proc_payload if is_proc else _tools_payload)(case_id, target, rows)
    return {"versions": versions, "data": payload, "missing": payload is None}


# ── 跨 query 去重 / link 复制(方案A:解构前先去重,避免重复花钱)──────────────
# case_id 是帖子物理身份(platform_channelContentId),与 query 无关。同一帖被多个
# query 搜到时只需真实解构一次;其余 query 用 link_* 复制行补齐关联(cost=0)。

def latest_real_version(case_id, mode="process"):
    """该 case 是否已有「真实」解构(任意 query;link_* 是复制品,不算源)。
    返回最新一行 {"version","query_id"} 或 None。给解构前去重判定用。"""
    table = _mode_table(mode)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT version, query_id FROM {table}
                            WHERE case_id=%s AND LEFT(version,5) <> 'link_'
                            ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
            return cur.fetchone()
    finally:
        conn.close()


def link_process(query_id, case_id, mode="process"):
    """把 case 在别处最新「真实」版本的解构行复制到目标 query
    (version='link_'+源版本, cost_usd=0)。幂等(先删目标同版本)。
    返回复制行数;该 case 从未真实解构过则返回 0(无源可复制)。"""
    table = _mode_table(mode)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT version FROM {table}
                            WHERE case_id=%s AND LEFT(version,5) <> 'link_'
                            ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
            r = cur.fetchone()
            if not r:
                return 0
            srcver = r["version"]
            newver = ("link_" + srcver)[:32]   # version 列 VARCHAR(32)
            # 复制除自增 id / 时间戳外的全部列,改写 query_id / version / cost。
            cur.execute(f"SHOW COLUMNS FROM {table}")
            cols = [c["Field"] for c in cur.fetchall()
                    if c["Field"] not in ("id", "created_at", "updated_at")]
            cur.execute(f"SELECT {','.join(cols)} FROM {table} WHERE case_id=%s AND version=%s",
                        (case_id, srcver))
            rows = cur.fetchall()
            cur.execute(f"DELETE FROM {table} WHERE query_id=%s AND case_id=%s AND version=%s",
                        (query_id, case_id, newver))
            for row in rows:
                row = dict(row)
                row["query_id"] = query_id
                row["version"] = newver
                row["cost_usd"] = 0
                cur.execute(
                    f"INSERT INTO {table} ({','.join(cols)}) VALUES ({','.join(['%s']*len(cols))})",
                    [row[k] for k in cols])
            return len(rows)
    finally:
        conn.close()


# ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────

# 采纳判定只需「和内容制作知识相关」的得分,用 SQL JSON_EXTRACT 直取这一个标量,
# 避免把整块 llm_evaluation(本库 ~1.5MB)拉到 Python 再解析。得分可能直接是数字,
# 也可能裹在 {"得分": x} 里,COALESCE 两条路径覆盖两种存法,口径同 is_adopted。
_REL_SQL = ("JSON_UNQUOTE(COALESCE("
            "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\".\"得分\"'),"
            "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\"')))")

# 可复现/实现门槛标量直取(口径同 is_adopted 的 _repro_score):兼容新旧 schema——
# 旧版「质量.固定维度.可复现性」,新版「质量.动态维度.工序.字段完整性.实现完整性」,COALESCE 依次回退。
_REPRO_SQL = ("JSON_UNQUOTE(COALESCE("
              "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"固定维度\".\"可复现性\".\"得分\"'),"
              "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"固定维度\".\"可复现性\"'),"
              "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"动态维度\".\"工序\".\"字段完整性\".\"实现完整性\".\"得分\"'),"
              "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"动态维度\".\"工序\".\"字段完整性\".\"实现完整性\"')))")


def fetch_adopted_process_cases(query_id=None):
    """返回「已采纳且有工序解构」的 case_id 列表(供知识上传脚本用)。

    采纳是帖子级属性(评估存在 search_process),工序解构存在 mode_process,故二者 JOIN:
    只取两边都有的 case,再用 is_adopted_rel(口径同 Dashboard)在 Python 侧过滤。
    relevance 得分由 _REL_SQL 直取标量,不传整块 llm_evaluation。
    query_id 给定时只看该搜索任务下的 case。返回去重、按 case_id 排序的列表。
    """
    sql = (f"SELECT DISTINCT s.case_id, s.overall_score, s.publish_time, "
           f"{_REL_SQL} AS rel, {_REPRO_SQL} AS repro "
           "FROM search_process s "
           "JOIN (SELECT DISTINCT case_id FROM mode_process) m ON s.case_id = m.case_id")
    params = ()
    if query_id:
        sql += " WHERE s.query_id=%s"
        params = (query_id,)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(sql, params)
            rows = cur.fetchall()
    finally:
        conn.close()
    cases = [r["case_id"] for r in rows
             if is_adopted_rel(r["overall_score"], r["rel"], r["publish_time"], r["repro"])]
    return sorted(set(cases))


# ── 评估去重:复用 query 无关分,只重算 query 相关分(search_eval.py 用)──────────

def fetch_existing_eval(case_id, table="search_process"):
    """返回该 case 在搜索表里最近一条「有效」评估 blob(任意 query)。
    评估去重用:同帖在别的相似 query 下评过时,复用其 query 无关分(质量/通用相关/时效),
    只重算「和 query 相关」。无有效评估(全是 _error 或没评过)返回 None。
    取最近若干条逐一挑出首个非 error、结构完整的 blob。"""
    table = _search_table(table)
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(f"""SELECT llm_evaluation FROM {table}
                            WHERE case_id=%s AND llm_evaluation IS NOT NULL
                            ORDER BY updated_at DESC, id DESC LIMIT 5""", (case_id,))
            rows = cur.fetchall()
    finally:
        conn.close()
    for r in rows:
        e = _loads(r["llm_evaluation"])
        if isinstance(e, dict) and not e.get("_error") and isinstance(e.get("相关性"), dict):
            return e
    return None


def update_post_eval(query_id, case_id, evaluation, table="search_process"):
    """用新的评估 blob 覆盖某 (query, case) 行的 llm_evaluation,并同步重算派生列
    overall_score、knowledge_type(口径同 upsert_search_posts)。返回受影响行数。"""
    table = _search_table(table)
    overall = overall_score(evaluation)
    ktype = evaluation.get("知识类型") if isinstance(evaluation, dict) else None
    conn = _conn()
    try:
        with conn.cursor() as cur:
            n = cur.execute(
                f"UPDATE {table} SET llm_evaluation=%s, overall_score=%s, knowledge_type=%s "
                "WHERE query_id=%s AND case_id=%s",
                (_j(evaluation), overall, _j(ktype), query_id, case_id))
        return n
    finally:
        conn.close()


# ── 上传去重:知识库已导入台账(import_process_knowledge.py 用)────────────────

def fetch_ingested_map(case_id):
    """返回 {proc_index: version} —— 该 case 各工序已导入知识库的版本。空表示没传过。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT proc_index, version FROM knowledge_ingest_log WHERE case_id=%s",
                        (case_id,))
            return {r["proc_index"]: r["version"] for r in cur.fetchall()}
    finally:
        conn.close()


def mark_ingested(case_id, proc_index, version, knowledge_id=None, api_url=None):
    """记一条「已导入」台账(case_id+proc_index 唯一,重导同序号则更新版本/knowledge_id)。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""INSERT INTO knowledge_ingest_log
                             (case_id, proc_index, version, knowledge_id, api_url)
                           VALUES (%s,%s,%s,%s,%s)
                           ON DUPLICATE KEY UPDATE version=VALUES(version),
                             knowledge_id=VALUES(knowledge_id), api_url=VALUES(api_url)""",
                        (case_id, proc_index, version, knowledge_id, api_url))
    finally:
        conn.close()


def fetch_dashboard_rows():
    """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。
    优化:① 不传 llm_evaluation 整块,SQL 只取采纳判定要的相关性得分;
    ② steps 只取每个 case 的最新版本(覆盖度只看最新版),历史/link_ 版本不传 steps。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            # 进度分母走「采纳」口径;mode 标方向(工序帖来自 search_process)。
            cols = (f"query_id, case_id, platform, overall_score, publish_time, "
                    f"{_REL_SQL} AS rel, {_REPRO_SQL} AS repro")
            cur.execute(f"SELECT {cols} FROM search_process")
            posts = cur.fetchall()
            for p in posts:
                p["mode"] = "process"
            cur.execute(f"SELECT {cols} FROM search_tools")
            st = cur.fetchall()
            for p in st:
                p["mode"] = "tools"
            posts += st
            # 成本/耗时按全部版本计;steps 仅最新版需要 → 非最新版只回 NULL,省传输。
            cur.execute("""SELECT p.case_id, p.version, p.cost_usd, p.duration_s, p.created_at,
                                  CASE WHEN p.version = m.maxv THEN p.steps END AS steps
                           FROM mode_process p
                           JOIN (SELECT case_id, MAX(version) AS maxv
                                 FROM mode_process GROUP BY case_id) m
                             ON p.case_id = m.case_id
                           ORDER BY p.id""")
            procs = cur.fetchall()
            cur.execute("""SELECT case_id, version, tool_name, substance_scope,
                                  form_scope, cost_usd, duration_s, created_at
                           FROM mode_tools""")
            tools = cur.fetchall()
    finally:
        conn.close()
    for p in posts:
        # 采纳判定:口径同帖子列表(is_adopted),作为「需解构」分母依据
        p["adopted"] = is_adopted_rel(p["overall_score"], p["rel"], p["publish_time"], p["repro"])
    for r in procs:
        r["steps"] = _loads(r["steps"], [])
        r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
        r["created_at"] = str(r["created_at"]) if r["created_at"] else None
    for r in tools:
        r["substance_scope"] = _loads(r["substance_scope"], [])
        r["form_scope"] = _loads(r["form_scope"], [])
        r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
        r["created_at"] = str(r["created_at"]) if r["created_at"] else None
    return posts, procs, tools


def check():
    conn = _conn()
    try:
        with conn.cursor() as cur:
            for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
                cur.execute(f"SELECT COUNT(*) AS n FROM {t}")
                print(f"{t}: {cur.fetchone()['n']} 行")
    finally:
        conn.close()


if __name__ == "__main__":
    cmd = sys.argv[1] if len(sys.argv) > 1 else ""
    if cmd == "init":
        init_tables()
    elif cmd == "check":
        check()
    elif cmd == "clear":
        clear_tables()
    else:
        print("用法:\n  python db.py init    # 建表\n  python db.py check   # 四表行数\n  python db.py clear   # 清空四表数据")