# -*- coding: utf-8 -*-
"""mode_workflow · MySQL 持久化(DB 为唯一事实源)
================================================================================
读 .env 的 MYSQL_* 连接 MySQL。三张表:
  search_data  —— 每行一个 (query, 帖子):搜索 + llm 评估结果
  mode_process —— 每行一个解构出的工序(steps 等嵌套结构存 JSON 列)
  mode_tools   —— 每行一个解构出的工具

与旧 fixed_query_eval/db.py 的关键差异:本系统 DB 是主存储,写入失败直接 raise,
不做"失败不阻断"。读侧保留防御(返回空/None)。

用法:
  python db.py init    # 建表(幂等)
  python db.py check   # 打印三表行数
"""
import json
import os
import sys
from pathlib import Path

PROJECT_ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(PROJECT_ROOT))

from dotenv import load_dotenv
load_dotenv()

import pymysql
from pymysql.cursors import DictCursor


def _conn():
    if not os.getenv("MYSQL_HOST"):
        raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
    return pymysql.connect(
        host=os.getenv("MYSQL_HOST"),
        port=int(os.getenv("MYSQL_PORT", 3306)),
        user=os.getenv("MYSQL_USER"),
        password=os.getenv("MYSQL_PASSWORD"),
        database=os.getenv("MYSQL_DATABASE"),
        charset="utf8mb4", cursorclass=DictCursor,
        autocommit=True, connect_timeout=10,
    )


# ── DDL ──────────────────────────────────────────────────────────────────────

DDL_SEARCH = """
CREATE TABLE IF NOT EXISTS search_data (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL COMMENT 'q0000',
  query_text    VARCHAR(512)  NULL,
  case_id       VARCHAR(128)  NOT NULL COMMENT 'platform_channelContentId',
  platform      VARCHAR(32)   NULL,
  channel_content_id VARCHAR(128) NULL,
  title         VARCHAR(512)  NULL,
  url           VARCHAR(1024) NULL,
  content_type  VARCHAR(32)   NULL,
  body          LONGTEXT      NULL,
  images        JSON          NULL,
  videos        JSON          NULL,
  like_count    INT           NULL,
  publish_time  VARCHAR(64)   NULL,
  quality_score FLOAT         NULL COMMENT 'post._quality_score',
  quality_grade VARCHAR(8)    NULL,
  found_by      JSON          NULL COMMENT '命中的措辞数组',
  knowledge_type JSON         NULL COMMENT '["能力","工序","工具"] 子集',
  mode_type     VARCHAR(16)   NULL COMMENT '该 query 的解构方向:工序/工具(空=通用)',
  overall_score FLOAT         NULL COMMENT '(相关均值+质量均值)/2',
  llm_evaluation JSON         NULL COMMENT '评估全量 blob',
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  UNIQUE KEY uk_qid_case (query_id, case_id),
  KEY idx_platform (platform)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='搜索+评估结果';
"""

DDL_PROCESS = """
CREATE TABLE IF NOT EXISTS mode_process (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL,
  case_id       VARCHAR(128)  NOT NULL,
  platform      VARCHAR(32)   NULL,
  post_title    VARCHAR(512)  NULL,
  source        JSON          NULL COMMENT '解构返回的 source 块',
  procedure_id  VARCHAR(16)   NULL COMMENT 'p1,p2…',
  name          VARCHAR(255)  NULL,
  purpose       TEXT          NULL,
  category      VARCHAR(32)   NULL COMMENT '产物创造/资产建设/自动化/分析/学习',
  declarations  JSON          NULL,
  type_registry JSON          NULL,
  steps         JSON          NULL COMMENT '步骤数组全量',
  step_count    INT           NULL,
  tools_used    JSON          NULL COMMENT '从 steps[].via 去重提取',
  model         VARCHAR(64)   NULL,
  version       VARCHAR(16)   NULL COMMENT 'v_MMDDHHMM,保留历史',
  cost_usd      DECIMAL(10,6) NULL COMMENT '本次解构调用成本(同版本各行相同,聚合需按 case+version 去重)',
  duration_s    FLOAT         NULL,
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  KEY idx_case_ver (case_id, version),
  KEY idx_qid (query_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序解构结果(每行一个工序)';
"""

DDL_TOOLS = """
CREATE TABLE IF NOT EXISTS mode_tools (
  id            BIGINT AUTO_INCREMENT PRIMARY KEY,
  query_id      VARCHAR(32)   NOT NULL,
  case_id       VARCHAR(128)  NOT NULL,
  platform      VARCHAR(32)   NULL,
  post_title    VARCHAR(512)  NULL,
  tool_name     VARCHAR(255)  NULL,
  substance_scope JSON        NULL COMMENT '实质作用域(数组)',
  form_scope    JSON          NULL COMMENT '形式作用域(数组或null)',
  creation_layer VARCHAR(32)  NULL COMMENT '制作层/创作层',
  source_link   VARCHAR(1024) NULL,
  input_desc    TEXT          NULL,
  output_desc   TEXT          NULL,
  usage_json    JSON          NULL,
  cases_json    JSON          NULL,
  defects_json  JSON          NULL,
  updated_time  VARCHAR(64)   NULL COMMENT '工具最新更新时间',
  model         VARCHAR(64)   NULL,
  version       VARCHAR(16)   NULL,
  cost_usd      DECIMAL(10,6) NULL COMMENT '同 mode_process,聚合按 case+version 去重',
  duration_s    FLOAT         NULL,
  created_at    TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  KEY idx_case_ver (case_id, version),
  KEY idx_qid (query_id),
  KEY idx_tool_name (tool_name)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具解构结果(每行一个工具)';
"""


def init_tables():
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute(DDL_SEARCH)
            cur.execute(DDL_PROCESS)
            cur.execute(DDL_TOOLS)
            # 迁移:旧表补 mode_type 列(CREATE IF NOT EXISTS 不会改已有表)
            cur.execute("SHOW COLUMNS FROM search_data LIKE 'mode_type'")
            if not cur.fetchone():
                cur.execute("ALTER TABLE search_data ADD COLUMN mode_type VARCHAR(16) NULL "
                            "COMMENT '该 query 的解构方向:工序/工具(空=通用)' AFTER knowledge_type")
                print("🔧 迁移:search_data 已补 mode_type 列")
        print("✅ 建表完成:search_data, mode_process, mode_tools")
    finally:
        conn.close()


# ── 工具函数 ──────────────────────────────────────────────────────────────────

def _loads(v, default=None):
    """pymysql 的 JSON 列可能返回字符串,统一解析。"""
    if v is None:
        return default
    if isinstance(v, (list, dict)):
        return v
    try:
        return json.loads(v)
    except Exception:
        return default


def _j(v):
    """写入 JSON 列:None 保持 NULL,其余 dumps。"""
    return None if v is None else json.dumps(v, ensure_ascii=False)


def _collect_scores(node):
    """递归收集嵌套评估里所有数值「得分」。"""
    out = []
    if isinstance(node, dict):
        for k, v in node.items():
            if k == "得分" and isinstance(v, (int, float)):
                out.append(float(v))
            else:
                out.extend(_collect_scores(v))
    elif isinstance(node, list):
        for v in node:
            out.extend(_collect_scores(v))
    return out


def overall_score(e):
    """综合分 = (相关性各项均值 + 质量各项均值) / 可得部分数。算不出返回 None。"""
    parts = []
    for key in ("相关性", "质量"):
        scores = _collect_scores((e or {}).get(key))
        if scores:
            parts.append(sum(scores) / len(scores))
    return round(sum(parts) / len(parts), 2) if parts else None


# ── search_data ──────────────────────────────────────────────────────────────

def upsert_search_posts(query_id, query_text, results, mode_type=None):
    """一组搜索结果写入 search_data(按 (query_id, case_id) upsert)。返回写入条数。
    mode_type:该 query 的解构方向(工序/工具),None 不覆盖已有值。"""
    if not results:
        return 0
    rows = []
    for r in results:
        post = r.get("post") or {}
        e = r.get("llm_evaluation") or {}
        rows.append((
            query_id, query_text, r.get("case_id"), r.get("platform"),
            r.get("channel_content_id"),
            (post.get("title") or post.get("desc") or "")[:500],
            r.get("source_url"), post.get("content_type"),
            post.get("body_text") or post.get("desc") or "",
            _j(post.get("images") or []), _j(post.get("videos") or []),
            post.get("like_count"),
            str(post.get("publish_time") or post.get("publish_timestamp") or "")[:64],
            post.get("_quality_score"), post.get("_quality_grade"),
            _j(r.get("found_by_queries") or []),
            _j(e.get("知识类型") or []),
            mode_type,
            overall_score(e),
            _j(e),
        ))
    sql = """
    INSERT INTO search_data
      (query_id, query_text, case_id, platform, channel_content_id, title, url,
       content_type, body, images, videos, like_count, publish_time,
       quality_score, quality_grade, found_by, knowledge_type, mode_type,
       overall_score, llm_evaluation)
    VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    ON DUPLICATE KEY UPDATE
      query_text=VALUES(query_text), platform=VALUES(platform),
      channel_content_id=VALUES(channel_content_id), title=VALUES(title), url=VALUES(url),
      content_type=VALUES(content_type), body=VALUES(body), images=VALUES(images),
      videos=VALUES(videos), like_count=VALUES(like_count), publish_time=VALUES(publish_time),
      quality_score=VALUES(quality_score), quality_grade=VALUES(quality_grade),
      found_by=VALUES(found_by), knowledge_type=VALUES(knowledge_type),
      mode_type=COALESCE(VALUES(mode_type), mode_type),
      overall_score=VALUES(overall_score), llm_evaluation=VALUES(llm_evaluation);
    """
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.executemany(sql, rows)
        return len(rows)
    finally:
        conn.close()


def fetch_queries():
    """query 列表 + 帖子数 + 解构进度。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT query_id, MAX(query_text) AS query_text,
                                  MAX(mode_type) AS mode_type, COUNT(*) AS post_count
                           FROM search_data GROUP BY query_id ORDER BY query_id""")
            queries = cur.fetchall()
            cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_process GROUP BY query_id")
            np = {r["query_id"]: r["n"] for r in cur.fetchall()}
            cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_tools GROUP BY query_id")
            nt = {r["query_id"]: r["n"] for r in cur.fetchall()}
    finally:
        conn.close()
    for q in queries:
        q["process_done"] = np.get(q["query_id"], 0)
        q["tools_done"] = nt.get(q["query_id"], 0)
    return queries


def fetch_posts(query_id):
    """某 query 下全部帖子(JSON 列已解析),带 has_process/has_tools 标记。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT * FROM search_data WHERE query_id=%s
                           ORDER BY overall_score DESC, id""", (query_id,))
            rows = cur.fetchall()
            cur.execute("SELECT DISTINCT case_id FROM mode_process WHERE query_id=%s", (query_id,))
            hp = {r["case_id"] for r in cur.fetchall()}
            cur.execute("SELECT DISTINCT case_id FROM mode_tools WHERE query_id=%s", (query_id,))
            ht = {r["case_id"] for r in cur.fetchall()}
    finally:
        conn.close()
    for r in rows:
        for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
            r[col] = _loads(r[col])
        r["has_process"] = r["case_id"] in hp
        r["has_tools"] = r["case_id"] in ht
        r.pop("created_at", None); r.pop("updated_at", None)
    return rows


def fetch_post(query_id, case_id):
    """单帖完整行(给 pipeline 脚本重建 source 用)。无则 None。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT * FROM search_data WHERE query_id=%s AND case_id=%s",
                        (query_id, case_id))
            row = cur.fetchone()
    finally:
        conn.close()
    if not row:
        return None
    for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
        row[col] = _loads(row[col])
    return row


# ── mode_process ─────────────────────────────────────────────────────────────

def replace_process(query_id, case_id, platform, post_title, payload,
                    model, version, cost_usd, duration_s):
    """写入一帖某版本的工序解构结果(payload = {source, procedures})。
    删 (case_id, version) 旧行再插,同版本重跑幂等、跨版本保留历史。返回工序条数。"""
    source = payload.get("source")
    procedures = payload.get("procedures") or []
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("DELETE FROM mode_process WHERE case_id=%s AND version=%s",
                        (case_id, version))
            if procedures:
                rows = []
                for p in procedures:
                    steps = p.get("steps") or []
                    vias = []
                    for s in steps:
                        v = s.get("via")
                        if v and v not in vias:
                            vias.append(v)
                    rows.append((
                        query_id, case_id, platform, (post_title or "")[:500],
                        _j(source), p.get("id"), (p.get("name") or "")[:250],
                        p.get("purpose"), p.get("category"),
                        _j(p.get("declarations")), _j(p.get("type_registry")),
                        _j(steps), len(steps), _j(vias),
                        model, version, cost_usd, duration_s,
                    ))
                cur.executemany("""
                INSERT INTO mode_process
                  (query_id, case_id, platform, post_title, source, procedure_id, name,
                   purpose, category, declarations, type_registry, steps, step_count,
                   tools_used, model, version, cost_usd, duration_s)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """, rows)
        return len(procedures)
    finally:
        conn.close()


def fetch_process_versions(case_id):
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
                           FROM mode_process WHERE case_id=%s
                           GROUP BY version ORDER BY version DESC""", (case_id,))
            return cur.fetchall()
    finally:
        conn.close()


def fetch_process(case_id, version=None):
    """重建 {case_id, version, model, source, procedures:[...]}。version=None 取最新。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            if version is None:
                cur.execute("""SELECT version FROM mode_process WHERE case_id=%s
                               ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
                row = cur.fetchone()
                if not row:
                    return None
                version = row["version"]
            cur.execute("""SELECT * FROM mode_process WHERE case_id=%s AND version=%s
                           ORDER BY id""", (case_id, version))
            rows = cur.fetchall()
    finally:
        conn.close()
    if not rows:
        return None
    procedures = [{
        "id": r["procedure_id"], "name": r["name"], "purpose": r["purpose"],
        "category": r["category"], "declarations": _loads(r["declarations"]),
        "type_registry": _loads(r["type_registry"]), "steps": _loads(r["steps"], []),
        "tools_used": _loads(r["tools_used"], []),
    } for r in rows]
    return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
            "title": rows[0]["post_title"], "model": rows[0]["model"],
            "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
            "duration_s": rows[0]["duration_s"],
            "source": _loads(rows[0]["source"]), "procedures": procedures}


# ── mode_tools ───────────────────────────────────────────────────────────────

def replace_tools(query_id, case_id, platform, post_title, tools,
                  model, version, cost_usd, duration_s):
    """写入一帖某版本的工具解构结果。语义同 replace_process。返回工具条数。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("DELETE FROM mode_tools WHERE case_id=%s AND version=%s",
                        (case_id, version))
            if tools:
                rows = [(
                    query_id, case_id, platform, (post_title or "")[:500],
                    (t.get("工具名称") or "")[:250],
                    _j(t.get("实质作用域")), _j(t.get("形式作用域")),
                    t.get("创作层级"), t.get("来源链接"), t.get("输入"), t.get("输出"),
                    _j(t.get("用法")), _j(t.get("案例")), _j(t.get("缺点")),
                    t.get("最新更新时间"), model, version, cost_usd, duration_s,
                ) for t in tools]
                cur.executemany("""
                INSERT INTO mode_tools
                  (query_id, case_id, platform, post_title, tool_name, substance_scope,
                   form_scope, creation_layer, source_link, input_desc, output_desc,
                   usage_json, cases_json, defects_json, updated_time, model, version,
                   cost_usd, duration_s)
                VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """, rows)
        return len(tools)
    finally:
        conn.close()


def fetch_tools_versions(case_id):
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
                           FROM mode_tools WHERE case_id=%s
                           GROUP BY version ORDER BY version DESC""", (case_id,))
            return cur.fetchall()
    finally:
        conn.close()


def fetch_tools(case_id, version=None):
    """重建 {case_id, version, model, tool_count, tools:[...]}。version=None 取最新。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            if version is None:
                cur.execute("""SELECT version FROM mode_tools WHERE case_id=%s
                               ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
                row = cur.fetchone()
                if not row:
                    return None
                version = row["version"]
            cur.execute("""SELECT * FROM mode_tools WHERE case_id=%s AND version=%s
                           ORDER BY id""", (case_id, version))
            rows = cur.fetchall()
    finally:
        conn.close()
    if not rows:
        return None
    tools = [{
        "工具名称": r["tool_name"], "实质作用域": _loads(r["substance_scope"]),
        "形式作用域": _loads(r["form_scope"]), "创作层级": r["creation_layer"],
        "来源链接": r["source_link"], "输入": r["input_desc"], "输出": r["output_desc"],
        "用法": _loads(r["usage_json"]), "案例": _loads(r["cases_json"]),
        "缺点": _loads(r["defects_json"]), "最新更新时间": r["updated_time"],
    } for r in rows]
    return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
            "title": rows[0]["post_title"], "model": rows[0]["model"],
            "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
            "duration_s": rows[0]["duration_s"],
            "tool_count": len(tools), "tools": tools}


# ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────

def fetch_dashboard_rows():
    """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。"""
    conn = _conn()
    try:
        with conn.cursor() as cur:
            cur.execute("SELECT query_id, case_id, knowledge_type FROM search_data")
            posts = cur.fetchall()
            cur.execute("""SELECT case_id, version, steps, tools_used, cost_usd,
                                  duration_s, created_at FROM mode_process""")
            procs = cur.fetchall()
            cur.execute("""SELECT case_id, version, tool_name, substance_scope,
                                  form_scope, cost_usd, duration_s, created_at
                           FROM mode_tools""")
            tools = cur.fetchall()
    finally:
        conn.close()
    for p in posts:
        p["knowledge_type"] = _loads(p["knowledge_type"], [])
    for r in procs:
        r["steps"] = _loads(r["steps"], [])
        r["tools_used"] = _loads(r["tools_used"], [])
        r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
        r["created_at"] = str(r["created_at"]) if r["created_at"] else None
    for r in tools:
        r["substance_scope"] = _loads(r["substance_scope"], [])
        r["form_scope"] = _loads(r["form_scope"], [])
        r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
        r["created_at"] = str(r["created_at"]) if r["created_at"] else None
    return posts, procs, tools


def check():
    conn = _conn()
    try:
        with conn.cursor() as cur:
            for t in ("search_data", "mode_process", "mode_tools"):
                cur.execute(f"SELECT COUNT(*) AS n FROM {t}")
                print(f"{t}: {cur.fetchone()['n']} 行")
    finally:
        conn.close()


if __name__ == "__main__":
    cmd = sys.argv[1] if len(sys.argv) > 1 else ""
    if cmd == "init":
        init_tables()
    elif cmd == "check":
        check()
    else:
        print("用法:\n  python db.py init    # 建表\n  python db.py check   # 三表行数")