|
|
@@ -29,20 +29,43 @@ load_dotenv()
|
|
|
|
|
|
import pymysql
|
|
|
from pymysql.cursors import DictCursor
|
|
|
+from dbutils.pooled_db import PooledDB
|
|
|
+
|
|
|
+# ── 连接池 ──────────────────────────────────────────────────────────────────
|
|
|
+# MySQL 是远程 RDS,每次 pymysql.connect() 的 TCP+鉴权握手 ~0.5s。旧实现每个
|
|
|
+# 请求新建一条连接,一次"点开帖子"要 2~3 个请求 = 2~3 次握手 ≈ 1s。改用连接池
|
|
|
+# 复用长连接后,握手只在池初始化时各发生一次,后续取连接近乎零开销。
|
|
|
+# server.py 是 ThreadingHTTPServer(每请求一线程),PooledDB 线程安全,正好匹配。
|
|
|
+# 注意:fetch_* 里的 conn.close() 在池连接上语义是"归还池中"而非真正断开。
|
|
|
+_POOL = None
|
|
|
+
|
|
|
+
|
|
|
+def _pool():
|
|
|
+ global _POOL
|
|
|
+ if _POOL is None:
|
|
|
+ if not os.getenv("MYSQL_HOST"):
|
|
|
+ raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
|
|
|
+ _POOL = PooledDB(
|
|
|
+ creator=pymysql,
|
|
|
+ mincached=2, # 启动即预热 2 条,首点不再吃冷握手
|
|
|
+ maxcached=5, # 空闲保留上限
|
|
|
+ maxconnections=20, # 并发上限(ThreadingHTTPServer 线程数)
|
|
|
+ blocking=True, # 连接耗尽时等待而非报错
|
|
|
+ ping=1, # 取用前 ping,自动剔除被 RDS 掐断的死连接
|
|
|
+ host=os.getenv("MYSQL_HOST"),
|
|
|
+ port=int(os.getenv("MYSQL_PORT", 3306)),
|
|
|
+ user=os.getenv("MYSQL_USER"),
|
|
|
+ password=os.getenv("MYSQL_PASSWORD"),
|
|
|
+ database=os.getenv("MYSQL_DATABASE"),
|
|
|
+ charset="utf8mb4", cursorclass=DictCursor,
|
|
|
+ autocommit=True, connect_timeout=10,
|
|
|
+ )
|
|
|
+ return _POOL
|
|
|
|
|
|
|
|
|
def _conn():
|
|
|
- if not os.getenv("MYSQL_HOST"):
|
|
|
- raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
|
|
|
- return pymysql.connect(
|
|
|
- host=os.getenv("MYSQL_HOST"),
|
|
|
- port=int(os.getenv("MYSQL_PORT", 3306)),
|
|
|
- user=os.getenv("MYSQL_USER"),
|
|
|
- password=os.getenv("MYSQL_PASSWORD"),
|
|
|
- database=os.getenv("MYSQL_DATABASE"),
|
|
|
- charset="utf8mb4", cursorclass=DictCursor,
|
|
|
- autocommit=True, connect_timeout=10,
|
|
|
- )
|
|
|
+ """从池取一条连接;用法不变(with cursor / conn.close() 归还池)。"""
|
|
|
+ return _pool().connection()
|
|
|
|
|
|
|
|
|
# ── DDL ──────────────────────────────────────────────────────────────────────
|
|
|
@@ -267,6 +290,23 @@ def is_adopted(overall, evaluation, publish_time):
|
|
|
return True
|
|
|
|
|
|
|
|
|
+def is_adopted_rel(overall, rel, publish_time):
|
|
|
+ """is_adopted 的轻量版:相关性得分(rel)已由 SQL JSON_EXTRACT 直接取出,
|
|
|
+ 无需传输/解析整块 llm_evaluation。判定口径与 is_adopted 完全一致。"""
|
|
|
+ try:
|
|
|
+ rel = float(rel) if rel is not None else None
|
|
|
+ except (TypeError, ValueError):
|
|
|
+ rel = None
|
|
|
+ if rel is not None and rel < 4:
|
|
|
+ return False
|
|
|
+ rh = _recency_hard(publish_time)
|
|
|
+ if rh is not None and rh < 2:
|
|
|
+ return False
|
|
|
+ if overall is not None and float(overall) < 6:
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
# ── search_process / search_tools ────────────────────────────────────────────
|
|
|
|
|
|
def upsert_search_posts(query_id, query_text, results, table="search_process"):
|
|
|
@@ -462,6 +502,11 @@ def fetch_process(case_id, version=None):
|
|
|
rows = cur.fetchall()
|
|
|
finally:
|
|
|
conn.close()
|
|
|
+ return _proc_payload(case_id, version, rows)
|
|
|
+
|
|
|
+
|
|
|
+def _proc_payload(case_id, version, rows):
|
|
|
+ """mode_process 行集 → {case_id, version, …, procedures:[...]}。无行返回 None。"""
|
|
|
if not rows:
|
|
|
return None
|
|
|
procedures = [{
|
|
|
@@ -538,6 +583,11 @@ def fetch_tools(case_id, version=None):
|
|
|
rows = cur.fetchall()
|
|
|
finally:
|
|
|
conn.close()
|
|
|
+ return _tools_payload(case_id, version, rows)
|
|
|
+
|
|
|
+
|
|
|
+def _tools_payload(case_id, version, rows):
|
|
|
+ """mode_tools 行集 → {case_id, version, …, tools:[...]}。无行返回 None。"""
|
|
|
if not rows:
|
|
|
return None
|
|
|
tools = [{
|
|
|
@@ -554,6 +604,33 @@ def fetch_tools(case_id, version=None):
|
|
|
"tool_count": len(tools), "tools": tools}
|
|
|
|
|
|
|
|
|
+# ── 点击帖子合一查询(单连接,最少往返;远程 RDS 每次往返 ~80ms,故按次数优化)──
|
|
|
+
|
|
|
+def fetch_extract(mode, case_id, version=None):
|
|
|
+ """一次取版本列表 + 解构详情,复用同一条池连接、最少往返。
|
|
|
+ 返回 {versions, data, missing}。mode: process / tools。"""
|
|
|
+ is_proc = mode != "tools"
|
|
|
+ mtable = _mode_table("process" if is_proc else "tools")
|
|
|
+ conn = _conn()
|
|
|
+ try:
|
|
|
+ with conn.cursor() as cur:
|
|
|
+ cur.execute(f"""SELECT version, COUNT(*) AS n, MAX(model) AS model
|
|
|
+ FROM {mtable} WHERE case_id=%s
|
|
|
+ GROUP BY version ORDER BY version DESC""", (case_id,))
|
|
|
+ versions = cur.fetchall()
|
|
|
+ # 详情:把"取最新版本"折进同一条 SQL,版本指定时直接用;省一次往返。
|
|
|
+ target = version or (versions[0]["version"] if versions else None)
|
|
|
+ rows = []
|
|
|
+ if target is not None:
|
|
|
+ cur.execute(f"SELECT * FROM {mtable} WHERE case_id=%s AND version=%s ORDER BY id",
|
|
|
+ (case_id, target))
|
|
|
+ rows = cur.fetchall()
|
|
|
+ finally:
|
|
|
+ conn.close()
|
|
|
+ payload = (_proc_payload if is_proc else _tools_payload)(case_id, target, rows)
|
|
|
+ return {"versions": versions, "data": payload, "missing": payload is None}
|
|
|
+
|
|
|
+
|
|
|
# ── 跨 query 去重 / link 复制(方案A:解构前先去重,避免重复花钱)──────────────
|
|
|
# case_id 是帖子物理身份(platform_channelContentId),与 query 无关。同一帖被多个
|
|
|
# query 搜到时只需真实解构一次;其余 query 用 link_* 复制行补齐关联(cost=0)。
|
|
|
@@ -613,15 +690,23 @@ def link_process(query_id, case_id, mode="process"):
|
|
|
|
|
|
# ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────
|
|
|
|
|
|
+# 采纳判定只需「和内容制作知识相关」的得分,用 SQL JSON_EXTRACT 直取这一个标量,
|
|
|
+# 避免把整块 llm_evaluation(本库 ~1.5MB)拉到 Python 再解析。得分可能直接是数字,
|
|
|
+# 也可能裹在 {"得分": x} 里,COALESCE 两条路径覆盖两种存法,口径同 is_adopted。
|
|
|
+_REL_SQL = ("JSON_UNQUOTE(COALESCE("
|
|
|
+ "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\".\"得分\"'),"
|
|
|
+ "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\"')))")
|
|
|
+
|
|
|
+
|
|
|
def fetch_dashboard_rows():
|
|
|
- """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。"""
|
|
|
+ """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。
|
|
|
+ 优化:① 不传 llm_evaluation 整块,SQL 只取采纳判定要的相关性得分;
|
|
|
+ ② steps 只取每个 case 的最新版本(覆盖度只看最新版),历史/link_ 版本不传 steps。"""
|
|
|
conn = _conn()
|
|
|
try:
|
|
|
with conn.cursor() as cur:
|
|
|
- # 进度分母走「采纳」口径,需带上 is_adopted 判定所需字段;
|
|
|
- # mode 标方向(工序帖来自 search_process,工具帖来自 search_tools)。
|
|
|
- cols = ("query_id, case_id, platform, knowledge_type, "
|
|
|
- "overall_score, publish_time, llm_evaluation")
|
|
|
+ # 进度分母走「采纳」口径;mode 标方向(工序帖来自 search_process)。
|
|
|
+ cols = f"query_id, case_id, platform, overall_score, publish_time, {_REL_SQL} AS rel"
|
|
|
cur.execute(f"SELECT {cols} FROM search_process")
|
|
|
posts = cur.fetchall()
|
|
|
for p in posts:
|
|
|
@@ -631,8 +716,14 @@ def fetch_dashboard_rows():
|
|
|
for p in st:
|
|
|
p["mode"] = "tools"
|
|
|
posts += st
|
|
|
- cur.execute("""SELECT case_id, version, steps, tools_used, cost_usd,
|
|
|
- duration_s, created_at FROM mode_process""")
|
|
|
+ # 成本/耗时按全部版本计;steps 仅最新版需要 → 非最新版只回 NULL,省传输。
|
|
|
+ cur.execute("""SELECT p.case_id, p.version, p.cost_usd, p.duration_s, p.created_at,
|
|
|
+ CASE WHEN p.version = m.maxv THEN p.steps END AS steps
|
|
|
+ FROM mode_process p
|
|
|
+ JOIN (SELECT case_id, MAX(version) AS maxv
|
|
|
+ FROM mode_process GROUP BY case_id) m
|
|
|
+ ON p.case_id = m.case_id
|
|
|
+ ORDER BY p.id""")
|
|
|
procs = cur.fetchall()
|
|
|
cur.execute("""SELECT case_id, version, tool_name, substance_scope,
|
|
|
form_scope, cost_usd, duration_s, created_at
|
|
|
@@ -641,13 +732,10 @@ def fetch_dashboard_rows():
|
|
|
finally:
|
|
|
conn.close()
|
|
|
for p in posts:
|
|
|
- p["knowledge_type"] = _loads(p["knowledge_type"], [])
|
|
|
# 采纳判定:口径同帖子列表(is_adopted),作为「需解构」分母依据
|
|
|
- p["adopted"] = is_adopted(
|
|
|
- p["overall_score"], _loads(p["llm_evaluation"]), p["publish_time"])
|
|
|
+ p["adopted"] = is_adopted_rel(p["overall_score"], p["rel"], p["publish_time"])
|
|
|
for r in procs:
|
|
|
r["steps"] = _loads(r["steps"], [])
|
|
|
- r["tools_used"] = _loads(r["tools_used"], [])
|
|
|
r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
|
|
|
r["created_at"] = str(r["created_at"]) if r["created_at"] else None
|
|
|
for r in tools:
|