db.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. # -*- coding: utf-8 -*-
  2. """mode_workflow · MySQL 持久化(DB 为唯一事实源)
  3. ================================================================================
  4. 读 .env 的 MYSQL_* 连接 MySQL。四张表:
  5. search_process —— 每行一个 (query, 帖子):工序方向的搜索 + llm 评估结果
  6. search_tools —— 同结构,工具方向的搜索结果(方向由表区分,不再用 mode_type 列)
  7. mode_process —— 每行一个解构出的工序(steps 等嵌套结构存 JSON 列)
  8. mode_tools —— 每行一个解构出的工具
  9. 与旧 fixed_query_eval/db.py 的关键差异:本系统 DB 是主存储,写入失败直接 raise,
  10. 不做"失败不阻断"。读侧保留防御(返回空/None)。
  11. 用法:
  12. python db.py init # 建表(幂等)
  13. python db.py check # 打印四表行数
  14. python db.py clear # 清空四表数据(TRUNCATE)
  15. """
  16. import json
  17. import os
  18. import sys
  19. from datetime import datetime
  20. from pathlib import Path
  21. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  22. sys.path.insert(0, str(PROJECT_ROOT))
  23. from dotenv import load_dotenv
  24. load_dotenv()
  25. import pymysql
  26. from pymysql.cursors import DictCursor
  27. from dbutils.pooled_db import PooledDB
  28. # ── 连接池 ──────────────────────────────────────────────────────────────────
  29. # MySQL 是远程 RDS,每次 pymysql.connect() 的 TCP+鉴权握手 ~0.5s。旧实现每个
  30. # 请求新建一条连接,一次"点开帖子"要 2~3 个请求 = 2~3 次握手 ≈ 1s。改用连接池
  31. # 复用长连接后,握手只在池初始化时各发生一次,后续取连接近乎零开销。
  32. # server.py 是 ThreadingHTTPServer(每请求一线程),PooledDB 线程安全,正好匹配。
  33. # 注意:fetch_* 里的 conn.close() 在池连接上语义是"归还池中"而非真正断开。
  34. _POOL = None
  35. def _pool():
  36. global _POOL
  37. if _POOL is None:
  38. if not os.getenv("MYSQL_HOST"):
  39. raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
  40. _POOL = PooledDB(
  41. creator=pymysql,
  42. mincached=2, # 启动即预热 2 条,首点不再吃冷握手
  43. maxcached=5, # 空闲保留上限
  44. maxconnections=20, # 并发上限(ThreadingHTTPServer 线程数)
  45. blocking=True, # 连接耗尽时等待而非报错
  46. ping=1, # 取用前 ping,自动剔除被 RDS 掐断的死连接
  47. host=os.getenv("MYSQL_HOST"),
  48. port=int(os.getenv("MYSQL_PORT", 3306)),
  49. user=os.getenv("MYSQL_USER"),
  50. password=os.getenv("MYSQL_PASSWORD"),
  51. database=os.getenv("MYSQL_DATABASE"),
  52. charset="utf8mb4", cursorclass=DictCursor,
  53. autocommit=True, connect_timeout=10,
  54. )
  55. return _POOL
  56. def _conn():
  57. """从池取一条连接;用法不变(with cursor / conn.close() 归还池)。"""
  58. return _pool().connection()
  59. # ── DDL ──────────────────────────────────────────────────────────────────────
  60. SEARCH_TABLES = {"process": "search_process", "tools": "search_tools"}
  61. MODE_TABLES = {"process": "mode_process", "tools": "mode_tools"}
  62. def _search_table(mode_or_table):
  63. """mode(process/tools)或表名 → 合法搜索表名(白名单,防 SQL 注入)。"""
  64. t = SEARCH_TABLES.get(mode_or_table, mode_or_table)
  65. if t not in SEARCH_TABLES.values():
  66. raise ValueError(f"未知搜索表/模式: {mode_or_table!r}")
  67. return t
  68. def _mode_table(mode_or_table):
  69. """mode(process/tools)或表名 → 合法解构表名(白名单,防 SQL 注入)。"""
  70. t = MODE_TABLES.get(mode_or_table, mode_or_table)
  71. if t not in MODE_TABLES.values():
  72. raise ValueError(f"未知解构表/模式: {mode_or_table!r}")
  73. return t
  74. def _ddl_search(table, direction):
  75. return f"""
  76. CREATE TABLE IF NOT EXISTS {table} (
  77. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  78. query_id VARCHAR(32) NOT NULL COMMENT 'q0000',
  79. query_text VARCHAR(512) NULL,
  80. case_id VARCHAR(128) NOT NULL COMMENT 'platform_channelContentId',
  81. platform VARCHAR(32) NULL,
  82. channel_content_id VARCHAR(128) NULL,
  83. title VARCHAR(512) NULL,
  84. url VARCHAR(1024) NULL,
  85. content_type VARCHAR(32) NULL,
  86. body LONGTEXT NULL,
  87. images JSON NULL,
  88. videos JSON NULL,
  89. like_count INT NULL,
  90. publish_time VARCHAR(64) NULL,
  91. quality_score FLOAT NULL COMMENT 'post._quality_score',
  92. quality_grade VARCHAR(8) NULL,
  93. found_by JSON NULL COMMENT '命中的措辞数组',
  94. knowledge_type JSON NULL COMMENT '["能力","工序","工具"] 子集',
  95. overall_score FLOAT NULL COMMENT '(相关均值+质量均值)/2',
  96. llm_evaluation JSON NULL COMMENT '评估全量 blob',
  97. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  98. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  99. UNIQUE KEY uk_qid_case (query_id, case_id),
  100. KEY idx_platform (platform)
  101. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='搜索+评估结果({direction})';
  102. """
  103. DDL_PROCESS = """
  104. CREATE TABLE IF NOT EXISTS mode_process (
  105. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  106. query_id VARCHAR(32) NOT NULL,
  107. case_id VARCHAR(128) NOT NULL,
  108. platform VARCHAR(32) NULL,
  109. post_title VARCHAR(512) NULL,
  110. source JSON NULL COMMENT '解构返回的 source 块',
  111. procedure_id VARCHAR(16) NULL COMMENT 'p1,p2…',
  112. name VARCHAR(255) NULL,
  113. purpose TEXT NULL,
  114. category VARCHAR(32) NULL COMMENT '产物创造/资产建设/自动化/分析/学习',
  115. declarations JSON NULL,
  116. type_registry JSON NULL,
  117. steps JSON NULL COMMENT '步骤数组全量',
  118. step_count INT NULL,
  119. tools_used JSON NULL COMMENT '从 steps[].via 去重提取',
  120. model VARCHAR(64) NULL,
  121. version VARCHAR(32) NULL COMMENT 'v_MMDDHHMM,保留历史;link_* 为跨 query 复制(cost=0)',
  122. cost_usd DECIMAL(10,6) NULL COMMENT '本次解构调用成本(同版本各行相同,聚合需按 case+version 去重)',
  123. duration_s FLOAT NULL,
  124. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  125. KEY idx_case_ver (case_id, version),
  126. KEY idx_qid (query_id)
  127. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序解构结果(每行一个工序)';
  128. """
  129. DDL_TOOLS = """
  130. CREATE TABLE IF NOT EXISTS mode_tools (
  131. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  132. query_id VARCHAR(32) NOT NULL,
  133. case_id VARCHAR(128) NOT NULL,
  134. platform VARCHAR(32) NULL,
  135. post_title VARCHAR(512) NULL,
  136. tool_name VARCHAR(255) NULL,
  137. substance_scope JSON NULL COMMENT '实质作用域(数组)',
  138. form_scope JSON NULL COMMENT '形式作用域(数组或null)',
  139. creation_layer VARCHAR(32) NULL COMMENT '制作层/创作层',
  140. source_link VARCHAR(1024) NULL,
  141. input_desc TEXT NULL,
  142. output_desc TEXT NULL,
  143. usage_json JSON NULL,
  144. cases_json JSON NULL,
  145. defects_json JSON NULL,
  146. updated_time VARCHAR(64) NULL COMMENT '工具最新更新时间',
  147. model VARCHAR(64) NULL,
  148. version VARCHAR(32) NULL COMMENT 'v_MMDDHHMM;link_* 为跨 query 复制(cost=0)',
  149. cost_usd DECIMAL(10,6) NULL COMMENT '同 mode_process,聚合按 case+version 去重',
  150. duration_s FLOAT NULL,
  151. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  152. KEY idx_case_ver (case_id, version),
  153. KEY idx_qid (query_id),
  154. KEY idx_tool_name (tool_name)
  155. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具解构结果(每行一个工具)';
  156. """
  157. def init_tables():
  158. conn = _conn()
  159. try:
  160. with conn.cursor() as cur:
  161. cur.execute(_ddl_search("search_process", "工序方向"))
  162. cur.execute(_ddl_search("search_tools", "工具方向"))
  163. cur.execute(DDL_PROCESS)
  164. cur.execute(DDL_TOOLS)
  165. # 历史库迁移:version 由 VARCHAR(16) 放宽到 32,容纳 link_v_mopN_* 复制版本。
  166. # MODIFY 幂等(已是 32 则 MySQL 元数据无操作),建表后表必存在,可安全执行。
  167. for t in ("mode_process", "mode_tools"):
  168. cur.execute(f"ALTER TABLE {t} MODIFY COLUMN version VARCHAR(32) NULL")
  169. print("✅ 建表完成:search_process, search_tools, mode_process, mode_tools")
  170. finally:
  171. conn.close()
  172. def clear_tables():
  173. """清空四张表的数据(TRUNCATE,表结构保留)。"""
  174. conn = _conn()
  175. try:
  176. with conn.cursor() as cur:
  177. for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
  178. cur.execute(f"TRUNCATE TABLE {t}")
  179. print(f"🧹 已清空 {t}")
  180. finally:
  181. conn.close()
  182. # ── 工具函数 ──────────────────────────────────────────────────────────────────
  183. def _loads(v, default=None):
  184. """pymysql 的 JSON 列可能返回字符串,统一解析。"""
  185. if v is None:
  186. return default
  187. if isinstance(v, (list, dict)):
  188. return v
  189. try:
  190. return json.loads(v)
  191. except Exception:
  192. return default
  193. def _j(v):
  194. """写入 JSON 列:None 保持 NULL,其余 dumps。"""
  195. return None if v is None else json.dumps(v, ensure_ascii=False)
  196. def _collect_scores(node):
  197. """递归收集嵌套评估里所有「得分」。LLM 直出的得分多为字符串("1"/"4"),
  198. 个别为数字(如 时效性 10),统一按 float 解析;非数值(如 "N/A")跳过不计入。"""
  199. out = []
  200. if isinstance(node, dict):
  201. for k, v in node.items():
  202. if k == "得分":
  203. try:
  204. out.append(float(v))
  205. except (TypeError, ValueError):
  206. pass
  207. else:
  208. out.extend(_collect_scores(v))
  209. elif isinstance(node, list):
  210. for v in node:
  211. out.extend(_collect_scores(v))
  212. return out
  213. def overall_score(e):
  214. """综合分 = (相关性各项均值 + 质量各项均值) / 可得部分数。算不出返回 None。"""
  215. parts = []
  216. for key in ("相关性", "质量"):
  217. scores = _collect_scores((e or {}).get(key))
  218. if scores:
  219. parts.append(sum(scores) / len(scores))
  220. return round(sum(parts) / len(parts), 2) if parts else None
  221. def _recency_hard(date_str):
  222. """硬时效(同 mode_procedure/server.py:_recency_hard):半年内=3 / 两年内=2 / 更早=1。
  223. publish_time 头 10 字符按 YYYY-MM-DD 解析,失败返回 None(不参与判定)。"""
  224. try:
  225. d = datetime.strptime(str(date_str or "")[:10], "%Y-%m-%d")
  226. except (ValueError, TypeError):
  227. return None
  228. days = (datetime.now() - d).days
  229. if days <= 180:
  230. return 3
  231. if days <= 730:
  232. return 2
  233. return 1
  234. def is_adopted(overall, evaluation, publish_time):
  235. """采纳/命中判定,口径对齐 mode_procedure 的 decision=="report":
  236. 制作相关性<4、发布超两年、综合分<6 —— 任一命中即不采纳;指标缺失不参与判定。"""
  237. rel = None
  238. v = ((evaluation or {}).get("相关性") or {}).get("和内容制作知识相关")
  239. if isinstance(v, dict):
  240. v = v.get("得分")
  241. try:
  242. rel = float(v) if v is not None else None
  243. except (TypeError, ValueError):
  244. rel = None
  245. if rel is not None and rel < 4:
  246. return False
  247. rh = _recency_hard(publish_time)
  248. if rh is not None and rh < 2:
  249. return False
  250. if overall is not None and float(overall) < 6:
  251. return False
  252. return True
  253. def is_adopted_rel(overall, rel, publish_time):
  254. """is_adopted 的轻量版:相关性得分(rel)已由 SQL JSON_EXTRACT 直接取出,
  255. 无需传输/解析整块 llm_evaluation。判定口径与 is_adopted 完全一致。"""
  256. try:
  257. rel = float(rel) if rel is not None else None
  258. except (TypeError, ValueError):
  259. rel = None
  260. if rel is not None and rel < 4:
  261. return False
  262. rh = _recency_hard(publish_time)
  263. if rh is not None and rh < 2:
  264. return False
  265. if overall is not None and float(overall) < 6:
  266. return False
  267. return True
  268. # ── search_process / search_tools ────────────────────────────────────────────
  269. def upsert_search_posts(query_id, query_text, results, table="search_process"):
  270. """一组搜索结果写入指定搜索表(按 (query_id, case_id) upsert)。返回写入条数。
  271. table:search_process(工序方向) / search_tools(工具方向)。"""
  272. table = _search_table(table)
  273. if not results:
  274. return 0
  275. rows = []
  276. for r in results:
  277. post = r.get("post") or {}
  278. e = r.get("llm_evaluation") or {}
  279. rows.append((
  280. query_id, query_text, r.get("case_id"), r.get("platform"),
  281. r.get("channel_content_id"),
  282. (post.get("title") or post.get("desc") or "")[:500],
  283. r.get("source_url"), post.get("content_type"),
  284. post.get("body_text") or post.get("desc") or "",
  285. _j(post.get("images") or []), _j(post.get("videos") or []),
  286. post.get("like_count"),
  287. str(post.get("publish_time") or post.get("publish_timestamp") or "")[:64],
  288. post.get("_quality_score"), post.get("_quality_grade"),
  289. _j(r.get("found_by_queries") or []),
  290. _j(e.get("知识类型") or []),
  291. overall_score(e),
  292. _j(e),
  293. ))
  294. sql = f"""
  295. INSERT INTO {table}
  296. (query_id, query_text, case_id, platform, channel_content_id, title, url,
  297. content_type, body, images, videos, like_count, publish_time,
  298. quality_score, quality_grade, found_by, knowledge_type,
  299. overall_score, llm_evaluation)
  300. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  301. ON DUPLICATE KEY UPDATE
  302. query_text=VALUES(query_text), platform=VALUES(platform),
  303. channel_content_id=VALUES(channel_content_id), title=VALUES(title), url=VALUES(url),
  304. content_type=VALUES(content_type), body=VALUES(body), images=VALUES(images),
  305. videos=VALUES(videos), like_count=VALUES(like_count), publish_time=VALUES(publish_time),
  306. quality_score=VALUES(quality_score), quality_grade=VALUES(quality_grade),
  307. found_by=VALUES(found_by), knowledge_type=VALUES(knowledge_type),
  308. overall_score=VALUES(overall_score), llm_evaluation=VALUES(llm_evaluation);
  309. """
  310. conn = _conn()
  311. try:
  312. with conn.cursor() as cur:
  313. cur.executemany(sql, rows)
  314. return len(rows)
  315. finally:
  316. conn.close()
  317. def fetch_queries(mode="process"):
  318. """某方向搜索表的 query 列表 + 帖子数 + 采纳/命中数 + 解构进度。"""
  319. table = _search_table(mode)
  320. conn = _conn()
  321. try:
  322. with conn.cursor() as cur:
  323. cur.execute(f"""SELECT query_id, MAX(query_text) AS query_text,
  324. COUNT(*) AS post_count
  325. FROM {table} GROUP BY query_id ORDER BY query_id""")
  326. queries = cur.fetchall()
  327. cur.execute(f"""SELECT query_id, overall_score, llm_evaluation, publish_time
  328. FROM {table}""")
  329. hits = {}
  330. for r in cur.fetchall():
  331. if is_adopted(r["overall_score"], _loads(r["llm_evaluation"]), r["publish_time"]):
  332. hits[r["query_id"]] = hits.get(r["query_id"], 0) + 1
  333. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_process GROUP BY query_id")
  334. np = {r["query_id"]: r["n"] for r in cur.fetchall()}
  335. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_tools GROUP BY query_id")
  336. nt = {r["query_id"]: r["n"] for r in cur.fetchall()}
  337. finally:
  338. conn.close()
  339. for q in queries:
  340. q["hit_count"] = hits.get(q["query_id"], 0)
  341. q["process_done"] = np.get(q["query_id"], 0)
  342. q["tools_done"] = nt.get(q["query_id"], 0)
  343. return queries
  344. def fetch_posts(query_id, mode="process"):
  345. """某方向搜索表里某 query 的全部帖子(JSON 列已解析),带 has_process/has_tools 标记。"""
  346. table = _search_table(mode)
  347. conn = _conn()
  348. try:
  349. with conn.cursor() as cur:
  350. cur.execute(f"""SELECT * FROM {table} WHERE query_id=%s
  351. ORDER BY overall_score DESC, id""", (query_id,))
  352. rows = cur.fetchall()
  353. cur.execute("SELECT DISTINCT case_id FROM mode_process WHERE query_id=%s", (query_id,))
  354. hp = {r["case_id"] for r in cur.fetchall()}
  355. cur.execute("SELECT DISTINCT case_id FROM mode_tools WHERE query_id=%s", (query_id,))
  356. ht = {r["case_id"] for r in cur.fetchall()}
  357. finally:
  358. conn.close()
  359. for r in rows:
  360. for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
  361. r[col] = _loads(r[col])
  362. r["adopted"] = is_adopted(r["overall_score"], r["llm_evaluation"], r["publish_time"])
  363. r["has_process"] = r["case_id"] in hp
  364. r["has_tools"] = r["case_id"] in ht
  365. r.pop("created_at", None); r.pop("updated_at", None)
  366. return rows
  367. def fetch_post(query_id, case_id, table="search_process"):
  368. """指定搜索表的单帖完整行(给 pipeline 脚本重建 source 用)。无则 None。"""
  369. table = _search_table(table)
  370. conn = _conn()
  371. try:
  372. with conn.cursor() as cur:
  373. cur.execute(f"SELECT * FROM {table} WHERE query_id=%s AND case_id=%s",
  374. (query_id, case_id))
  375. row = cur.fetchone()
  376. finally:
  377. conn.close()
  378. if not row:
  379. return None
  380. for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
  381. row[col] = _loads(row[col])
  382. return row
  383. # ── mode_process ─────────────────────────────────────────────────────────────
  384. def replace_process(query_id, case_id, platform, post_title, payload,
  385. model, version, cost_usd, duration_s):
  386. """写入一帖某版本的工序解构结果(payload = {source, procedures})。
  387. 删 (case_id, version) 旧行再插,同版本重跑幂等、跨版本保留历史。返回工序条数。"""
  388. source = payload.get("source")
  389. procedures = payload.get("procedures") or []
  390. conn = _conn()
  391. try:
  392. with conn.cursor() as cur:
  393. cur.execute("DELETE FROM mode_process WHERE case_id=%s AND version=%s",
  394. (case_id, version))
  395. if procedures:
  396. rows = []
  397. for p in procedures:
  398. steps = p.get("steps") or []
  399. vias = []
  400. for s in steps:
  401. v = s.get("via")
  402. if v and v not in vias:
  403. vias.append(v)
  404. rows.append((
  405. query_id, case_id, platform, (post_title or "")[:500],
  406. _j(source), p.get("id"), (p.get("name") or "")[:250],
  407. p.get("purpose"), p.get("category"),
  408. _j(p.get("declarations")), _j(p.get("type_registry")),
  409. _j(steps), len(steps), _j(vias),
  410. model, version, cost_usd, duration_s,
  411. ))
  412. cur.executemany("""
  413. INSERT INTO mode_process
  414. (query_id, case_id, platform, post_title, source, procedure_id, name,
  415. purpose, category, declarations, type_registry, steps, step_count,
  416. tools_used, model, version, cost_usd, duration_s)
  417. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  418. """, rows)
  419. return len(procedures)
  420. finally:
  421. conn.close()
  422. def fetch_process_versions(case_id):
  423. conn = _conn()
  424. try:
  425. with conn.cursor() as cur:
  426. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  427. FROM mode_process WHERE case_id=%s
  428. GROUP BY version ORDER BY version DESC""", (case_id,))
  429. return cur.fetchall()
  430. finally:
  431. conn.close()
  432. def fetch_process(case_id, version=None):
  433. """重建 {case_id, version, model, source, procedures:[...]}。version=None 取最新。"""
  434. conn = _conn()
  435. try:
  436. with conn.cursor() as cur:
  437. if version is None:
  438. cur.execute("""SELECT version FROM mode_process WHERE case_id=%s
  439. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  440. row = cur.fetchone()
  441. if not row:
  442. return None
  443. version = row["version"]
  444. cur.execute("""SELECT * FROM mode_process WHERE case_id=%s AND version=%s
  445. ORDER BY id""", (case_id, version))
  446. rows = cur.fetchall()
  447. finally:
  448. conn.close()
  449. return _proc_payload(case_id, version, rows)
  450. def _proc_payload(case_id, version, rows):
  451. """mode_process 行集 → {case_id, version, …, procedures:[...]}。无行返回 None。"""
  452. if not rows:
  453. return None
  454. procedures = [{
  455. "id": r["procedure_id"], "name": r["name"], "purpose": r["purpose"],
  456. "category": r["category"], "declarations": _loads(r["declarations"]),
  457. "type_registry": _loads(r["type_registry"]), "steps": _loads(r["steps"], []),
  458. "tools_used": _loads(r["tools_used"], []),
  459. } for r in rows]
  460. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  461. "title": rows[0]["post_title"], "model": rows[0]["model"],
  462. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  463. "duration_s": rows[0]["duration_s"],
  464. "source": _loads(rows[0]["source"]), "procedures": procedures}
  465. # ── mode_tools ───────────────────────────────────────────────────────────────
  466. def replace_tools(query_id, case_id, platform, post_title, tools,
  467. model, version, cost_usd, duration_s):
  468. """写入一帖某版本的工具解构结果。语义同 replace_process。返回工具条数。"""
  469. conn = _conn()
  470. try:
  471. with conn.cursor() as cur:
  472. cur.execute("DELETE FROM mode_tools WHERE case_id=%s AND version=%s",
  473. (case_id, version))
  474. if tools:
  475. rows = [(
  476. query_id, case_id, platform, (post_title or "")[:500],
  477. (t.get("工具名称") or "")[:250],
  478. _j(t.get("实质作用域")), _j(t.get("形式作用域")),
  479. t.get("创作层级"), t.get("来源链接"), t.get("输入"), t.get("输出"),
  480. _j(t.get("用法")), _j(t.get("案例")), _j(t.get("缺点")),
  481. t.get("最新更新时间"), model, version, cost_usd, duration_s,
  482. ) for t in tools]
  483. cur.executemany("""
  484. INSERT INTO mode_tools
  485. (query_id, case_id, platform, post_title, tool_name, substance_scope,
  486. form_scope, creation_layer, source_link, input_desc, output_desc,
  487. usage_json, cases_json, defects_json, updated_time, model, version,
  488. cost_usd, duration_s)
  489. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  490. """, rows)
  491. return len(tools)
  492. finally:
  493. conn.close()
  494. def fetch_tools_versions(case_id):
  495. conn = _conn()
  496. try:
  497. with conn.cursor() as cur:
  498. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  499. FROM mode_tools WHERE case_id=%s
  500. GROUP BY version ORDER BY version DESC""", (case_id,))
  501. return cur.fetchall()
  502. finally:
  503. conn.close()
  504. def fetch_tools(case_id, version=None):
  505. """重建 {case_id, version, model, tool_count, tools:[...]}。version=None 取最新。"""
  506. conn = _conn()
  507. try:
  508. with conn.cursor() as cur:
  509. if version is None:
  510. cur.execute("""SELECT version FROM mode_tools WHERE case_id=%s
  511. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  512. row = cur.fetchone()
  513. if not row:
  514. return None
  515. version = row["version"]
  516. cur.execute("""SELECT * FROM mode_tools WHERE case_id=%s AND version=%s
  517. ORDER BY id""", (case_id, version))
  518. rows = cur.fetchall()
  519. finally:
  520. conn.close()
  521. return _tools_payload(case_id, version, rows)
  522. def _tools_payload(case_id, version, rows):
  523. """mode_tools 行集 → {case_id, version, …, tools:[...]}。无行返回 None。"""
  524. if not rows:
  525. return None
  526. tools = [{
  527. "工具名称": r["tool_name"], "实质作用域": _loads(r["substance_scope"]),
  528. "形式作用域": _loads(r["form_scope"]), "创作层级": r["creation_layer"],
  529. "来源链接": r["source_link"], "输入": r["input_desc"], "输出": r["output_desc"],
  530. "用法": _loads(r["usage_json"]), "案例": _loads(r["cases_json"]),
  531. "缺点": _loads(r["defects_json"]), "最新更新时间": r["updated_time"],
  532. } for r in rows]
  533. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  534. "title": rows[0]["post_title"], "model": rows[0]["model"],
  535. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  536. "duration_s": rows[0]["duration_s"],
  537. "tool_count": len(tools), "tools": tools}
  538. # ── 点击帖子合一查询(单连接,最少往返;远程 RDS 每次往返 ~80ms,故按次数优化)──
  539. def fetch_extract(mode, case_id, version=None):
  540. """一次取版本列表 + 解构详情,复用同一条池连接、最少往返。
  541. 返回 {versions, data, missing}。mode: process / tools。"""
  542. is_proc = mode != "tools"
  543. mtable = _mode_table("process" if is_proc else "tools")
  544. conn = _conn()
  545. try:
  546. with conn.cursor() as cur:
  547. cur.execute(f"""SELECT version, COUNT(*) AS n, MAX(model) AS model
  548. FROM {mtable} WHERE case_id=%s
  549. GROUP BY version ORDER BY version DESC""", (case_id,))
  550. versions = cur.fetchall()
  551. # 详情:把"取最新版本"折进同一条 SQL,版本指定时直接用;省一次往返。
  552. target = version or (versions[0]["version"] if versions else None)
  553. rows = []
  554. if target is not None:
  555. cur.execute(f"SELECT * FROM {mtable} WHERE case_id=%s AND version=%s ORDER BY id",
  556. (case_id, target))
  557. rows = cur.fetchall()
  558. finally:
  559. conn.close()
  560. payload = (_proc_payload if is_proc else _tools_payload)(case_id, target, rows)
  561. return {"versions": versions, "data": payload, "missing": payload is None}
  562. # ── 跨 query 去重 / link 复制(方案A:解构前先去重,避免重复花钱)──────────────
  563. # case_id 是帖子物理身份(platform_channelContentId),与 query 无关。同一帖被多个
  564. # query 搜到时只需真实解构一次;其余 query 用 link_* 复制行补齐关联(cost=0)。
  565. def latest_real_version(case_id, mode="process"):
  566. """该 case 是否已有「真实」解构(任意 query;link_* 是复制品,不算源)。
  567. 返回最新一行 {"version","query_id"} 或 None。给解构前去重判定用。"""
  568. table = _mode_table(mode)
  569. conn = _conn()
  570. try:
  571. with conn.cursor() as cur:
  572. cur.execute(f"""SELECT version, query_id FROM {table}
  573. WHERE case_id=%s AND LEFT(version,5) <> 'link_'
  574. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  575. return cur.fetchone()
  576. finally:
  577. conn.close()
  578. def link_process(query_id, case_id, mode="process"):
  579. """把 case 在别处最新「真实」版本的解构行复制到目标 query
  580. (version='link_'+源版本, cost_usd=0)。幂等(先删目标同版本)。
  581. 返回复制行数;该 case 从未真实解构过则返回 0(无源可复制)。"""
  582. table = _mode_table(mode)
  583. conn = _conn()
  584. try:
  585. with conn.cursor() as cur:
  586. cur.execute(f"""SELECT version FROM {table}
  587. WHERE case_id=%s AND LEFT(version,5) <> 'link_'
  588. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  589. r = cur.fetchone()
  590. if not r:
  591. return 0
  592. srcver = r["version"]
  593. newver = ("link_" + srcver)[:32] # version 列 VARCHAR(32)
  594. # 复制除自增 id / 时间戳外的全部列,改写 query_id / version / cost。
  595. cur.execute(f"SHOW COLUMNS FROM {table}")
  596. cols = [c["Field"] for c in cur.fetchall()
  597. if c["Field"] not in ("id", "created_at", "updated_at")]
  598. cur.execute(f"SELECT {','.join(cols)} FROM {table} WHERE case_id=%s AND version=%s",
  599. (case_id, srcver))
  600. rows = cur.fetchall()
  601. cur.execute(f"DELETE FROM {table} WHERE query_id=%s AND case_id=%s AND version=%s",
  602. (query_id, case_id, newver))
  603. for row in rows:
  604. row = dict(row)
  605. row["query_id"] = query_id
  606. row["version"] = newver
  607. row["cost_usd"] = 0
  608. cur.execute(
  609. f"INSERT INTO {table} ({','.join(cols)}) VALUES ({','.join(['%s']*len(cols))})",
  610. [row[k] for k in cols])
  611. return len(rows)
  612. finally:
  613. conn.close()
  614. # ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────
  615. # 采纳判定只需「和内容制作知识相关」的得分,用 SQL JSON_EXTRACT 直取这一个标量,
  616. # 避免把整块 llm_evaluation(本库 ~1.5MB)拉到 Python 再解析。得分可能直接是数字,
  617. # 也可能裹在 {"得分": x} 里,COALESCE 两条路径覆盖两种存法,口径同 is_adopted。
  618. _REL_SQL = ("JSON_UNQUOTE(COALESCE("
  619. "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\".\"得分\"'),"
  620. "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\"')))")
  621. def fetch_adopted_process_cases(query_id=None):
  622. """返回「已采纳且有工序解构」的 case_id 列表(供知识上传脚本用)。
  623. 采纳是帖子级属性(评估存在 search_process),工序解构存在 mode_process,故二者 JOIN:
  624. 只取两边都有的 case,再用 is_adopted_rel(口径同 Dashboard)在 Python 侧过滤。
  625. relevance 得分由 _REL_SQL 直取标量,不传整块 llm_evaluation。
  626. query_id 给定时只看该搜索任务下的 case。返回去重、按 case_id 排序的列表。
  627. """
  628. sql = (f"SELECT DISTINCT s.case_id, s.overall_score, s.publish_time, "
  629. f"{_REL_SQL} AS rel "
  630. "FROM search_process s "
  631. "JOIN (SELECT DISTINCT case_id FROM mode_process) m ON s.case_id = m.case_id")
  632. params = ()
  633. if query_id:
  634. sql += " WHERE s.query_id=%s"
  635. params = (query_id,)
  636. conn = _conn()
  637. try:
  638. with conn.cursor() as cur:
  639. cur.execute(sql, params)
  640. rows = cur.fetchall()
  641. finally:
  642. conn.close()
  643. cases = [r["case_id"] for r in rows
  644. if is_adopted_rel(r["overall_score"], r["rel"], r["publish_time"])]
  645. return sorted(set(cases))
  646. def fetch_dashboard_rows():
  647. """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。
  648. 优化:① 不传 llm_evaluation 整块,SQL 只取采纳判定要的相关性得分;
  649. ② steps 只取每个 case 的最新版本(覆盖度只看最新版),历史/link_ 版本不传 steps。"""
  650. conn = _conn()
  651. try:
  652. with conn.cursor() as cur:
  653. # 进度分母走「采纳」口径;mode 标方向(工序帖来自 search_process)。
  654. cols = f"query_id, case_id, platform, overall_score, publish_time, {_REL_SQL} AS rel"
  655. cur.execute(f"SELECT {cols} FROM search_process")
  656. posts = cur.fetchall()
  657. for p in posts:
  658. p["mode"] = "process"
  659. cur.execute(f"SELECT {cols} FROM search_tools")
  660. st = cur.fetchall()
  661. for p in st:
  662. p["mode"] = "tools"
  663. posts += st
  664. # 成本/耗时按全部版本计;steps 仅最新版需要 → 非最新版只回 NULL,省传输。
  665. cur.execute("""SELECT p.case_id, p.version, p.cost_usd, p.duration_s, p.created_at,
  666. CASE WHEN p.version = m.maxv THEN p.steps END AS steps
  667. FROM mode_process p
  668. JOIN (SELECT case_id, MAX(version) AS maxv
  669. FROM mode_process GROUP BY case_id) m
  670. ON p.case_id = m.case_id
  671. ORDER BY p.id""")
  672. procs = cur.fetchall()
  673. cur.execute("""SELECT case_id, version, tool_name, substance_scope,
  674. form_scope, cost_usd, duration_s, created_at
  675. FROM mode_tools""")
  676. tools = cur.fetchall()
  677. finally:
  678. conn.close()
  679. for p in posts:
  680. # 采纳判定:口径同帖子列表(is_adopted),作为「需解构」分母依据
  681. p["adopted"] = is_adopted_rel(p["overall_score"], p["rel"], p["publish_time"])
  682. for r in procs:
  683. r["steps"] = _loads(r["steps"], [])
  684. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  685. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  686. for r in tools:
  687. r["substance_scope"] = _loads(r["substance_scope"], [])
  688. r["form_scope"] = _loads(r["form_scope"], [])
  689. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  690. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  691. return posts, procs, tools
  692. def check():
  693. conn = _conn()
  694. try:
  695. with conn.cursor() as cur:
  696. for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
  697. cur.execute(f"SELECT COUNT(*) AS n FROM {t}")
  698. print(f"{t}: {cur.fetchone()['n']} 行")
  699. finally:
  700. conn.close()
  701. if __name__ == "__main__":
  702. cmd = sys.argv[1] if len(sys.argv) > 1 else ""
  703. if cmd == "init":
  704. init_tables()
  705. elif cmd == "check":
  706. check()
  707. elif cmd == "clear":
  708. clear_tables()
  709. else:
  710. print("用法:\n python db.py init # 建表\n python db.py check # 四表行数\n python db.py clear # 清空四表数据")