db.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. # -*- coding: utf-8 -*-
  2. """mode_workflow · MySQL 持久化(DB 为唯一事实源)
  3. ================================================================================
  4. 读 .env 的 MYSQL_* 连接 MySQL。三张表:
  5. search_data —— 每行一个 (query, 帖子):搜索 + llm 评估结果
  6. mode_process —— 每行一个解构出的工序(steps 等嵌套结构存 JSON 列)
  7. mode_tools —— 每行一个解构出的工具
  8. 与旧 fixed_query_eval/db.py 的关键差异:本系统 DB 是主存储,写入失败直接 raise,
  9. 不做"失败不阻断"。读侧保留防御(返回空/None)。
  10. 用法:
  11. python db.py init # 建表(幂等)
  12. python db.py check # 打印三表行数
  13. """
  14. import json
  15. import os
  16. import sys
  17. from pathlib import Path
  18. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  19. sys.path.insert(0, str(PROJECT_ROOT))
  20. from dotenv import load_dotenv
  21. load_dotenv()
  22. import pymysql
  23. from pymysql.cursors import DictCursor
  24. def _conn():
  25. if not os.getenv("MYSQL_HOST"):
  26. raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
  27. return pymysql.connect(
  28. host=os.getenv("MYSQL_HOST"),
  29. port=int(os.getenv("MYSQL_PORT", 3306)),
  30. user=os.getenv("MYSQL_USER"),
  31. password=os.getenv("MYSQL_PASSWORD"),
  32. database=os.getenv("MYSQL_DATABASE"),
  33. charset="utf8mb4", cursorclass=DictCursor,
  34. autocommit=True, connect_timeout=10,
  35. )
  36. # ── DDL ──────────────────────────────────────────────────────────────────────
  37. DDL_SEARCH = """
  38. CREATE TABLE IF NOT EXISTS search_data (
  39. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  40. query_id VARCHAR(32) NOT NULL COMMENT 'q0000',
  41. query_text VARCHAR(512) NULL,
  42. case_id VARCHAR(128) NOT NULL COMMENT 'platform_channelContentId',
  43. platform VARCHAR(32) NULL,
  44. channel_content_id VARCHAR(128) NULL,
  45. title VARCHAR(512) NULL,
  46. url VARCHAR(1024) NULL,
  47. content_type VARCHAR(32) NULL,
  48. body LONGTEXT NULL,
  49. images JSON NULL,
  50. videos JSON NULL,
  51. like_count INT NULL,
  52. publish_time VARCHAR(64) NULL,
  53. quality_score FLOAT NULL COMMENT 'post._quality_score',
  54. quality_grade VARCHAR(8) NULL,
  55. found_by JSON NULL COMMENT '命中的措辞数组',
  56. knowledge_type JSON NULL COMMENT '["能力","工序","工具"] 子集',
  57. overall_score FLOAT NULL COMMENT '(相关均值+质量均值)/2',
  58. llm_evaluation JSON NULL COMMENT '评估全量 blob',
  59. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  60. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  61. UNIQUE KEY uk_qid_case (query_id, case_id),
  62. KEY idx_platform (platform)
  63. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='搜索+评估结果';
  64. """
  65. DDL_PROCESS = """
  66. CREATE TABLE IF NOT EXISTS mode_process (
  67. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  68. query_id VARCHAR(32) NOT NULL,
  69. case_id VARCHAR(128) NOT NULL,
  70. platform VARCHAR(32) NULL,
  71. post_title VARCHAR(512) NULL,
  72. source JSON NULL COMMENT '解构返回的 source 块',
  73. procedure_id VARCHAR(16) NULL COMMENT 'p1,p2…',
  74. name VARCHAR(255) NULL,
  75. purpose TEXT NULL,
  76. category VARCHAR(32) NULL COMMENT '产物创造/资产建设/自动化/分析/学习',
  77. declarations JSON NULL,
  78. type_registry JSON NULL,
  79. steps JSON NULL COMMENT '步骤数组全量',
  80. step_count INT NULL,
  81. tools_used JSON NULL COMMENT '从 steps[].via 去重提取',
  82. model VARCHAR(64) NULL,
  83. version VARCHAR(16) NULL COMMENT 'v_MMDDHHMM,保留历史',
  84. cost_usd DECIMAL(10,6) NULL COMMENT '本次解构调用成本(同版本各行相同,聚合需按 case+version 去重)',
  85. duration_s FLOAT NULL,
  86. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  87. KEY idx_case_ver (case_id, version),
  88. KEY idx_qid (query_id)
  89. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序解构结果(每行一个工序)';
  90. """
  91. DDL_TOOLS = """
  92. CREATE TABLE IF NOT EXISTS mode_tools (
  93. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  94. query_id VARCHAR(32) NOT NULL,
  95. case_id VARCHAR(128) NOT NULL,
  96. platform VARCHAR(32) NULL,
  97. post_title VARCHAR(512) NULL,
  98. tool_name VARCHAR(255) NULL,
  99. substance_scope JSON NULL COMMENT '实质作用域(数组)',
  100. form_scope JSON NULL COMMENT '形式作用域(数组或null)',
  101. creation_layer VARCHAR(32) NULL COMMENT '制作层/创作层',
  102. source_link VARCHAR(1024) NULL,
  103. input_desc TEXT NULL,
  104. output_desc TEXT NULL,
  105. usage_json JSON NULL,
  106. cases_json JSON NULL,
  107. defects_json JSON NULL,
  108. updated_time VARCHAR(64) NULL COMMENT '工具最新更新时间',
  109. model VARCHAR(64) NULL,
  110. version VARCHAR(16) NULL,
  111. cost_usd DECIMAL(10,6) NULL COMMENT '同 mode_process,聚合按 case+version 去重',
  112. duration_s FLOAT NULL,
  113. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  114. KEY idx_case_ver (case_id, version),
  115. KEY idx_qid (query_id),
  116. KEY idx_tool_name (tool_name)
  117. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具解构结果(每行一个工具)';
  118. """
  119. def init_tables():
  120. conn = _conn()
  121. try:
  122. with conn.cursor() as cur:
  123. cur.execute(DDL_SEARCH)
  124. cur.execute(DDL_PROCESS)
  125. cur.execute(DDL_TOOLS)
  126. print("✅ 建表完成:search_data, mode_process, mode_tools")
  127. finally:
  128. conn.close()
  129. # ── 工具函数 ──────────────────────────────────────────────────────────────────
  130. def _loads(v, default=None):
  131. """pymysql 的 JSON 列可能返回字符串,统一解析。"""
  132. if v is None:
  133. return default
  134. if isinstance(v, (list, dict)):
  135. return v
  136. try:
  137. return json.loads(v)
  138. except Exception:
  139. return default
  140. def _j(v):
  141. """写入 JSON 列:None 保持 NULL,其余 dumps。"""
  142. return None if v is None else json.dumps(v, ensure_ascii=False)
  143. def _collect_scores(node):
  144. """递归收集嵌套评估里所有数值「得分」。"""
  145. out = []
  146. if isinstance(node, dict):
  147. for k, v in node.items():
  148. if k == "得分" and isinstance(v, (int, float)):
  149. out.append(float(v))
  150. else:
  151. out.extend(_collect_scores(v))
  152. elif isinstance(node, list):
  153. for v in node:
  154. out.extend(_collect_scores(v))
  155. return out
  156. def overall_score(e):
  157. """综合分 = (相关性各项均值 + 质量各项均值) / 可得部分数。算不出返回 None。"""
  158. parts = []
  159. for key in ("相关性", "质量"):
  160. scores = _collect_scores((e or {}).get(key))
  161. if scores:
  162. parts.append(sum(scores) / len(scores))
  163. return round(sum(parts) / len(parts), 2) if parts else None
  164. # ── search_data ──────────────────────────────────────────────────────────────
  165. def upsert_search_posts(query_id, query_text, results):
  166. """一组搜索结果写入 search_data(按 (query_id, case_id) upsert)。返回写入条数。"""
  167. if not results:
  168. return 0
  169. rows = []
  170. for r in results:
  171. post = r.get("post") or {}
  172. e = r.get("llm_evaluation") or {}
  173. rows.append((
  174. query_id, query_text, r.get("case_id"), r.get("platform"),
  175. r.get("channel_content_id"),
  176. (post.get("title") or post.get("desc") or "")[:500],
  177. r.get("source_url"), post.get("content_type"),
  178. post.get("body_text") or post.get("desc") or "",
  179. _j(post.get("images") or []), _j(post.get("videos") or []),
  180. post.get("like_count"),
  181. str(post.get("publish_time") or post.get("publish_timestamp") or "")[:64],
  182. post.get("_quality_score"), post.get("_quality_grade"),
  183. _j(r.get("found_by_queries") or []),
  184. _j(e.get("知识类型") or []),
  185. overall_score(e),
  186. _j(e),
  187. ))
  188. sql = """
  189. INSERT INTO search_data
  190. (query_id, query_text, case_id, platform, channel_content_id, title, url,
  191. content_type, body, images, videos, like_count, publish_time,
  192. quality_score, quality_grade, found_by, knowledge_type, overall_score, llm_evaluation)
  193. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  194. ON DUPLICATE KEY UPDATE
  195. query_text=VALUES(query_text), platform=VALUES(platform),
  196. channel_content_id=VALUES(channel_content_id), title=VALUES(title), url=VALUES(url),
  197. content_type=VALUES(content_type), body=VALUES(body), images=VALUES(images),
  198. videos=VALUES(videos), like_count=VALUES(like_count), publish_time=VALUES(publish_time),
  199. quality_score=VALUES(quality_score), quality_grade=VALUES(quality_grade),
  200. found_by=VALUES(found_by), knowledge_type=VALUES(knowledge_type),
  201. overall_score=VALUES(overall_score), llm_evaluation=VALUES(llm_evaluation);
  202. """
  203. conn = _conn()
  204. try:
  205. with conn.cursor() as cur:
  206. cur.executemany(sql, rows)
  207. return len(rows)
  208. finally:
  209. conn.close()
  210. def fetch_queries():
  211. """query 列表 + 帖子数 + 解构进度。"""
  212. conn = _conn()
  213. try:
  214. with conn.cursor() as cur:
  215. cur.execute("""SELECT query_id, MAX(query_text) AS query_text,
  216. COUNT(*) AS post_count
  217. FROM search_data GROUP BY query_id ORDER BY query_id""")
  218. queries = cur.fetchall()
  219. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_process GROUP BY query_id")
  220. np = {r["query_id"]: r["n"] for r in cur.fetchall()}
  221. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_tools GROUP BY query_id")
  222. nt = {r["query_id"]: r["n"] for r in cur.fetchall()}
  223. finally:
  224. conn.close()
  225. for q in queries:
  226. q["process_done"] = np.get(q["query_id"], 0)
  227. q["tools_done"] = nt.get(q["query_id"], 0)
  228. return queries
  229. def fetch_posts(query_id):
  230. """某 query 下全部帖子(JSON 列已解析),带 has_process/has_tools 标记。"""
  231. conn = _conn()
  232. try:
  233. with conn.cursor() as cur:
  234. cur.execute("""SELECT * FROM search_data WHERE query_id=%s
  235. ORDER BY overall_score DESC, id""", (query_id,))
  236. rows = cur.fetchall()
  237. cur.execute("SELECT DISTINCT case_id FROM mode_process WHERE query_id=%s", (query_id,))
  238. hp = {r["case_id"] for r in cur.fetchall()}
  239. cur.execute("SELECT DISTINCT case_id FROM mode_tools WHERE query_id=%s", (query_id,))
  240. ht = {r["case_id"] for r in cur.fetchall()}
  241. finally:
  242. conn.close()
  243. for r in rows:
  244. for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
  245. r[col] = _loads(r[col])
  246. r["has_process"] = r["case_id"] in hp
  247. r["has_tools"] = r["case_id"] in ht
  248. r.pop("created_at", None); r.pop("updated_at", None)
  249. return rows
  250. def fetch_post(query_id, case_id):
  251. """单帖完整行(给 pipeline 脚本重建 source 用)。无则 None。"""
  252. conn = _conn()
  253. try:
  254. with conn.cursor() as cur:
  255. cur.execute("SELECT * FROM search_data WHERE query_id=%s AND case_id=%s",
  256. (query_id, case_id))
  257. row = cur.fetchone()
  258. finally:
  259. conn.close()
  260. if not row:
  261. return None
  262. for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
  263. row[col] = _loads(row[col])
  264. return row
  265. # ── mode_process ─────────────────────────────────────────────────────────────
  266. def replace_process(query_id, case_id, platform, post_title, payload,
  267. model, version, cost_usd, duration_s):
  268. """写入一帖某版本的工序解构结果(payload = {source, procedures})。
  269. 删 (case_id, version) 旧行再插,同版本重跑幂等、跨版本保留历史。返回工序条数。"""
  270. source = payload.get("source")
  271. procedures = payload.get("procedures") or []
  272. conn = _conn()
  273. try:
  274. with conn.cursor() as cur:
  275. cur.execute("DELETE FROM mode_process WHERE case_id=%s AND version=%s",
  276. (case_id, version))
  277. if procedures:
  278. rows = []
  279. for p in procedures:
  280. steps = p.get("steps") or []
  281. vias = []
  282. for s in steps:
  283. v = s.get("via")
  284. if v and v not in vias:
  285. vias.append(v)
  286. rows.append((
  287. query_id, case_id, platform, (post_title or "")[:500],
  288. _j(source), p.get("id"), (p.get("name") or "")[:250],
  289. p.get("purpose"), p.get("category"),
  290. _j(p.get("declarations")), _j(p.get("type_registry")),
  291. _j(steps), len(steps), _j(vias),
  292. model, version, cost_usd, duration_s,
  293. ))
  294. cur.executemany("""
  295. INSERT INTO mode_process
  296. (query_id, case_id, platform, post_title, source, procedure_id, name,
  297. purpose, category, declarations, type_registry, steps, step_count,
  298. tools_used, model, version, cost_usd, duration_s)
  299. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  300. """, rows)
  301. return len(procedures)
  302. finally:
  303. conn.close()
  304. def fetch_process_versions(case_id):
  305. conn = _conn()
  306. try:
  307. with conn.cursor() as cur:
  308. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  309. FROM mode_process WHERE case_id=%s
  310. GROUP BY version ORDER BY version DESC""", (case_id,))
  311. return cur.fetchall()
  312. finally:
  313. conn.close()
  314. def fetch_process(case_id, version=None):
  315. """重建 {case_id, version, model, source, procedures:[...]}。version=None 取最新。"""
  316. conn = _conn()
  317. try:
  318. with conn.cursor() as cur:
  319. if version is None:
  320. cur.execute("""SELECT version FROM mode_process WHERE case_id=%s
  321. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  322. row = cur.fetchone()
  323. if not row:
  324. return None
  325. version = row["version"]
  326. cur.execute("""SELECT * FROM mode_process WHERE case_id=%s AND version=%s
  327. ORDER BY id""", (case_id, version))
  328. rows = cur.fetchall()
  329. finally:
  330. conn.close()
  331. if not rows:
  332. return None
  333. procedures = [{
  334. "id": r["procedure_id"], "name": r["name"], "purpose": r["purpose"],
  335. "category": r["category"], "declarations": _loads(r["declarations"]),
  336. "type_registry": _loads(r["type_registry"]), "steps": _loads(r["steps"], []),
  337. "tools_used": _loads(r["tools_used"], []),
  338. } for r in rows]
  339. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  340. "title": rows[0]["post_title"], "model": rows[0]["model"],
  341. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  342. "duration_s": rows[0]["duration_s"],
  343. "source": _loads(rows[0]["source"]), "procedures": procedures}
  344. # ── mode_tools ───────────────────────────────────────────────────────────────
  345. def replace_tools(query_id, case_id, platform, post_title, tools,
  346. model, version, cost_usd, duration_s):
  347. """写入一帖某版本的工具解构结果。语义同 replace_process。返回工具条数。"""
  348. conn = _conn()
  349. try:
  350. with conn.cursor() as cur:
  351. cur.execute("DELETE FROM mode_tools WHERE case_id=%s AND version=%s",
  352. (case_id, version))
  353. if tools:
  354. rows = [(
  355. query_id, case_id, platform, (post_title or "")[:500],
  356. (t.get("工具名称") or "")[:250],
  357. _j(t.get("实质作用域")), _j(t.get("形式作用域")),
  358. t.get("创作层级"), t.get("来源链接"), t.get("输入"), t.get("输出"),
  359. _j(t.get("用法")), _j(t.get("案例")), _j(t.get("缺点")),
  360. t.get("最新更新时间"), model, version, cost_usd, duration_s,
  361. ) for t in tools]
  362. cur.executemany("""
  363. INSERT INTO mode_tools
  364. (query_id, case_id, platform, post_title, tool_name, substance_scope,
  365. form_scope, creation_layer, source_link, input_desc, output_desc,
  366. usage_json, cases_json, defects_json, updated_time, model, version,
  367. cost_usd, duration_s)
  368. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  369. """, rows)
  370. return len(tools)
  371. finally:
  372. conn.close()
  373. def fetch_tools_versions(case_id):
  374. conn = _conn()
  375. try:
  376. with conn.cursor() as cur:
  377. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  378. FROM mode_tools WHERE case_id=%s
  379. GROUP BY version ORDER BY version DESC""", (case_id,))
  380. return cur.fetchall()
  381. finally:
  382. conn.close()
  383. def fetch_tools(case_id, version=None):
  384. """重建 {case_id, version, model, tool_count, tools:[...]}。version=None 取最新。"""
  385. conn = _conn()
  386. try:
  387. with conn.cursor() as cur:
  388. if version is None:
  389. cur.execute("""SELECT version FROM mode_tools WHERE case_id=%s
  390. ORDER BY version DESC, id DESC LIMIT 1""", (case_id,))
  391. row = cur.fetchone()
  392. if not row:
  393. return None
  394. version = row["version"]
  395. cur.execute("""SELECT * FROM mode_tools WHERE case_id=%s AND version=%s
  396. ORDER BY id""", (case_id, version))
  397. rows = cur.fetchall()
  398. finally:
  399. conn.close()
  400. if not rows:
  401. return None
  402. tools = [{
  403. "工具名称": r["tool_name"], "实质作用域": _loads(r["substance_scope"]),
  404. "形式作用域": _loads(r["form_scope"]), "创作层级": r["creation_layer"],
  405. "来源链接": r["source_link"], "输入": r["input_desc"], "输出": r["output_desc"],
  406. "用法": _loads(r["usage_json"]), "案例": _loads(r["cases_json"]),
  407. "缺点": _loads(r["defects_json"]), "最新更新时间": r["updated_time"],
  408. } for r in rows]
  409. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  410. "title": rows[0]["post_title"], "model": rows[0]["model"],
  411. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  412. "duration_s": rows[0]["duration_s"],
  413. "tool_count": len(tools), "tools": tools}
  414. # ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────
  415. def fetch_dashboard_rows():
  416. """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。"""
  417. conn = _conn()
  418. try:
  419. with conn.cursor() as cur:
  420. cur.execute("SELECT query_id, case_id, knowledge_type FROM search_data")
  421. posts = cur.fetchall()
  422. cur.execute("""SELECT case_id, version, steps, tools_used, cost_usd,
  423. duration_s, created_at FROM mode_process""")
  424. procs = cur.fetchall()
  425. cur.execute("""SELECT case_id, version, tool_name, substance_scope,
  426. form_scope, cost_usd, duration_s, created_at
  427. FROM mode_tools""")
  428. tools = cur.fetchall()
  429. finally:
  430. conn.close()
  431. for p in posts:
  432. p["knowledge_type"] = _loads(p["knowledge_type"], [])
  433. for r in procs:
  434. r["steps"] = _loads(r["steps"], [])
  435. r["tools_used"] = _loads(r["tools_used"], [])
  436. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  437. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  438. for r in tools:
  439. r["substance_scope"] = _loads(r["substance_scope"], [])
  440. r["form_scope"] = _loads(r["form_scope"], [])
  441. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  442. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  443. return posts, procs, tools
  444. def check():
  445. conn = _conn()
  446. try:
  447. with conn.cursor() as cur:
  448. for t in ("search_data", "mode_process", "mode_tools"):
  449. cur.execute(f"SELECT COUNT(*) AS n FROM {t}")
  450. print(f"{t}: {cur.fetchone()['n']} 行")
  451. finally:
  452. conn.close()
  453. if __name__ == "__main__":
  454. cmd = sys.argv[1] if len(sys.argv) > 1 else ""
  455. if cmd == "init":
  456. init_tables()
  457. elif cmd == "check":
  458. check()
  459. else:
  460. print("用法:\n python db.py init # 建表\n python db.py check # 三表行数")