db.py 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247
  1. # -*- coding: utf-8 -*-
  2. """mode_workflow · MySQL 持久化(DB 为唯一事实源)
  3. ================================================================================
  4. 读 .env 的 MYSQL_* 连接 MySQL。四张表:
  5. search_process —— 每行一个 (query, 帖子):工序方向的搜索 + llm 评估结果
  6. search_tools —— 同结构,工具方向的搜索结果(方向由表区分,不再用 mode_type 列)
  7. mode_process —— 每行一个解构出的工序(steps 等嵌套结构存 JSON 列)
  8. mode_tools —— 每行一个解构出的工具
  9. 与旧 fixed_query_eval/db.py 的关键差异:本系统 DB 是主存储,写入失败直接 raise,
  10. 不做"失败不阻断"。读侧保留防御(返回空/None)。
  11. 用法:
  12. python db.py init # 建表(幂等)
  13. python db.py check # 打印四表行数
  14. python db.py clear # 清空四表数据(TRUNCATE)
  15. """
  16. import json
  17. import os
  18. import sys
  19. from datetime import datetime
  20. from pathlib import Path
  21. PROJECT_ROOT = Path(__file__).resolve().parents[2]
  22. sys.path.insert(0, str(PROJECT_ROOT))
  23. from dotenv import load_dotenv
  24. load_dotenv()
  25. import pymysql
  26. from pymysql.cursors import DictCursor
  27. from dbutils.pooled_db import PooledDB
  28. # ── 连接池 ──────────────────────────────────────────────────────────────────
  29. # MySQL 是远程 RDS,每次 pymysql.connect() 的 TCP+鉴权握手 ~0.5s。旧实现每个
  30. # 请求新建一条连接,一次"点开帖子"要 2~3 个请求 = 2~3 次握手 ≈ 1s。改用连接池
  31. # 复用长连接后,握手只在池初始化时各发生一次,后续取连接近乎零开销。
  32. # server.py 是 ThreadingHTTPServer(每请求一线程),PooledDB 线程安全,正好匹配。
  33. # 注意:fetch_* 里的 conn.close() 在池连接上语义是"归还池中"而非真正断开。
  34. _POOL = None
  35. def _pool():
  36. global _POOL
  37. if _POOL is None:
  38. if not os.getenv("MYSQL_HOST"):
  39. raise RuntimeError("缺 MYSQL_HOST:检查 .env 的 MYSQL_* 配置")
  40. _POOL = PooledDB(
  41. creator=pymysql,
  42. mincached=2, # 启动即预热 2 条,首点不再吃冷握手
  43. maxcached=5, # 空闲保留上限
  44. maxconnections=20, # 并发上限(ThreadingHTTPServer 线程数)
  45. blocking=True, # 连接耗尽时等待而非报错
  46. ping=1, # 取用前 ping,自动剔除被 RDS 掐断的死连接
  47. host=os.getenv("MYSQL_HOST"),
  48. port=int(os.getenv("MYSQL_PORT", 3306)),
  49. user=os.getenv("MYSQL_USER"),
  50. password=os.getenv("MYSQL_PASSWORD"),
  51. database=os.getenv("MYSQL_DATABASE"),
  52. charset="utf8mb4", cursorclass=DictCursor,
  53. autocommit=True, connect_timeout=10,
  54. )
  55. return _POOL
  56. def _conn():
  57. """从池取一条连接;用法不变(with cursor / conn.close() 归还池)。"""
  58. return _pool().connection()
  59. # ── DDL ──────────────────────────────────────────────────────────────────────
  60. SEARCH_TABLES = {"process": "search_process", "tools": "search_tools"}
  61. MODE_TABLES = {"process": "mode_process", "tools": "mode_tools"}
  62. def _search_table(mode_or_table):
  63. """mode(process/tools)或表名 → 合法搜索表名(白名单,防 SQL 注入)。"""
  64. t = SEARCH_TABLES.get(mode_or_table, mode_or_table)
  65. if t not in SEARCH_TABLES.values():
  66. raise ValueError(f"未知搜索表/模式: {mode_or_table!r}")
  67. return t
  68. def _mode_table(mode_or_table):
  69. """mode(process/tools)或表名 → 合法解构表名(白名单,防 SQL 注入)。"""
  70. t = MODE_TABLES.get(mode_or_table, mode_or_table)
  71. if t not in MODE_TABLES.values():
  72. raise ValueError(f"未知解构表/模式: {mode_or_table!r}")
  73. return t
  74. def _ddl_search(table, direction):
  75. return f"""
  76. CREATE TABLE IF NOT EXISTS {table} (
  77. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  78. query_id VARCHAR(32) NOT NULL COMMENT 'q0000',
  79. query_text VARCHAR(512) NULL,
  80. case_id VARCHAR(128) NOT NULL COMMENT 'platform_channelContentId',
  81. platform VARCHAR(32) NULL,
  82. channel_content_id VARCHAR(128) NULL,
  83. title VARCHAR(512) NULL,
  84. url VARCHAR(1024) NULL,
  85. content_type VARCHAR(32) NULL,
  86. body LONGTEXT NULL,
  87. images JSON NULL,
  88. videos JSON NULL,
  89. like_count INT NULL,
  90. publish_time VARCHAR(64) NULL,
  91. quality_score FLOAT NULL COMMENT 'post._quality_score',
  92. quality_grade VARCHAR(8) NULL,
  93. found_by JSON NULL COMMENT '命中的措辞数组',
  94. knowledge_type JSON NULL COMMENT '["能力","工序","工具"] 子集',
  95. overall_score FLOAT NULL COMMENT '(相关均值+质量均值)/2',
  96. llm_evaluation JSON NULL COMMENT '评估全量 blob',
  97. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  98. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  99. UNIQUE KEY uk_qid_case (query_id, case_id),
  100. KEY idx_platform (platform)
  101. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='搜索+评估结果({direction})';
  102. """
  103. DDL_PROCESS = """
  104. CREATE TABLE IF NOT EXISTS mode_process (
  105. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  106. query_id VARCHAR(32) NOT NULL,
  107. case_id VARCHAR(128) NOT NULL,
  108. platform VARCHAR(32) NULL,
  109. post_title VARCHAR(512) NULL,
  110. source JSON NULL COMMENT '解构返回的 source 块',
  111. procedure_id VARCHAR(16) NULL COMMENT 'p1,p2…',
  112. name VARCHAR(255) NULL,
  113. purpose TEXT NULL,
  114. category VARCHAR(32) NULL COMMENT '产物创造/资产建设/自动化/分析/学习',
  115. declarations JSON NULL,
  116. type_registry JSON NULL,
  117. steps JSON NULL COMMENT '步骤数组全量',
  118. step_count INT NULL,
  119. tools_used JSON NULL COMMENT '从 steps[].via 去重提取',
  120. model VARCHAR(64) NULL,
  121. version VARCHAR(32) NULL COMMENT 'v_MMDDHHMM,保留历史;link_* 为跨 query 复制(cost=0)',
  122. cost_usd DECIMAL(10,6) NULL COMMENT '本次解构调用成本(同版本各行相同,聚合需按 case+version 去重)',
  123. duration_s FLOAT NULL,
  124. seq SMALLINT NULL COMMENT '帖内序号(0-based);与 (query_id,case_id,version) 组唯一键防并发/重复写',
  125. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  126. UNIQUE KEY uk_q_case_ver_seq (query_id, case_id, version, seq),
  127. KEY idx_case_ver (case_id, version),
  128. KEY idx_qid (query_id)
  129. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序解构结果(每行一个工序)';
  130. """
  131. DDL_TOOLS = """
  132. CREATE TABLE IF NOT EXISTS mode_tools (
  133. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  134. query_id VARCHAR(32) NOT NULL,
  135. case_id VARCHAR(128) NOT NULL,
  136. platform VARCHAR(32) NULL,
  137. post_title VARCHAR(512) NULL,
  138. source JSON NULL COMMENT '解构时帖子来源块(tool_extract._row_to_source 产出)',
  139. tool_name VARCHAR(255) NULL,
  140. substance_scope JSON NULL COMMENT '实质作用域(数组)',
  141. form_scope JSON NULL COMMENT '形式作用域(数组或null)',
  142. creation_layer VARCHAR(32) NULL COMMENT '制作层/创作层',
  143. source_link VARCHAR(1024) NULL,
  144. input_desc TEXT NULL,
  145. output_desc TEXT NULL,
  146. usage_json JSON NULL,
  147. cases_json JSON NULL,
  148. defects_json JSON NULL,
  149. updated_time VARCHAR(64) NULL COMMENT '工具最新更新时间',
  150. model VARCHAR(64) NULL,
  151. version VARCHAR(32) NULL COMMENT 'v_MMDDHHMM;link_* 为跨 query 复制(cost=0)',
  152. cost_usd DECIMAL(10,6) NULL COMMENT '同 mode_process,聚合按 case+version 去重',
  153. duration_s FLOAT NULL,
  154. seq SMALLINT NULL COMMENT '帖内序号(0-based);与 (query_id,case_id,version) 组唯一键防并发/重复写',
  155. created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  156. UNIQUE KEY uk_q_case_ver_seq (query_id, case_id, version, seq),
  157. KEY idx_case_ver (case_id, version),
  158. KEY idx_qid (query_id),
  159. KEY idx_tool_name (tool_name)
  160. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具解构结果(每行一个工具)';
  161. """
  162. # 工序知识「已导入知识库」台账:防重复上传(stages/import_process_knowledge.py 用)。
  163. # 每条知识 = 某 case 的某个工序(proc_index 1-based)。记录导入时的 mode_process 版本:
  164. # 版本变了(重解构)说明内容已变,应重导;版本不变即视为「已传过」,跳过。
  165. # 选 DB 台账而非本地文件,是为了换机器/换链接后也不会重复写知识库。
  166. # 注:工具知识用独立的 tools_ingest_log,不与本表混用(case_id 是帖子物理身份,
  167. # 同帖可能既被工序解构又被工具解构,共表会在 (case_id, index) 上撞键)。
  168. DDL_INGEST_LOG = """
  169. CREATE TABLE IF NOT EXISTS knowledge_ingest_log (
  170. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  171. case_id VARCHAR(128) NOT NULL,
  172. proc_index INT NOT NULL COMMENT '工序序号(1-based),对齐导入脚本枚举',
  173. version VARCHAR(32) NULL COMMENT '导入时 mode_process 版本;变了应重导',
  174. knowledge_id VARCHAR(128) NULL COMMENT '接口返回的 knowledge_id',
  175. api_url VARCHAR(255) NULL,
  176. ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  177. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  178. UNIQUE KEY uk_case_proc (case_id, proc_index)
  179. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工序知识已导入台账(防重复上传)';
  180. """
  181. # 工具知识「已导入知识库」台账:语义同 knowledge_ingest_log,但针对工具方向独立成表
  182. # (stages/import_tools_knowledge.py 用)。每条知识 = 某 case 的某个工具(tool_index 1-based),
  183. # 版本记录导入时的 mode_tools 版本;变了(重解构)应重导,不变即「已传过」跳过。
  184. DDL_TOOLS_INGEST_LOG = """
  185. CREATE TABLE IF NOT EXISTS tools_ingest_log (
  186. id BIGINT AUTO_INCREMENT PRIMARY KEY,
  187. case_id VARCHAR(128) NOT NULL,
  188. tool_index INT NOT NULL COMMENT '工具序号(1-based),对齐导入脚本枚举',
  189. version VARCHAR(32) NULL COMMENT '导入时 mode_tools 版本;变了应重导',
  190. knowledge_id VARCHAR(128) NULL COMMENT '接口返回的 knowledge_id',
  191. api_url VARCHAR(255) NULL,
  192. ingested_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  193. updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
  194. UNIQUE KEY uk_case_tool (case_id, tool_index)
  195. ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='工具知识已导入台账(防重复上传)';
  196. """
  197. def _ensure_column(cur, table, column, column_ddl):
  198. """给已存在的表幂等补列:列已存在则跳过(MySQL ADD COLUMN 无 IF NOT EXISTS)。
  199. column_ddl 为 ADD COLUMN 后的完整定义,如 \"source JSON NULL ... AFTER post_title\"。"""
  200. cur.execute("""SELECT COUNT(*) AS n FROM information_schema.columns
  201. WHERE table_schema=DATABASE() AND table_name=%s AND column_name=%s""",
  202. (table, column))
  203. if cur.fetchone()["n"] == 0:
  204. cur.execute(f"ALTER TABLE {table} ADD COLUMN {column_ddl}")
  205. def _ensure_unique_index(cur, table, index_name, cols):
  206. """幂等加唯一索引:已存在则跳过(MySQL ADD INDEX 无 IF NOT EXISTS)。
  207. cols 为列表达式,如 "query_id, case_id, version, seq"。加之前需保证无冲突数据。"""
  208. cur.execute("""SELECT COUNT(*) AS n FROM information_schema.statistics
  209. WHERE table_schema=DATABASE() AND table_name=%s AND index_name=%s""",
  210. (table, index_name))
  211. if cur.fetchone()["n"] == 0:
  212. cur.execute(f"ALTER TABLE {table} ADD UNIQUE KEY {index_name} ({cols})")
  213. def init_tables():
  214. conn = _conn()
  215. try:
  216. with conn.cursor() as cur:
  217. cur.execute(_ddl_search("search_process", "工序方向"))
  218. cur.execute(_ddl_search("search_tools", "工具方向"))
  219. cur.execute(DDL_PROCESS)
  220. cur.execute(DDL_TOOLS)
  221. cur.execute(DDL_INGEST_LOG)
  222. cur.execute(DDL_TOOLS_INGEST_LOG)
  223. # 历史库迁移:version 由 VARCHAR(16) 放宽到 32,容纳 link_v_mopN_* 复制版本。
  224. # MODIFY 幂等(已是 32 则 MySQL 元数据无操作),建表后表必存在,可安全执行。
  225. for t in ("mode_process", "mode_tools"):
  226. cur.execute(f"ALTER TABLE {t} MODIFY COLUMN version VARCHAR(32) NULL")
  227. # 历史库迁移:给老 mode_tools 补 source 列(MySQL 的 ADD COLUMN 无 IF NOT EXISTS,
  228. # 故先查 information_schema 判存在,缺了才 ADD,幂等)。
  229. _ensure_column(cur, "mode_tools", "source",
  230. "source JSON NULL COMMENT '解构时帖子来源块' AFTER post_title")
  231. # 历史库迁移:加 seq(帖内序号)+ (query_id,case_id,version,seq) 唯一键,防并发/重复
  232. # 写入产生重复行。顺序必须是 加列 → 回填 → 加唯一键。MySQL 5.7 无窗口函数,seq 在
  233. # 应用层按 (query_id,case_id,version) 内 id 升序回填(现有数据该粒度已无重复)。
  234. for t in ("mode_process", "mode_tools"):
  235. _ensure_column(cur, t, "seq",
  236. "seq SMALLINT NULL COMMENT '帖内序号(0-based)' AFTER duration_s")
  237. for t in ("mode_process", "mode_tools"):
  238. cur.execute(f"""SELECT id, query_id, case_id, version FROM {t}
  239. WHERE seq IS NULL ORDER BY query_id, case_id, version, id""")
  240. key, n, ups = None, 0, []
  241. for r in cur.fetchall():
  242. k = (r["query_id"], r["case_id"], r["version"])
  243. if k != key:
  244. key, n = k, 0
  245. ups.append((n, r["id"])); n += 1
  246. if ups:
  247. cur.executemany(f"UPDATE {t} SET seq=%s WHERE id=%s", ups)
  248. print(f" ↳ {t}: 回填 seq {len(ups)} 行")
  249. for t in ("mode_process", "mode_tools"):
  250. _ensure_unique_index(cur, t, "uk_q_case_ver_seq",
  251. "query_id, case_id, version, seq")
  252. print("✅ 建表完成:search_process, search_tools, mode_process, mode_tools, "
  253. "knowledge_ingest_log, tools_ingest_log")
  254. finally:
  255. conn.close()
  256. def clear_tables():
  257. """清空四张表的数据(TRUNCATE,表结构保留)。"""
  258. conn = _conn()
  259. try:
  260. with conn.cursor() as cur:
  261. for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
  262. cur.execute(f"TRUNCATE TABLE {t}")
  263. print(f"🧹 已清空 {t}")
  264. finally:
  265. conn.close()
  266. # ── 工具函数 ──────────────────────────────────────────────────────────────────
  267. def _loads(v, default=None):
  268. """pymysql 的 JSON 列可能返回字符串,统一解析。"""
  269. if v is None:
  270. return default
  271. if isinstance(v, (list, dict)):
  272. return v
  273. try:
  274. return json.loads(v)
  275. except Exception:
  276. return default
  277. def _j(v):
  278. """写入 JSON 列:None 保持 NULL,其余 dumps。"""
  279. return None if v is None else json.dumps(v, ensure_ascii=False)
  280. def _collect_scores(node):
  281. """递归收集嵌套评估里所有「得分」。LLM 直出的得分多为字符串("1"/"4"),
  282. 个别为数字(如 时效性 10),统一按 float 解析;非数值(如 "N/A")跳过不计入。"""
  283. out = []
  284. if isinstance(node, dict):
  285. for k, v in node.items():
  286. if k == "得分":
  287. try:
  288. out.append(float(v))
  289. except (TypeError, ValueError):
  290. pass
  291. else:
  292. out.extend(_collect_scores(v))
  293. elif isinstance(node, list):
  294. for v in node:
  295. out.extend(_collect_scores(v))
  296. return out
  297. def overall_score(e):
  298. """综合分 = (相关性各项均值 + 质量各项均值) / 可得部分数。算不出返回 None。"""
  299. parts = []
  300. for key in ("相关性", "质量"):
  301. scores = _collect_scores((e or {}).get(key))
  302. if scores:
  303. parts.append(sum(scores) / len(scores))
  304. return round(sum(parts) / len(parts), 2) if parts else None
  305. def _recency_hard(date_str):
  306. """硬时效(同 mode_procedure/server.py:_recency_hard):半年内=3 / 两年内=2 / 更早=1。
  307. publish_time 头 10 字符按 YYYY-MM-DD 解析,失败返回 None(不参与判定)。"""
  308. try:
  309. d = datetime.strptime(str(date_str or "")[:10], "%Y-%m-%d")
  310. except (ValueError, TypeError):
  311. return None
  312. days = (datetime.now() - d).days
  313. if days <= 180:
  314. return 3
  315. if days <= 730:
  316. return 2
  317. return 1
  318. def _fixed_dim_score(evaluation, name):
  319. """取 质量.固定维度.<name>.得分 标量,缺失/非数值返回 None(不参与判定)。"""
  320. v = (((evaluation or {}).get("质量") or {}).get("固定维度") or {}).get(name)
  321. if isinstance(v, dict):
  322. v = v.get("得分")
  323. try:
  324. return float(v) if v is not None else None
  325. except (TypeError, ValueError):
  326. return None
  327. def _impl_score(evaluation):
  328. """取 质量.动态维度.工序.字段完整性.实现完整性.得分 标量,缺失/非数值返回 None。
  329. 新版 prompt 把旧「可复现性」的硬封顶规则并入了「实现完整性」,故采纳门槛改读此处。"""
  330. v = ((((((evaluation or {}).get("质量") or {}).get("动态维度") or {})
  331. .get("工序") or {}).get("字段完整性") or {}).get("实现完整性"))
  332. if isinstance(v, dict):
  333. v = v.get("得分")
  334. try:
  335. return float(v) if v is not None else None
  336. except (TypeError, ValueError):
  337. return None
  338. def _repro_score(evaluation):
  339. """采纳门槛用的「可复现/可实现」得分:优先旧版「可复现性」(固定维度),
  340. 缺失则回退新版「实现完整性」(动态维度.工序)。这样新旧两套评估 blob 都能正确判定。"""
  341. v = _fixed_dim_score(evaluation, "可复现性")
  342. return v if v is not None else _impl_score(evaluation)
  343. def is_adopted(overall, evaluation, publish_time):
  344. """采纳/命中判定,口径对齐 mode_procedure 的 decision=="report":
  345. 制作相关性<4、可复现/实现完整性<4、发布超两年、综合分<6 —— 任一命中即不采纳;指标缺失不参与判定。
  346. (意图可控性暂只采分不设门槛,留待阈值标定后再开。)
  347. 可复现/实现门槛兼容新旧 schema:旧版读「可复现性」,新版读「实现完整性」(见 _repro_score)。
  348. fail-closed:评估失败(_error)、blob 缺失/为空、或综合分算不出(None)→ 直接判不采纳。
  349. 评不出的帖子不该混进命中集(此前 fail-open 会因各指标取不到值而误判采纳)。"""
  350. if not isinstance(evaluation, dict) or not evaluation or evaluation.get("_error"):
  351. return False
  352. if overall is None:
  353. return False
  354. rel = None
  355. v = ((evaluation or {}).get("相关性") or {}).get("和内容制作知识相关")
  356. if isinstance(v, dict):
  357. v = v.get("得分")
  358. try:
  359. rel = float(v) if v is not None else None
  360. except (TypeError, ValueError):
  361. rel = None
  362. if rel is not None and rel < 4:
  363. return False
  364. repro = _repro_score(evaluation)
  365. if repro is not None and repro < 4:
  366. return False
  367. rh = _recency_hard(publish_time)
  368. if rh is not None and rh < 2:
  369. return False
  370. if overall is not None and float(overall) < 6:
  371. return False
  372. return True
  373. def is_adopted_rel(overall, rel, publish_time, repro=None):
  374. """is_adopted 的轻量版:相关性得分(rel)、可复现/实现门槛(repro)已由 SQL JSON_EXTRACT
  375. 直接取出(repro 由 _REPRO_SQL 兼容新旧 schema 取值),无需传输/解析整块 llm_evaluation。
  376. 判定口径与 is_adopted 完全一致(含 fail-closed:综合分算不出→不采纳;失败帖的 overall_score 列为 NULL)。"""
  377. if overall is None:
  378. return False
  379. try:
  380. rel = float(rel) if rel is not None else None
  381. except (TypeError, ValueError):
  382. rel = None
  383. if rel is not None and rel < 4:
  384. return False
  385. try:
  386. repro = float(repro) if repro is not None else None
  387. except (TypeError, ValueError):
  388. repro = None
  389. if repro is not None and repro < 4:
  390. return False
  391. rh = _recency_hard(publish_time)
  392. if rh is not None and rh < 2:
  393. return False
  394. if overall is not None and float(overall) < 6:
  395. return False
  396. return True
  397. # ── search_process / search_tools ────────────────────────────────────────────
  398. def upsert_search_posts(query_id, query_text, results, table="search_process"):
  399. """一组搜索结果写入指定搜索表(按 (query_id, case_id) upsert)。返回写入条数。
  400. table:search_process(工序方向) / search_tools(工具方向)。"""
  401. table = _search_table(table)
  402. if not results:
  403. return 0
  404. rows = []
  405. for r in results:
  406. post = r.get("post") or {}
  407. e = r.get("llm_evaluation") or {}
  408. rows.append((
  409. query_id, query_text, r.get("case_id"), r.get("platform"),
  410. r.get("channel_content_id"),
  411. (post.get("title") or post.get("desc") or "")[:500],
  412. r.get("source_url"), post.get("content_type"),
  413. post.get("body_text") or post.get("desc") or "",
  414. _j(post.get("images") or []), _j(post.get("videos") or []),
  415. post.get("like_count"),
  416. str(post.get("publish_time") or post.get("publish_timestamp") or "")[:64],
  417. post.get("_quality_score"), post.get("_quality_grade"),
  418. _j(r.get("found_by_queries") or []),
  419. _j(e.get("知识类型") or []),
  420. overall_score(e),
  421. _j(e),
  422. ))
  423. sql = f"""
  424. INSERT INTO {table}
  425. (query_id, query_text, case_id, platform, channel_content_id, title, url,
  426. content_type, body, images, videos, like_count, publish_time,
  427. quality_score, quality_grade, found_by, knowledge_type,
  428. overall_score, llm_evaluation)
  429. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  430. ON DUPLICATE KEY UPDATE
  431. query_text=VALUES(query_text), platform=VALUES(platform),
  432. channel_content_id=VALUES(channel_content_id), title=VALUES(title), url=VALUES(url),
  433. content_type=VALUES(content_type), body=VALUES(body), images=VALUES(images),
  434. videos=VALUES(videos), like_count=VALUES(like_count), publish_time=VALUES(publish_time),
  435. quality_score=VALUES(quality_score), quality_grade=VALUES(quality_grade),
  436. found_by=VALUES(found_by), knowledge_type=VALUES(knowledge_type),
  437. overall_score=VALUES(overall_score), llm_evaluation=VALUES(llm_evaluation);
  438. """
  439. conn = _conn()
  440. try:
  441. with conn.cursor() as cur:
  442. cur.executemany(sql, rows)
  443. return len(rows)
  444. finally:
  445. conn.close()
  446. def fetch_queries(mode="process"):
  447. """某方向搜索表的 query 列表 + 帖子数 + 采纳/命中数 + 解构进度。"""
  448. table = _search_table(mode)
  449. conn = _conn()
  450. try:
  451. with conn.cursor() as cur:
  452. cur.execute(f"""SELECT query_id, MAX(query_text) AS query_text,
  453. COUNT(*) AS post_count
  454. FROM {table} GROUP BY query_id ORDER BY query_id""")
  455. queries = cur.fetchall()
  456. # 采纳数:SQL 直取 rel/repro 标量算,**不拉整表 llm_evaluation**(旧版全表 blob,切 tab 巨慢)
  457. cur.execute(f"""SELECT query_id, overall_score, publish_time,
  458. {_REL_SQL} AS rel, {_REPRO_SQL} AS repro FROM {table}""")
  459. hits = {}
  460. for r in cur.fetchall():
  461. if is_adopted_rel(r["overall_score"], r["rel"], r["publish_time"], r["repro"]):
  462. hits[r["query_id"]] = hits.get(r["query_id"], 0) + 1
  463. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_process GROUP BY query_id")
  464. np = {r["query_id"]: r["n"] for r in cur.fetchall()}
  465. cur.execute("SELECT query_id, COUNT(DISTINCT case_id) AS n FROM mode_tools GROUP BY query_id")
  466. nt = {r["query_id"]: r["n"] for r in cur.fetchall()}
  467. finally:
  468. conn.close()
  469. for q in queries:
  470. q["hit_count"] = hits.get(q["query_id"], 0)
  471. q["process_done"] = np.get(q["query_id"], 0)
  472. q["tools_done"] = nt.get(q["query_id"], 0)
  473. return queries
  474. def fetch_posts(query_id, mode="process"):
  475. """列表用:只取列表所需列 + SQL 直取 adopted 标量,**不拉 body/videos/llm_evaluation 大字段**
  476. (llm_evaluation ~1.5MB/帖,旧版 SELECT * 切 tab/选 query 要几十 MB 过远程 RDS,故慢)。
  477. 正文/评分等详情按需走 fetch_post。带 adopted/has_process/has_tools;adopted 口径用
  478. is_adopted_rel(与 is_adopted 完全一致,rel/repro 由 _REL_SQL/_REPRO_SQL 直取标量)。"""
  479. table = _search_table(mode)
  480. conn = _conn()
  481. try:
  482. with conn.cursor() as cur:
  483. cur.execute(f"""SELECT id, query_id, query_text, case_id, platform, channel_content_id,
  484. title, url, content_type, images, like_count, publish_time,
  485. quality_score, quality_grade, found_by, knowledge_type, overall_score,
  486. {_REL_SQL} AS rel, {_REPRO_SQL} AS repro
  487. FROM {table} WHERE query_id=%s
  488. ORDER BY overall_score DESC, id""", (query_id,))
  489. rows = cur.fetchall()
  490. cur.execute("SELECT DISTINCT case_id FROM mode_process WHERE query_id=%s", (query_id,))
  491. hp = {r["case_id"] for r in cur.fetchall()}
  492. cur.execute("SELECT DISTINCT case_id FROM mode_tools WHERE query_id=%s", (query_id,))
  493. ht = {r["case_id"] for r in cur.fetchall()}
  494. finally:
  495. conn.close()
  496. for r in rows:
  497. for col in ("images", "found_by", "knowledge_type"):
  498. r[col] = _loads(r[col])
  499. r["adopted"] = is_adopted_rel(r["overall_score"], r.pop("rel", None),
  500. r["publish_time"], r.pop("repro", None))
  501. r["has_process"] = r["case_id"] in hp
  502. r["has_tools"] = r["case_id"] in ht
  503. return rows
  504. def fetch_post(query_id, case_id, table="search_process"):
  505. """指定搜索表的单帖完整行(给 pipeline 脚本重建 source 用)。无则 None。"""
  506. table = _search_table(table)
  507. conn = _conn()
  508. try:
  509. with conn.cursor() as cur:
  510. cur.execute(f"SELECT * FROM {table} WHERE query_id=%s AND case_id=%s",
  511. (query_id, case_id))
  512. row = cur.fetchone()
  513. finally:
  514. conn.close()
  515. if not row:
  516. return None
  517. for col in ("images", "videos", "found_by", "knowledge_type", "llm_evaluation"):
  518. row[col] = _loads(row[col])
  519. return row
  520. def fetch_all_posts(mode="process", *, query_ids=None, adopted_only=False, distinct=False,
  521. limit=None, offset=0):
  522. """某方向「全部帖子」:跨所有 query 的列表(瘦身列,口径同 fetch_posts,不拉
  523. body/videos/llm_evaluation 大字段)。fetch_posts 限定单 query,本函数默认取全表。
  524. - query_ids:选填 query_id 列表,传了就 WHERE query_id IN(...) 只取这些 query
  525. 的帖子(SQL 层过滤,不拉全表);None=全部,[]=空结果。
  526. - adopted_only=True:只返回采纳帖(is_adopted_rel 口径,rel/repro 由
  527. _REL_SQL/_REPRO_SQL 直取标量算,不拉整表 blob)。
  528. - distinct=True:按 case_id 去重(同一帖被多个 query 搜到时,只保留
  529. overall_score 最高的一行——已按 score 降序,取首次出现即最高分)。
  530. - limit/offset:分页(limit=None 不分页)。
  531. 返回 (total, rows):total 为过滤(+去重)后的总条数,rows 为本页切片。"""
  532. table = _search_table(mode)
  533. where, params = "", []
  534. if query_ids is not None:
  535. if not query_ids:
  536. return 0, [] # 显式空列表:直接空结果,不必查库
  537. where = " WHERE query_id IN (" + ",".join(["%s"] * len(query_ids)) + ")"
  538. params = list(query_ids)
  539. conn = _conn()
  540. try:
  541. with conn.cursor() as cur:
  542. cur.execute(f"""SELECT id, query_id, query_text, case_id, platform, channel_content_id,
  543. title, url, content_type, images, like_count, publish_time,
  544. quality_score, quality_grade, found_by, knowledge_type, overall_score,
  545. {_REL_SQL} AS rel, {_REPRO_SQL} AS repro
  546. FROM {table}{where}
  547. ORDER BY overall_score DESC, id""", params)
  548. rows = cur.fetchall()
  549. # has_process/has_tools 全局判定:跨 query 的「该帖是否已解构」,两张解构表各取一次
  550. cur.execute("SELECT DISTINCT case_id FROM mode_process")
  551. hp = {r["case_id"] for r in cur.fetchall()}
  552. cur.execute("SELECT DISTINCT case_id FROM mode_tools")
  553. ht = {r["case_id"] for r in cur.fetchall()}
  554. finally:
  555. conn.close()
  556. out, seen = [], set()
  557. for r in rows:
  558. for col in ("images", "found_by", "knowledge_type"):
  559. r[col] = _loads(r[col])
  560. r["adopted"] = is_adopted_rel(r["overall_score"], r.pop("rel", None),
  561. r["publish_time"], r.pop("repro", None))
  562. if adopted_only and not r["adopted"]:
  563. continue
  564. if distinct:
  565. if r["case_id"] in seen:
  566. continue
  567. seen.add(r["case_id"])
  568. r["has_process"] = r["case_id"] in hp
  569. r["has_tools"] = r["case_id"] in ht
  570. out.append(r)
  571. total = len(out)
  572. if limit is not None:
  573. out = out[offset:offset + limit]
  574. elif offset:
  575. out = out[offset:]
  576. return total, out
  577. def count_executed_queries(mode="process"):
  578. """该方向「已执行」的 query 数 = 搜索表里出现过的 distinct query_id 个数。
  579. 注:一次搜索若 0 命中则不写任何行,故不计入(口径为「已产出结果的 query」)。"""
  580. table = _search_table(mode)
  581. conn = _conn()
  582. try:
  583. with conn.cursor() as cur:
  584. cur.execute(f"SELECT COUNT(DISTINCT query_id) AS n FROM {table}")
  585. return cur.fetchone()["n"]
  586. finally:
  587. conn.close()
  588. # ── mode_process ─────────────────────────────────────────────────────────────
  589. def replace_process(query_id, case_id, platform, post_title, payload,
  590. model, version, cost_usd, duration_s):
  591. """写入一帖某版本的工序解构结果(payload = {source, procedures})。
  592. 删 (case_id, version) 旧行再插,同版本重跑幂等、跨版本保留历史。返回工序条数。"""
  593. source = payload.get("source")
  594. procedures = payload.get("procedures") or []
  595. conn = _conn()
  596. try:
  597. conn.begin() # DELETE+INSERT 原子化:配合 uk_q_case_ver_seq,并发/重复写入不会留下重复行
  598. with conn.cursor() as cur:
  599. cur.execute("DELETE FROM mode_process WHERE case_id=%s AND version=%s",
  600. (case_id, version))
  601. if procedures:
  602. rows = []
  603. for i, p in enumerate(procedures):
  604. steps = p.get("steps") or []
  605. vias = []
  606. for s in steps:
  607. v = s.get("via")
  608. if v and v not in vias:
  609. vias.append(v)
  610. rows.append((
  611. query_id, case_id, platform, (post_title or "")[:500],
  612. _j(source), p.get("id"), (p.get("name") or "")[:250],
  613. p.get("purpose"), p.get("category"),
  614. _j(p.get("declarations")), _j(p.get("type_registry")),
  615. _j(steps), len(steps), _j(vias),
  616. model, version, cost_usd, duration_s, i,
  617. ))
  618. cur.executemany("""
  619. INSERT INTO mode_process
  620. (query_id, case_id, platform, post_title, source, procedure_id, name,
  621. purpose, category, declarations, type_registry, steps, step_count,
  622. tools_used, model, version, cost_usd, duration_s, seq)
  623. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  624. """, rows)
  625. conn.commit()
  626. return len(procedures)
  627. except Exception:
  628. conn.rollback()
  629. raise
  630. finally:
  631. conn.close()
  632. def fetch_process_versions(case_id):
  633. conn = _conn()
  634. try:
  635. with conn.cursor() as cur:
  636. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  637. FROM mode_process WHERE case_id=%s
  638. GROUP BY version
  639. ORDER BY (LEFT(version,5)='link_') ASC, MAX(id) DESC""", (case_id,))
  640. return cur.fetchall()
  641. finally:
  642. conn.close()
  643. def fetch_process(case_id, version=None):
  644. """重建 {case_id, version, model, source, procedures:[...]}。version=None 取最新。"""
  645. conn = _conn()
  646. try:
  647. with conn.cursor() as cur:
  648. if version is None:
  649. cur.execute("""SELECT version FROM mode_process WHERE case_id=%s
  650. ORDER BY (LEFT(version,5)='link_') ASC, id DESC LIMIT 1""", (case_id,))
  651. row = cur.fetchone()
  652. if not row:
  653. return None
  654. version = row["version"]
  655. cur.execute("""SELECT * FROM mode_process WHERE case_id=%s AND version=%s
  656. ORDER BY id""", (case_id, version))
  657. rows = cur.fetchall()
  658. finally:
  659. conn.close()
  660. return _proc_payload(case_id, version, rows)
  661. def fetch_process_by_query(query_id, case_id, version=None):
  662. """同 fetch_process,但用 (query_id, case_id) 精确定位某 query 下该帖的工序
  663. (category-match 用:post_id=query_id / knowledge_id=case_id)。
  664. version=None 取该 (query_id, case_id) 下最新真实版(link_ 排后)。无行返回 None。"""
  665. conn = _conn()
  666. try:
  667. with conn.cursor() as cur:
  668. if version is None:
  669. cur.execute("""SELECT version FROM mode_process WHERE query_id=%s AND case_id=%s
  670. ORDER BY (LEFT(version,5)='link_') ASC, id DESC LIMIT 1""",
  671. (query_id, case_id))
  672. row = cur.fetchone()
  673. if not row:
  674. return None
  675. version = row["version"]
  676. cur.execute("""SELECT * FROM mode_process WHERE query_id=%s AND case_id=%s AND version=%s
  677. ORDER BY seq, id""", (query_id, case_id, version))
  678. rows = cur.fetchall()
  679. finally:
  680. conn.close()
  681. return _proc_payload(case_id, version, rows)
  682. def update_process_steps_by_query(query_id, case_id, version, steps_in_order):
  683. """按工序顺序覆盖某 (query_id, case_id, version) 各行的 steps JSON 列。
  684. steps_in_order 必须与 fetch_process_by_query 返回的 procedures 同序(均按 seq, id 升序);
  685. 按行 id 一一对应更新,稳健于 seq 不连续。行数与工序数不符则报错回滚。返回更新行数。"""
  686. conn = _conn()
  687. try:
  688. conn.begin()
  689. with conn.cursor() as cur:
  690. cur.execute("""SELECT id FROM mode_process
  691. WHERE query_id=%s AND case_id=%s AND version=%s
  692. ORDER BY seq, id""", (query_id, case_id, version))
  693. ids = [r["id"] for r in cur.fetchall()]
  694. if len(ids) != len(steps_in_order):
  695. raise ValueError(f"行数({len(ids)})与工序数({len(steps_in_order)})不一致")
  696. n = 0
  697. for row_id, steps in zip(ids, steps_in_order):
  698. cur.execute("UPDATE mode_process SET steps=%s WHERE id=%s", (_j(steps), row_id))
  699. n += cur.rowcount
  700. conn.commit()
  701. return n
  702. except Exception:
  703. conn.rollback()
  704. raise
  705. finally:
  706. conn.close()
  707. def _proc_payload(case_id, version, rows):
  708. """mode_process 行集 → {case_id, version, …, procedures:[...]}。无行返回 None。"""
  709. if not rows:
  710. return None
  711. procedures = [{
  712. "id": r["procedure_id"], "name": r["name"], "purpose": r["purpose"],
  713. "category": r["category"], "declarations": _loads(r["declarations"]),
  714. "type_registry": _loads(r["type_registry"]), "steps": _loads(r["steps"], []),
  715. "tools_used": _loads(r["tools_used"], []),
  716. } for r in rows]
  717. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  718. "title": rows[0]["post_title"], "model": rows[0]["model"],
  719. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  720. "duration_s": rows[0]["duration_s"],
  721. "source": _loads(rows[0]["source"]), "procedures": procedures}
  722. # ── mode_tools ───────────────────────────────────────────────────────────────
  723. def replace_tools(query_id, case_id, platform, post_title, tools,
  724. model, version, cost_usd, duration_s, source=None):
  725. """写入一帖某版本的工具解构结果。语义同 replace_process。返回工具条数。
  726. source:帖子来源块(同 mode_process,每行重复存),供知识上传脚本重建 source 用。"""
  727. src = _j(source)
  728. conn = _conn()
  729. try:
  730. conn.begin() # DELETE+INSERT 原子化:配合 uk_q_case_ver_seq,并发/重复写入不会留下重复行
  731. with conn.cursor() as cur:
  732. cur.execute("DELETE FROM mode_tools WHERE case_id=%s AND version=%s",
  733. (case_id, version))
  734. if tools:
  735. rows = [(
  736. query_id, case_id, platform, (post_title or "")[:500], src,
  737. (t.get("工具名称") or "")[:250],
  738. _j(t.get("实质作用域")), _j(t.get("形式作用域")),
  739. t.get("创作层级"), t.get("来源链接"), t.get("输入"), t.get("输出"),
  740. _j(t.get("用法")), _j(t.get("案例")), _j(t.get("缺点")),
  741. t.get("最新更新时间"), model, version, cost_usd, duration_s, i,
  742. ) for i, t in enumerate(tools)]
  743. cur.executemany("""
  744. INSERT INTO mode_tools
  745. (query_id, case_id, platform, post_title, source, tool_name, substance_scope,
  746. form_scope, creation_layer, source_link, input_desc, output_desc,
  747. usage_json, cases_json, defects_json, updated_time, model, version,
  748. cost_usd, duration_s, seq)
  749. VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
  750. """, rows)
  751. conn.commit()
  752. return len(tools)
  753. except Exception:
  754. conn.rollback()
  755. raise
  756. finally:
  757. conn.close()
  758. def fetch_tools_versions(case_id):
  759. conn = _conn()
  760. try:
  761. with conn.cursor() as cur:
  762. cur.execute("""SELECT version, COUNT(*) AS n, MAX(model) AS model
  763. FROM mode_tools WHERE case_id=%s
  764. GROUP BY version
  765. ORDER BY (LEFT(version,5)='link_') ASC, MAX(id) DESC""", (case_id,))
  766. return cur.fetchall()
  767. finally:
  768. conn.close()
  769. def fetch_tools(case_id, version=None):
  770. """重建 {case_id, version, model, tool_count, tools:[...]}。version=None 取最新。"""
  771. conn = _conn()
  772. try:
  773. with conn.cursor() as cur:
  774. if version is None:
  775. cur.execute("""SELECT version FROM mode_tools WHERE case_id=%s
  776. ORDER BY (LEFT(version,5)='link_') ASC, id DESC LIMIT 1""", (case_id,))
  777. row = cur.fetchone()
  778. if not row:
  779. return None
  780. version = row["version"]
  781. cur.execute("""SELECT * FROM mode_tools WHERE case_id=%s AND version=%s
  782. ORDER BY id""", (case_id, version))
  783. rows = cur.fetchall()
  784. finally:
  785. conn.close()
  786. return _tools_payload(case_id, version, rows)
  787. def _tools_payload(case_id, version, rows):
  788. """mode_tools 行集 → {case_id, version, …, tools:[...]}。无行返回 None。"""
  789. if not rows:
  790. return None
  791. tools = [{
  792. "工具名称": r["tool_name"], "实质作用域": _loads(r["substance_scope"]),
  793. "形式作用域": _loads(r["form_scope"]), "创作层级": r["creation_layer"],
  794. "来源链接": r["source_link"], "输入": r["input_desc"], "输出": r["output_desc"],
  795. "用法": _loads(r["usage_json"]), "案例": _loads(r["cases_json"]),
  796. "缺点": _loads(r["defects_json"]), "最新更新时间": r["updated_time"],
  797. } for r in rows]
  798. return {"case_id": case_id, "version": version, "platform": rows[0]["platform"],
  799. "title": rows[0]["post_title"], "model": rows[0]["model"],
  800. "cost_usd": float(rows[0]["cost_usd"]) if rows[0]["cost_usd"] is not None else None,
  801. "duration_s": rows[0]["duration_s"],
  802. "source": _loads(rows[0].get("source")),
  803. "tool_count": len(tools), "tools": tools}
  804. # ── 点击帖子合一查询(单连接,最少往返;远程 RDS 每次往返 ~80ms,故按次数优化)──
  805. def fetch_extract(mode, case_id, version=None):
  806. """一次取版本列表 + 解构详情,复用同一条池连接、最少往返。
  807. 返回 {versions, data, missing}。mode: process / tools。"""
  808. is_proc = mode != "tools"
  809. mtable = _mode_table("process" if is_proc else "tools")
  810. conn = _conn()
  811. try:
  812. with conn.cursor() as cur:
  813. cur.execute(f"""SELECT version, COUNT(*) AS n, MAX(model) AS model
  814. FROM {mtable} WHERE case_id=%s
  815. GROUP BY version
  816. ORDER BY (LEFT(version,5)='link_') ASC, MAX(id) DESC""", (case_id,))
  817. versions = cur.fetchall()
  818. # 详情:把"取最新版本"折进同一条 SQL,版本指定时直接用;省一次往返。
  819. target = version or (versions[0]["version"] if versions else None)
  820. rows = []
  821. if target is not None:
  822. cur.execute(f"SELECT * FROM {mtable} WHERE case_id=%s AND version=%s ORDER BY id",
  823. (case_id, target))
  824. rows = cur.fetchall()
  825. finally:
  826. conn.close()
  827. payload = (_proc_payload if is_proc else _tools_payload)(case_id, target, rows)
  828. return {"versions": versions, "data": payload, "missing": payload is None}
  829. # ── 跨 query 去重 / link 复制(方案A:解构前先去重,避免重复花钱)──────────────
  830. # case_id 是帖子物理身份(platform_channelContentId),与 query 无关。同一帖被多个
  831. # query 搜到时只需真实解构一次;其余 query 用 link_* 复制行补齐关联(cost=0)。
  832. def latest_real_version(case_id, mode="process"):
  833. """该 case 是否已有「真实」解构(任意 query;link_* 是复制品,不算源)。
  834. 返回最新一行 {"version","query_id"} 或 None。给解构前去重判定用。"""
  835. table = _mode_table(mode)
  836. conn = _conn()
  837. try:
  838. with conn.cursor() as cur:
  839. cur.execute(f"""SELECT version, query_id FROM {table}
  840. WHERE case_id=%s AND LEFT(version,5) <> 'link_'
  841. ORDER BY id DESC LIMIT 1""", (case_id,))
  842. return cur.fetchone()
  843. finally:
  844. conn.close()
  845. def link_process(query_id, case_id, mode="process"):
  846. """把 case 在别处最新「真实」版本的解构行复制到目标 query
  847. (version='link_'+源版本, cost_usd=0)。幂等(先删目标同版本)。
  848. 返回复制行数;该 case 从未真实解构过则返回 0(无源可复制)。"""
  849. table = _mode_table(mode)
  850. conn = _conn()
  851. try:
  852. with conn.cursor() as cur:
  853. cur.execute(f"""SELECT version FROM {table}
  854. WHERE case_id=%s AND LEFT(version,5) <> 'link_'
  855. ORDER BY id DESC LIMIT 1""", (case_id,))
  856. r = cur.fetchone()
  857. if not r:
  858. return 0
  859. srcver = r["version"]
  860. newver = ("link_" + srcver)[:32] # version 列 VARCHAR(32)
  861. # 复制除自增 id / 时间戳外的全部列,改写 query_id / version / cost。
  862. cur.execute(f"SHOW COLUMNS FROM {table}")
  863. cols = [c["Field"] for c in cur.fetchall()
  864. if c["Field"] not in ("id", "created_at", "updated_at")]
  865. cur.execute(f"SELECT {','.join(cols)} FROM {table} WHERE case_id=%s AND version=%s",
  866. (case_id, srcver))
  867. rows = cur.fetchall()
  868. cur.execute(f"DELETE FROM {table} WHERE query_id=%s AND case_id=%s AND version=%s",
  869. (query_id, case_id, newver))
  870. for row in rows:
  871. row = dict(row)
  872. row["query_id"] = query_id
  873. row["version"] = newver
  874. row["cost_usd"] = 0
  875. cur.execute(
  876. f"INSERT INTO {table} ({','.join(cols)}) VALUES ({','.join(['%s']*len(cols))})",
  877. [row[k] for k in cols])
  878. return len(rows)
  879. finally:
  880. conn.close()
  881. # ── Dashboard 原始行(指标计算在 server.py)─────────────────────────────────────
  882. # 采纳判定只需「和内容制作知识相关」的得分,用 SQL JSON_EXTRACT 直取这一个标量,
  883. # 避免把整块 llm_evaluation(本库 ~1.5MB)拉到 Python 再解析。得分可能直接是数字,
  884. # 也可能裹在 {"得分": x} 里,COALESCE 两条路径覆盖两种存法,口径同 is_adopted。
  885. _REL_SQL = ("JSON_UNQUOTE(COALESCE("
  886. "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\".\"得分\"'),"
  887. "JSON_EXTRACT(llm_evaluation,'$.\"相关性\".\"和内容制作知识相关\"')))")
  888. # 可复现/实现门槛标量直取(口径同 is_adopted 的 _repro_score):兼容新旧 schema——
  889. # 旧版「质量.固定维度.可复现性」,新版「质量.动态维度.工序.字段完整性.实现完整性」,COALESCE 依次回退。
  890. _REPRO_SQL = ("JSON_UNQUOTE(COALESCE("
  891. "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"固定维度\".\"可复现性\".\"得分\"'),"
  892. "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"固定维度\".\"可复现性\"'),"
  893. "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"动态维度\".\"工序\".\"字段完整性\".\"实现完整性\".\"得分\"'),"
  894. "JSON_EXTRACT(llm_evaluation,'$.\"质量\".\"动态维度\".\"工序\".\"字段完整性\".\"实现完整性\"')))")
  895. def fetch_adopted_process_cases(query_id=None):
  896. """返回「已采纳且有工序解构」的 case_id 列表(供知识上传脚本用)。
  897. 采纳是帖子级属性(评估存在 search_process),工序解构存在 mode_process,故二者 JOIN:
  898. 只取两边都有的 case,再用 is_adopted_rel(口径同 Dashboard)在 Python 侧过滤。
  899. relevance 得分由 _REL_SQL 直取标量,不传整块 llm_evaluation。
  900. query_id 给定时只看该搜索任务下的 case。返回去重、按 case_id 排序的列表。
  901. """
  902. sql = (f"SELECT DISTINCT s.case_id, s.overall_score, s.publish_time, "
  903. f"{_REL_SQL} AS rel, {_REPRO_SQL} AS repro "
  904. "FROM search_process s "
  905. "JOIN (SELECT DISTINCT case_id FROM mode_process) m ON s.case_id = m.case_id")
  906. params = ()
  907. if query_id:
  908. sql += " WHERE s.query_id=%s"
  909. params = (query_id,)
  910. conn = _conn()
  911. try:
  912. with conn.cursor() as cur:
  913. cur.execute(sql, params)
  914. rows = cur.fetchall()
  915. finally:
  916. conn.close()
  917. cases = [r["case_id"] for r in rows
  918. if is_adopted_rel(r["overall_score"], r["rel"], r["publish_time"], r["repro"])]
  919. return sorted(set(cases))
  920. def fetch_adopted_tools_cases(query_id=None):
  921. """返回「已采纳且有工具解构」的 case_id 列表(供工具知识上传脚本用)。
  922. 与 fetch_adopted_process_cases 完全同构,只把搜索/解构表换成工具方向:
  923. 采纳是帖子级属性(评估存在 search_tools),工具解构存在 mode_tools,故二者 JOIN,
  924. 只取两边都有的 case,再用 is_adopted_rel(口径同 Dashboard)在 Python 侧过滤。
  925. query_id 给定时只看该搜索任务下的 case。返回去重、按 case_id 排序的列表。
  926. """
  927. sql = (f"SELECT DISTINCT s.case_id, s.overall_score, s.publish_time, "
  928. f"{_REL_SQL} AS rel, {_REPRO_SQL} AS repro "
  929. "FROM search_tools s "
  930. "JOIN (SELECT DISTINCT case_id FROM mode_tools) m ON s.case_id = m.case_id")
  931. params = ()
  932. if query_id:
  933. sql += " WHERE s.query_id=%s"
  934. params = (query_id,)
  935. conn = _conn()
  936. try:
  937. with conn.cursor() as cur:
  938. cur.execute(sql, params)
  939. rows = cur.fetchall()
  940. finally:
  941. conn.close()
  942. cases = [r["case_id"] for r in rows
  943. if is_adopted_rel(r["overall_score"], r["rel"], r["publish_time"], r["repro"])]
  944. return sorted(set(cases))
  945. def route_tables(knowledge_types):
  946. """知识类型标签 → 落表列表(有序去重)。
  947. 工序/能力 → search_process;工具 → search_tools;两者都含写两表;空/None 兜底 search_process。
  948. 评估是统一一套(同一 llm_evaluation blob),故同帖落多表不重复打分,只是多写一行。"""
  949. kt = set(knowledge_types or [])
  950. tables = []
  951. if kt & {"工具"}:
  952. tables.append("search_tools")
  953. if (kt & {"工序", "能力"}) or not tables: # 工序/能力,或没命中任何已知标签 → 兜底 process
  954. tables.insert(0, "search_process")
  955. return tables
  956. # ── 评估去重:复用 query 无关分,只重算 query 相关分(search_eval.py 用)──────────
  957. def fetch_existing_eval(case_id, table="search_process"):
  958. """返回该 case 在搜索表里最近一条「有效」评估 blob(任意 query)。
  959. 评估去重用:同帖在别的相似 query 下评过时,复用其 query 无关分(质量/通用相关/时效),
  960. 只重算「和 query 相关」。无有效评估(全是 _error 或没评过)返回 None。
  961. 取最近若干条逐一挑出首个非 error、结构完整的 blob。"""
  962. table = _search_table(table)
  963. conn = _conn()
  964. try:
  965. with conn.cursor() as cur:
  966. cur.execute(f"""SELECT llm_evaluation FROM {table}
  967. WHERE case_id=%s AND llm_evaluation IS NOT NULL
  968. ORDER BY updated_at DESC, id DESC LIMIT 5""", (case_id,))
  969. rows = cur.fetchall()
  970. finally:
  971. conn.close()
  972. for r in rows:
  973. e = _loads(r["llm_evaluation"])
  974. if isinstance(e, dict) and not e.get("_error") and isinstance(e.get("相关性"), dict):
  975. return e
  976. return None
  977. def fetch_existing_eval_any(case_id):
  978. """跨两张搜索表找该 case 最近一条有效评估 blob。
  979. 评估与表无关(统一一套),任一表评过即可复用,避免同帖在两表各评一次。无则 None。"""
  980. for table in ("search_process", "search_tools"):
  981. e = fetch_existing_eval(case_id, table)
  982. if e:
  983. return e
  984. return None
  985. def update_post_eval(query_id, case_id, evaluation, table="search_process"):
  986. """用新的评估 blob 覆盖某 (query, case) 行的 llm_evaluation,并同步重算派生列
  987. overall_score、knowledge_type(口径同 upsert_search_posts)。返回受影响行数。"""
  988. table = _search_table(table)
  989. overall = overall_score(evaluation)
  990. ktype = evaluation.get("知识类型") if isinstance(evaluation, dict) else None
  991. conn = _conn()
  992. try:
  993. with conn.cursor() as cur:
  994. n = cur.execute(
  995. f"UPDATE {table} SET llm_evaluation=%s, overall_score=%s, knowledge_type=%s "
  996. "WHERE query_id=%s AND case_id=%s",
  997. (_j(evaluation), overall, _j(ktype), query_id, case_id))
  998. return n
  999. finally:
  1000. conn.close()
  1001. # ── 上传去重:知识库已导入台账(stages/import_process_knowledge.py 用)────────────────
  1002. def fetch_ingested_map(case_id):
  1003. """返回 {proc_index: version} —— 该 case 各工序已导入知识库的版本。空表示没传过。"""
  1004. conn = _conn()
  1005. try:
  1006. with conn.cursor() as cur:
  1007. cur.execute("SELECT proc_index, version FROM knowledge_ingest_log WHERE case_id=%s",
  1008. (case_id,))
  1009. return {r["proc_index"]: r["version"] for r in cur.fetchall()}
  1010. finally:
  1011. conn.close()
  1012. def mark_ingested(case_id, proc_index, version, knowledge_id=None, api_url=None):
  1013. """记一条「已导入」台账(case_id+proc_index 唯一,重导同序号则更新版本/knowledge_id)。"""
  1014. conn = _conn()
  1015. try:
  1016. with conn.cursor() as cur:
  1017. cur.execute("""INSERT INTO knowledge_ingest_log
  1018. (case_id, proc_index, version, knowledge_id, api_url)
  1019. VALUES (%s,%s,%s,%s,%s)
  1020. ON DUPLICATE KEY UPDATE version=VALUES(version),
  1021. knowledge_id=VALUES(knowledge_id), api_url=VALUES(api_url)""",
  1022. (case_id, proc_index, version, knowledge_id, api_url))
  1023. finally:
  1024. conn.close()
  1025. def fetch_tools_ingested_map(case_id):
  1026. """返回 {tool_index: version} —— 该 case 各工具已导入知识库的版本。空表示没传过。
  1027. 工具方向独立台账(tools_ingest_log),与工序的 knowledge_ingest_log 互不干扰。"""
  1028. conn = _conn()
  1029. try:
  1030. with conn.cursor() as cur:
  1031. cur.execute("SELECT tool_index, version FROM tools_ingest_log WHERE case_id=%s",
  1032. (case_id,))
  1033. return {r["tool_index"]: r["version"] for r in cur.fetchall()}
  1034. finally:
  1035. conn.close()
  1036. def mark_tools_ingested(case_id, tool_index, version, knowledge_id=None, api_url=None):
  1037. """记一条工具「已导入」台账(case_id+tool_index 唯一,重导同序号则更新版本/knowledge_id)。"""
  1038. conn = _conn()
  1039. try:
  1040. with conn.cursor() as cur:
  1041. cur.execute("""INSERT INTO tools_ingest_log
  1042. (case_id, tool_index, version, knowledge_id, api_url)
  1043. VALUES (%s,%s,%s,%s,%s)
  1044. ON DUPLICATE KEY UPDATE version=VALUES(version),
  1045. knowledge_id=VALUES(knowledge_id), api_url=VALUES(api_url)""",
  1046. (case_id, tool_index, version, knowledge_id, api_url))
  1047. finally:
  1048. conn.close()
  1049. def fetch_dashboard_rows():
  1050. """拉 Dashboard 计算所需的轻量行。数据量级:百~千行,Python 聚合足够。
  1051. 优化:① 不传 llm_evaluation 整块,SQL 只取采纳判定要的相关性得分;
  1052. ② steps 只取每个 case 的最新版本(覆盖度只看最新版),历史/link_ 版本不传 steps。"""
  1053. conn = _conn()
  1054. try:
  1055. with conn.cursor() as cur:
  1056. # 进度分母走「采纳」口径;mode 标方向(工序帖来自 search_process)。
  1057. cols = (f"query_id, case_id, platform, overall_score, publish_time, "
  1058. f"{_REL_SQL} AS rel, {_REPRO_SQL} AS repro")
  1059. cur.execute(f"SELECT {cols} FROM search_process")
  1060. posts = cur.fetchall()
  1061. for p in posts:
  1062. p["mode"] = "process"
  1063. cur.execute(f"SELECT {cols} FROM search_tools")
  1064. st = cur.fetchall()
  1065. for p in st:
  1066. p["mode"] = "tools"
  1067. posts += st
  1068. # 成本/耗时按全部版本计;steps 仅最新版需要 → 非最新版只回 NULL,省传输。
  1069. cur.execute("""SELECT p.id, p.case_id, p.version, p.cost_usd, p.duration_s, p.created_at,
  1070. CASE WHEN p.version = m.maxv THEN p.steps END AS steps
  1071. FROM mode_process p
  1072. JOIN (SELECT t.case_id, t.version AS maxv FROM mode_process t
  1073. JOIN (SELECT case_id, MAX(id) AS mid FROM mode_process
  1074. WHERE LEFT(version,5) <> 'link_' GROUP BY case_id) x
  1075. ON t.id = x.mid) m
  1076. ON p.case_id = m.case_id
  1077. ORDER BY p.id""")
  1078. procs = cur.fetchall()
  1079. cur.execute("""SELECT id, case_id, version, tool_name, substance_scope,
  1080. form_scope, cost_usd, duration_s, created_at
  1081. FROM mode_tools""")
  1082. tools = cur.fetchall()
  1083. finally:
  1084. conn.close()
  1085. for p in posts:
  1086. # 采纳判定:口径同帖子列表(is_adopted),作为「需解构」分母依据
  1087. p["adopted"] = is_adopted_rel(p["overall_score"], p["rel"], p["publish_time"], p["repro"])
  1088. for r in procs:
  1089. r["steps"] = _loads(r["steps"], [])
  1090. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  1091. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  1092. for r in tools:
  1093. r["substance_scope"] = _loads(r["substance_scope"], [])
  1094. r["form_scope"] = _loads(r["form_scope"], [])
  1095. r["cost_usd"] = float(r["cost_usd"]) if r["cost_usd"] is not None else None
  1096. r["created_at"] = str(r["created_at"]) if r["created_at"] else None
  1097. return posts, procs, tools
  1098. def check():
  1099. conn = _conn()
  1100. try:
  1101. with conn.cursor() as cur:
  1102. for t in ("search_process", "search_tools", "mode_process", "mode_tools"):
  1103. cur.execute(f"SELECT COUNT(*) AS n FROM {t}")
  1104. print(f"{t}: {cur.fetchone()['n']} 行")
  1105. finally:
  1106. conn.close()
  1107. if __name__ == "__main__":
  1108. cmd = sys.argv[1] if len(sys.argv) > 1 else ""
  1109. if cmd == "init":
  1110. init_tables()
  1111. elif cmd == "check":
  1112. check()
  1113. elif cmd == "clear":
  1114. clear_tables()
  1115. else:
  1116. print("用法:\n python db.py init # 建表\n python db.py check # 四表行数\n python db.py clear # 清空四表数据")