database_runtime.py 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046
  1. from __future__ import annotations
  2. import json
  3. import os
  4. from dataclasses import dataclass
  5. from datetime import datetime, timezone
  6. from pathlib import Path
  7. from typing import Any, Callable
  8. import pymysql
  9. from content_agent.constants import DB_SCHEMA_VERSION
  10. ConnectionFactory = Callable[[], Any]
  11. FORBIDDEN_RAW_PAYLOAD_KEYS = {
  12. "password",
  13. "token",
  14. "access_token",
  15. "refresh_token",
  16. "api_key",
  17. "apikey",
  18. "secret",
  19. "dsn",
  20. "authorization",
  21. "cookie",
  22. "session",
  23. "credential",
  24. }
  25. RUNTIME_FILE_TABLES = {
  26. "source_context.json": "content_agent_source_contexts",
  27. "pattern_seed_pack.json": "content_agent_pattern_seed_packs",
  28. "search_queries.jsonl": "content_agent_queries",
  29. "discovered_content_items.jsonl": "content_agent_discovered_content_items",
  30. "content_media_records.jsonl": "content_agent_content_media_records",
  31. "pattern_recall_evidence.jsonl": "content_agent_pattern_recall_evidence",
  32. "rule_decisions.jsonl": "content_agent_rule_decisions",
  33. "walk_actions.jsonl": "content_agent_walk_actions",
  34. "run_events.jsonl": "content_agent_run_events",
  35. "source_path_records.jsonl": "content_agent_source_path_records",
  36. "search_clues.jsonl": "content_agent_search_clues",
  37. "final_output.json": "content_agent_final_outputs",
  38. "strategy_review.json": "content_agent_strategy_reviews",
  39. }
  40. JSON_COLUMNS_BY_TABLE = {
  41. "content_agent_source_contexts": {"evidence_pack", "source_context", "raw_demand_content"},
  42. "content_agent_pattern_seed_packs": {"itemset_ids", "seed_terms", "pattern_seed_pack"},
  43. "content_agent_queries": {"pattern_seed_ref", "raw_payload"},
  44. "content_agent_discovered_content_items": {
  45. "statistics",
  46. "tags",
  47. "text_extra",
  48. "source_evidence",
  49. "pattern_match_result",
  50. "content_audience_profile",
  51. "platform_raw_payload",
  52. "raw_payload",
  53. },
  54. "content_agent_content_media_records": {"raw_payload"},
  55. "content_agent_pattern_recall_evidence": {
  56. "matched_terms",
  57. "matched_category_paths",
  58. "decode_elements",
  59. "match_paths_request",
  60. "match_paths_response",
  61. "evidence_summary",
  62. "raw_payload",
  63. },
  64. "content_agent_rule_decisions": {
  65. "triggered_blocking_rules",
  66. "scorecard",
  67. "source_evidence",
  68. "decision_replay_data",
  69. "raw_payload",
  70. },
  71. "content_agent_walk_actions": {"raw_payload"},
  72. "content_agent_source_path_records": {"raw_payload"},
  73. "content_agent_search_clues": {"raw_payload"},
  74. "content_agent_run_events": {"raw_payload"},
  75. "content_agent_final_outputs": {"summary", "final_output"},
  76. "content_agent_publish_jobs": {"request_payload", "response_payload"},
  77. "content_agent_author_assets": {
  78. "content_tags",
  79. "profile_snapshot",
  80. "evidence_refs",
  81. "raw_payload",
  82. },
  83. "content_agent_author_asset_roles": {"raw_payload"},
  84. "content_agent_search_clue_assets": {"summary_metrics", "raw_payload"},
  85. "content_agent_search_clue_asset_evidence": {
  86. "source_path_record_ids",
  87. "decision_ids",
  88. "performance_feedback_refs",
  89. "raw_payload",
  90. },
  91. "content_agent_strategy_reviews": {
  92. "summary",
  93. "effective_search_queries",
  94. "weak_search_queries",
  95. "top_reject_reasons",
  96. "productive_paths",
  97. "suggestions",
  98. "raw_payload",
  99. },
  100. "content_agent_performance_feedback": {"raw_payload"},
  101. "content_agent_runs": {"source_ref", "error_detail"},
  102. "content_agent_policy_runs": {
  103. "strategy_source_ref",
  104. "rule_pack_source_ref",
  105. "metrics",
  106. "decision_summary",
  107. "raw_payload",
  108. },
  109. }
  110. DATETIME_COLUMNS = {"started_at", "completed_at", "created_at", "updated_at"}
  111. JSON_FILE_PAYLOAD_COLUMNS = {
  112. "source_context.json": "source_context",
  113. "pattern_seed_pack.json": "pattern_seed_pack",
  114. "final_output.json": "final_output",
  115. "strategy_review.json": "raw_payload",
  116. }
  117. JSONL_UPSERT_KEYS = {
  118. "search_queries.jsonl": ("run_id", "policy_run_id", "search_query_id"),
  119. "pattern_recall_evidence.jsonl": ("run_id", "policy_run_id", "recall_evidence_id"),
  120. "search_clues.jsonl": ("run_id", "policy_run_id", "clue_id"),
  121. "run_events.jsonl": ("run_id", "policy_run_id", "event_id"),
  122. }
  123. @dataclass(frozen=True)
  124. class ContentSupplyDbConfig:
  125. host: str
  126. port: int
  127. user: str
  128. password: str
  129. database: str
  130. timeout: int = 8
  131. @classmethod
  132. def from_env(
  133. cls,
  134. env_file: str | Path | None = ".env",
  135. ) -> "ContentSupplyDbConfig":
  136. env: dict[str, str] = _load_env_file(env_file)
  137. env.update({
  138. key: os.environ[key]
  139. for key in [
  140. "CONTENT_SUPPLY_DB_HOST",
  141. "CONTENT_SUPPLY_DB_PORT",
  142. "CONTENT_SUPPLY_DB_NAME",
  143. "CONTENT_SUPPLY_DB_USER",
  144. "CONTENT_SUPPLY_DB_PASSWORD",
  145. ]
  146. if os.environ.get(key)
  147. })
  148. required_keys = [
  149. "CONTENT_SUPPLY_DB_HOST",
  150. "CONTENT_SUPPLY_DB_PORT",
  151. "CONTENT_SUPPLY_DB_NAME",
  152. "CONTENT_SUPPLY_DB_USER",
  153. "CONTENT_SUPPLY_DB_PASSWORD",
  154. ]
  155. missing_keys = [key for key in required_keys if not env.get(key)]
  156. if missing_keys:
  157. raise ValueError(f"Missing required CFA db env keys: {', '.join(missing_keys)}")
  158. return cls(
  159. host=env["CONTENT_SUPPLY_DB_HOST"],
  160. port=int(env["CONTENT_SUPPLY_DB_PORT"]),
  161. user=env["CONTENT_SUPPLY_DB_USER"],
  162. password=env["CONTENT_SUPPLY_DB_PASSWORD"],
  163. database=env["CONTENT_SUPPLY_DB_NAME"],
  164. )
  165. def connect(self) -> Any:
  166. return pymysql.connect(
  167. host=self.host,
  168. port=self.port,
  169. user=self.user,
  170. password=self.password,
  171. database=self.database,
  172. charset="utf8mb4",
  173. cursorclass=pymysql.cursors.DictCursor,
  174. connect_timeout=self.timeout,
  175. read_timeout=self.timeout,
  176. write_timeout=self.timeout,
  177. )
  178. class DatabaseRuntimeStore:
  179. def __init__(
  180. self,
  181. config: ContentSupplyDbConfig,
  182. connection_factory: ConnectionFactory | None = None,
  183. ) -> None:
  184. self.config = config
  185. self._connection_factory = connection_factory or config.connect
  186. def prepare_run(self, run_id: str) -> Path:
  187. if self._run_has_runtime_records(run_id):
  188. raise FileExistsError(f"run already exists in database runtime tables: {run_id}")
  189. return self.run_dir(run_id)
  190. def run_dir(self, run_id: str) -> Path:
  191. return Path("database_runtime") / run_id
  192. def write_json(self, run_id: str, filename: str, data: dict[str, Any]) -> Path:
  193. table, record = _record_for_json(filename, data)
  194. if record["run_id"] != run_id:
  195. raise ValueError(f"{filename} run_id does not match runtime run_id")
  196. self._insert(table, record)
  197. return self.run_dir(run_id) / filename
  198. def update_json(self, run_id: str, filename: str, data: dict[str, Any]) -> Path:
  199. table, record = _record_for_json(filename, data)
  200. if record["run_id"] != run_id:
  201. raise ValueError(f"{filename} run_id does not match runtime run_id")
  202. if filename == "final_output.json":
  203. self._upsert(
  204. table,
  205. record,
  206. key_columns=("run_id", "policy_run_id", "output_version"),
  207. )
  208. return self.run_dir(run_id) / filename
  209. self._insert(table, record)
  210. return self.run_dir(run_id) / filename
  211. def append_jsonl(self, run_id: str, filename: str, rows: list[dict[str, Any]]) -> Path:
  212. table = _table_for_runtime_file(filename)
  213. # 整批共用一个连接、一次 commit:避免每行新建连接+commit 的 N 次网络往返。
  214. statements: list[tuple[str, list[Any]]] = []
  215. for row in rows:
  216. if row.get("run_id") != run_id:
  217. raise ValueError(f"{filename} row run_id does not match runtime run_id")
  218. record = _record_for_jsonl(filename, row)
  219. statements.append(
  220. self._row_sql(table, record, key_columns=JSONL_UPSERT_KEYS.get(filename))
  221. )
  222. if statements:
  223. with self._connection_factory() as conn:
  224. with conn.cursor() as cur:
  225. for sql, values in statements:
  226. cur.execute(sql, values)
  227. conn.commit()
  228. return self.run_dir(run_id) / filename
  229. def read_json(self, run_id: str, filename: str) -> dict[str, Any]:
  230. table = _table_for_runtime_file(filename)
  231. payload_column = JSON_FILE_PAYLOAD_COLUMNS.get(filename)
  232. if not payload_column:
  233. raise ValueError(f"{filename} is not a JSON runtime file")
  234. row = self._fetch_one(
  235. f"SELECT `{payload_column}` FROM `{table}` WHERE `run_id` = %s ORDER BY `id` DESC LIMIT 1",
  236. (run_id,),
  237. )
  238. if not row:
  239. raise FileNotFoundError(f"{filename} not found for run: {run_id}")
  240. payload = _decode_json_payload(row[payload_column]) or {}
  241. if payload_column == "raw_payload":
  242. return _runtime_payload(payload)
  243. return payload
  244. def read_jsonl(self, run_id: str, filename: str) -> list[dict[str, Any]]:
  245. table = _table_for_runtime_file(filename)
  246. rows = self._fetch_all(
  247. f"SELECT `raw_payload` FROM `{table}` WHERE `run_id` = %s ORDER BY `id`",
  248. (run_id,),
  249. )
  250. return [_runtime_payload(_decode_json_payload(row["raw_payload"]) or {}) for row in rows]
  251. def read_performance_feedback(
  252. self,
  253. run_id: str,
  254. policy_run_id: str,
  255. ) -> list[dict[str, Any]]:
  256. rows = self._fetch_all(
  257. "SELECT `raw_payload` FROM `content_agent_performance_feedback` "
  258. "WHERE `run_id` = %s AND `policy_run_id` = %s ORDER BY `id`",
  259. (run_id, policy_run_id),
  260. )
  261. return [_runtime_payload(_decode_json_payload(row["raw_payload"]) or {}) for row in rows]
  262. def file_status(self, run_id: str) -> dict[str, bool]:
  263. return {
  264. filename: self._runtime_file_exists(run_id, filename)
  265. for filename in RUNTIME_FILE_TABLES
  266. }
  267. def create_run_record(self, record: dict[str, Any]) -> None:
  268. self._insert("content_agent_runs", _run_record(record))
  269. def update_run_record(self, run_id: str, updates: dict[str, Any]) -> None:
  270. if not updates:
  271. return
  272. record = _sanitize_record("content_agent_runs", updates)
  273. if not record:
  274. return
  275. assignments = ", ".join(f"`{column}` = %s" for column in record)
  276. params = [
  277. _db_value("content_agent_runs", column, value)
  278. for column, value in record.items()
  279. ]
  280. params.append(run_id)
  281. with self._connection_factory() as conn:
  282. with conn.cursor() as cur:
  283. cur.execute(
  284. f"UPDATE `content_agent_runs` SET {assignments} WHERE `run_id` = %s",
  285. params,
  286. )
  287. conn.commit()
  288. def record_policy_run(self, record: dict[str, Any]) -> None:
  289. self._insert("content_agent_policy_runs", _policy_run_record(record))
  290. def append_run_event_records(
  291. self,
  292. run_id: str,
  293. policy_run_id: str,
  294. rows: list[dict[str, Any]],
  295. ) -> None:
  296. prepared_rows = [
  297. {**row, "run_id": run_id, "policy_run_id": row.get("policy_run_id", policy_run_id)}
  298. for row in rows
  299. ]
  300. self.append_jsonl(run_id, "run_events.jsonl", prepared_rows)
  301. def write_publish_jobs(
  302. self,
  303. run_id: str,
  304. policy_run_id: str,
  305. rows: list[dict[str, Any]],
  306. ) -> None:
  307. for row in rows:
  308. record = {
  309. **row,
  310. "run_id": run_id,
  311. "policy_run_id": row.get("policy_run_id", policy_run_id),
  312. }
  313. self._upsert(
  314. "content_agent_publish_jobs",
  315. _with_db_schema(record),
  316. key_columns=("run_id", "policy_run_id", "publish_job_id"),
  317. )
  318. def write_author_assets(self, rows: list[dict[str, Any]]) -> None:
  319. for row in rows:
  320. self._upsert(
  321. "content_agent_author_assets",
  322. _with_db_schema(row),
  323. key_columns=("author_asset_id",),
  324. )
  325. def write_author_asset_roles(self, rows: list[dict[str, Any]]) -> None:
  326. for row in rows:
  327. self._upsert(
  328. "content_agent_author_asset_roles",
  329. _with_db_schema(row),
  330. key_columns=("author_asset_id", "role"),
  331. )
  332. def write_search_clue_assets(self, rows: list[dict[str, Any]]) -> None:
  333. for row in rows:
  334. self._upsert(
  335. "content_agent_search_clue_assets",
  336. _with_db_schema(row),
  337. key_columns=("search_clue_asset_id",),
  338. )
  339. def write_search_clue_asset_evidence(self, rows: list[dict[str, Any]]) -> None:
  340. for row in rows:
  341. self._upsert(
  342. "content_agent_search_clue_asset_evidence",
  343. _with_db_schema(row),
  344. key_columns=("run_id", "policy_run_id", "clue_id"),
  345. )
  346. def _row_sql(
  347. self,
  348. table: str,
  349. record: dict[str, Any],
  350. key_columns: tuple[str, ...] | None = None,
  351. ) -> tuple[str, list[Any]]:
  352. sanitized = _sanitize_record(table, record)
  353. columns = list(sanitized)
  354. placeholders = ", ".join(["%s"] * len(columns))
  355. column_sql = ", ".join(f"`{column}`" for column in columns)
  356. values = [
  357. _db_value(table, column, sanitized[column])
  358. for column in columns
  359. ]
  360. sql = f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})"
  361. if key_columns:
  362. update_columns = [column for column in columns if column not in key_columns]
  363. assignments = ", ".join(f"`{column}` = VALUES(`{column}`)" for column in update_columns)
  364. if assignments:
  365. sql += f" ON DUPLICATE KEY UPDATE {assignments}"
  366. return sql, values
  367. def _insert(self, table: str, record: dict[str, Any]) -> None:
  368. sql, values = self._row_sql(table, record)
  369. with self._connection_factory() as conn:
  370. with conn.cursor() as cur:
  371. cur.execute(sql, values)
  372. conn.commit()
  373. def _upsert(
  374. self,
  375. table: str,
  376. record: dict[str, Any],
  377. key_columns: tuple[str, ...],
  378. ) -> None:
  379. sql, values = self._row_sql(table, record, key_columns=key_columns)
  380. with self._connection_factory() as conn:
  381. with conn.cursor() as cur:
  382. cur.execute(sql, values)
  383. conn.commit()
  384. def _fetch_one(self, sql: str, params: tuple[Any, ...]) -> dict[str, Any] | None:
  385. with self._connection_factory() as conn:
  386. with conn.cursor() as cur:
  387. cur.execute(sql, params)
  388. return cur.fetchone()
  389. def _fetch_all(self, sql: str, params: tuple[Any, ...]) -> list[dict[str, Any]]:
  390. with self._connection_factory() as conn:
  391. with conn.cursor() as cur:
  392. cur.execute(sql, params)
  393. return list(cur.fetchall())
  394. def _run_has_runtime_records(self, run_id: str) -> bool:
  395. tables = [
  396. "content_agent_runs",
  397. "content_agent_policy_runs",
  398. *RUNTIME_FILE_TABLES.values(),
  399. ]
  400. for table in tables:
  401. row = self._fetch_one(
  402. f"SELECT COUNT(*) AS cnt FROM `{table}` WHERE `run_id` = %s",
  403. (run_id,),
  404. )
  405. if row and row.get("cnt", 0):
  406. return True
  407. return False
  408. def _runtime_file_exists(self, run_id: str, filename: str) -> bool:
  409. table = _table_for_runtime_file(filename)
  410. row = self._fetch_one(
  411. f"SELECT COUNT(*) AS cnt FROM `{table}` WHERE `run_id` = %s",
  412. (run_id,),
  413. )
  414. return bool(row and row.get("cnt", 0))
  415. def _record_for_json(filename: str, data: dict[str, Any]) -> tuple[str, dict[str, Any]]:
  416. table = _table_for_runtime_file(filename)
  417. if filename == "source_context.json":
  418. evidence_pack = data.get("ext_data", {}).get("evidence_pack") or {}
  419. return table, {
  420. "schema_version": DB_SCHEMA_VERSION,
  421. "run_id": data["run_id"],
  422. "demand_content_id": _int_or_none(data.get("demand_content_id")),
  423. "pattern_source_system": evidence_pack.get("pattern_source_system"),
  424. "source_kind": evidence_pack.get("source_kind"),
  425. "source_post_id": evidence_pack.get("source_post_id"),
  426. "pattern_execution_id": evidence_pack.get("pattern_execution_id"),
  427. "mining_config_id": evidence_pack.get("mining_config_id"),
  428. "evidence_pack": evidence_pack,
  429. "source_context": data,
  430. "raw_demand_content": data.get("raw_demand_content"),
  431. }
  432. if filename == "pattern_seed_pack.json":
  433. return table, {
  434. "schema_version": DB_SCHEMA_VERSION,
  435. "run_id": data["run_id"],
  436. "policy_run_id": data["policy_run_id"],
  437. "source_post_id": data.get("source_post_id"),
  438. "pattern_execution_id": data.get("pattern_execution_id"),
  439. "itemset_ids": _itemset_ids_from_seed_pack(data),
  440. "seed_terms": data.get("seed_terms"),
  441. "pattern_seed_pack": data,
  442. }
  443. if filename == "final_output.json":
  444. return table, {
  445. "schema_version": DB_SCHEMA_VERSION,
  446. "run_id": data["run_id"],
  447. "policy_run_id": data["policy_run_id"],
  448. "output_version": data.get("output_version", "v1"),
  449. "summary": data.get("summary"),
  450. "final_output": data,
  451. "validation_status": data.get("validation_status"),
  452. }
  453. if filename == "strategy_review.json":
  454. return table, {
  455. "schema_version": DB_SCHEMA_VERSION,
  456. "run_id": data["run_id"],
  457. "policy_run_id": data["policy_run_id"],
  458. "review_id": data["review_id"],
  459. "review_status": data.get("review_status", "generated"),
  460. "summary": data.get("summary"),
  461. "effective_search_queries": data.get("effective_search_queries"),
  462. "weak_search_queries": data.get("weak_search_queries"),
  463. "top_reject_reasons": data.get("top_reject_reasons"),
  464. "productive_paths": data.get("productive_paths"),
  465. "suggestions": data.get("suggestions"),
  466. "raw_payload": data.get("raw_payload", data),
  467. }
  468. raise ValueError(f"unsupported JSON runtime file: {filename}")
  469. def _record_for_jsonl(filename: str, row: dict[str, Any]) -> dict[str, Any]:
  470. if filename == "search_queries.jsonl":
  471. return _with_db_schema(row)
  472. if filename == "discovered_content_items.jsonl":
  473. return _with_db_schema(row)
  474. if filename == "content_media_records.jsonl":
  475. return _with_db_schema(row)
  476. if filename == "pattern_recall_evidence.jsonl":
  477. return _with_db_schema(row)
  478. if filename == "rule_decisions.jsonl":
  479. return _with_db_schema(row)
  480. if filename == "walk_actions.jsonl":
  481. return _with_db_schema(row)
  482. if filename == "run_events.jsonl":
  483. return _with_db_schema(row)
  484. if filename == "source_path_records.jsonl":
  485. return _with_db_schema(row)
  486. if filename == "search_clues.jsonl":
  487. return _with_db_schema(row)
  488. raise ValueError(f"unsupported JSONL runtime file: {filename}")
  489. def _run_record(record: dict[str, Any]) -> dict[str, Any]:
  490. return _with_db_schema(record)
  491. def _policy_run_record(record: dict[str, Any]) -> dict[str, Any]:
  492. return _with_db_schema(record)
  493. def _with_db_schema(record: dict[str, Any]) -> dict[str, Any]:
  494. return {**record, "schema_version": DB_SCHEMA_VERSION}
  495. def _sanitize_record(table: str, record: dict[str, Any]) -> dict[str, Any]:
  496. result = {
  497. column: value
  498. for column, value in record.items()
  499. if column in _allowed_columns(table) and value is not None
  500. }
  501. raw_payload = result.get("raw_payload")
  502. if isinstance(raw_payload, dict):
  503. _assert_no_forbidden_raw_payload_keys(raw_payload)
  504. return result
  505. def _allowed_columns(table: str) -> set[str]:
  506. return TABLE_COLUMNS[table]
  507. TABLE_COLUMNS = {
  508. "content_agent_source_contexts": {
  509. "schema_version",
  510. "run_id",
  511. "demand_content_id",
  512. "pattern_source_system",
  513. "source_kind",
  514. "source_post_id",
  515. "pattern_execution_id",
  516. "mining_config_id",
  517. "evidence_pack",
  518. "source_context",
  519. "raw_demand_content",
  520. "created_at",
  521. },
  522. "content_agent_pattern_seed_packs": {
  523. "schema_version",
  524. "run_id",
  525. "policy_run_id",
  526. "source_post_id",
  527. "pattern_execution_id",
  528. "itemset_ids",
  529. "seed_terms",
  530. "pattern_seed_pack",
  531. "created_at",
  532. },
  533. "content_agent_queries": {
  534. "schema_version",
  535. "run_id",
  536. "policy_run_id",
  537. "search_query_id",
  538. "search_query",
  539. "search_query_generation_method",
  540. "discovery_start_source",
  541. "previous_discovery_step",
  542. "search_query_effect_status",
  543. "pattern_seed_ref",
  544. "raw_payload",
  545. "created_at",
  546. },
  547. "content_agent_discovered_content_items": {
  548. "schema_version",
  549. "run_id",
  550. "policy_run_id",
  551. "content_discovery_id",
  552. "search_query_id",
  553. "platform",
  554. "platform_content_id",
  555. "platform_content_format",
  556. "platform_content_url",
  557. "description",
  558. "platform_author_id",
  559. "author_display_name",
  560. "discovery_start_source",
  561. "previous_discovery_step",
  562. "statistics",
  563. "tags",
  564. "text_extra",
  565. "source_evidence",
  566. "pattern_match_result",
  567. "content_audience_profile",
  568. "platform_raw_payload",
  569. "raw_payload",
  570. "created_at",
  571. },
  572. "content_agent_content_media_records": {
  573. "schema_version",
  574. "run_id",
  575. "policy_run_id",
  576. "platform",
  577. "platform_content_id",
  578. "content_media_status",
  579. "content_metadata_source",
  580. "play_url",
  581. "local_path",
  582. "oss_url",
  583. "raw_payload",
  584. "created_at",
  585. },
  586. "content_agent_pattern_recall_evidence": {
  587. "schema_version",
  588. "run_id",
  589. "policy_run_id",
  590. "recall_evidence_id",
  591. "content_discovery_id",
  592. "platform_content_id",
  593. "decode_status",
  594. "decode_task_id",
  595. "recall_status",
  596. "matched_terms",
  597. "matched_category_paths",
  598. "decode_elements",
  599. "match_paths_request",
  600. "match_paths_response",
  601. "evidence_summary",
  602. "raw_payload",
  603. "created_at",
  604. },
  605. "content_agent_rule_decisions": {
  606. "schema_version",
  607. "run_id",
  608. "policy_run_id",
  609. "decision_id",
  610. "policy_bundle_id",
  611. "rule_pack_id",
  612. "rule_pack_version",
  613. "strategy_version",
  614. "decision_target_type",
  615. "decision_target_id",
  616. "decision_action",
  617. "decision_reason_code",
  618. "search_query_effect_status",
  619. "score",
  620. "age_50_plus_level",
  621. "triggered_blocking_rules",
  622. "scorecard",
  623. "source_evidence",
  624. "decision_replay_data",
  625. "raw_payload",
  626. "created_at",
  627. },
  628. "content_agent_walk_actions": {
  629. "schema_version",
  630. "run_id",
  631. "policy_run_id",
  632. "walk_action_id",
  633. "edge_id",
  634. "edge_type",
  635. "from_node_type",
  636. "from_node_id",
  637. "to_node_type",
  638. "to_node_id",
  639. "walk_action",
  640. "walk_status",
  641. "budget_tier",
  642. "depth",
  643. "page_cursor",
  644. "next_cursor",
  645. "decision_id",
  646. "rule_pack_id",
  647. "rule_pack_version",
  648. "reason_code",
  649. "source_path_record_id",
  650. "raw_payload",
  651. "created_at",
  652. "updated_at",
  653. },
  654. "content_agent_source_path_records": {
  655. "schema_version",
  656. "run_id",
  657. "policy_run_id",
  658. "source_path_record_id",
  659. "source_path_type",
  660. "from_node_type",
  661. "from_node_id",
  662. "to_node_type",
  663. "to_node_id",
  664. "decision_id",
  665. "rule_pack_id",
  666. "discovery_start_source",
  667. "previous_discovery_step",
  668. "origin_path_id",
  669. "source_evidence_ref",
  670. "raw_payload",
  671. "created_at",
  672. },
  673. "content_agent_search_clues": {
  674. "schema_version",
  675. "run_id",
  676. "policy_run_id",
  677. "clue_id",
  678. "search_query_id",
  679. "search_query",
  680. "discovery_start_source",
  681. "previous_discovery_step",
  682. "result_count",
  683. "pooled_content_count",
  684. "review_content_count",
  685. "pending_content_count",
  686. "rejected_content_count",
  687. "search_query_effect_status",
  688. "walk_next_step",
  689. "raw_payload",
  690. "created_at",
  691. },
  692. "content_agent_run_events": {
  693. "schema_version",
  694. "run_id",
  695. "policy_run_id",
  696. "event_id",
  697. "event_type",
  698. "status",
  699. "input_ref",
  700. "output_ref",
  701. "error_code",
  702. "message",
  703. "raw_payload",
  704. "created_at",
  705. },
  706. "content_agent_final_outputs": {
  707. "schema_version",
  708. "run_id",
  709. "policy_run_id",
  710. "output_version",
  711. "summary",
  712. "final_output",
  713. "validation_status",
  714. "created_at",
  715. "updated_at",
  716. },
  717. "content_agent_publish_jobs": {
  718. "schema_version",
  719. "run_id",
  720. "policy_run_id",
  721. "publish_job_id",
  722. "platform_content_id",
  723. "job_status",
  724. "trigger_mode",
  725. "crawler_plan_id",
  726. "produce_plan_id",
  727. "publish_plan_id",
  728. "request_payload",
  729. "response_payload",
  730. "error_code",
  731. "error_message",
  732. "created_at",
  733. "updated_at",
  734. },
  735. "content_agent_author_assets": {
  736. "schema_version",
  737. "author_asset_id",
  738. "platform",
  739. "platform_author_id",
  740. "author_display_name",
  741. "author_profile_url",
  742. "asset_status",
  743. "source_type",
  744. "validation_status",
  745. "eligible_as_source",
  746. "elderly_ratio",
  747. "elderly_tgi",
  748. "content_tags",
  749. "source_run_id",
  750. "source_policy_run_id",
  751. "last_profile_fetch_at",
  752. "last_works_fetch_at",
  753. "last_validated_at",
  754. "profile_snapshot",
  755. "evidence_refs",
  756. "raw_payload",
  757. "created_at",
  758. "updated_at",
  759. },
  760. "content_agent_author_asset_roles": {
  761. "schema_version",
  762. "author_asset_id",
  763. "role",
  764. "role_status",
  765. "role_reason_code",
  766. "assigned_by",
  767. "source_run_id",
  768. "raw_payload",
  769. "created_at",
  770. "updated_at",
  771. },
  772. "content_agent_strategy_reviews": {
  773. "schema_version",
  774. "run_id",
  775. "policy_run_id",
  776. "review_id",
  777. "review_status",
  778. "summary",
  779. "effective_search_queries",
  780. "weak_search_queries",
  781. "top_reject_reasons",
  782. "productive_paths",
  783. "suggestions",
  784. "raw_payload",
  785. "created_at",
  786. },
  787. "content_agent_performance_feedback": {
  788. "schema_version",
  789. "run_id",
  790. "policy_run_id",
  791. "feedback_id",
  792. "platform",
  793. "platform_content_id",
  794. "content_asset_id",
  795. "feedback_source",
  796. "feedback_status",
  797. "metric_window_start",
  798. "metric_window_end",
  799. "completion_rate",
  800. "share_rate",
  801. "average_watch_seconds",
  802. "total_watch_seconds",
  803. "impression_count",
  804. "play_count",
  805. "like_count",
  806. "comment_count",
  807. "share_count",
  808. "collect_count",
  809. "raw_payload",
  810. "created_at",
  811. "updated_at",
  812. },
  813. "content_agent_search_clue_assets": {
  814. "schema_version",
  815. "search_clue_asset_id",
  816. "platform",
  817. "clue_type",
  818. "normalized_clue_text",
  819. "display_clue_text",
  820. "promotion_status",
  821. "reusable_priority",
  822. "can_seed_next_run",
  823. "first_seen_run_id",
  824. "first_seen_policy_run_id",
  825. "last_validated_at",
  826. "summary_metrics",
  827. "raw_payload",
  828. "created_at",
  829. "updated_at",
  830. },
  831. "content_agent_search_clue_asset_evidence": {
  832. "schema_version",
  833. "evidence_id",
  834. "search_clue_asset_id",
  835. "run_id",
  836. "policy_run_id",
  837. "clue_id",
  838. "search_query_id",
  839. "pooled_content_count",
  840. "review_content_count",
  841. "failed_content_count",
  842. "source_path_record_ids",
  843. "decision_ids",
  844. "performance_feedback_refs",
  845. "raw_payload",
  846. "created_at",
  847. },
  848. "content_agent_runs": {
  849. "schema_version",
  850. "run_id",
  851. "demand_content_id",
  852. "run_label",
  853. "platform",
  854. "platform_mode",
  855. "strategy_version",
  856. "status",
  857. "current_step",
  858. "validation_status",
  859. "source_ref",
  860. "error_code",
  861. "error_message",
  862. "error_detail",
  863. "started_at",
  864. "completed_at",
  865. "created_at",
  866. "updated_at",
  867. },
  868. "content_agent_policy_runs": {
  869. "schema_version",
  870. "run_id",
  871. "policy_run_id",
  872. "experiment_name",
  873. "run_role",
  874. "policy_bundle_id",
  875. "rule_pack_id",
  876. "strategy_id",
  877. "strategy_version",
  878. "rule_pack_version",
  879. "walk_strategy_version",
  880. "policy_bundle_hash",
  881. "strategy_source_ref",
  882. "rule_pack_source_ref",
  883. "evidence_bundle_schema_version",
  884. "runtime_record_schema_version",
  885. "status",
  886. "metrics",
  887. "decision_summary",
  888. "raw_payload",
  889. "created_at",
  890. },
  891. }
  892. def _table_for_runtime_file(filename: str) -> str:
  893. try:
  894. return RUNTIME_FILE_TABLES[filename]
  895. except KeyError as exc:
  896. raise ValueError(f"unsupported runtime file: {filename}") from exc
  897. def _db_value(table: str, column: str, value: Any) -> Any:
  898. if column in JSON_COLUMNS_BY_TABLE.get(table, set()):
  899. return _json_dump(value)
  900. if column in DATETIME_COLUMNS:
  901. return _datetime_value(value)
  902. return value
  903. def _json_dump(value: Any) -> str | None:
  904. if value is None:
  905. return None
  906. return json.dumps(value, ensure_ascii=False, separators=(",", ":"))
  907. def _decode_json_payload(value: Any) -> Any:
  908. if value is None:
  909. return None
  910. if isinstance(value, (dict, list)):
  911. return value
  912. return json.loads(value)
  913. def _runtime_payload(payload: dict[str, Any]) -> dict[str, Any]:
  914. if "raw_payload" in payload:
  915. return payload
  916. return {**payload, "raw_payload": dict(payload)}
  917. def _datetime_value(value: Any) -> Any:
  918. if value is None or isinstance(value, datetime):
  919. return value
  920. if isinstance(value, str):
  921. try:
  922. parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
  923. except ValueError:
  924. return value
  925. if parsed.tzinfo:
  926. parsed = parsed.astimezone(timezone.utc).replace(tzinfo=None)
  927. return parsed
  928. return value
  929. def _itemset_ids_from_seed_pack(data: dict[str, Any]) -> list[Any]:
  930. if data.get("itemset_ids"):
  931. return list(data["itemset_ids"])
  932. itemsets = data.get("itemsets") or []
  933. ids = [
  934. itemset.get("itemset_id")
  935. for itemset in itemsets
  936. if isinstance(itemset, dict) and itemset.get("itemset_id") is not None
  937. ]
  938. return list(dict.fromkeys(ids))
  939. def _int_or_none(value: Any) -> int | None:
  940. if value in (None, ""):
  941. return None
  942. return int(value)
  943. def _assert_no_forbidden_raw_payload_keys(payload: Any) -> None:
  944. if isinstance(payload, list):
  945. for item in payload:
  946. _assert_no_forbidden_raw_payload_keys(item)
  947. return
  948. if not isinstance(payload, dict):
  949. return
  950. for key, value in payload.items():
  951. lowered = str(key).lower()
  952. if lowered in FORBIDDEN_RAW_PAYLOAD_KEYS:
  953. raise ValueError(f"raw_payload contains forbidden key: {key}")
  954. _assert_no_forbidden_raw_payload_keys(value)
  955. def _load_env_file(path_value: str | Path | None) -> dict[str, str]:
  956. if not path_value:
  957. return {}
  958. path = Path(path_value)
  959. if not path.exists():
  960. return {}
  961. result: dict[str, str] = {}
  962. for line in path.read_text(encoding="utf-8").splitlines():
  963. stripped = line.strip()
  964. if not stripped or stripped.startswith("#") or "=" not in stripped:
  965. continue
  966. key, value = stripped.split("=", 1)
  967. result[key.strip()] = value.strip().strip('"').strip("'")
  968. return result