database_runtime.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939
  1. from __future__ import annotations
  2. import json
  3. import os
  4. from dataclasses import dataclass
  5. from datetime import datetime, timezone
  6. from pathlib import Path
  7. from typing import Any, Callable
  8. import pymysql
  9. from content_agent.constants import DB_SCHEMA_VERSION
  10. ConnectionFactory = Callable[[], Any]
  11. FORBIDDEN_RAW_PAYLOAD_KEYS = {
  12. "password",
  13. "token",
  14. "access_token",
  15. "refresh_token",
  16. "api_key",
  17. "apikey",
  18. "secret",
  19. "dsn",
  20. "authorization",
  21. "cookie",
  22. "session",
  23. "credential",
  24. }
  25. RUNTIME_FILE_TABLES = {
  26. "source_context.json": "content_agent_source_contexts",
  27. "pattern_seed_pack.json": "content_agent_pattern_seed_packs",
  28. "search_queries.jsonl": "content_agent_queries",
  29. "discovered_content_items.jsonl": "content_agent_discovered_content_items",
  30. "content_media_records.jsonl": "content_agent_content_media_records",
  31. "pattern_recall_evidence.jsonl": "content_agent_pattern_recall_evidence",
  32. "rule_decisions.jsonl": "content_agent_rule_decisions",
  33. "walk_actions.jsonl": "content_agent_walk_actions",
  34. "run_events.jsonl": "content_agent_run_events",
  35. "source_path_records.jsonl": "content_agent_source_path_records",
  36. "search_clues.jsonl": "content_agent_search_clues",
  37. "final_output.json": "content_agent_final_outputs",
  38. "strategy_review.json": "content_agent_strategy_reviews",
  39. }
  40. JSON_COLUMNS_BY_TABLE = {
  41. "content_agent_source_contexts": {"evidence_pack", "source_context", "raw_demand_content"},
  42. "content_agent_pattern_seed_packs": {"itemset_ids", "seed_terms", "pattern_seed_pack"},
  43. "content_agent_queries": {"pattern_seed_ref", "raw_payload"},
  44. "content_agent_discovered_content_items": {
  45. "statistics",
  46. "tags",
  47. "text_extra",
  48. "source_evidence",
  49. "pattern_match_result",
  50. "content_audience_profile",
  51. "platform_raw_payload",
  52. "raw_payload",
  53. },
  54. "content_agent_content_media_records": {"raw_payload"},
  55. "content_agent_pattern_recall_evidence": {
  56. "matched_terms",
  57. "matched_category_paths",
  58. "decode_elements",
  59. "match_paths_request",
  60. "match_paths_response",
  61. "evidence_summary",
  62. "raw_payload",
  63. },
  64. "content_agent_rule_decisions": {
  65. "triggered_blocking_rules",
  66. "scorecard",
  67. "source_evidence",
  68. "decision_replay_data",
  69. "raw_payload",
  70. },
  71. "content_agent_walk_actions": {"raw_payload"},
  72. "content_agent_source_path_records": {"raw_payload"},
  73. "content_agent_search_clues": {"raw_payload"},
  74. "content_agent_run_events": {"raw_payload"},
  75. "content_agent_final_outputs": {"summary", "final_output"},
  76. "content_agent_publish_jobs": {"request_payload", "response_payload"},
  77. "content_agent_author_assets": {
  78. "content_tags",
  79. "profile_snapshot",
  80. "evidence_refs",
  81. "raw_payload",
  82. },
  83. "content_agent_author_asset_roles": {"raw_payload"},
  84. "content_agent_strategy_reviews": {
  85. "summary",
  86. "effective_search_queries",
  87. "weak_search_queries",
  88. "top_reject_reasons",
  89. "productive_paths",
  90. "suggestions",
  91. "raw_payload",
  92. },
  93. "content_agent_runs": {"source_ref", "error_detail"},
  94. "content_agent_policy_runs": {
  95. "strategy_source_ref",
  96. "rule_pack_source_ref",
  97. "metrics",
  98. "decision_summary",
  99. "raw_payload",
  100. },
  101. }
  102. DATETIME_COLUMNS = {"started_at", "completed_at", "created_at", "updated_at"}
  103. JSON_FILE_PAYLOAD_COLUMNS = {
  104. "source_context.json": "source_context",
  105. "pattern_seed_pack.json": "pattern_seed_pack",
  106. "final_output.json": "final_output",
  107. "strategy_review.json": "raw_payload",
  108. }
  109. @dataclass(frozen=True)
  110. class ContentSupplyDbConfig:
  111. host: str
  112. port: int
  113. user: str
  114. password: str
  115. database: str
  116. timeout: int = 8
  117. @classmethod
  118. def from_env(
  119. cls,
  120. env_file: str | Path | None = ".env",
  121. ) -> "ContentSupplyDbConfig":
  122. env: dict[str, str] = _load_env_file(env_file)
  123. env.update({
  124. key: os.environ[key]
  125. for key in [
  126. "CONTENT_SUPPLY_DB_HOST",
  127. "CONTENT_SUPPLY_DB_PORT",
  128. "CONTENT_SUPPLY_DB_NAME",
  129. "CONTENT_SUPPLY_DB_USER",
  130. "CONTENT_SUPPLY_DB_PASSWORD",
  131. ]
  132. if os.environ.get(key)
  133. })
  134. required_keys = [
  135. "CONTENT_SUPPLY_DB_HOST",
  136. "CONTENT_SUPPLY_DB_PORT",
  137. "CONTENT_SUPPLY_DB_NAME",
  138. "CONTENT_SUPPLY_DB_USER",
  139. "CONTENT_SUPPLY_DB_PASSWORD",
  140. ]
  141. missing_keys = [key for key in required_keys if not env.get(key)]
  142. if missing_keys:
  143. raise ValueError(f"Missing required CFA db env keys: {', '.join(missing_keys)}")
  144. return cls(
  145. host=env["CONTENT_SUPPLY_DB_HOST"],
  146. port=int(env["CONTENT_SUPPLY_DB_PORT"]),
  147. user=env["CONTENT_SUPPLY_DB_USER"],
  148. password=env["CONTENT_SUPPLY_DB_PASSWORD"],
  149. database=env["CONTENT_SUPPLY_DB_NAME"],
  150. )
  151. def connect(self) -> Any:
  152. return pymysql.connect(
  153. host=self.host,
  154. port=self.port,
  155. user=self.user,
  156. password=self.password,
  157. database=self.database,
  158. charset="utf8mb4",
  159. cursorclass=pymysql.cursors.DictCursor,
  160. connect_timeout=self.timeout,
  161. read_timeout=self.timeout,
  162. write_timeout=self.timeout,
  163. )
  164. class DatabaseRuntimeStore:
  165. def __init__(
  166. self,
  167. config: ContentSupplyDbConfig,
  168. connection_factory: ConnectionFactory | None = None,
  169. ) -> None:
  170. self.config = config
  171. self._connection_factory = connection_factory or config.connect
  172. def prepare_run(self, run_id: str) -> Path:
  173. if self._run_has_runtime_records(run_id):
  174. raise FileExistsError(f"run already exists in database runtime tables: {run_id}")
  175. return self.run_dir(run_id)
  176. def run_dir(self, run_id: str) -> Path:
  177. return Path("database_runtime") / run_id
  178. def write_json(self, run_id: str, filename: str, data: dict[str, Any]) -> Path:
  179. table, record = _record_for_json(filename, data)
  180. if record["run_id"] != run_id:
  181. raise ValueError(f"{filename} run_id does not match runtime run_id")
  182. self._insert(table, record)
  183. return self.run_dir(run_id) / filename
  184. def update_json(self, run_id: str, filename: str, data: dict[str, Any]) -> Path:
  185. table, record = _record_for_json(filename, data)
  186. if record["run_id"] != run_id:
  187. raise ValueError(f"{filename} run_id does not match runtime run_id")
  188. if filename == "final_output.json":
  189. self._upsert(
  190. table,
  191. record,
  192. key_columns=("run_id", "policy_run_id", "output_version"),
  193. )
  194. return self.run_dir(run_id) / filename
  195. self._insert(table, record)
  196. return self.run_dir(run_id) / filename
  197. def append_jsonl(self, run_id: str, filename: str, rows: list[dict[str, Any]]) -> Path:
  198. table = _table_for_runtime_file(filename)
  199. for row in rows:
  200. if row.get("run_id") != run_id:
  201. raise ValueError(f"{filename} row run_id does not match runtime run_id")
  202. record = _record_for_jsonl(filename, row)
  203. if filename == "pattern_recall_evidence.jsonl":
  204. self._upsert(
  205. table,
  206. record,
  207. key_columns=("run_id", "policy_run_id", "recall_evidence_id"),
  208. )
  209. else:
  210. self._insert(table, record)
  211. return self.run_dir(run_id) / filename
  212. def read_json(self, run_id: str, filename: str) -> dict[str, Any]:
  213. table = _table_for_runtime_file(filename)
  214. payload_column = JSON_FILE_PAYLOAD_COLUMNS.get(filename)
  215. if not payload_column:
  216. raise ValueError(f"{filename} is not a JSON runtime file")
  217. row = self._fetch_one(
  218. f"SELECT `{payload_column}` FROM `{table}` WHERE `run_id` = %s ORDER BY `id` DESC LIMIT 1",
  219. (run_id,),
  220. )
  221. if not row:
  222. raise FileNotFoundError(f"{filename} not found for run: {run_id}")
  223. payload = _decode_json_payload(row[payload_column]) or {}
  224. if payload_column == "raw_payload":
  225. return _runtime_payload(payload)
  226. return payload
  227. def read_jsonl(self, run_id: str, filename: str) -> list[dict[str, Any]]:
  228. table = _table_for_runtime_file(filename)
  229. rows = self._fetch_all(
  230. f"SELECT `raw_payload` FROM `{table}` WHERE `run_id` = %s ORDER BY `id`",
  231. (run_id,),
  232. )
  233. return [_runtime_payload(_decode_json_payload(row["raw_payload"]) or {}) for row in rows]
  234. def file_status(self, run_id: str) -> dict[str, bool]:
  235. return {
  236. filename: self._runtime_file_exists(run_id, filename)
  237. for filename in RUNTIME_FILE_TABLES
  238. }
  239. def create_run_record(self, record: dict[str, Any]) -> None:
  240. self._insert("content_agent_runs", _run_record(record))
  241. def update_run_record(self, run_id: str, updates: dict[str, Any]) -> None:
  242. if not updates:
  243. return
  244. record = _sanitize_record("content_agent_runs", updates)
  245. if not record:
  246. return
  247. assignments = ", ".join(f"`{column}` = %s" for column in record)
  248. params = [
  249. _db_value("content_agent_runs", column, value)
  250. for column, value in record.items()
  251. ]
  252. params.append(run_id)
  253. with self._connection_factory() as conn:
  254. with conn.cursor() as cur:
  255. cur.execute(
  256. f"UPDATE `content_agent_runs` SET {assignments} WHERE `run_id` = %s",
  257. params,
  258. )
  259. conn.commit()
  260. def record_policy_run(self, record: dict[str, Any]) -> None:
  261. self._insert("content_agent_policy_runs", _policy_run_record(record))
  262. def append_run_event_records(
  263. self,
  264. run_id: str,
  265. policy_run_id: str,
  266. rows: list[dict[str, Any]],
  267. ) -> None:
  268. prepared_rows = [
  269. {**row, "run_id": run_id, "policy_run_id": row.get("policy_run_id", policy_run_id)}
  270. for row in rows
  271. ]
  272. self.append_jsonl(run_id, "run_events.jsonl", prepared_rows)
  273. def write_publish_jobs(
  274. self,
  275. run_id: str,
  276. policy_run_id: str,
  277. rows: list[dict[str, Any]],
  278. ) -> None:
  279. for row in rows:
  280. record = {
  281. **row,
  282. "run_id": run_id,
  283. "policy_run_id": row.get("policy_run_id", policy_run_id),
  284. }
  285. self._upsert(
  286. "content_agent_publish_jobs",
  287. _with_db_schema(record),
  288. key_columns=("run_id", "policy_run_id", "publish_job_id"),
  289. )
  290. def write_author_assets(self, rows: list[dict[str, Any]]) -> None:
  291. for row in rows:
  292. self._upsert(
  293. "content_agent_author_assets",
  294. _with_db_schema(row),
  295. key_columns=("author_asset_id",),
  296. )
  297. def write_author_asset_roles(self, rows: list[dict[str, Any]]) -> None:
  298. for row in rows:
  299. self._upsert(
  300. "content_agent_author_asset_roles",
  301. _with_db_schema(row),
  302. key_columns=("author_asset_id", "role"),
  303. )
  304. def _insert(self, table: str, record: dict[str, Any]) -> None:
  305. sanitized = _sanitize_record(table, record)
  306. columns = list(sanitized)
  307. placeholders = ", ".join(["%s"] * len(columns))
  308. column_sql = ", ".join(f"`{column}`" for column in columns)
  309. values = [
  310. _db_value(table, column, sanitized[column])
  311. for column in columns
  312. ]
  313. with self._connection_factory() as conn:
  314. with conn.cursor() as cur:
  315. cur.execute(
  316. f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})",
  317. values,
  318. )
  319. conn.commit()
  320. def _upsert(
  321. self,
  322. table: str,
  323. record: dict[str, Any],
  324. key_columns: tuple[str, ...],
  325. ) -> None:
  326. sanitized = _sanitize_record(table, record)
  327. columns = list(sanitized)
  328. placeholders = ", ".join(["%s"] * len(columns))
  329. column_sql = ", ".join(f"`{column}`" for column in columns)
  330. update_columns = [column for column in columns if column not in key_columns]
  331. assignments = ", ".join(f"`{column}` = VALUES(`{column}`)" for column in update_columns)
  332. values = [
  333. _db_value(table, column, sanitized[column])
  334. for column in columns
  335. ]
  336. sql = f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})"
  337. if assignments:
  338. sql += f" ON DUPLICATE KEY UPDATE {assignments}"
  339. with self._connection_factory() as conn:
  340. with conn.cursor() as cur:
  341. cur.execute(sql, values)
  342. conn.commit()
  343. def _fetch_one(self, sql: str, params: tuple[Any, ...]) -> dict[str, Any] | None:
  344. with self._connection_factory() as conn:
  345. with conn.cursor() as cur:
  346. cur.execute(sql, params)
  347. return cur.fetchone()
  348. def _fetch_all(self, sql: str, params: tuple[Any, ...]) -> list[dict[str, Any]]:
  349. with self._connection_factory() as conn:
  350. with conn.cursor() as cur:
  351. cur.execute(sql, params)
  352. return list(cur.fetchall())
  353. def _run_has_runtime_records(self, run_id: str) -> bool:
  354. tables = [
  355. "content_agent_runs",
  356. "content_agent_policy_runs",
  357. *RUNTIME_FILE_TABLES.values(),
  358. ]
  359. for table in tables:
  360. row = self._fetch_one(
  361. f"SELECT COUNT(*) AS cnt FROM `{table}` WHERE `run_id` = %s",
  362. (run_id,),
  363. )
  364. if row and row.get("cnt", 0):
  365. return True
  366. return False
  367. def _runtime_file_exists(self, run_id: str, filename: str) -> bool:
  368. table = _table_for_runtime_file(filename)
  369. row = self._fetch_one(
  370. f"SELECT COUNT(*) AS cnt FROM `{table}` WHERE `run_id` = %s",
  371. (run_id,),
  372. )
  373. return bool(row and row.get("cnt", 0))
  374. def _record_for_json(filename: str, data: dict[str, Any]) -> tuple[str, dict[str, Any]]:
  375. table = _table_for_runtime_file(filename)
  376. if filename == "source_context.json":
  377. evidence_pack = data.get("ext_data", {}).get("evidence_pack") or {}
  378. return table, {
  379. "schema_version": DB_SCHEMA_VERSION,
  380. "run_id": data["run_id"],
  381. "demand_content_id": _int_or_none(data.get("demand_content_id")),
  382. "pattern_source_system": evidence_pack.get("pattern_source_system"),
  383. "source_kind": evidence_pack.get("source_kind"),
  384. "source_post_id": evidence_pack.get("source_post_id"),
  385. "pattern_execution_id": evidence_pack.get("pattern_execution_id"),
  386. "mining_config_id": evidence_pack.get("mining_config_id"),
  387. "evidence_pack": evidence_pack,
  388. "source_context": data,
  389. "raw_demand_content": data.get("raw_demand_content"),
  390. }
  391. if filename == "pattern_seed_pack.json":
  392. return table, {
  393. "schema_version": DB_SCHEMA_VERSION,
  394. "run_id": data["run_id"],
  395. "policy_run_id": data["policy_run_id"],
  396. "source_post_id": data.get("source_post_id"),
  397. "pattern_execution_id": data.get("pattern_execution_id"),
  398. "itemset_ids": _itemset_ids_from_seed_pack(data),
  399. "seed_terms": data.get("seed_terms"),
  400. "pattern_seed_pack": data,
  401. }
  402. if filename == "final_output.json":
  403. return table, {
  404. "schema_version": DB_SCHEMA_VERSION,
  405. "run_id": data["run_id"],
  406. "policy_run_id": data["policy_run_id"],
  407. "output_version": data.get("output_version", "v1"),
  408. "summary": data.get("summary"),
  409. "final_output": data,
  410. "validation_status": data.get("validation_status"),
  411. }
  412. if filename == "strategy_review.json":
  413. return table, {
  414. "schema_version": DB_SCHEMA_VERSION,
  415. "run_id": data["run_id"],
  416. "policy_run_id": data["policy_run_id"],
  417. "review_id": data["review_id"],
  418. "review_status": data.get("review_status", "generated"),
  419. "summary": data.get("summary"),
  420. "effective_search_queries": data.get("effective_search_queries"),
  421. "weak_search_queries": data.get("weak_search_queries"),
  422. "top_reject_reasons": data.get("top_reject_reasons"),
  423. "productive_paths": data.get("productive_paths"),
  424. "suggestions": data.get("suggestions"),
  425. "raw_payload": data.get("raw_payload", data),
  426. }
  427. raise ValueError(f"unsupported JSON runtime file: {filename}")
  428. def _record_for_jsonl(filename: str, row: dict[str, Any]) -> dict[str, Any]:
  429. if filename == "search_queries.jsonl":
  430. return _with_db_schema(row)
  431. if filename == "discovered_content_items.jsonl":
  432. return _with_db_schema(row)
  433. if filename == "content_media_records.jsonl":
  434. return _with_db_schema(row)
  435. if filename == "pattern_recall_evidence.jsonl":
  436. return _with_db_schema(row)
  437. if filename == "rule_decisions.jsonl":
  438. return _with_db_schema(row)
  439. if filename == "walk_actions.jsonl":
  440. return _with_db_schema(row)
  441. if filename == "run_events.jsonl":
  442. return _with_db_schema(row)
  443. if filename == "source_path_records.jsonl":
  444. return _with_db_schema(row)
  445. if filename == "search_clues.jsonl":
  446. return _with_db_schema(row)
  447. raise ValueError(f"unsupported JSONL runtime file: {filename}")
  448. def _run_record(record: dict[str, Any]) -> dict[str, Any]:
  449. return _with_db_schema(record)
  450. def _policy_run_record(record: dict[str, Any]) -> dict[str, Any]:
  451. return _with_db_schema(record)
  452. def _with_db_schema(record: dict[str, Any]) -> dict[str, Any]:
  453. return {**record, "schema_version": DB_SCHEMA_VERSION}
  454. def _sanitize_record(table: str, record: dict[str, Any]) -> dict[str, Any]:
  455. result = {
  456. column: value
  457. for column, value in record.items()
  458. if column in _allowed_columns(table) and value is not None
  459. }
  460. raw_payload = result.get("raw_payload")
  461. if isinstance(raw_payload, dict):
  462. _assert_no_forbidden_raw_payload_keys(raw_payload)
  463. return result
  464. def _allowed_columns(table: str) -> set[str]:
  465. return TABLE_COLUMNS[table]
  466. TABLE_COLUMNS = {
  467. "content_agent_source_contexts": {
  468. "schema_version",
  469. "run_id",
  470. "demand_content_id",
  471. "pattern_source_system",
  472. "source_kind",
  473. "source_post_id",
  474. "pattern_execution_id",
  475. "mining_config_id",
  476. "evidence_pack",
  477. "source_context",
  478. "raw_demand_content",
  479. "created_at",
  480. },
  481. "content_agent_pattern_seed_packs": {
  482. "schema_version",
  483. "run_id",
  484. "policy_run_id",
  485. "source_post_id",
  486. "pattern_execution_id",
  487. "itemset_ids",
  488. "seed_terms",
  489. "pattern_seed_pack",
  490. "created_at",
  491. },
  492. "content_agent_queries": {
  493. "schema_version",
  494. "run_id",
  495. "policy_run_id",
  496. "search_query_id",
  497. "search_query",
  498. "search_query_generation_method",
  499. "discovery_start_source",
  500. "previous_discovery_step",
  501. "search_query_effect_status",
  502. "pattern_seed_ref",
  503. "raw_payload",
  504. "created_at",
  505. },
  506. "content_agent_discovered_content_items": {
  507. "schema_version",
  508. "run_id",
  509. "policy_run_id",
  510. "content_discovery_id",
  511. "search_query_id",
  512. "platform",
  513. "platform_content_id",
  514. "platform_content_format",
  515. "platform_content_url",
  516. "description",
  517. "platform_author_id",
  518. "author_display_name",
  519. "discovery_start_source",
  520. "previous_discovery_step",
  521. "statistics",
  522. "tags",
  523. "text_extra",
  524. "source_evidence",
  525. "pattern_match_result",
  526. "content_audience_profile",
  527. "platform_raw_payload",
  528. "raw_payload",
  529. "created_at",
  530. },
  531. "content_agent_content_media_records": {
  532. "schema_version",
  533. "run_id",
  534. "policy_run_id",
  535. "platform",
  536. "platform_content_id",
  537. "content_media_status",
  538. "content_metadata_source",
  539. "play_url",
  540. "local_path",
  541. "oss_url",
  542. "raw_payload",
  543. "created_at",
  544. },
  545. "content_agent_pattern_recall_evidence": {
  546. "schema_version",
  547. "run_id",
  548. "policy_run_id",
  549. "recall_evidence_id",
  550. "content_discovery_id",
  551. "platform_content_id",
  552. "decode_status",
  553. "decode_task_id",
  554. "recall_status",
  555. "matched_terms",
  556. "matched_category_paths",
  557. "decode_elements",
  558. "match_paths_request",
  559. "match_paths_response",
  560. "evidence_summary",
  561. "raw_payload",
  562. "created_at",
  563. },
  564. "content_agent_rule_decisions": {
  565. "schema_version",
  566. "run_id",
  567. "policy_run_id",
  568. "decision_id",
  569. "policy_bundle_id",
  570. "rule_pack_id",
  571. "rule_pack_version",
  572. "strategy_version",
  573. "decision_target_type",
  574. "decision_target_id",
  575. "decision_action",
  576. "decision_reason_code",
  577. "search_query_effect_status",
  578. "score",
  579. "age_50_plus_level",
  580. "triggered_blocking_rules",
  581. "scorecard",
  582. "source_evidence",
  583. "decision_replay_data",
  584. "raw_payload",
  585. "created_at",
  586. },
  587. "content_agent_walk_actions": {
  588. "schema_version",
  589. "run_id",
  590. "policy_run_id",
  591. "walk_action_id",
  592. "edge_id",
  593. "edge_type",
  594. "from_node_type",
  595. "from_node_id",
  596. "to_node_type",
  597. "to_node_id",
  598. "walk_action",
  599. "walk_status",
  600. "budget_tier",
  601. "depth",
  602. "page_cursor",
  603. "next_cursor",
  604. "decision_id",
  605. "rule_pack_id",
  606. "rule_pack_version",
  607. "reason_code",
  608. "source_path_record_id",
  609. "raw_payload",
  610. "created_at",
  611. "updated_at",
  612. },
  613. "content_agent_source_path_records": {
  614. "schema_version",
  615. "run_id",
  616. "policy_run_id",
  617. "source_path_record_id",
  618. "source_path_type",
  619. "from_node_type",
  620. "from_node_id",
  621. "to_node_type",
  622. "to_node_id",
  623. "decision_id",
  624. "rule_pack_id",
  625. "discovery_start_source",
  626. "previous_discovery_step",
  627. "origin_path_id",
  628. "source_evidence_ref",
  629. "raw_payload",
  630. "created_at",
  631. },
  632. "content_agent_search_clues": {
  633. "schema_version",
  634. "run_id",
  635. "policy_run_id",
  636. "clue_id",
  637. "search_query_id",
  638. "search_query",
  639. "discovery_start_source",
  640. "previous_discovery_step",
  641. "result_count",
  642. "pooled_content_count",
  643. "review_content_count",
  644. "pending_content_count",
  645. "rejected_content_count",
  646. "search_query_effect_status",
  647. "walk_next_step",
  648. "raw_payload",
  649. "created_at",
  650. },
  651. "content_agent_run_events": {
  652. "schema_version",
  653. "run_id",
  654. "policy_run_id",
  655. "event_id",
  656. "event_type",
  657. "status",
  658. "input_ref",
  659. "output_ref",
  660. "error_code",
  661. "message",
  662. "raw_payload",
  663. "created_at",
  664. },
  665. "content_agent_final_outputs": {
  666. "schema_version",
  667. "run_id",
  668. "policy_run_id",
  669. "output_version",
  670. "summary",
  671. "final_output",
  672. "validation_status",
  673. "created_at",
  674. "updated_at",
  675. },
  676. "content_agent_publish_jobs": {
  677. "schema_version",
  678. "run_id",
  679. "policy_run_id",
  680. "publish_job_id",
  681. "platform_content_id",
  682. "job_status",
  683. "trigger_mode",
  684. "crawler_plan_id",
  685. "produce_plan_id",
  686. "publish_plan_id",
  687. "request_payload",
  688. "response_payload",
  689. "error_code",
  690. "error_message",
  691. "created_at",
  692. "updated_at",
  693. },
  694. "content_agent_author_assets": {
  695. "schema_version",
  696. "author_asset_id",
  697. "platform",
  698. "platform_author_id",
  699. "author_display_name",
  700. "author_profile_url",
  701. "asset_status",
  702. "source_type",
  703. "validation_status",
  704. "eligible_as_source",
  705. "elderly_ratio",
  706. "elderly_tgi",
  707. "content_tags",
  708. "source_run_id",
  709. "source_policy_run_id",
  710. "last_profile_fetch_at",
  711. "last_works_fetch_at",
  712. "last_validated_at",
  713. "profile_snapshot",
  714. "evidence_refs",
  715. "raw_payload",
  716. "created_at",
  717. "updated_at",
  718. },
  719. "content_agent_author_asset_roles": {
  720. "schema_version",
  721. "author_asset_id",
  722. "role",
  723. "role_status",
  724. "role_reason_code",
  725. "assigned_by",
  726. "source_run_id",
  727. "raw_payload",
  728. "created_at",
  729. "updated_at",
  730. },
  731. "content_agent_strategy_reviews": {
  732. "schema_version",
  733. "run_id",
  734. "policy_run_id",
  735. "review_id",
  736. "review_status",
  737. "summary",
  738. "effective_search_queries",
  739. "weak_search_queries",
  740. "top_reject_reasons",
  741. "productive_paths",
  742. "suggestions",
  743. "raw_payload",
  744. "created_at",
  745. },
  746. "content_agent_runs": {
  747. "schema_version",
  748. "run_id",
  749. "demand_content_id",
  750. "run_label",
  751. "platform",
  752. "platform_mode",
  753. "strategy_version",
  754. "status",
  755. "current_step",
  756. "validation_status",
  757. "source_ref",
  758. "error_code",
  759. "error_message",
  760. "error_detail",
  761. "started_at",
  762. "completed_at",
  763. "created_at",
  764. "updated_at",
  765. },
  766. "content_agent_policy_runs": {
  767. "schema_version",
  768. "run_id",
  769. "policy_run_id",
  770. "experiment_name",
  771. "run_role",
  772. "policy_bundle_id",
  773. "rule_pack_id",
  774. "strategy_id",
  775. "strategy_version",
  776. "rule_pack_version",
  777. "walk_strategy_version",
  778. "policy_bundle_hash",
  779. "strategy_source_ref",
  780. "rule_pack_source_ref",
  781. "evidence_bundle_schema_version",
  782. "runtime_record_schema_version",
  783. "status",
  784. "metrics",
  785. "decision_summary",
  786. "raw_payload",
  787. "created_at",
  788. },
  789. }
  790. def _table_for_runtime_file(filename: str) -> str:
  791. try:
  792. return RUNTIME_FILE_TABLES[filename]
  793. except KeyError as exc:
  794. raise ValueError(f"unsupported runtime file: {filename}") from exc
  795. def _db_value(table: str, column: str, value: Any) -> Any:
  796. if column in JSON_COLUMNS_BY_TABLE.get(table, set()):
  797. return _json_dump(value)
  798. if column in DATETIME_COLUMNS:
  799. return _datetime_value(value)
  800. return value
  801. def _json_dump(value: Any) -> str | None:
  802. if value is None:
  803. return None
  804. return json.dumps(value, ensure_ascii=False, separators=(",", ":"))
  805. def _decode_json_payload(value: Any) -> Any:
  806. if value is None:
  807. return None
  808. if isinstance(value, (dict, list)):
  809. return value
  810. return json.loads(value)
  811. def _runtime_payload(payload: dict[str, Any]) -> dict[str, Any]:
  812. if "raw_payload" in payload:
  813. return payload
  814. return {**payload, "raw_payload": dict(payload)}
  815. def _datetime_value(value: Any) -> Any:
  816. if value is None or isinstance(value, datetime):
  817. return value
  818. if isinstance(value, str):
  819. try:
  820. parsed = datetime.fromisoformat(value.replace("Z", "+00:00"))
  821. except ValueError:
  822. return value
  823. if parsed.tzinfo:
  824. parsed = parsed.astimezone(timezone.utc).replace(tzinfo=None)
  825. return parsed
  826. return value
  827. def _itemset_ids_from_seed_pack(data: dict[str, Any]) -> list[Any]:
  828. if data.get("itemset_ids"):
  829. return list(data["itemset_ids"])
  830. itemsets = data.get("itemsets") or []
  831. ids = [
  832. itemset.get("itemset_id")
  833. for itemset in itemsets
  834. if isinstance(itemset, dict) and itemset.get("itemset_id") is not None
  835. ]
  836. return list(dict.fromkeys(ids))
  837. def _int_or_none(value: Any) -> int | None:
  838. if value in (None, ""):
  839. return None
  840. return int(value)
  841. def _assert_no_forbidden_raw_payload_keys(payload: Any) -> None:
  842. if isinstance(payload, list):
  843. for item in payload:
  844. _assert_no_forbidden_raw_payload_keys(item)
  845. return
  846. if not isinstance(payload, dict):
  847. return
  848. for key, value in payload.items():
  849. lowered = str(key).lower()
  850. if lowered in FORBIDDEN_RAW_PAYLOAD_KEYS:
  851. raise ValueError(f"raw_payload contains forbidden key: {key}")
  852. _assert_no_forbidden_raw_payload_keys(value)
  853. def _load_env_file(path_value: str | Path | None) -> dict[str, str]:
  854. if not path_value:
  855. return {}
  856. path = Path(path_value)
  857. if not path.exists():
  858. return {}
  859. result: dict[str, str] = {}
  860. for line in path.read_text(encoding="utf-8").splitlines():
  861. stripped = line.strip()
  862. if not stripped or stripped.startswith("#") or "=" not in stripped:
  863. continue
  864. key, value = stripped.split("=", 1)
  865. result[key.strip()] = value.strip().strip('"').strip("'")
  866. return result