浏览代码

fix(v3): 治理体检高/中优代码债安全集(5 项,行为零变化)

- ffmpeg 压缩加 120s 超时(坏视频不再卡死判定并发线程,超时按 failed 降级)
- DB append_jsonl 整批一个连接一次 commit(原每行新建连接+commit,N 次网络往返;SQL/顺序不变,抽 _row_sql 复用)
- _fail 三份合一进 content_agent/findings.py(validation/walk_strategy_json/walk_graph_json 改 import,调用点零改)
- 判定失败 dict 三处构造统一复用 gemini_video._fail(recall 配额截断+_safe_analyze)
- config_store.load_json 加 mtime+size 缓存(一个 run 读 3+ 次 walk_policy 不再重复读盘/parse;配置全仓只读消费)

基线 326 passed 全绿、config gate pass、real_id45 指纹不变。

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
Sam Lee 1 天之前
父节点
当前提交
6cf025c50d

+ 3 - 14
content_agent/business_modules/content_discovery/pattern_recall/recall_decision.py

@@ -14,6 +14,7 @@ from datetime import datetime, timezone
 from typing import Any
 
 from content_agent.constants import RUNTIME_RECORD_SCHEMA_VERSION
+from content_agent.integrations.gemini_video import _fail
 from content_agent.integrations.walk_graph_json import WalkGraphStore
 from content_agent.interfaces import GeminiVideoClient, RuntimeFileStore
 
@@ -81,13 +82,7 @@ def _collect_judgments(
         future_to_offset = {}
         for offset, item in enumerate(discovered_content_items):
             if offset >= submitted:
-                judgments[offset] = {
-                    "fit_senior_50plus": False,
-                    "fit_confidence": 0.0,
-                    "relevance_score": 0.0,
-                    "reason": "gemini_quota_exhausted",
-                    "status": "failed",
-                }
+                judgments[offset] = _fail("gemini_quota_exhausted")
                 continue
             future = pool.submit(
                 _safe_analyze,
@@ -113,13 +108,7 @@ def _safe_analyze(
     try:
         return client.analyze(item, media, source_context)
     except Exception as exc:
-        return {
-            "fit_senior_50plus": False,
-            "fit_confidence": 0.0,
-            "relevance_score": 0.0,
-            "reason": f"analyze_raised: {type(exc).__name__}",
-            "status": "failed",
-        }
+        return _fail(f"analyze_raised: {type(exc).__name__}")
 
 
 def _resolve_max_workers() -> int:

+ 1 - 2
content_agent/business_modules/run_record/validation.py

@@ -5,6 +5,7 @@ from collections import Counter
 from typing import Any
 
 from content_agent.constants import RUNTIME_RECORD_SCHEMA_VERSION, RUNTIME_SCHEMA_VERSION
+from content_agent.findings import fail as _fail
 from content_agent.integrations.runtime_files import RUNTIME_FILENAMES
 from content_agent.interfaces import RuntimeFileStore
 
@@ -852,5 +853,3 @@ def _result(run_id: str, findings: list[dict[str, Any]]) -> dict[str, Any]:
     }
 
 
-def _fail(findings: list[dict[str, Any]], check_id: str, message: str) -> None:
-    findings.append({"level": "fail", "check_id": check_id, "message": message})

+ 9 - 0
content_agent/findings.py

@@ -0,0 +1,9 @@
+"""校验 findings 的共享构造(原 validation / walk_strategy_json / walk_graph_json 各抄一份)。"""
+
+from __future__ import annotations
+
+from typing import Any
+
+
+def fail(findings: list[dict[str, Any]], check_id: str, message: str) -> None:
+    findings.append({"level": "fail", "check_id": check_id, "message": message})

+ 16 - 2
content_agent/integrations/config_store.py

@@ -19,10 +19,24 @@ def read_text(path: Path) -> str:
     return Path(path).read_text(encoding="utf-8")
 
 
+# mtime+size 键缓存:同一 run 内 walk_policy 等配置会被读 3+ 次(walk_engine + 每次 recall),
+# 文件未变时不重复读盘/parse。调用方约定不就地修改返回的 parsed 对象(全仓配置均只读消费)。
+_CACHE: dict[str, tuple[tuple[int, int], Any, str]] = {}
+
+
 def load_json(path: Path) -> tuple[Any, str]:
     """Return (parsed, raw_text). raw_text is the exact on-disk text for hashing."""
-    raw = read_text(path)
-    return json.loads(raw), raw
+    resolved = Path(path)
+    stat = resolved.stat()
+    key = str(resolved)
+    stamp = (stat.st_mtime_ns, stat.st_size)
+    cached = _CACHE.get(key)
+    if cached and cached[0] == stamp:
+        return cached[1], cached[2]
+    raw = read_text(resolved)
+    parsed = json.loads(raw)
+    _CACHE[key] = (stamp, parsed, raw)
+    return parsed, raw
 
 
 def sha256_text(text: str) -> str:

+ 29 - 26
content_agent/integrations/database_runtime.py

@@ -232,18 +232,21 @@ class DatabaseRuntimeStore:
 
     def append_jsonl(self, run_id: str, filename: str, rows: list[dict[str, Any]]) -> Path:
         table = _table_for_runtime_file(filename)
+        # 整批共用一个连接、一次 commit:避免每行新建连接+commit 的 N 次网络往返。
+        statements: list[tuple[str, list[Any]]] = []
         for row in rows:
             if row.get("run_id") != run_id:
                 raise ValueError(f"{filename} row run_id does not match runtime run_id")
             record = _record_for_jsonl(filename, row)
-            if filename in JSONL_UPSERT_KEYS:
-                self._upsert(
-                    table,
-                    record,
-                    key_columns=JSONL_UPSERT_KEYS[filename],
-                )
-            else:
-                self._insert(table, record)
+            statements.append(
+                self._row_sql(table, record, key_columns=JSONL_UPSERT_KEYS.get(filename))
+            )
+        if statements:
+            with self._connection_factory() as conn:
+                with conn.cursor() as cur:
+                    for sql, values in statements:
+                        cur.execute(sql, values)
+                conn.commit()
         return self.run_dir(run_id) / filename
 
     def read_json(self, run_id: str, filename: str) -> dict[str, Any]:
@@ -376,7 +379,12 @@ class DatabaseRuntimeStore:
                 key_columns=("run_id", "policy_run_id", "clue_id"),
             )
 
-    def _insert(self, table: str, record: dict[str, Any]) -> None:
+    def _row_sql(
+        self,
+        table: str,
+        record: dict[str, Any],
+        key_columns: tuple[str, ...] | None = None,
+    ) -> tuple[str, list[Any]]:
         sanitized = _sanitize_record(table, record)
         columns = list(sanitized)
         placeholders = ", ".join(["%s"] * len(columns))
@@ -385,12 +393,19 @@ class DatabaseRuntimeStore:
             _db_value(table, column, sanitized[column])
             for column in columns
         ]
+        sql = f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})"
+        if key_columns:
+            update_columns = [column for column in columns if column not in key_columns]
+            assignments = ", ".join(f"`{column}` = VALUES(`{column}`)" for column in update_columns)
+            if assignments:
+                sql += f" ON DUPLICATE KEY UPDATE {assignments}"
+        return sql, values
+
+    def _insert(self, table: str, record: dict[str, Any]) -> None:
+        sql, values = self._row_sql(table, record)
         with self._connection_factory() as conn:
             with conn.cursor() as cur:
-                cur.execute(
-                    f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})",
-                    values,
-                )
+                cur.execute(sql, values)
             conn.commit()
 
     def _upsert(
@@ -399,19 +414,7 @@ class DatabaseRuntimeStore:
         record: dict[str, Any],
         key_columns: tuple[str, ...],
     ) -> None:
-        sanitized = _sanitize_record(table, record)
-        columns = list(sanitized)
-        placeholders = ", ".join(["%s"] * len(columns))
-        column_sql = ", ".join(f"`{column}`" for column in columns)
-        update_columns = [column for column in columns if column not in key_columns]
-        assignments = ", ".join(f"`{column}` = VALUES(`{column}`)" for column in update_columns)
-        values = [
-            _db_value(table, column, sanitized[column])
-            for column in columns
-        ]
-        sql = f"INSERT INTO `{table}` ({column_sql}) VALUES ({placeholders})"
-        if assignments:
-            sql += f" ON DUPLICATE KEY UPDATE {assignments}"
+        sql, values = self._row_sql(table, record, key_columns=key_columns)
         with self._connection_factory() as conn:
             with conn.cursor() as cur:
                 cur.execute(sql, values)

+ 13 - 7
content_agent/integrations/video_fetch.py

@@ -28,6 +28,7 @@ _PLATFORM_DOWNLOAD_HEADERS = {
 # 已拍板压缩档:360p / 1fps / 低清,实测 ~4MB(memory/video-multimodal-analysis)。
 _FFMPEG_ARGS = ["-vf", "scale=360:-2,fps=1", "-crf", "33", "-c:a", "aac", "-b:a", "32k", "-ac", "1"]
 MAX_INLINE_BYTES = 30 * 1024 * 1024  # OpenRouter inline base64 平台硬上限
+COMPRESS_TIMEOUT_SECONDS = 120.0  # 实测 64MB/720p 压缩 ~8s,120s 足够余量
 
 
 class VideoFetchError(RuntimeError):
@@ -41,13 +42,18 @@ def _download_headers(platform: str, override: dict[str, str] | None) -> dict[st
 
 
 def _compress(raw: bytes, ffmpeg_exe: str) -> bytes:
-    proc = subprocess.run(
-        [ffmpeg_exe, "-i", "pipe:0", *_FFMPEG_ARGS, "-f", "mp4",
-         "-movflags", "frag_keyframe+empty_moov", "pipe:1"],
-        input=raw,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-    )
+    # 超时保护:坏视频会让 ffmpeg 卡死,进而挂住一个判定并发线程(实测正常压缩 ~8s)。
+    try:
+        proc = subprocess.run(
+            [ffmpeg_exe, "-i", "pipe:0", *_FFMPEG_ARGS, "-f", "mp4",
+             "-movflags", "frag_keyframe+empty_moov", "pipe:1"],
+            input=raw,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=COMPRESS_TIMEOUT_SECONDS,
+        )
+    except subprocess.TimeoutExpired as exc:
+        raise VideoFetchError("ffmpeg compression timeout") from exc
     if proc.returncode != 0 or not proc.stdout:
         raise VideoFetchError("ffmpeg compression failed")
     return proc.stdout

+ 2 - 2
content_agent/integrations/walk_graph_json.py

@@ -10,6 +10,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
+from content_agent.findings import fail as _fail
+
 WALK_GRAPH_PATH = Path("tech_documents/数据接口与来源/walk_graph.json")
 WALK_POLICY_PATH = Path("tech_documents/数据接口与来源/walk_policy.json")
 PROFILE_DIR = Path("tech_documents/数据接口与来源/platform_profiles")
@@ -127,8 +129,6 @@ def _validate_profile(profile: dict[str, Any], edge_ids: set[str]) -> list[dict[
     return findings
 
 
-def _fail(findings: list[dict[str, Any]], check_id: str, message: str) -> None:
-    findings.append({"level": "fail", "check_id": check_id, "message": message})
 
 
 def _raise_on_fail(findings: list[dict[str, Any]], label: str) -> None:

+ 2 - 4
content_agent/integrations/walk_strategy_json.py

@@ -5,6 +5,8 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 
+from content_agent.findings import fail as _fail
+
 
 WALK_STRATEGY_PATH = Path("product_documents/抖音游走策略/douyin_walk_strategy.v1.json")
 RULE_PACK_PATH = Path("product_documents/规则包/douyin_rule_packs.v1.json")
@@ -199,7 +201,3 @@ def _check_rule_pack_bindings(
 
 def _ids(rows: list[dict[str, Any]], field: str) -> set[str]:
     return {str(row[field]) for row in rows if row.get(field)}
-
-
-def _fail(findings: list[dict[str, Any]], check_id: str, message: str) -> None:
-    findings.append({"level": "fail", "check_id": check_id, "message": message})