|
|
@@ -1,18 +1,20 @@
|
|
|
-"""内容判定(V3-M2C):Gemini 直读视频,产出判定字段写进 pattern_match_result。
|
|
|
+"""内容判定(V3-M2C,M5 批并发):Gemini 直读视频,产出判定字段写进 pattern_match_result。
|
|
|
|
|
|
替换原 decode 异步解构 + 分类树匹配。每条内容调一次 gemini_video_client.analyze,
|
|
|
把 4 个判定字段(fit_senior_50plus / fit_confidence / relevance_score / reason)写进
|
|
|
discovered item 的 pattern_match_result,并镜像 fit_senior_50plus 进 content_audience_profile。
|
|
|
-pattern_recall / category_or_element_binding 设为 "matched" 桥接键,仅为 M2→M3 过渡
|
|
|
-(让未重写的旧 hard_gate 不误拒);M3 删除旧门槛后移除这两个桥接键。
|
|
|
+M5:analyze 纯 IO 且每条独立,用 ThreadPool 并发执行、按 offset 归位回收;
|
|
|
+id 编号、三个 list 的组装与落盘全部留主线程按 offset 串行 → 产物与串行逐条等价。
|
|
|
"""
|
|
|
|
|
|
from __future__ import annotations
|
|
|
|
|
|
+from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
from datetime import datetime, timezone
|
|
|
from typing import Any
|
|
|
|
|
|
from content_agent.constants import RUNTIME_RECORD_SCHEMA_VERSION
|
|
|
+from content_agent.integrations.walk_graph_json import WalkGraphStore
|
|
|
from content_agent.interfaces import GeminiVideoClient, RuntimeFileStore
|
|
|
|
|
|
|
|
|
@@ -31,13 +33,15 @@ def run(
|
|
|
media_by_content_id = {
|
|
|
row["platform_content_id"]: row for row in content_media_records
|
|
|
}
|
|
|
+ judgments = _collect_judgments(
|
|
|
+ discovered_content_items, media_by_content_id, source_context, gemini_video_client
|
|
|
+ )
|
|
|
evidence_rows: list[dict[str, Any]] = []
|
|
|
updated_items: list[dict[str, Any]] = []
|
|
|
updated_bundles: list[dict[str, Any]] = []
|
|
|
for offset, item in enumerate(discovered_content_items):
|
|
|
- media = media_by_content_id.get(item["platform_content_id"], {})
|
|
|
recall_evidence_id = f"recall_{start_index + offset:03d}"
|
|
|
- judgment = gemini_video_client.analyze(item, media, source_context)
|
|
|
+ judgment = judgments[offset]
|
|
|
pattern_match_result = _build_pattern_match_result(judgment, recall_evidence_id)
|
|
|
updated_items.append(_update_discovered_item(item, pattern_match_result))
|
|
|
updated_bundles.append(
|
|
|
@@ -56,6 +60,75 @@ def run(
|
|
|
}
|
|
|
|
|
|
|
|
|
+def _collect_judgments(
|
|
|
+ discovered_content_items: list[dict[str, Any]],
|
|
|
+ media_by_content_id: dict[str, dict[str, Any]],
|
|
|
+ source_context: dict[str, Any],
|
|
|
+ gemini_video_client: GeminiVideoClient,
|
|
|
+) -> list[dict[str, Any]]:
|
|
|
+ """并发执行 analyze,按 offset 归位(与完成顺序无关)→ 结果与串行逐条等价。
|
|
|
+
|
|
|
+ worker 只返回 judgment、不碰共享 list;组装/落盘由调用方主线程按 offset 串行完成。
|
|
|
+ """
|
|
|
+ judgments: list[dict[str, Any]] = [None] * len(discovered_content_items) # type: ignore[list-item]
|
|
|
+ if not discovered_content_items:
|
|
|
+ return judgments
|
|
|
+ # 配额截断按 offset 在提交前预判(与完成顺序无关)→ 串行/并发截断边界相同。
|
|
|
+ # 非 wrapper client(如单测直调的 FakeGemini)无 remaining_quota → 不限额。
|
|
|
+ remaining = getattr(gemini_video_client, "remaining_quota", lambda: len(discovered_content_items))()
|
|
|
+ submitted = min(remaining, len(discovered_content_items))
|
|
|
+ with ThreadPoolExecutor(max_workers=_resolve_max_workers()) as pool:
|
|
|
+ future_to_offset = {}
|
|
|
+ for offset, item in enumerate(discovered_content_items):
|
|
|
+ if offset >= submitted:
|
|
|
+ judgments[offset] = {
|
|
|
+ "fit_senior_50plus": False,
|
|
|
+ "fit_confidence": 0.0,
|
|
|
+ "relevance_score": 0.0,
|
|
|
+ "reason": "gemini_quota_exhausted",
|
|
|
+ "status": "failed",
|
|
|
+ }
|
|
|
+ continue
|
|
|
+ future = pool.submit(
|
|
|
+ _safe_analyze,
|
|
|
+ gemini_video_client,
|
|
|
+ item,
|
|
|
+ media_by_content_id.get(item["platform_content_id"], {}),
|
|
|
+ source_context,
|
|
|
+ )
|
|
|
+ future_to_offset[future] = offset
|
|
|
+ for future in as_completed(future_to_offset):
|
|
|
+ judgments[future_to_offset[future]] = future.result()
|
|
|
+ getattr(gemini_video_client, "consume", lambda count: None)(submitted)
|
|
|
+ return judgments
|
|
|
+
|
|
|
+
|
|
|
+def _safe_analyze(
|
|
|
+ client: GeminiVideoClient,
|
|
|
+ item: dict[str, Any],
|
|
|
+ media: dict[str, Any],
|
|
|
+ source_context: dict[str, Any],
|
|
|
+) -> dict[str, Any]:
|
|
|
+ # analyze 自身失败一律返回 _fail 不抛;这里兜底意外异常,绝不让 future.result() 炸主线程。
|
|
|
+ try:
|
|
|
+ return client.analyze(item, media, source_context)
|
|
|
+ except Exception as exc:
|
|
|
+ return {
|
|
|
+ "fit_senior_50plus": False,
|
|
|
+ "fit_confidence": 0.0,
|
|
|
+ "relevance_score": 0.0,
|
|
|
+ "reason": f"analyze_raised: {type(exc).__name__}",
|
|
|
+ "status": "failed",
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+def _resolve_max_workers() -> int:
|
|
|
+ try:
|
|
|
+ return int(WalkGraphStore().load_policy()["global"]["gemini_max_workers"])
|
|
|
+ except Exception:
|
|
|
+ return 4
|
|
|
+
|
|
|
+
|
|
|
def _build_pattern_match_result(judgment: dict[str, Any], recall_evidence_id: str) -> dict[str, Any]:
|
|
|
return {
|
|
|
"fit_senior_50plus": bool(judgment.get("fit_senior_50plus", False)),
|