|
@@ -11,6 +11,7 @@
|
|
|
"""
|
|
"""
|
|
|
|
|
|
|
|
import json
|
|
import json
|
|
|
|
|
+import logging
|
|
|
import re
|
|
import re
|
|
|
from pathlib import Path
|
|
from pathlib import Path
|
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
@@ -18,6 +19,8 @@ import asyncio
|
|
|
import aiohttp
|
|
import aiohttp
|
|
|
from urllib.parse import urlparse, parse_qs, urlencode
|
|
from urllib.parse import urlparse, parse_qs, urlencode
|
|
|
|
|
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
+
|
|
|
|
|
|
|
|
# ── URL → (platform, content_id) 解析 ────────────────────────────────
|
|
# ── URL → (platform, content_id) 解析 ────────────────────────────────
|
|
|
|
|
|
|
@@ -141,6 +144,141 @@ def _merge_transcript_into_body(post: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
return merged
|
|
return merged
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _needs_transcribe(platform: str, post: Dict[str, Any]) -> bool:
|
|
|
|
|
+ """是否需要给这条 post 跑 Deepgram 转写。
|
|
|
|
|
+
|
|
|
|
|
+ 语义(用户明确约束):
|
|
|
|
|
+ - `video_transcript` 字段**缺失** → 视为"从未尝试",需要跑
|
|
|
|
|
+ - `video_transcript` 字段存在(即使为空字符串 "")→ 视为"已尝试过",跳过
|
|
|
|
|
+ (Deepgram 对纯音乐/无人声视频会返回空,这是合法的"失败"标记,不重跑)
|
|
|
|
|
+ - 必须有视频源(用 transcription.extract_video_url 统一判断,跨平台)
|
|
|
|
|
+ """
|
|
|
|
|
+ if not isinstance(post, dict):
|
|
|
|
|
+ return False
|
|
|
|
|
+ if "video_transcript" in post:
|
|
|
|
|
+ return False
|
|
|
|
|
+ try:
|
|
|
|
|
+ from agent.tools.builtin.content.transcription import extract_video_url
|
|
|
|
|
+ return bool(extract_video_url(platform, post))
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def _transcribe_one_post(
|
|
|
|
|
+ platform: str,
|
|
|
|
|
+ post: Dict[str, Any],
|
|
|
|
|
+ sem: asyncio.Semaphore,
|
|
|
|
|
+) -> Optional[str]:
|
|
|
|
|
+ """对一条 post 跑 transcribe_video_from_post,写回 post["video_transcript"]。
|
|
|
|
|
+
|
|
|
|
|
+ 成功 → post["video_transcript"] = transcript
|
|
|
|
|
+ 失败 → post["video_transcript"] = "" # 明确"已尝试"标记,避免后续 backfill 重跑
|
|
|
|
|
+ 返回 transcript 或 None。
|
|
|
|
|
+ """
|
|
|
|
|
+ from agent.tools.builtin.content.transcription import transcribe_video_from_post
|
|
|
|
|
+ async with sem:
|
|
|
|
|
+ try:
|
|
|
|
|
+ text = await transcribe_video_from_post(platform, post)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning("transcribe failed (%s): %s", platform, e)
|
|
|
|
|
+ text = None
|
|
|
|
|
+ if text:
|
|
|
|
|
+ post["video_transcript"] = text
|
|
|
|
|
+ return text
|
|
|
|
|
+ # 失败也写一个 "",表示"我们尝试过 Deepgram 但没拿到结果"
|
|
|
|
|
+ post["video_transcript"] = ""
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+async def _transcribe_pending_async(
|
|
|
|
|
+ matched: List[Dict[str, Any]],
|
|
|
|
|
+ concurrency: int = 3,
|
|
|
|
|
+) -> Dict[Tuple[str, str], str]:
|
|
|
|
|
+ """对 matched 里所有"缺 video_transcript 字段 + 有视频源"的 post 并发跑 Deepgram。
|
|
|
|
|
+
|
|
|
|
|
+ 返回成功的 {(platform, channel_content_id): transcript_text} 映射,
|
|
|
|
|
+ 供调用方写回 cache 文件(让其他 trace 命中同一 post 时复用)。
|
|
|
|
|
+ """
|
|
|
|
|
+ targets: List[Tuple[str, Dict[str, Any]]] = []
|
|
|
|
|
+ for src in matched:
|
|
|
|
|
+ platform = src.get("platform")
|
|
|
|
|
+ post = src.get("post")
|
|
|
|
|
+ if not isinstance(post, dict) or not platform:
|
|
|
|
|
+ continue
|
|
|
|
|
+ if _needs_transcribe(platform, post):
|
|
|
|
|
+ targets.append((platform, post))
|
|
|
|
|
+
|
|
|
|
|
+ if not targets:
|
|
|
|
|
+ return {}
|
|
|
|
|
+
|
|
|
|
|
+ logger.info("Auto-transcribe: %d post(s) pending", len(targets))
|
|
|
|
|
+ sem = asyncio.Semaphore(concurrency)
|
|
|
|
|
+ results = await asyncio.gather(
|
|
|
|
|
+ *[_transcribe_one_post(p, post, sem) for p, post in targets]
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ updates: Dict[Tuple[str, str], str] = {}
|
|
|
|
|
+ success_n = 0
|
|
|
|
|
+ for (platform, post), text in zip(targets, results):
|
|
|
|
|
+ if text:
|
|
|
|
|
+ success_n += 1
|
|
|
|
|
+ cid = post.get("channel_content_id") or post.get("video_id")
|
|
|
|
|
+ if cid:
|
|
|
|
|
+ updates[(platform, str(cid))] = text
|
|
|
|
|
+ logger.info("Auto-transcribe: %d/%d success", success_n, len(targets))
|
|
|
|
|
+ return updates
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _writeback_transcript_to_cache(
|
|
|
|
|
+ cache_dir: Path,
|
|
|
|
|
+ updates: Dict[Tuple[str, str], str],
|
|
|
|
|
+) -> int:
|
|
|
|
|
+ """把新拿到的 transcript 写回所有 cache 文件里匹配的 post。
|
|
|
|
|
+
|
|
|
|
|
+ 跨 trace 扩散:同一条 video 可能被多个 trace 的搜索引用过,这里一次写回所有
|
|
|
|
|
+ cache 副本,避免下次另一个 trace 跑 extract_sources 时又触发一遍 Deepgram。
|
|
|
|
|
+ 返回 cache 中被更新的 post 总数。
|
|
|
|
|
+ """
|
|
|
|
|
+ if not updates or not cache_dir.exists():
|
|
|
|
|
+ return 0
|
|
|
|
|
+ written = 0
|
|
|
|
|
+ for cf in cache_dir.glob("*.json"):
|
|
|
|
|
+ try:
|
|
|
|
|
+ data = json.loads(cf.read_text(encoding="utf-8"))
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ continue
|
|
|
|
|
+ dirty = False
|
|
|
|
|
+ for key, entry in data.items():
|
|
|
|
|
+ if not key.startswith("search:") or not isinstance(entry, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ platform = key.split(":", 1)[1]
|
|
|
|
|
+ post_lists = []
|
|
|
|
|
+ for h in entry.get("history", []) or []:
|
|
|
|
|
+ post_lists.append(h.get("posts", []))
|
|
|
|
|
+ if "posts" in entry and isinstance(entry["posts"], list):
|
|
|
|
|
+ post_lists.append(entry["posts"])
|
|
|
|
|
+ for posts in post_lists:
|
|
|
|
|
+ for post in posts or []:
|
|
|
|
|
+ if not isinstance(post, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ cid = post.get("channel_content_id") or post.get("video_id")
|
|
|
|
|
+ if not cid:
|
|
|
|
|
+ continue
|
|
|
|
|
+ text = updates.get((platform, str(cid)))
|
|
|
|
|
+ # 注意:用 "video_transcript" not in post 判断,跟 _needs_transcribe 语义一致
|
|
|
|
|
+ # 已经有字段(即使为空)→ 不覆盖,尊重之前的"已尝试"状态
|
|
|
|
|
+ if text and "video_transcript" not in post:
|
|
|
|
|
+ post["video_transcript"] = text
|
|
|
|
|
+ dirty = True
|
|
|
|
|
+ written += 1
|
|
|
|
|
+ if dirty:
|
|
|
|
|
+ cf.write_text(
|
|
|
|
|
+ json.dumps(data, ensure_ascii=False, indent=2),
|
|
|
|
|
+ encoding="utf-8",
|
|
|
|
|
+ )
|
|
|
|
|
+ return written
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
|
|
def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
|
|
|
"""判断帖子是否早于截止时间戳(秒级)
|
|
"""判断帖子是否早于截止时间戳(秒级)
|
|
|
|
|
|
|
@@ -236,6 +374,27 @@ def _normalize_url(url: str) -> Optional[str]:
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def _normalize_post_in_place(platform: str, post: Dict[str, Any]) -> None:
|
|
|
|
|
+ """对 cache 里读出的 post 做平台相关的字段补齐(in-place)。
|
|
|
|
|
+
|
|
|
|
|
+ 早期 cache 可能在 platform-level normalize 函数加上之前就写入了,此处兜底补救:
|
|
|
|
|
+ YouTube: description_snippet -> body_text / thumbnails -> images / url -> videos / ...
|
|
|
|
|
+ sph: title (caption) -> body_text (视频号 title 字段塞的是整段 caption)
|
|
|
|
|
+ """
|
|
|
|
|
+ if platform == "youtube":
|
|
|
|
|
+ try:
|
|
|
|
|
+ from agent.tools.builtin.content.platforms.youtube import _normalize_youtube_post
|
|
|
|
|
+ _normalize_youtube_post(post)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ elif platform == "sph":
|
|
|
|
|
+ try:
|
|
|
|
|
+ from agent.tools.builtin.content.platforms.aigc_channel import _normalize_sph_post
|
|
|
|
|
+ _normalize_sph_post(post)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def build_cache_index(cache_dir: Path, trace_ids: Optional[List[str]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]:
|
|
def build_cache_index(cache_dir: Path, trace_ids: Optional[List[str]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]:
|
|
|
"""
|
|
"""
|
|
|
构建 (platform, channel_content_id) -> post 映射。
|
|
构建 (platform, channel_content_id) -> post 映射。
|
|
@@ -285,9 +444,14 @@ def build_cache_index(cache_dir: Path, trace_ids: Optional[List[str]] = None) ->
|
|
|
for post in posts or []:
|
|
for post in posts or []:
|
|
|
if not isinstance(post, dict):
|
|
if not isinstance(post, dict):
|
|
|
continue
|
|
continue
|
|
|
|
|
+
|
|
|
|
|
+ # 平台字段 normalize:兜底救援早期 cache(normalize 函数加之前写入的)
|
|
|
|
|
+ _normalize_post_in_place(platform, post)
|
|
|
|
|
+
|
|
|
cid = post.get("channel_content_id")
|
|
cid = post.get("channel_content_id")
|
|
|
|
|
|
|
|
# YouTube 平台用 video_id 而非 channel_content_id
|
|
# YouTube 平台用 video_id 而非 channel_content_id
|
|
|
|
|
+ # (normalize 已经处理过,这里是双保险,对早期未 normalize 的 post 也兜底)
|
|
|
if not cid and post.get("video_id"):
|
|
if not cid and post.get("video_id"):
|
|
|
cid = post.get("video_id")
|
|
cid = post.get("video_id")
|
|
|
post["channel_content_id"] = cid # 补全字段
|
|
post["channel_content_id"] = cid # 补全字段
|
|
@@ -318,6 +482,8 @@ def extract_sources_to_json(
|
|
|
min_body_len: int = DEFAULT_MIN_BODY_LEN,
|
|
min_body_len: int = DEFAULT_MIN_BODY_LEN,
|
|
|
min_score: float = DEFAULT_MIN_SCORE,
|
|
min_score: float = DEFAULT_MIN_SCORE,
|
|
|
cutoff_date: Tuple[int, int, int] = DEFAULT_CUTOFF_DATE,
|
|
cutoff_date: Tuple[int, int, int] = DEFAULT_CUTOFF_DATE,
|
|
|
|
|
+ auto_transcribe: bool = True,
|
|
|
|
|
+ transcribe_concurrency: int = 3,
|
|
|
) -> Dict[str, Any]:
|
|
) -> Dict[str, Any]:
|
|
|
"""
|
|
"""
|
|
|
扫描 raw_cases_dir 下的 case_*.json,
|
|
扫描 raw_cases_dir 下的 case_*.json,
|
|
@@ -483,6 +649,45 @@ def extract_sources_to_json(
|
|
|
# 4. 合并已有数据和新匹配的数据
|
|
# 4. 合并已有数据和新匹配的数据
|
|
|
all_sources = existing_sources + matched
|
|
all_sources = existing_sources + matched
|
|
|
|
|
|
|
|
|
|
+ # 4.5. 自动 backfill 视频转写(保底兜底)
|
|
|
|
|
+ # 触发条件:post 有视频源(extract_video_url 非空)且**完全没有 `video_transcript` 字段**
|
|
|
|
|
+ # 空字符串视为"已尝试过"不重跑,跨平台统一。失败也会写 "",下次跳过避免反复浪费 Deepgram 额度。
|
|
|
|
|
+ # 跑完写回所有 cache 文件,让其他 trace 引用同一 post 时直接复用。
|
|
|
|
|
+ auto_transcribe_stats: Dict[str, Any] = {"attempted": 0, "succeeded": 0, "cache_writeback": 0}
|
|
|
|
|
+ if auto_transcribe and all_sources:
|
|
|
|
|
+ try:
|
|
|
|
|
+ transcribe_targets = sum(
|
|
|
|
|
+ 1 for s in all_sources
|
|
|
|
|
+ if isinstance(s.get("post"), dict)
|
|
|
|
|
+ and _needs_transcribe(s.get("platform"), s["post"])
|
|
|
|
|
+ )
|
|
|
|
|
+ if transcribe_targets > 0:
|
|
|
|
|
+ logger.info("extract_sources: auto-transcribe %d post(s)", transcribe_targets)
|
|
|
|
|
+ updates = asyncio.run(
|
|
|
|
|
+ _transcribe_pending_async(all_sources, concurrency=transcribe_concurrency)
|
|
|
|
|
+ )
|
|
|
|
|
+ auto_transcribe_stats["attempted"] = transcribe_targets
|
|
|
|
|
+ auto_transcribe_stats["succeeded"] = len(updates)
|
|
|
|
|
+ # 写回 cache(跨 trace 扩散)
|
|
|
|
|
+ if updates:
|
|
|
|
|
+ n = _writeback_transcript_to_cache(cache_dir, updates)
|
|
|
|
|
+ auto_transcribe_stats["cache_writeback"] = n
|
|
|
|
|
+ # 顺手把 transcript merge 进 body_text,保持跟 _merge_transcript_into_body 一致
|
|
|
|
|
+ for s in all_sources:
|
|
|
|
|
+ post = s.get("post")
|
|
|
|
|
+ if not isinstance(post, dict):
|
|
|
|
|
+ continue
|
|
|
|
|
+ if not post.get("video_transcript"):
|
|
|
|
|
+ continue
|
|
|
|
|
+ merged = _merge_transcript_into_body(post)
|
|
|
|
|
+ if merged is not post:
|
|
|
|
|
+ post["body_text"] = merged.get("body_text", post.get("body_text", ""))
|
|
|
|
|
+ except RuntimeError as e:
|
|
|
|
|
+ # 比如已在 event loop 内 — 跳过 auto-transcribe 不阻塞主流程
|
|
|
|
|
+ logger.warning("auto-transcribe skipped: %s", e)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ logger.warning("auto-transcribe failed: %s", e)
|
|
|
|
|
+
|
|
|
# 5. 统一过滤:body_text 完整性 / agent 评分 / 时效
|
|
# 5. 统一过滤:body_text 完整性 / agent 评分 / 时效
|
|
|
from datetime import datetime as _dt
|
|
from datetime import datetime as _dt
|
|
|
cutoff_ts = int(_dt(*cutoff_date).timestamp())
|
|
cutoff_ts = int(_dt(*cutoff_date).timestamp())
|
|
@@ -574,6 +779,7 @@ def extract_sources_to_json(
|
|
|
"filtered_reasons": reason_counts,
|
|
"filtered_reasons": reason_counts,
|
|
|
"filtered_details": filtered_details,
|
|
"filtered_details": filtered_details,
|
|
|
"images_downloaded": images_downloaded,
|
|
"images_downloaded": images_downloaded,
|
|
|
|
|
+ "auto_transcribe": auto_transcribe_stats,
|
|
|
"output_file": str(output_file),
|
|
"output_file": str(output_file),
|
|
|
}
|
|
}
|
|
|
|
|
|