1 месяц назад · 1991164813
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ CLAUDE.md
 
				 htmlcov/
			
 
				 .tox/
			
 
				 .nox/
			
 
				+scratch/
			
 
				 
			
 
				 # Misc
			
 
				 .DS_Store
			
--- a/agent/tools/builtin/content/platforms/aigc_channel.py
+++ b/agent/tools/builtin/content/platforms/aigc_channel.py
@@ -288,28 +288,56 @@ async def detail(
 
				     # 视频字幕：任何 aigc-channel 平台只要 post.videos 字段非空就触发 Deepgram 转写。
			
 
				     # 下载策略在 transcription._download_video 里按 platform 分支，未指定的平台走
			
 
				     # "yt-dlp on page URL → httpx direct" 两步兜底。
			
 
				+    #
			
 
				+    # 三态语义（跟 extract_sources._needs_transcribe 对齐）：
			
 
				+    #   字段缺失     → 没尝试过，跑 Deepgram
			
 
				+    #   字段 = ""    → 尝试过但失败，跳过（保护 Deepgram 额度）
			
 
				+    #   字段 = text  → 已成功，复用
			
 
				     extras_d = extras or {}
			
 
				-    transcript_text: Optional[str] = post.get("video_transcript")  # cache hit reuse
			
 
				     has_video = bool(post.get("videos"))
			
 
				+    field_present = "video_transcript" in post
			
 
				+    transcript_text: Optional[str] = post.get("video_transcript") or None
			
 
				+
			
 
				     if (
			
 
				-        not transcript_text
			
 
				+        not field_present
			
 
				         and has_video
			
 
				         and extras_d.get("include_transcript", True)
			
 
				     ):
			
 
				         from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				-        transcript_text = await transcribe_video_from_post(platform_id, post)
			
 
				-        if transcript_text:
			
 
				-            post["video_transcript"] = transcript_text
			
 
				-            import os as _os
			
 
				-            from agent.tools.builtin.content import cache as _cache
			
 
				-            trace_id = extras_d.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				-            content_id = (
			
 
				-                post.get("channel_content_id")
			
 
				-                or post.get("content_id")
			
 
				-                or post.get("video_id")
			
 
				+        transcribe_error: Optional[str] = None
			
 
				+        try:
			
 
				+            transcript_text = await transcribe_video_from_post(platform_id, post)
			
 
				+        except Exception as e:
			
 
				+            transcript_text = None
			
 
				+            transcribe_error = f"{type(e).__name__}: {e}"
			
 
				+            import logging as _logging
			
 
				+            _logging.getLogger(__name__).warning(
			
 
				+                "transcribe_video_from_post raised for %s: %s", platform_id, e
			
 
				+            )
			
 
				+
			
 
				+        # 三态写回：成功 = text；失败/None = "" 作为"已尝试"标记，下次 cache hit 直接短路。
			
 
				+        final_value = transcript_text or ""
			
 
				+        post["video_transcript"] = final_value
			
 
				+        if not final_value:
			
 
				+            # 失败原因暴露到 output JSON，方便 agent/用户判断要不要重试或换平台
			
 
				+            post["_transcribe_error"] = (
			
 
				+                transcribe_error
			
 
				+                or "transcribe returned None (下载/抽音/Deepgram 任一步失败，见 logger.warning)"
			
 
				+            )
			
 
				+
			
 
				+        # cache writeback 不再以"成功"为前提：失败的 "" 也写回，让下次 cache hit 短路掉
			
 
				+        import os as _os
			
 
				+        from agent.tools.builtin.content import cache as _cache
			
 
				+        trace_id = extras_d.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				+        content_id = (
			
 
				+            post.get("channel_content_id")
			
 
				+            or post.get("content_id")
			
 
				+            or post.get("video_id")
			
 
				+        )
			
 
				+        if trace_id and content_id:
			
 
				+            _cache.update_post_field(
			
 
				+                trace_id, platform_id, content_id, "video_transcript", final_value
			
 
				             )
			
 
				-            if trace_id and content_id:
			
 
				-                _cache.update_post_field(trace_id, platform_id, content_id, "video_transcript", transcript_text)
			
 
				 
			
 
				     # transcript already embedded as post["video_transcript"] inside the JSON dump;
			
 
				     # no need to repeat as a separate section.
			
--- a/agent/tools/builtin/content/platforms/youtube.py
+++ b/agent/tools/builtin/content/platforms/youtube.py
@@ -238,13 +238,21 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				     """
			
 
				     YouTube 详情：需要额外 HTTP 调用获取字幕/下载等。
			
 
				     post 来自搜索缓存，extras 支持 include_captions / download_video。
			
 
				+
			
 
				+    Graceful degrade: 三条数据通路（/youtube/detail 增强元数据、/youtube/captions 官方字幕、
			
 
				+    Deepgram 自研转写）独立进行，任何一条失败都不影响其他。特别是 Deepgram 走的是
			
 
				+    yt-dlp 下载 watch URL → ffmpeg → Deepgram API，跟 crawler.aiddit.com 后端无关，
			
 
				+    后端宕机时仍应自动跑 transcript。
			
 
				     """
			
 
				     extras = extras or {}
			
 
				-    content_id = post.get("video_id", "")
			
 
				+    content_id = post.get("video_id") or post.get("channel_content_id", "")
			
 
				     include_captions = extras.get("include_captions", True)
			
 
				     download_video = extras.get("download_video", False)
			
 
				     include_transcript = extras.get("include_transcript", True)
			
 
				 
			
 
				+    # ── 1) /youtube/detail：拿增强元数据（标题/描述/点赞等）。失败时用 search post 兜底 ──
			
 
				+    video_info: Dict[str, Any] = {}
			
 
				+    detail_error: Optional[str] = None
			
 
				     try:
			
 
				         async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
			
 
				             resp = await client.post(
			
@@ -253,92 +261,133 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				             )
			
 
				             resp.raise_for_status()
			
 
				             detail_data = resp.json()
			
 
				+        if detail_data.get("code") == 0:
			
 
				+            result_data = detail_data.get("data", {})
			
 
				+            video_info = result_data.get("data", {}) if isinstance(result_data, dict) else {}
			
 
				+        else:
			
 
				+            detail_error = detail_data.get("msg") or "未知错误"
			
 
				+    except Exception as e:
			
 
				+        detail_error = str(e)
			
 
				 
			
 
				-        if detail_data.get("code") != 0:
			
 
				-            return ToolResult(title="详情获取失败", output="", error=detail_data.get("msg", "未知错误"))
			
 
				-
			
 
				-        result_data = detail_data.get("data", {})
			
 
				-        video_info = result_data.get("data", {}) if isinstance(result_data, dict) else {}
			
 
				-
			
 
				-        # 字幕
			
 
				-        captions_text = None
			
 
				-        if include_captions or download_video:
			
 
				-            try:
			
 
				-                async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
			
 
				-                    cap_resp = await client.post(
			
 
				-                        f"{CRAWLER_BASE_URL}/youtube/captions",
			
 
				-                        json={"content_id": content_id},
			
 
				-                    )
			
 
				-                    cap_resp.raise_for_status()
			
 
				-                    cap_data = cap_resp.json()
			
 
				-                    if cap_data.get("code") == 0:
			
 
				-                        inner = cap_data.get("data", {})
			
 
				-                        if isinstance(inner, dict):
			
 
				-                            inner2 = inner.get("data", {})
			
 
				-                            if isinstance(inner2, dict):
			
 
				-                                captions_text = inner2.get("content")
			
 
				-            except Exception:
			
 
				-                pass
			
 
				-
			
 
				-        # 下载
			
 
				-        video_path = None
			
 
				-        video_outline = None
			
 
				-        if download_video:
			
 
				-            import asyncio
			
 
				+    # ── 2) /youtube/captions：官方字幕（也走 crawler 后端，同样可能挂） ──
			
 
				+    captions_text: Optional[str] = None
			
 
				+    if include_captions or download_video:
			
 
				+        try:
			
 
				+            async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
			
 
				+                cap_resp = await client.post(
			
 
				+                    f"{CRAWLER_BASE_URL}/youtube/captions",
			
 
				+                    json={"content_id": content_id},
			
 
				+                )
			
 
				+                cap_resp.raise_for_status()
			
 
				+                cap_data = cap_resp.json()
			
 
				+                if cap_data.get("code") == 0:
			
 
				+                    inner = cap_data.get("data", {})
			
 
				+                    if isinstance(inner, dict):
			
 
				+                        inner2 = inner.get("data", {})
			
 
				+                        if isinstance(inner2, dict):
			
 
				+                            captions_text = inner2.get("content")
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+    # ── 3) 视频文件下载（用户显式 extras.download_video=True 时才跑） ──
			
 
				+    video_path = None
			
 
				+    video_outline = None
			
 
				+    if download_video:
			
 
				+        import asyncio
			
 
				+        try:
			
 
				             from agent.tools.builtin.content.media import download_youtube_video, parse_srt_to_outline
			
 
				             video_path = await asyncio.to_thread(download_youtube_video, content_id)
			
 
				             if captions_text:
			
 
				                 video_outline = parse_srt_to_outline(captions_text)
			
 
				-
			
 
				-        # Deepgram 转写：独立于 captions，无论 captions 是否拿到都会跑（除非显式关掉），
			
 
				-        # 这样面对官方字幕空缺/质量不佳的视频也有兜底。Cache 命中时复用。
			
 
				-        transcript_text: Optional[str] = post.get("video_transcript")
			
 
				-        if not transcript_text and include_transcript:
			
 
				-            from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				-            # transcribe_video_from_post 用 post.get("video_id") 构造 watch URL
			
 
				-            if not post.get("video_id"):
			
 
				-                post["video_id"] = content_id
			
 
				+        except Exception as e:
			
 
				+            import logging
			
 
				+            logging.getLogger(__name__).warning("youtube download_video failed: %s", e)
			
 
				+
			
 
				+    # ── 4) Deepgram 转写：独立于 1)/2)，走 yt-dlp+Deepgram，不依赖 crawler 后端 ──
			
 
				+    #
			
 
				+    # 三态语义（跟 extract_sources / aigc_channel.detail 对齐）：
			
 
				+    #   字段缺失     → 没尝试过，跑 Deepgram
			
 
				+    #   字段 = ""    → 尝试过但失败，跳过（保护 Deepgram 额度）
			
 
				+    #   字段 = text  → 已成功，复用
			
 
				+    transcript_text: Optional[str] = post.get("video_transcript") or None
			
 
				+    field_present = "video_transcript" in post
			
 
				+    transcribe_error: Optional[str] = None
			
 
				+    if not field_present and include_transcript:
			
 
				+        from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				+        if not post.get("video_id"):
			
 
				+            post["video_id"] = content_id
			
 
				+        try:
			
 
				             transcript_text = await transcribe_video_from_post("youtube", post)
			
 
				-            if transcript_text:
			
 
				-                post["video_transcript"] = transcript_text
			
 
				-                import os as _os
			
 
				-                from agent.tools.builtin.content import cache as _cache
			
 
				-                trace_id = extras.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				-                if trace_id and content_id:
			
 
				-                    _cache.update_post_field(trace_id, "youtube", content_id, "video_transcript", transcript_text)
			
 
				-
			
 
				-        output_data = {
			
 
				-            "video_id": content_id,
			
 
				-            "title": video_info.get("title", ""),
			
 
				-            "channel": video_info.get("channel_account_name", ""),
			
 
				-            "description": video_info.get("body_text", ""),
			
 
				-            "like_count": video_info.get("like_count"),
			
 
				-            "comment_count": video_info.get("comment_count"),
			
 
				-            "content_link": video_info.get("content_link", ""),
			
 
				-            "captions": captions_text,           # YouTube 官方字幕（可能为空）
			
 
				-            "video_transcript": transcript_text, # Deepgram 转写兜底
			
 
				-        }
			
 
				-        if download_video:
			
 
				-            output_data["video_path"] = video_path
			
 
				-            output_data["video_outline"] = video_outline
			
 
				-
			
 
				-        output_text = json.dumps(output_data, ensure_ascii=False, indent=2)
			
 
				-
			
 
				-        memory_parts = []
			
 
				-        if captions_text:
			
 
				-            memory_parts.append("captions")
			
 
				-        if transcript_text and transcript_text != captions_text:
			
 
				-            memory_parts.append("transcript")
			
 
				-        memory_extra = f" with {'+'.join(memory_parts)}" if memory_parts else ""
			
 
				-
			
 
				-        return ToolResult(
			
 
				-            title=f"YouTube 详情: {video_info.get('title', content_id)}",
			
 
				-            output=output_text,
			
 
				-            long_term_memory=f"YouTube detail for {content_id}{memory_extra}",
			
 
				-        )
			
 
				+        except Exception as e:
			
 
				+            import logging
			
 
				+            logging.getLogger(__name__).warning("youtube transcribe failed: %s", e)
			
 
				+            transcript_text = None
			
 
				+            transcribe_error = f"{type(e).__name__}: {e}"
			
 
				+
			
 
				+        # 三态写回：成功 = text；失败/None = "" 作为"已尝试"标记
			
 
				+        final_value = transcript_text or ""
			
 
				+        post["video_transcript"] = final_value
			
 
				+        if not final_value:
			
 
				+            post["_transcribe_error"] = (
			
 
				+                transcribe_error
			
 
				+                or "transcribe returned None (yt-dlp/Deepgram 任一步失败，见 logger.warning)"
			
 
				+            )
			
 
				 
			
 
				-    except Exception as e:
			
 
				-        return ToolResult(title="YouTube 详情异常", output="", error=str(e))
			
 
				+        # cache writeback 失败的 "" 也写，下次 cache hit 短路
			
 
				+        import os as _os
			
 
				+        from agent.tools.builtin.content import cache as _cache
			
 
				+        trace_id = extras.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				+        if trace_id and content_id:
			
 
				+            _cache.update_post_field(trace_id, "youtube", content_id, "video_transcript", final_value)
			
 
				+
			
 
				+    # ── 5) 组装输出：detail 接口的字段优先，缺失时用 search post 兜底 ──
			
 
				+    output_data = {
			
 
				+        "video_id": content_id,
			
 
				+        "title": video_info.get("title") or post.get("title", ""),
			
 
				+        "channel": video_info.get("channel_account_name") or post.get("author", ""),
			
 
				+        "description": (
			
 
				+            video_info.get("body_text")
			
 
				+            or post.get("body_text")
			
 
				+            or post.get("description_snippet", "")
			
 
				+        ),
			
 
				+        "like_count": (
			
 
				+            video_info.get("like_count")
			
 
				+            if video_info.get("like_count") is not None
			
 
				+            else post.get("like_count")
			
 
				+        ),
			
 
				+        "comment_count": video_info.get("comment_count"),
			
 
				+        "content_link": video_info.get("content_link") or post.get("link", ""),
			
 
				+        "captions": captions_text,           # YouTube 官方字幕（可能为空）
			
 
				+        # Deepgram 转写：读 post 字段，三态语义自然透出（"" = 已尝试失败）
			
 
				+        "video_transcript": post.get("video_transcript", ""),
			
 
				+    }
			
 
				+    if detail_error:
			
 
				+        # 显式标记 graceful degrade 状态，让上层知道这次走的是 fallback
			
 
				+        output_data["_detail_backend_error"] = detail_error
			
 
				+    if post.get("_transcribe_error"):
			
 
				+        # Deepgram 这一路失败原因透到 output，方便 agent/用户判断要不要重试
			
 
				+        output_data["_transcribe_error"] = post["_transcribe_error"]
			
 
				+    if download_video:
			
 
				+        output_data["video_path"] = video_path
			
 
				+        output_data["video_outline"] = video_outline
			
 
				+
			
 
				+    output_text = json.dumps(output_data, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+    memory_parts = []
			
 
				+    if captions_text:
			
 
				+        memory_parts.append("captions")
			
 
				+    if transcript_text and transcript_text != captions_text:
			
 
				+        memory_parts.append("transcript")
			
 
				+    if detail_error:
			
 
				+        memory_parts.append(f"degraded(detail backend down)")
			
 
				+    memory_extra = f" with {'+'.join(memory_parts)}" if memory_parts else ""
			
 
				+
			
 
				+    title = video_info.get("title") or post.get("title") or content_id
			
 
				+    return ToolResult(
			
 
				+        title=f"YouTube 详情: {title}",
			
 
				+        output=output_text,
			
 
				+        long_term_memory=f"YouTube detail for {content_id}{memory_extra}",
			
 
				+    )
			
 
				 
			
 
				 
			
 
				 # ── 拼图 ──
			
--- a/agent/tools/builtin/content/transcription.py
+++ b/agent/tools/builtin/content/transcription.py
@@ -24,7 +24,6 @@ import logging
 
				 import os
			
 
				 import re
			
 
				 import subprocess
			
 
				-import tempfile
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Optional
			
 
				 
			
@@ -40,7 +39,10 @@ FFMPEG_TIMEOUT = 600
 
				 UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
			
 
				       "(KHTML, like Gecko) Chrome/124.0 Safari/537.36")
			
 
				 
			
 
				-_TMP_ROOT = Path(tempfile.gettempdir()) / "content_transcribe"
			
 
				+# 项目根目录 / .cache / content_videos —— 不再用系统 %TEMP%，避免被 Windows 偶发清理
			
 
				+# 也避免 8GB+ 视频堆在 AppData\Local\Temp 看不见。
			
 
				+# parents[4]: transcription.py → content/ → builtin/ → tools/ → agent/ → project root
			
 
				+_CACHE_ROOT = Path(__file__).resolve().parents[4] / ".cache" / "content_videos"
			
 
				 _SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
			
 
				 # Zero-width lookbehind/lookahead: remove whitespace strictly between CJK chars,
			
 
				 # preserve CJK<->ASCII boundaries (e.g. "Remotion 是工具" stays intact).
			
@@ -58,11 +60,7 @@ _DURATION_PROBE_TIMEOUT = 15
 
				 
			
 
				 
			
 
				 def extract_video_url(platform: str, post: dict[str, Any]) -> Optional[str]:
			
 
				-    """Pluck a video URL (page or direct) out of a platform's raw post dict.
			
 
				-
			
 
				-    Mirrors scratch/crawl_videos.py so the two paths stay in sync; the
			
 
				-    crawler is the source of truth for what shape each platform's post takes.
			
 
				-    """
			
 
				+    """Pluck a video URL (page or direct) out of a platform's raw post dict."""
			
 
				     if platform == "x":
			
 
				         vlist = post.get("video_url_list") or []
			
 
				         if vlist:
			
@@ -93,7 +91,11 @@ def _safe_stem(platform: str, post: dict[str, Any]) -> str:
 
				 def _yt_dlp_download(url: str, target: Path) -> Optional[Path]:
			
 
				     if target.exists() and target.stat().st_size > 0:
			
 
				         return target
			
 
				-    cmd = ["yt-dlp", "-f", "best[ext=mp4]/best", "-o", str(target),
			
 
				+    # Format chain: 优先 muxed mp4（YouTube/X/douyin 通常命中，最快），
			
 
				+    # fallback 到 bestvideo+bestaudio + ffmpeg merge（bili 等 DASH-only 平台），
			
 
				+    # 最后兜底 best。
			
 
				+    cmd = ["yt-dlp", "-f", "best[ext=mp4]/bestvideo+bestaudio/best",
			
 
				+           "-o", str(target),
			
 
				            "--no-playlist", "--quiet", "--no-warnings", url]
			
 
				     try:
			
 
				         r = subprocess.run(cmd, capture_output=True, text=True, timeout=DOWNLOAD_TIMEOUT)
			
@@ -140,7 +142,7 @@ async def _download_video(
 
				 ) -> Optional[Path]:
			
 
				     """Dispatch to the right downloader per platform.
			
 
				 
			
 
				-    Known-good strategies (from scratch/crawl_videos.py):
			
 
				+    Per-platform strategies:
			
 
				       x      : yt-dlp on the tweet page URL (video URLs are signed/rotating)
			
 
				       douyin : httpx direct with douyin.com Referer (video URL is a play API)
			
 
				       sph    : httpx direct with channels.weixin.qq.com Referer (stodownload link)
			
@@ -333,7 +335,7 @@ async def transcribe_video_from_post(
 
				         return None
			
 
				 
			
 
				     stem = _safe_stem(platform, post)
			
 
				-    work_dir = _TMP_ROOT / platform
			
 
				+    work_dir = _CACHE_ROOT / platform
			
 
				     work_dir.mkdir(parents=True, exist_ok=True)
			
 
				     video_path = work_dir / f"{stem}.mp4"
			
 
				     audio_path = work_dir / f"{stem}.m4a"
			
--- a/examples/process_pipeline/db_requirements.json
+++ b/examples/process_pipeline/db_requirements.json
@@ -112,5 +112,7 @@
 
				   "图文排版（优先选择使用终端/API/agent友好工具的帖子）",
			
 
				   "给定一段文案或需求，用 AI 工具生成符合要求的排版图片的方法/教程/工作流",
			
 
				   "给定一段文案或需求，用 AI 工具生成符合要求的排版图片的方法/教程/工作流",
			
 
				-  "帮我找深度的小红书/抖音的内容是如何从选题开始构建的知识，比如如何确定一个选题的方向，逐步填充选题的细节，选题如何扩展成创作脚本，创作脚本创作又有什么技巧的，这种内容构建的知识"
			
 
				+  "给定一段文案或需求，用 AI 工具生成符合要求的排版图片的方法/教程/工作流",
			
 
				+  "用ai生成真实摄影的美女写真组图，要求具有真实感，氛围感，人物一致性保持",
			
 
				+  "520_test"
			
 
				 ]
			
--- a/examples/process_pipeline/run_metrics.json
+++ b/examples/process_pipeline/run_metrics.json
--- a/examples/process_pipeline/script/extract_sources.py
+++ b/examples/process_pipeline/script/extract_sources.py
@@ -11,6 +11,7 @@
 
				 """
			
 
				 
			
 
				 import json
			
 
				+import logging
			
 
				 import re
			
 
				 from pathlib import Path
			
 
				 from typing import Any, Dict, List, Optional, Tuple
			
@@ -18,6 +19,8 @@ import asyncio
 
				 import aiohttp
			
 
				 from urllib.parse import urlparse, parse_qs, urlencode
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 # ── URL → (platform, content_id) 解析 ────────────────────────────────
			
 
				 
			
@@ -141,6 +144,141 @@ def _merge_transcript_into_body(post: Dict[str, Any]) -> Dict[str, Any]:
 
				     return merged
			
 
				 
			
 
				 
			
 
				+def _needs_transcribe(platform: str, post: Dict[str, Any]) -> bool:
			
 
				+    """是否需要给这条 post 跑 Deepgram 转写。
			
 
				+
			
 
				+    语义（用户明确约束）：
			
 
				+      - `video_transcript` 字段**缺失** → 视为"从未尝试"，需要跑
			
 
				+      - `video_transcript` 字段存在（即使为空字符串 ""）→ 视为"已尝试过"，跳过
			
 
				+        （Deepgram 对纯音乐/无人声视频会返回空，这是合法的"失败"标记，不重跑）
			
 
				+      - 必须有视频源（用 transcription.extract_video_url 统一判断，跨平台）
			
 
				+    """
			
 
				+    if not isinstance(post, dict):
			
 
				+        return False
			
 
				+    if "video_transcript" in post:
			
 
				+        return False
			
 
				+    try:
			
 
				+        from agent.tools.builtin.content.transcription import extract_video_url
			
 
				+        return bool(extract_video_url(platform, post))
			
 
				+    except Exception:
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				+async def _transcribe_one_post(
			
 
				+    platform: str,
			
 
				+    post: Dict[str, Any],
			
 
				+    sem: asyncio.Semaphore,
			
 
				+) -> Optional[str]:
			
 
				+    """对一条 post 跑 transcribe_video_from_post，写回 post["video_transcript"]。
			
 
				+
			
 
				+    成功 → post["video_transcript"] = transcript
			
 
				+    失败 → post["video_transcript"] = ""  # 明确"已尝试"标记，避免后续 backfill 重跑
			
 
				+    返回 transcript 或 None。
			
 
				+    """
			
 
				+    from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				+    async with sem:
			
 
				+        try:
			
 
				+            text = await transcribe_video_from_post(platform, post)
			
 
				+        except Exception as e:
			
 
				+            logger.warning("transcribe failed (%s): %s", platform, e)
			
 
				+            text = None
			
 
				+    if text:
			
 
				+        post["video_transcript"] = text
			
 
				+        return text
			
 
				+    # 失败也写一个 ""，表示"我们尝试过 Deepgram 但没拿到结果"
			
 
				+    post["video_transcript"] = ""
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+async def _transcribe_pending_async(
			
 
				+    matched: List[Dict[str, Any]],
			
 
				+    concurrency: int = 3,
			
 
				+) -> Dict[Tuple[str, str], str]:
			
 
				+    """对 matched 里所有"缺 video_transcript 字段 + 有视频源"的 post 并发跑 Deepgram。
			
 
				+
			
 
				+    返回成功的 {(platform, channel_content_id): transcript_text} 映射，
			
 
				+    供调用方写回 cache 文件（让其他 trace 命中同一 post 时复用）。
			
 
				+    """
			
 
				+    targets: List[Tuple[str, Dict[str, Any]]] = []
			
 
				+    for src in matched:
			
 
				+        platform = src.get("platform")
			
 
				+        post = src.get("post")
			
 
				+        if not isinstance(post, dict) or not platform:
			
 
				+            continue
			
 
				+        if _needs_transcribe(platform, post):
			
 
				+            targets.append((platform, post))
			
 
				+
			
 
				+    if not targets:
			
 
				+        return {}
			
 
				+
			
 
				+    logger.info("Auto-transcribe: %d post(s) pending", len(targets))
			
 
				+    sem = asyncio.Semaphore(concurrency)
			
 
				+    results = await asyncio.gather(
			
 
				+        *[_transcribe_one_post(p, post, sem) for p, post in targets]
			
 
				+    )
			
 
				+
			
 
				+    updates: Dict[Tuple[str, str], str] = {}
			
 
				+    success_n = 0
			
 
				+    for (platform, post), text in zip(targets, results):
			
 
				+        if text:
			
 
				+            success_n += 1
			
 
				+            cid = post.get("channel_content_id") or post.get("video_id")
			
 
				+            if cid:
			
 
				+                updates[(platform, str(cid))] = text
			
 
				+    logger.info("Auto-transcribe: %d/%d success", success_n, len(targets))
			
 
				+    return updates
			
 
				+
			
 
				+
			
 
				+def _writeback_transcript_to_cache(
			
 
				+    cache_dir: Path,
			
 
				+    updates: Dict[Tuple[str, str], str],
			
 
				+) -> int:
			
 
				+    """把新拿到的 transcript 写回所有 cache 文件里匹配的 post。
			
 
				+
			
 
				+    跨 trace 扩散：同一条 video 可能被多个 trace 的搜索引用过，这里一次写回所有
			
 
				+    cache 副本，避免下次另一个 trace 跑 extract_sources 时又触发一遍 Deepgram。
			
 
				+    返回 cache 中被更新的 post 总数。
			
 
				+    """
			
 
				+    if not updates or not cache_dir.exists():
			
 
				+        return 0
			
 
				+    written = 0
			
 
				+    for cf in cache_dir.glob("*.json"):
			
 
				+        try:
			
 
				+            data = json.loads(cf.read_text(encoding="utf-8"))
			
 
				+        except Exception:
			
 
				+            continue
			
 
				+        dirty = False
			
 
				+        for key, entry in data.items():
			
 
				+            if not key.startswith("search:") or not isinstance(entry, dict):
			
 
				+                continue
			
 
				+            platform = key.split(":", 1)[1]
			
 
				+            post_lists = []
			
 
				+            for h in entry.get("history", []) or []:
			
 
				+                post_lists.append(h.get("posts", []))
			
 
				+            if "posts" in entry and isinstance(entry["posts"], list):
			
 
				+                post_lists.append(entry["posts"])
			
 
				+            for posts in post_lists:
			
 
				+                for post in posts or []:
			
 
				+                    if not isinstance(post, dict):
			
 
				+                        continue
			
 
				+                    cid = post.get("channel_content_id") or post.get("video_id")
			
 
				+                    if not cid:
			
 
				+                        continue
			
 
				+                    text = updates.get((platform, str(cid)))
			
 
				+                    # 注意：用 "video_transcript" not in post 判断，跟 _needs_transcribe 语义一致
			
 
				+                    # 已经有字段（即使为空）→ 不覆盖，尊重之前的"已尝试"状态
			
 
				+                    if text and "video_transcript" not in post:
			
 
				+                        post["video_transcript"] = text
			
 
				+                        dirty = True
			
 
				+                        written += 1
			
 
				+        if dirty:
			
 
				+            cf.write_text(
			
 
				+                json.dumps(data, ensure_ascii=False, indent=2),
			
 
				+                encoding="utf-8",
			
 
				+            )
			
 
				+    return written
			
 
				+
			
 
				+
			
 
				 def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
			
 
				     """判断帖子是否早于截止时间戳（秒级）
			
 
				 
			
@@ -236,6 +374,27 @@ def _normalize_url(url: str) -> Optional[str]:
 
				         return None
			
 
				 
			
 
				 
			
 
				+def _normalize_post_in_place(platform: str, post: Dict[str, Any]) -> None:
			
 
				+    """对 cache 里读出的 post 做平台相关的字段补齐（in-place）。
			
 
				+
			
 
				+    早期 cache 可能在 platform-level normalize 函数加上之前就写入了，此处兜底补救：
			
 
				+    YouTube: description_snippet -> body_text / thumbnails -> images / url -> videos / ...
			
 
				+    sph:     title (caption) -> body_text （视频号 title 字段塞的是整段 caption）
			
 
				+    """
			
 
				+    if platform == "youtube":
			
 
				+        try:
			
 
				+            from agent.tools.builtin.content.platforms.youtube import _normalize_youtube_post
			
 
				+            _normalize_youtube_post(post)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+    elif platform == "sph":
			
 
				+        try:
			
 
				+            from agent.tools.builtin.content.platforms.aigc_channel import _normalize_sph_post
			
 
				+            _normalize_sph_post(post)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+
			
 
				 def build_cache_index(cache_dir: Path, trace_ids: Optional[List[str]] = None) -> Dict[Tuple[str, str], Dict[str, Any]]:
			
 
				     """
			
 
				     构建 (platform, channel_content_id) -> post 映射。
			
@@ -285,9 +444,14 @@ def build_cache_index(cache_dir: Path, trace_ids: Optional[List[str]] = None) ->
 
				                 for post in posts or []:
			
 
				                     if not isinstance(post, dict):
			
 
				                         continue
			
 
				+
			
 
				+                    # 平台字段 normalize：兜底救援早期 cache（normalize 函数加之前写入的）
			
 
				+                    _normalize_post_in_place(platform, post)
			
 
				+
			
 
				                     cid = post.get("channel_content_id")
			
 
				 
			
 
				                     # YouTube 平台用 video_id 而非 channel_content_id
			
 
				+                    # （normalize 已经处理过，这里是双保险，对早期未 normalize 的 post 也兜底）
			
 
				                     if not cid and post.get("video_id"):
			
 
				                         cid = post.get("video_id")
			
 
				                         post["channel_content_id"] = cid  # 补全字段
			
@@ -318,6 +482,8 @@ def extract_sources_to_json(
 
				     min_body_len: int = DEFAULT_MIN_BODY_LEN,
			
 
				     min_score: float = DEFAULT_MIN_SCORE,
			
 
				     cutoff_date: Tuple[int, int, int] = DEFAULT_CUTOFF_DATE,
			
 
				+    auto_transcribe: bool = True,
			
 
				+    transcribe_concurrency: int = 3,
			
 
				 ) -> Dict[str, Any]:
			
 
				     """
			
 
				     扫描 raw_cases_dir 下的 case_*.json，
			
@@ -483,6 +649,45 @@ def extract_sources_to_json(
 
				     # 4. 合并已有数据和新匹配的数据
			
 
				     all_sources = existing_sources + matched
			
 
				 
			
 
				+    # 4.5. 自动 backfill 视频转写（保底兜底）
			
 
				+    # 触发条件：post 有视频源（extract_video_url 非空）且**完全没有 `video_transcript` 字段**
			
 
				+    # 空字符串视为"已尝试过"不重跑，跨平台统一。失败也会写 ""，下次跳过避免反复浪费 Deepgram 额度。
			
 
				+    # 跑完写回所有 cache 文件，让其他 trace 引用同一 post 时直接复用。
			
 
				+    auto_transcribe_stats: Dict[str, Any] = {"attempted": 0, "succeeded": 0, "cache_writeback": 0}
			
 
				+    if auto_transcribe and all_sources:
			
 
				+        try:
			
 
				+            transcribe_targets = sum(
			
 
				+                1 for s in all_sources
			
 
				+                if isinstance(s.get("post"), dict)
			
 
				+                and _needs_transcribe(s.get("platform"), s["post"])
			
 
				+            )
			
 
				+            if transcribe_targets > 0:
			
 
				+                logger.info("extract_sources: auto-transcribe %d post(s)", transcribe_targets)
			
 
				+                updates = asyncio.run(
			
 
				+                    _transcribe_pending_async(all_sources, concurrency=transcribe_concurrency)
			
 
				+                )
			
 
				+                auto_transcribe_stats["attempted"] = transcribe_targets
			
 
				+                auto_transcribe_stats["succeeded"] = len(updates)
			
 
				+                # 写回 cache（跨 trace 扩散）
			
 
				+                if updates:
			
 
				+                    n = _writeback_transcript_to_cache(cache_dir, updates)
			
 
				+                    auto_transcribe_stats["cache_writeback"] = n
			
 
				+                # 顺手把 transcript merge 进 body_text，保持跟 _merge_transcript_into_body 一致
			
 
				+                for s in all_sources:
			
 
				+                    post = s.get("post")
			
 
				+                    if not isinstance(post, dict):
			
 
				+                        continue
			
 
				+                    if not post.get("video_transcript"):
			
 
				+                        continue
			
 
				+                    merged = _merge_transcript_into_body(post)
			
 
				+                    if merged is not post:
			
 
				+                        post["body_text"] = merged.get("body_text", post.get("body_text", ""))
			
 
				+        except RuntimeError as e:
			
 
				+            # 比如已在 event loop 内 — 跳过 auto-transcribe 不阻塞主流程
			
 
				+            logger.warning("auto-transcribe skipped: %s", e)
			
 
				+        except Exception as e:
			
 
				+            logger.warning("auto-transcribe failed: %s", e)
			
 
				+
			
 
				     # 5. 统一过滤：body_text 完整性 / agent 评分 / 时效
			
 
				     from datetime import datetime as _dt
			
 
				     cutoff_ts = int(_dt(*cutoff_date).timestamp())
			
@@ -574,6 +779,7 @@ def extract_sources_to_json(
 
				         "filtered_reasons": reason_counts,
			
 
				         "filtered_details": filtered_details,
			
 
				         "images_downloaded": images_downloaded,
			
 
				+        "auto_transcribe": auto_transcribe_stats,
			
 
				         "output_file": str(output_file),
			
 
				     }
			
 
				 
			
--- a/examples/process_pipeline/ui/app.js
+++ b/examples/process_pipeline/ui/app.js
@@ -626,6 +626,11 @@ async function fetchRequirementData(index) {
 
				     } catch (e) {
			
 
				         console.error("Failed to fetch data", e);
			
 
				     }
			
 
				+
			
 
				+    // Automatically re-apply search filter on newly loaded data
			
 
				+    if (typeof applySearchFilter === 'function') {
			
 
				+        applySearchFilter();
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				 async function pollStatus() {
			
@@ -1237,8 +1242,52 @@ function setupEventListeners() {
 
				             }
			
 
				         });
			
 
				     }
			
 
				+
			
 
				+    // Search input character matching for Case tab
			
 
				+    const searchInput = document.querySelector('.search-input');
			
 
				+    if (searchInput) {
			
 
				+        searchInput.addEventListener('input', () => {
			
 
				+            applySearchFilter();
			
 
				+        });
			
 
				+    }
			
 
				 }
			
 
				 
			
 
				+window.applySearchFilter = function() {
			
 
				+    const searchInput = document.querySelector('.search-input');
			
 
				+    if (!searchInput) return;
			
 
				+    const query = searchInput.value.toLowerCase().trim();
			
 
				+
			
 
				+    // Filter raw case cards (on "案例" page)
			
 
				+    const cards = document.querySelectorAll('#json-raw .masonry-card');
			
 
				+    cards.forEach(card => {
			
 
				+        const text = card.textContent.toLowerCase();
			
 
				+        if (text.includes(query)) {
			
 
				+            card.style.display = '';
			
 
				+        } else {
			
 
				+            card.style.display = 'none';
			
 
				+        }
			
 
				+    });
			
 
				+
			
 
				+    // Handle empty group headers and grids
			
 
				+    const grids = document.querySelectorAll('#json-raw .masonry-grid');
			
 
				+    grids.forEach(grid => {
			
 
				+        const visibleCards = Array.from(grid.querySelectorAll('.masonry-card')).filter(card => card.style.display !== 'none');
			
 
				+        const prevSibling = grid.previousElementSibling;
			
 
				+        
			
 
				+        if (visibleCards.length > 0) {
			
 
				+            grid.style.display = '';
			
 
				+            if (prevSibling && prevSibling.tagName === 'H3') {
			
 
				+                prevSibling.style.display = '';
			
 
				+            }
			
 
				+        } else {
			
 
				+            grid.style.display = 'none';
			
 
				+            if (prevSibling && prevSibling.tagName === 'H3') {
			
 
				+                prevSibling.style.display = 'none';
			
 
				+            }
			
 
				+        }
			
 
				+    });
			
 
				+};
			
 
				+
			
 
				 // Boot
			
 
				 // ----------------------------------------------------
			
 
				 // Pipeline Chain Visualization Logic