11 часов назад · 1b62df52c5
--- a/agent/tools/builtin/content/cache.py
+++ b/agent/tools/builtin/content/cache.py
@@ -137,11 +137,16 @@ def update_post_field(
 
				 
			
 
				     cid_str = str(content_id)
			
 
				     updated = False
			
 
				+    # Match by channel_content_id (primary, used by X / aigc-channel platforms)
			
 
				+    # or video_id (fallback, used by YouTube whose post id field is named differently).
			
 
				     for hist in histories:
			
 
				         for post in hist.get("posts", []) or []:
			
 
				             if not isinstance(post, dict):
			
 
				                 continue
			
 
				-            if str(post.get("channel_content_id", "")) == cid_str:
			
 
				+            if (
			
 
				+                str(post.get("channel_content_id", "")) == cid_str
			
 
				+                or str(post.get("video_id", "")) == cid_str
			
 
				+            ):
			
 
				                 post[field] = value
			
 
				                 updated = True
			
 
				 
			
--- a/agent/tools/builtin/content/platforms/aigc_channel.py
+++ b/agent/tools/builtin/content/platforms/aigc_channel.py
@@ -6,6 +6,7 @@ AIGC-Channel 平台实现（9 个中文平台）
 
				 """
			
 
				 
			
 
				 import json
			
 
				+import re
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				 import httpx
			
@@ -19,6 +20,43 @@ from agent.tools.builtin.content.registry import (
 
				 BASE_URL = "http://aigc-channel.aiddit.com/aigc/channel"
			
 
				 DEFAULT_TIMEOUT = 60.0
			
 
				 
			
 
				+# aigc-channel returns search-highlighted titles like
			
 
				+# '<em class="highlight">关键词</em>'. Strip before any rendering / scoring use.
			
 
				+_HTML_TAG_RE = re.compile(r"<[^>]+>")
			
 
				+
			
 
				+
			
 
				+def _strip_html(text: Optional[str]) -> str:
			
 
				+    if not text:
			
 
				+        return ""
			
 
				+    return _HTML_TAG_RE.sub("", text)
			
 
				+
			
 
				+
			
 
				+_SPH_TITLE_MAX = 20  # sph normalized title 截断字符数
			
 
				+
			
 
				+
			
 
				+def _normalize_sph_post(post: Dict[str, Any]) -> None:
			
 
				+    """In-place: 视频号没有独立 title，后端把 caption 塞进 title 字段而 body_text 留空。
			
 
				+
			
 
				+    把整段 title 搬到 body_text，title 取剥 HTML 后前 20 字 + '...' 作为短摘要。
			
 
				+    幂等：如果 body_text 已经有内容则不动，避免重复迁移或覆盖；title 已经 <=20 字
			
 
				+    也不强加省略号。
			
 
				+    """
			
 
				+    if not isinstance(post, dict):
			
 
				+        return
			
 
				+    raw_title = post.get("title") or ""
			
 
				+    body = post.get("body_text") or ""
			
 
				+    body = body.strip() if isinstance(body, str) else ""
			
 
				+    if not raw_title or body:
			
 
				+        return
			
 
				+    clean = _strip_html(raw_title).strip()
			
 
				+    if not clean:
			
 
				+        return
			
 
				+    post["body_text"] = clean
			
 
				+    if len(clean) > _SPH_TITLE_MAX:
			
 
				+        post["title"] = clean[:_SPH_TITLE_MAX] + "..."
			
 
				+    else:
			
 
				+        post["title"] = clean
			
 
				+
			
 
				 
			
 
				 # ── 平台注册 ──
			
 
				 
			
@@ -114,16 +152,32 @@ async def search(
 
				 
			
 
				     posts = data.get("data", [])
			
 
				 
			
 
				+    # sph 字段 normalization：title 太长（后端把 caption 塞进 title），
			
 
				+    # 把它搬到 body_text，title 取前 20 字。在评分 / summary / cache 之前做。
			
 
				+    if platform_id == "sph":
			
 
				+        for p in posts:
			
 
				+            _normalize_sph_post(p)
			
 
				+
			
 
				     # 构建概览摘要
			
 
				     summary_list = []
			
 
				-    
			
 
				+
			
 
				     # 动态导入评价模块
			
 
				     try:
			
 
				         from examples.process_pipeline.script.evaluate_source_quality import SourceQualityEvaluator
			
 
				         evaluator = SourceQualityEvaluator()
			
 
				     except ImportError:
			
 
				         evaluator = None
			
 
				-        
			
 
				+
			
 
				+    # 视频帖在评分前先并发探测 mp4 duration（HTTP Range，不下载视频流），
			
 
				+    # 让 evaluator 用真实时长替代 body 长度作为内容信号。
			
 
				+    if evaluator and posts:
			
 
				+        try:
			
 
				+            from agent.tools.builtin.content.transcription import probe_durations_for_posts
			
 
				+            await probe_durations_for_posts(platform_id, posts, concurrency=8)
			
 
				+        except Exception as e:
			
 
				+            import logging
			
 
				+            logging.getLogger(__name__).info("duration probe failed: %s", e)
			
 
				+
			
 
				     for idx, post in enumerate(posts, 1):
			
 
				         body = post.get("body_text", "") or ""
			
 
				         title = post.get("title") or body[:20] or ""
			
@@ -210,8 +264,12 @@ async def _build_images_collage(urls: List[str]) -> Optional[Dict[str, Any]]:
 
				         return {"type": "base64", "media_type": "image/png", "data": b64}
			
 
				 
			
 
				 
			
 
				-async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None) -> ToolResult:
			
 
				-    """返回单条帖子的完整内容"""
			
 
				+async def detail(
			
 
				+    post: Dict[str, Any],
			
 
				+    extras: Optional[Dict[str, Any]] = None,
			
 
				+    platform_id: str = "",
			
 
				+) -> ToolResult:
			
 
				+    """返回单条帖子的完整内容；sph/douyin 视频会通过 Deepgram 自动转写。"""
			
 
				     title = post.get("title") or post.get("body_text", "")[:30] or "无标题"
			
 
				 
			
 
				     img_urls = [u for u in post.get("images", []) if u]
			
@@ -227,12 +285,41 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				         for u in img_urls:
			
 
				             images.append({"type": "url", "url": u})
			
 
				 
			
 
				+    # 视频字幕：任何 aigc-channel 平台只要 post.videos 字段非空就触发 Deepgram 转写。
			
 
				+    # 下载策略在 transcription._download_video 里按 platform 分支，未指定的平台走
			
 
				+    # "yt-dlp on page URL → httpx direct" 两步兜底。
			
 
				+    extras_d = extras or {}
			
 
				+    transcript_text: Optional[str] = post.get("video_transcript")  # cache hit reuse
			
 
				+    has_video = bool(post.get("videos"))
			
 
				+    if (
			
 
				+        not transcript_text
			
 
				+        and has_video
			
 
				+        and extras_d.get("include_transcript", True)
			
 
				+    ):
			
 
				+        from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				+        transcript_text = await transcribe_video_from_post(platform_id, post)
			
 
				+        if transcript_text:
			
 
				+            post["video_transcript"] = transcript_text
			
 
				+            import os as _os
			
 
				+            from agent.tools.builtin.content import cache as _cache
			
 
				+            trace_id = extras_d.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				+            content_id = (
			
 
				+                post.get("channel_content_id")
			
 
				+                or post.get("content_id")
			
 
				+                or post.get("video_id")
			
 
				+            )
			
 
				+            if trace_id and content_id:
			
 
				+                _cache.update_post_field(trace_id, platform_id, content_id, "video_transcript", transcript_text)
			
 
				+
			
 
				+    # transcript already embedded as post["video_transcript"] inside the JSON dump;
			
 
				+    # no need to repeat as a separate section.
			
 
				     output_text = json.dumps(post, ensure_ascii=False, indent=2)
			
 
				 
			
 
				+    memory_suffix = " +transcript" if transcript_text else ""
			
 
				     return ToolResult(
			
 
				         title=f"详情: {title}",
			
 
				         output=output_text,
			
 
				-        long_term_memory=f"Viewed detail: {title}",
			
 
				+        long_term_memory=f"Viewed detail: {title}{memory_suffix}",
			
 
				         images=images,
			
 
				     )
			
 
				 
			
@@ -270,7 +357,7 @@ async def _build_collage(posts: List[Dict[str, Any]]) -> Optional[str]:
 
				         imgs = post.get("images", [])
			
 
				         if imgs and imgs[0]:
			
 
				             urls.append(imgs[0])
			
 
				-            base_title = post.get("title", "") or ""
			
 
				+            base_title = _strip_html(post.get("title", ""))
			
 
				             score = post.get("_quality_score")
			
 
				             if score is not None:
			
 
				                 title_with_score = f"[{score}分] {base_title}"
			
@@ -319,7 +406,11 @@ async def _build_collage(posts: List[Dict[str, Any]]) -> Optional[str]:
 
				 def _register_all():
			
 
				     for p in _AIGC_PLATFORMS:
			
 
				         p.search_impl = search
			
 
				-        p.detail_impl = detail
			
 
				+        # Bind each platform's id into detail_impl so the shared detail() knows
			
 
				+        # whether to trigger video transcription (only for sph/douyin).
			
 
				+        p.detail_impl = (
			
 
				+            lambda post, extras, _pid=p.id: detail(post, extras, _pid)  # noqa: B023 (default-arg captures pid)
			
 
				+        )
			
 
				         if p.supports_suggest:
			
 
				             p.suggest_impl = suggest
			
 
				             p.suggest_channels = [p.id]
			
--- a/agent/tools/builtin/content/platforms/x.py
+++ b/agent/tools/builtin/content/platforms/x.py
@@ -45,6 +45,16 @@ async def search(
 
				         except ImportError:
			
 
				             evaluator = None
			
 
				 
			
 
				+        # 视频帖在评分前先并发探测 mp4 duration（HTTP Range，不下载视频流），
			
 
				+        # 让 evaluator 用真实时长替代 body 长度作为内容信号。
			
 
				+        if evaluator and tweets:
			
 
				+            try:
			
 
				+                from agent.tools.builtin.content.transcription import probe_durations_for_posts
			
 
				+                await probe_durations_for_posts("x", tweets[:max_count], concurrency=8)
			
 
				+            except Exception as e:
			
 
				+                import logging
			
 
				+                logging.getLogger(__name__).info("duration probe failed for x: %s", e)
			
 
				+
			
 
				         summary_list = []
			
 
				         for idx, tweet in enumerate(tweets[:max_count], 1):
			
 
				             text = tweet.get("body_text", "")
			
@@ -189,15 +199,29 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				 
			
 
				     author_comments = await _fetch_author_comments(content_id, author_id)
			
 
				 
			
 
				+    extras_d = extras or {}
			
 
				+    trace_id = extras_d.get("__trace_id__")
			
 
				+    if not trace_id:
			
 
				+        import os as _os
			
 
				+        trace_id = _os.getenv("TRACE_ID")
			
 
				+
			
 
				     # 把作者评论写回 cache，让下游离线流程（如 extract_sources）也能拿到
			
 
				     if author_comments:
			
 
				-        import os
			
 
				         from agent.tools.builtin.content import cache as _cache
			
 
				-        # trace_id 优先从 extras 取（agent 路径由 dispatcher 注入），回退到 env（CLI 路径设置）
			
 
				-        trace_id = (extras or {}).get("__trace_id__") or os.getenv("TRACE_ID")
			
 
				         if trace_id and content_id:
			
 
				             _cache.update_post_field(trace_id, "x", content_id, "author_comments", author_comments)
			
 
				 
			
 
				+    # 视频字幕：检测到 video_url_list 时通过 Deepgram 转写 (default on, opt-out via extras)
			
 
				+    transcript_text: Optional[str] = post.get("video_transcript")  # cache hit reuse
			
 
				+    if not transcript_text and extras_d.get("include_transcript", True):
			
 
				+        from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				+        transcript_text = await transcribe_video_from_post("x", post)
			
 
				+        if transcript_text:
			
 
				+            post["video_transcript"] = transcript_text
			
 
				+            from agent.tools.builtin.content import cache as _cache
			
 
				+            if trace_id and content_id:
			
 
				+                _cache.update_post_field(trace_id, "x", content_id, "video_transcript", transcript_text)
			
 
				+
			
 
				     output_json = json.dumps(post, ensure_ascii=False, indent=2)
			
 
				 
			
 
				     sections = [output_json]
			
@@ -206,9 +230,16 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				         for i, c in enumerate(author_comments, 1):
			
 
				             lines.append(f"{i}. [赞{c['likes']} · 回复{c['replies']}] {c['text']}")
			
 
				         sections.append("\n".join(lines))
			
 
				+    # transcript already embedded as post["video_transcript"] inside output_json above;
			
 
				+    # no need to repeat as a separate section.
			
 
				     output_text = "\n\n".join(sections)
			
 
				 
			
 
				-    memory_suffix = f" + {len(author_comments)} author replies" if author_comments else ""
			
 
				+    memory_extras = []
			
 
				+    if author_comments:
			
 
				+        memory_extras.append(f"{len(author_comments)} author replies")
			
 
				+    if transcript_text:
			
 
				+        memory_extras.append("+transcript")
			
 
				+    memory_suffix = " + " + ", ".join(memory_extras) if memory_extras else ""
			
 
				     return ToolResult(
			
 
				         title=f"X 详情: @{author}",
			
 
				         output=output_text,
			
--- a/agent/tools/builtin/content/platforms/youtube.py
+++ b/agent/tools/builtin/content/platforms/youtube.py
@@ -5,6 +5,8 @@ YouTube 平台实现
 
				 """
			
 
				 
			
 
				 import json
			
 
				+import re
			
 
				+import time
			
 
				 from typing import Any, Dict, List, Optional
			
 
				 
			
 
				 import httpx
			
@@ -19,6 +21,138 @@ CRAWLER_BASE_URL = "http://crawler.aiddit.com/crawler"
 
				 DEFAULT_TIMEOUT = 60.0
			
 
				 
			
 
				 
			
 
				+# ── 字段 normalization：YouTube 后端字段名跟 evaluator/其他平台不一致 ──
			
 
				+#
			
 
				+# evaluator 期待的字段     | YouTube 后端返回的字段
			
 
				+# ------------------------+---------------------------
			
 
				+# channel_content_id      | video_id
			
 
				+# body_text               | description_snippet
			
 
				+# like_count (int)        | view_count ("130,461 views")
			
 
				+# publish_timestamp (ms)  | published_time ("6 months ago")
			
 
				+# link                    | url
			
 
				+# duration_sec (float)    | duration ("6:15" or "1:23:45")
			
 
				+# images (list[str])      | thumbnails (list[dict])
			
 
				+# content_type=="video"   | (缺失)
			
 
				+# videos                  | (缺失)
			
 
				+#
			
 
				+# 不做 normalization 的话 evaluator 会走 article 路径 + 8 个字段全找不到，
			
 
				+# 视频拿 15 分 F。
			
 
				+
			
 
				+
			
 
				+def _parse_duration(s: Any) -> Optional[float]:
			
 
				+    """Parse 'MM:SS' or 'HH:MM:SS' to seconds (float)."""
			
 
				+    if not isinstance(s, str):
			
 
				+        return None
			
 
				+    parts = s.strip().split(":")
			
 
				+    try:
			
 
				+        nums = [int(p) for p in parts]
			
 
				+    except ValueError:
			
 
				+        return None
			
 
				+    if len(nums) == 2:
			
 
				+        return float(nums[0] * 60 + nums[1])
			
 
				+    if len(nums) == 3:
			
 
				+        return float(nums[0] * 3600 + nums[1] * 60 + nums[2])
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def _parse_view_count(s: Any) -> Optional[int]:
			
 
				+    """Parse '130,461 views' (or '1.2M views') to int."""
			
 
				+    if not isinstance(s, str):
			
 
				+        return None
			
 
				+    s = s.strip()
			
 
				+    # "1.2M views" / "3.5K views"
			
 
				+    m = re.match(r"([\d.]+)\s*([KMBkmb])\b", s)
			
 
				+    if m:
			
 
				+        try:
			
 
				+            num = float(m.group(1))
			
 
				+        except ValueError:
			
 
				+            return None
			
 
				+        mult = {"K": 1_000, "M": 1_000_000, "B": 1_000_000_000}[m.group(2).upper()]
			
 
				+        return int(num * mult)
			
 
				+    # "130,461 views"
			
 
				+    m = re.search(r"([\d,]+)", s)
			
 
				+    if m:
			
 
				+        try:
			
 
				+            return int(m.group(1).replace(",", ""))
			
 
				+        except ValueError:
			
 
				+            return None
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+_RELATIVE_TIME_RE = re.compile(
			
 
				+    r"(\d+)\s+(minute|hour|day|week|month|year)s?\s+ago", re.IGNORECASE
			
 
				+)
			
 
				+_SECONDS_PER = {
			
 
				+    "minute": 60, "hour": 3600, "day": 86400,
			
 
				+    "week": 86400 * 7, "month": 86400 * 30, "year": 86400 * 365,
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def _parse_relative_time(s: Any) -> Optional[int]:
			
 
				+    """Parse '6 months ago' -> UTC milliseconds timestamp."""
			
 
				+    if not isinstance(s, str):
			
 
				+        return None
			
 
				+    m = _RELATIVE_TIME_RE.search(s.lower())
			
 
				+    if not m:
			
 
				+        return None
			
 
				+    n = int(m.group(1))
			
 
				+    delta = n * _SECONDS_PER.get(m.group(2).lower(), 0)
			
 
				+    if not delta:
			
 
				+        return None
			
 
				+    return int((time.time() - delta) * 1000)
			
 
				+
			
 
				+
			
 
				+def _normalize_youtube_post(post: Dict[str, Any]) -> None:
			
 
				+    """In-place: rewrite YouTube post fields onto the schema evaluator/transcription expect.
			
 
				+
			
 
				+    Idempotent — only fills missing fields, never overwrites existing values.
			
 
				+    """
			
 
				+    if not isinstance(post, dict):
			
 
				+        return
			
 
				+
			
 
				+    if post.get("video_id") and not post.get("channel_content_id"):
			
 
				+        post["channel_content_id"] = post["video_id"]
			
 
				+
			
 
				+    if post.get("description_snippet") and not post.get("body_text"):
			
 
				+        post["body_text"] = post["description_snippet"]
			
 
				+
			
 
				+    if post.get("view_count") and not isinstance(post.get("like_count"), (int, float)):
			
 
				+        n = _parse_view_count(post["view_count"])
			
 
				+        if n is not None:
			
 
				+            post["like_count"] = n
			
 
				+
			
 
				+    if post.get("published_time") and not post.get("publish_timestamp"):
			
 
				+        ts = _parse_relative_time(post["published_time"])
			
 
				+        if ts:
			
 
				+            post["publish_timestamp"] = ts
			
 
				+
			
 
				+    if post.get("url") and not post.get("link"):
			
 
				+        post["link"] = post["url"]
			
 
				+
			
 
				+    if post.get("duration") and not isinstance(post.get("duration_sec"), (int, float)):
			
 
				+        sec = _parse_duration(post["duration"])
			
 
				+        if sec:
			
 
				+            post["duration_sec"] = sec
			
 
				+
			
 
				+    if post.get("thumbnails") and not post.get("images"):
			
 
				+        imgs = []
			
 
				+        for t in post["thumbnails"]:
			
 
				+            if isinstance(t, dict) and t.get("url"):
			
 
				+                imgs.append(t["url"])
			
 
				+        if imgs:
			
 
				+            post["images"] = imgs
			
 
				+
			
 
				+    if not post.get("content_type"):
			
 
				+        post["content_type"] = "video"
			
 
				+
			
 
				+    if not post.get("videos"):
			
 
				+        # transcription.extract_video_url for "youtube" uses video_id directly,
			
 
				+        # so this `videos` field is just for evaluator.is_video detection.
			
 
				+        url = post.get("url")
			
 
				+        if url:
			
 
				+            post["videos"] = [url]
			
 
				+
			
 
				+
			
 
				 # ── 搜索 ──
			
 
				 
			
 
				 async def search(
			
@@ -43,6 +177,11 @@ async def search(
 
				         result_data = data.get("data", {})
			
 
				         videos = result_data.get("data", []) if isinstance(result_data, dict) else []
			
 
				 
			
 
				+        # YouTube 字段名跟其他平台不一致，先 normalize 让 evaluator 能正确评分
			
 
				+        # （并且让 duration_sec / publish_timestamp 等被解析出来，复用 video-mode 评分）
			
 
				+        for v in videos:
			
 
				+            _normalize_youtube_post(v)
			
 
				+
			
 
				         # 动态导入评价模块
			
 
				         try:
			
 
				             from examples.process_pipeline.script.evaluate_source_quality import SourceQualityEvaluator
			
@@ -104,6 +243,7 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				     content_id = post.get("video_id", "")
			
 
				     include_captions = extras.get("include_captions", True)
			
 
				     download_video = extras.get("download_video", False)
			
 
				+    include_transcript = extras.get("include_transcript", True)
			
 
				 
			
 
				     try:
			
 
				         async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
			
@@ -150,6 +290,23 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				             if captions_text:
			
 
				                 video_outline = parse_srt_to_outline(captions_text)
			
 
				 
			
 
				+        # Deepgram 转写：独立于 captions，无论 captions 是否拿到都会跑（除非显式关掉），
			
 
				+        # 这样面对官方字幕空缺/质量不佳的视频也有兜底。Cache 命中时复用。
			
 
				+        transcript_text: Optional[str] = post.get("video_transcript")
			
 
				+        if not transcript_text and include_transcript:
			
 
				+            from agent.tools.builtin.content.transcription import transcribe_video_from_post
			
 
				+            # transcribe_video_from_post 用 post.get("video_id") 构造 watch URL
			
 
				+            if not post.get("video_id"):
			
 
				+                post["video_id"] = content_id
			
 
				+            transcript_text = await transcribe_video_from_post("youtube", post)
			
 
				+            if transcript_text:
			
 
				+                post["video_transcript"] = transcript_text
			
 
				+                import os as _os
			
 
				+                from agent.tools.builtin.content import cache as _cache
			
 
				+                trace_id = extras.get("__trace_id__") or _os.getenv("TRACE_ID")
			
 
				+                if trace_id and content_id:
			
 
				+                    _cache.update_post_field(trace_id, "youtube", content_id, "video_transcript", transcript_text)
			
 
				+
			
 
				         output_data = {
			
 
				             "video_id": content_id,
			
 
				             "title": video_info.get("title", ""),
			
@@ -158,7 +315,8 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				             "like_count": video_info.get("like_count"),
			
 
				             "comment_count": video_info.get("comment_count"),
			
 
				             "content_link": video_info.get("content_link", ""),
			
 
				-            "captions": captions_text,
			
 
				+            "captions": captions_text,           # YouTube 官方字幕（可能为空）
			
 
				+            "video_transcript": transcript_text, # Deepgram 转写兜底
			
 
				         }
			
 
				         if download_video:
			
 
				             output_data["video_path"] = video_path
			
@@ -166,10 +324,17 @@ async def detail(post: Dict[str, Any], extras: Optional[Dict[str, Any]] = None)
 
				 
			
 
				         output_text = json.dumps(output_data, ensure_ascii=False, indent=2)
			
 
				 
			
 
				+        memory_parts = []
			
 
				+        if captions_text:
			
 
				+            memory_parts.append("captions")
			
 
				+        if transcript_text and transcript_text != captions_text:
			
 
				+            memory_parts.append("transcript")
			
 
				+        memory_extra = f" with {'+'.join(memory_parts)}" if memory_parts else ""
			
 
				+
			
 
				         return ToolResult(
			
 
				             title=f"YouTube 详情: {video_info.get('title', content_id)}",
			
 
				             output=output_text,
			
 
				-            long_term_memory=f"YouTube detail for {content_id}" + (" with captions" if captions_text else ""),
			
 
				+            long_term_memory=f"YouTube detail for {content_id}{memory_extra}",
			
 
				         )
			
 
				 
			
 
				     except Exception as e:
			
--- a/agent/tools/builtin/content/transcription.py
+++ b/agent/tools/builtin/content/transcription.py
@@ -0,0 +1,351 @@
 
				+"""Download a post's source video, extract audio, transcribe via Deepgram.
			
 
				+
			
 
				+Used by platform detail() implementations whose posts ship raw video URLs
			
 
				+(X, sph, douyin) and don't already supply captions. YouTube has its own
			
 
				+captions endpoint and bypasses this module.
			
 
				+
			
 
				+Pipeline per video:
			
 
				+  1. extract_video_url(platform, post)  -> source url (page or direct)
			
 
				+  2. download to %TEMP%/content_transcribe/<platform>/<stem>.mp4
			
 
				+     - X     : yt-dlp on the page URL (most robust against rotating video URLs)
			
 
				+     - douyin: httpx + Referer https://www.douyin.com/
			
 
				+     - sph   : httpx + Referer https://channels.weixin.qq.com/
			
 
				+  3. ffmpeg -> 16kHz mono AAC 64kbps m4a (~3% the size of the source mp4)
			
 
				+  4. POST to Deepgram /v1/listen, model=whisper-large by default
			
 
				+  5. Strip spaces inserted by Deepgram between consecutive CJK characters
			
 
				+
			
 
				+Returns transcript text on success, None on any failure (silent fallback).
			
 
				+"""
			
 
				+
			
 
				+from __future__ import annotations
			
 
				+
			
 
				+import asyncio
			
 
				+import logging
			
 
				+import os
			
 
				+import re
			
 
				+import subprocess
			
 
				+import tempfile
			
 
				+from pathlib import Path
			
 
				+from typing import Any, Optional
			
 
				+
			
 
				+import httpx
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+DEEPGRAM_URL = "https://api.deepgram.com/v1/listen"
			
 
				+DEEPGRAM_MODEL_DEFAULT = "whisper-large"
			
 
				+DEEPGRAM_REQUEST_TIMEOUT = 600.0
			
 
				+DOWNLOAD_TIMEOUT = 300
			
 
				+FFMPEG_TIMEOUT = 600
			
 
				+UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
			
 
				+      "(KHTML, like Gecko) Chrome/124.0 Safari/537.36")
			
 
				+
			
 
				+_TMP_ROOT = Path(tempfile.gettempdir()) / "content_transcribe"
			
 
				+_SAFE_RE = re.compile(r"[^A-Za-z0-9._-]+")
			
 
				+# Zero-width lookbehind/lookahead: remove whitespace strictly between CJK chars,
			
 
				+# preserve CJK<->ASCII boundaries (e.g. "Remotion 是工具" stays intact).
			
 
				+_CJK_SPACE_RE = re.compile(r"(?<=[一-鿿])\s+(?=[一-鿿])")
			
 
				+
			
 
				+# Referer headers required by some CDNs for ffprobe / yt-dlp / httpx to access video URLs.
			
 
				+_PLATFORM_REFERERS = {
			
 
				+    "douyin": "https://www.douyin.com/",
			
 
				+    "sph": "https://channels.weixin.qq.com/",
			
 
				+    "xhs": "https://www.xiaohongshu.com/",
			
 
				+    "bili": "https://www.bilibili.com/",
			
 
				+    "weibo": "https://weibo.com/",
			
 
				+}
			
 
				+_DURATION_PROBE_TIMEOUT = 15
			
 
				+
			
 
				+
			
 
				+def extract_video_url(platform: str, post: dict[str, Any]) -> Optional[str]:
			
 
				+    """Pluck a video URL (page or direct) out of a platform's raw post dict.
			
 
				+
			
 
				+    Mirrors scratch/crawl_videos.py so the two paths stay in sync; the
			
 
				+    crawler is the source of truth for what shape each platform's post takes.
			
 
				+    """
			
 
				+    if platform == "x":
			
 
				+        vlist = post.get("video_url_list") or []
			
 
				+        if vlist:
			
 
				+            head = vlist[0]
			
 
				+            return head.get("video_url") if isinstance(head, dict) else head
			
 
				+        return None
			
 
				+    if platform == "youtube":
			
 
				+        vid = post.get("video_id") or post.get("content_id")
			
 
				+        return f"https://www.youtube.com/watch?v={vid}" if vid else None
			
 
				+    # Generic: aigc-channel platforms (xhs / gzh / sph / douyin / bili / zhihu /
			
 
				+    # weibo / toutiao / github) all expose video URLs under `videos[0]`.
			
 
				+    videos = post.get("videos") or []
			
 
				+    if videos:
			
 
				+        return videos[0]
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+def _safe_stem(platform: str, post: dict[str, Any]) -> str:
			
 
				+    raw_id = (
			
 
				+        post.get("channel_content_id")
			
 
				+        or post.get("video_id")
			
 
				+        or post.get("content_id")
			
 
				+        or "item"
			
 
				+    )
			
 
				+    return f"{platform}_{_SAFE_RE.sub('_', str(raw_id))[:60]}"
			
 
				+
			
 
				+
			
 
				+def _yt_dlp_download(url: str, target: Path) -> Optional[Path]:
			
 
				+    if target.exists() and target.stat().st_size > 0:
			
 
				+        return target
			
 
				+    cmd = ["yt-dlp", "-f", "best[ext=mp4]/best", "-o", str(target),
			
 
				+           "--no-playlist", "--quiet", "--no-warnings", url]
			
 
				+    try:
			
 
				+        r = subprocess.run(cmd, capture_output=True, text=True, timeout=DOWNLOAD_TIMEOUT)
			
 
				+    except (subprocess.TimeoutExpired, FileNotFoundError) as e:
			
 
				+        logger.warning("yt-dlp failed for %s: %s", url, e)
			
 
				+        return None
			
 
				+    if r.returncode != 0:
			
 
				+        logger.warning("yt-dlp non-zero for %s: %s", url, (r.stderr or r.stdout)[:200])
			
 
				+        return None
			
 
				+    if target.exists() and target.stat().st_size > 0:
			
 
				+        return target
			
 
				+    # yt-dlp may have written with a different extension
			
 
				+    for f in target.parent.glob(target.stem + ".*"):
			
 
				+        if f.is_file() and f.stat().st_size > 0:
			
 
				+            return f
			
 
				+    return None
			
 
				+
			
 
				+
			
 
				+async def _httpx_download(url: str, target: Path, referer: Optional[str] = None) -> Optional[Path]:
			
 
				+    if target.exists() and target.stat().st_size > 0:
			
 
				+        return target
			
 
				+    headers = {"User-Agent": UA}
			
 
				+    if referer:
			
 
				+        headers["Referer"] = referer
			
 
				+    try:
			
 
				+        async with httpx.AsyncClient(
			
 
				+            timeout=DOWNLOAD_TIMEOUT, follow_redirects=True, headers=headers
			
 
				+        ) as client:
			
 
				+            async with client.stream("GET", url) as resp:
			
 
				+                if resp.status_code != 200:
			
 
				+                    logger.warning("download HTTP %s for %s", resp.status_code, url)
			
 
				+                    return None
			
 
				+                with target.open("wb") as f:
			
 
				+                    async for chunk in resp.aiter_bytes(chunk_size=64 * 1024):
			
 
				+                        f.write(chunk)
			
 
				+    except Exception as e:
			
 
				+        logger.warning("httpx download failed for %s: %s", url, e)
			
 
				+        return None
			
 
				+    return target if target.exists() and target.stat().st_size > 0 else None
			
 
				+
			
 
				+
			
 
				+async def _download_video(
			
 
				+    platform: str, post: dict[str, Any], video_url: str, target: Path
			
 
				+) -> Optional[Path]:
			
 
				+    """Dispatch to the right downloader per platform.
			
 
				+
			
 
				+    Known-good strategies (from scratch/crawl_videos.py):
			
 
				+      x      : yt-dlp on the tweet page URL (video URLs are signed/rotating)
			
 
				+      douyin : httpx direct with douyin.com Referer (video URL is a play API)
			
 
				+      sph    : httpx direct with channels.weixin.qq.com Referer (stodownload link)
			
 
				+      youtube: yt-dlp on the watch URL
			
 
				+
			
 
				+    For everything else (xhs / bili / weibo / zhihu / gzh / toutiao / github / ...):
			
 
				+    try yt-dlp on the post's page URL first (yt-dlp supports 1000+ sites including
			
 
				+    most aigc-channel platforms via cookies-free extractors), and fall back to
			
 
				+    plain httpx on `videos[0]` if yt-dlp can't handle it.
			
 
				+    """
			
 
				+    if platform == "x":
			
 
				+        page_url = post.get("link") or video_url
			
 
				+        return await asyncio.to_thread(_yt_dlp_download, page_url, target)
			
 
				+    if platform == "douyin":
			
 
				+        return await _httpx_download(video_url, target, referer="https://www.douyin.com/")
			
 
				+    if platform == "sph":
			
 
				+        return await _httpx_download(video_url, target, referer="https://channels.weixin.qq.com/")
			
 
				+    if platform == "youtube":
			
 
				+        return await asyncio.to_thread(_yt_dlp_download, video_url, target)
			
 
				+
			
 
				+    # Generic two-step fallback for any other platform with a `videos` field.
			
 
				+    page_url = post.get("link")
			
 
				+    if page_url:
			
 
				+        result = await asyncio.to_thread(_yt_dlp_download, page_url, target)
			
 
				+        if result:
			
 
				+            return result
			
 
				+        logger.info("yt-dlp didn't handle %s page URL; falling back to httpx", platform)
			
 
				+    return await _httpx_download(video_url, target)
			
 
				+
			
 
				+
			
 
				+def _extract_m4a(video_path: Path, audio_path: Path) -> bool:
			
 
				+    """ffmpeg: video -> 16kHz mono AAC 64kbps m4a. Returns True if file written."""
			
 
				+    audio_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+    if audio_path.exists() and audio_path.stat().st_size > 0:
			
 
				+        return True
			
 
				+    cmd = ["ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
			
 
				+           "-i", str(video_path),
			
 
				+           "-vn", "-ac", "1", "-ar", "16000",
			
 
				+           "-c:a", "aac", "-b:a", "64k",
			
 
				+           str(audio_path)]
			
 
				+    try:
			
 
				+        subprocess.run(cmd, check=True, timeout=FFMPEG_TIMEOUT,
			
 
				+                       capture_output=True)
			
 
				+    except (subprocess.CalledProcessError, subprocess.TimeoutExpired, FileNotFoundError) as e:
			
 
				+        logger.warning("ffmpeg failed for %s: %s", video_path, e)
			
 
				+        return False
			
 
				+    return audio_path.exists() and audio_path.stat().st_size > 0
			
 
				+
			
 
				+
			
 
				+async def _transcribe_deepgram(
			
 
				+    audio_path: Path,
			
 
				+    api_key: str,
			
 
				+    model: str = DEEPGRAM_MODEL_DEFAULT,
			
 
				+    language: Optional[str] = None,
			
 
				+) -> Optional[str]:
			
 
				+    params: dict[str, str] = {
			
 
				+        "model": model,
			
 
				+        "smart_format": "true",
			
 
				+        "punctuate": "true",
			
 
				+    }
			
 
				+    if language:
			
 
				+        params["language"] = language
			
 
				+    else:
			
 
				+        params["detect_language"] = "true"
			
 
				+    headers = {
			
 
				+        "Authorization": f"Token {api_key}",
			
 
				+        "Content-Type": "audio/mp4",
			
 
				+    }
			
 
				+    try:
			
 
				+        audio_bytes = audio_path.read_bytes()
			
 
				+        async with httpx.AsyncClient(timeout=DEEPGRAM_REQUEST_TIMEOUT) as client:
			
 
				+            r = await client.post(DEEPGRAM_URL, params=params, headers=headers,
			
 
				+                                  content=audio_bytes)
			
 
				+    except Exception as e:
			
 
				+        logger.warning("Deepgram request failed for %s: %s", audio_path.name, e)
			
 
				+        return None
			
 
				+    if r.status_code != 200:
			
 
				+        logger.warning("Deepgram HTTP %s: %s", r.status_code, r.text[:200])
			
 
				+        return None
			
 
				+    try:
			
 
				+        data = r.json()
			
 
				+        alt = data["results"]["channels"][0]["alternatives"][0]
			
 
				+        return alt.get("transcript") or None
			
 
				+    except (KeyError, IndexError, ValueError) as e:
			
 
				+        logger.warning("Deepgram response malformed: %s", e)
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _clean_chinese_spaces(text: str) -> str:
			
 
				+    """Drop whitespace strictly between two CJK characters."""
			
 
				+    return _CJK_SPACE_RE.sub("", text)
			
 
				+
			
 
				+
			
 
				+def _ffprobe_duration_sync(video_url: str, referer: Optional[str] = None) -> Optional[float]:
			
 
				+    """Read mp4 moov box over HTTP Range; returns duration (seconds) or None.
			
 
				+
			
 
				+    Does NOT download the video stream — typically pulls only a few KB even for
			
 
				+    multi-GB files. Designed to be called from search() to enrich posts with
			
 
				+    duration before scoring, without paying the cost of a full download.
			
 
				+    """
			
 
				+    cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
			
 
				+           "-of", "default=nw=1:nk=1"]
			
 
				+    if referer:
			
 
				+        cmd += ["-headers", f"Referer: {referer}\r\n"]
			
 
				+    cmd += [video_url]
			
 
				+    try:
			
 
				+        r = subprocess.run(cmd, capture_output=True, text=True, timeout=_DURATION_PROBE_TIMEOUT)
			
 
				+    except (subprocess.TimeoutExpired, FileNotFoundError) as e:
			
 
				+        logger.info("ffprobe duration probe failed for %s: %s", video_url[:80], e)
			
 
				+        return None
			
 
				+    out = (r.stdout or "").strip()
			
 
				+    if not out:
			
 
				+        return None
			
 
				+    try:
			
 
				+        d = float(out)
			
 
				+    except ValueError:
			
 
				+        return None
			
 
				+    return d if d > 0 else None
			
 
				+
			
 
				+
			
 
				+async def probe_video_duration(
			
 
				+    video_url: str, platform: Optional[str] = None
			
 
				+) -> Optional[float]:
			
 
				+    """Async wrapper. Probes mp4 duration via HTTP Range; returns seconds or None.
			
 
				+
			
 
				+    Pass `platform` to auto-inject the right Referer header (douyin / sph / xhs / bili
			
 
				+    require it). Safe to call concurrently — uses asyncio.to_thread so subprocesses
			
 
				+    don't block the event loop. Each call is one ffprobe subprocess; cap parallelism
			
 
				+    at the call site if probing many URLs.
			
 
				+    """
			
 
				+    if not video_url:
			
 
				+        return None
			
 
				+    referer = _PLATFORM_REFERERS.get(platform) if platform else None
			
 
				+    return await asyncio.to_thread(_ffprobe_duration_sync, video_url, referer)
			
 
				+
			
 
				+
			
 
				+async def probe_durations_for_posts(
			
 
				+    platform: str, posts: list, concurrency: int = 8
			
 
				+) -> None:
			
 
				+    """In-place: probe each post's video URL and set post["duration_sec"] if found.
			
 
				+
			
 
				+    Skips posts with no video URL (image-only posts). Probes happen concurrently
			
 
				+    bounded by `concurrency` to avoid spawning a flood of ffprobe subprocesses.
			
 
				+    Failures are silent (post just won't have duration_sec — evaluator handles).
			
 
				+    """
			
 
				+    sem = asyncio.Semaphore(concurrency)
			
 
				+
			
 
				+    async def _one(post: dict) -> None:
			
 
				+        url = extract_video_url(platform, post)
			
 
				+        if not url:
			
 
				+            return
			
 
				+        async with sem:
			
 
				+            d = await probe_video_duration(url, platform=platform)
			
 
				+        if d is not None:
			
 
				+            post["duration_sec"] = d
			
 
				+
			
 
				+    await asyncio.gather(*[_one(p) for p in posts if isinstance(p, dict)])
			
 
				+
			
 
				+
			
 
				+def _get_api_key() -> Optional[str]:
			
 
				+    key = os.environ.get("DEEPGRAM_KEY") or os.environ.get("DEEPGRAM_API_KEY")
			
 
				+    if key:
			
 
				+        return key
			
 
				+    try:
			
 
				+        from dotenv import load_dotenv
			
 
				+        load_dotenv()
			
 
				+    except ImportError:
			
 
				+        return None
			
 
				+    return os.environ.get("DEEPGRAM_KEY") or os.environ.get("DEEPGRAM_API_KEY")
			
 
				+
			
 
				+
			
 
				+async def transcribe_video_from_post(
			
 
				+    platform: str,
			
 
				+    post: dict[str, Any],
			
 
				+    *,
			
 
				+    model: str = DEEPGRAM_MODEL_DEFAULT,
			
 
				+    language: Optional[str] = None,
			
 
				+) -> Optional[str]:
			
 
				+    """End-to-end: locate video, download, extract m4a, STT, clean spaces.
			
 
				+
			
 
				+    Returns transcript text or None if any step fails (logged at WARNING level).
			
 
				+    Caller can safely ignore None and fall back to whatever body text it has.
			
 
				+    """
			
 
				+    url = extract_video_url(platform, post)
			
 
				+    if not url:
			
 
				+        return None
			
 
				+    api_key = _get_api_key()
			
 
				+    if not api_key:
			
 
				+        logger.warning("DEEPGRAM_KEY not set; skipping transcription for %s", platform)
			
 
				+        return None
			
 
				+
			
 
				+    stem = _safe_stem(platform, post)
			
 
				+    work_dir = _TMP_ROOT / platform
			
 
				+    work_dir.mkdir(parents=True, exist_ok=True)
			
 
				+    video_path = work_dir / f"{stem}.mp4"
			
 
				+    audio_path = work_dir / f"{stem}.m4a"
			
 
				+
			
 
				+    video = await _download_video(platform, post, url, video_path)
			
 
				+    if not video:
			
 
				+        return None
			
 
				+
			
 
				+    if not await asyncio.to_thread(_extract_m4a, video, audio_path):
			
 
				+        return None
			
 
				+
			
 
				+    transcript = await _transcribe_deepgram(audio_path, api_key, model=model, language=language)
			
 
				+    if not transcript:
			
 
				+        return None
			
 
				+    return _clean_chinese_spaces(transcript).strip()
			
--- a/examples/process_pipeline/prompts/researcher.prompt
+++ b/examples/process_pipeline/prompts/researcher.prompt
@@ -4,7 +4,7 @@ temperature: 0.3
 
				 
			
 
				 $system$
			
 
				 
			
 
				-你是一个专注的渠道调研专家。你负责在指定的单个渠道（如小红书、X、youtube）进行完整的广度调研，包括多关键词搜索、适度查看内容，并输出结构化的调研结果。
			
 
				+你是一个专注的渠道调研专家。你负责在指定的单个渠道（图文为主如小红书/知乎/公众号；视频为主如 youtube / X / 抖音 / 视频号）进行完整的广度调研，包括多关键词搜索、适度查看内容，并输出结构化的调研结果。
			
 
				 
			
 
				 ---
			
 
				 
			
@@ -16,8 +16,12 @@ $system$
 
				 
			
 
				 ### 搜索工具
			
 
				 - `content_platforms(platform="")` — 列出/查询平台详细搜索参数
			
 
				-- `content_search(platform, keyword, max_count=20)` — 跨平台搜索案例（返回结果和序列号）
			
 
				-  - platform 常用值: `xhs`(小红书), `youtube`, `x`(Twitter), `bili`, `gzh`, `zhihu`
			
 
				+- `content_search(platform, keyword, max_count=20, extras={...})` — 跨平台搜索案例（返回结果和序列号）
			
 
				+  - platform 常用值（按内容形态分组）：
			
 
				+    - **图文为主**：`xhs`(小红书), `gzh`(公众号), `zhihu`(知乎), `weibo`(微博), `toutiao`(头条), `github`
			
 
				+    - **视频为主**：`youtube`, `x`(Twitter), `douyin`(抖音), `sph`(微信视频号), `bili`(B 站)
			
 
				+  - **搜视频教程时的关键技巧**：对 AIGC 系平台（xhs / douyin / sph / bili / gzh 等）传 `extras={"content_type": "视频"}` 强制只返回视频内容；youtube / x 平台本身偏视频，无需此参数。若调研目标明显是动手演示类教程，应优先选视频渠道。
			
 
				+  - **视频可下载性**：`youtube` / `x` / `douyin` / `sph` 四个平台的搜索结果会自动附带可直接播放/下载的视频直链；其余平台仅返回页面 URL。
			
 
				 - `content_detail(platform, index)` — 根据 content_search 结果的序号查看详细内容和全文
			
 
				 - `content_suggest(platform, keyword)` — 获取搜索相关建议词
			
 
				 
			
--- a/examples/process_pipeline/run_metrics.json
+++ b/examples/process_pipeline/run_metrics.json
@@ -2609,5 +2609,57 @@
 
				       "research"
			
 
				     ],
			
 
				     "timestamp": "2026-05-14T17:42:11.272855"
			
 
				+  },
			
 
				+  {
			
 
				+    "index": 109,
			
 
				+    "requirement": "用ai生成真实摄影的美女写真组图，要求具有真实感，氛围感，人物一致性保持...",
			
 
				+    "duration_seconds": 3.05,
			
 
				+    "total_cost_usd": 0.0,
			
 
				+    "costs_breakdown": {},
			
 
				+    "trace_ids": {},
			
 
				+    "errors": [],
			
 
				+    "active_steps": [
			
 
				+      "source"
			
 
				+    ],
			
 
				+    "timestamp": "2026-05-15T16:08:19.318536"
			
 
				+  },
			
 
				+  {
			
 
				+    "index": 109,
			
 
				+    "requirement": "用ai生成真实摄影的美女写真组图，要求具有真实感，氛围感，人物一致性保持...",
			
 
				+    "duration_seconds": 2.21,
			
 
				+    "total_cost_usd": 0.0,
			
 
				+    "costs_breakdown": {},
			
 
				+    "trace_ids": {},
			
 
				+    "errors": [],
			
 
				+    "active_steps": [
			
 
				+      "source"
			
 
				+    ],
			
 
				+    "timestamp": "2026-05-15T16:24:20.607244"
			
 
				+  },
			
 
				+  {
			
 
				+    "index": 109,
			
 
				+    "requirement": "用ai生成真实摄影的美女写真组图，要求具有真实感，氛围感，人物一致性保持...",
			
 
				+    "duration_seconds": 0.73,
			
 
				+    "total_cost_usd": 0.0,
			
 
				+    "costs_breakdown": {},
			
 
				+    "trace_ids": {},
			
 
				+    "errors": [],
			
 
				+    "active_steps": [
			
 
				+      "source"
			
 
				+    ],
			
 
				+    "timestamp": "2026-05-15T16:29:54.897458"
			
 
				+  },
			
 
				+  {
			
 
				+    "index": 109,
			
 
				+    "requirement": "用ai生成真实摄影的美女写真组图，要求具有真实感，氛围感，人物一致性保持...",
			
 
				+    "duration_seconds": 0.73,
			
 
				+    "total_cost_usd": 0.0,
			
 
				+    "costs_breakdown": {},
			
 
				+    "trace_ids": {},
			
 
				+    "errors": [],
			
 
				+    "active_steps": [
			
 
				+      "source"
			
 
				+    ],
			
 
				+    "timestamp": "2026-05-15T16:31:36.684729"
			
 
				   }
			
 
				 ]
			
--- a/examples/process_pipeline/run_pipeline.py
+++ b/examples/process_pipeline/run_pipeline.py
@@ -23,7 +23,7 @@ CLI 速查
 
				 
			
 
				 可选参数：
			
 
				   --case-index N             仅 decode-workflow / apply-grounding 支持
			
 
				-  --platforms xhs,zhihu,gzh,youtube   research 阶段平台过滤
			
 
				+  --platforms xhs,zhihu,gzh,youtube,douyin,sph   research 阶段平台过滤
			
 
				   --skip-existing            仅在某 case 还没生成 decode 输出时才跑（增量模式）。
			
 
				                              默认行为是全覆盖：每次跑都把所有 case 重新生成。
			
 
				                              仅对 decode-workflow 批量模式生效；单 case 模式本身就总是重跑。
			
@@ -608,7 +608,7 @@ def _parse_args() -> argparse.Namespace:
 
				     parser = argparse.ArgumentParser(description="AIGC Process Pipeline (5-step)")
			
 
				     parser.add_argument("--index", type=int, required=True,
			
 
				                         help="Index of requirement in db_requirements.json (0-based)")
			
 
				-    parser.add_argument("--platforms", type=str, default="xhs,zhihu,gzh,youtube",
			
 
				+    parser.add_argument("--platforms", type=str, default="xhs,zhihu,gzh,youtube,douyin,sph",
			
 
				                         help="Comma-separated platforms for research step")
			
 
				     parser.add_argument("--case-index", type=int, default=None,
			
 
				                         help="Re-run a single case in decode-workflow / apply-grounding")
			
--- a/examples/process_pipeline/script/evaluate_source_quality.py
+++ b/examples/process_pipeline/script/evaluate_source_quality.py
@@ -10,10 +10,20 @@ Source.json 质量评估模块
 
				 """
			
 
				 
			
 
				 import json
			
 
				+import re
			
 
				 from pathlib import Path
			
 
				 from typing import Dict, List, Tuple
			
 
				 from datetime import datetime, timedelta
			
 
				 
			
 
				+_HTML_TAG_RE = re.compile(r"<[^>]+>")
			
 
				+
			
 
				+
			
 
				+def _strip_html(text) -> str:
			
 
				+    """Remove inline HTML tags (e.g. <em class="highlight">) from search-result text."""
			
 
				+    if not text:
			
 
				+        return ""
			
 
				+    return _HTML_TAG_RE.sub("", str(text))
			
 
				+
			
 
				 
			
 
				 class SourceQualityEvaluator:
			
 
				     """Source 数据质量评估器"""
			
@@ -65,10 +75,18 @@ class SourceQualityEvaluator:
 
				                 "total_fields": int,       # 总字段数
			
 
				             }
			
 
				         """
			
 
				+        # Video posts (content_type=="video" 或 videos 字段非空) 通常没有 body_text，
			
 
				+        # 仅靠 caption + 互动数据评分，避免被 body 长度一律打低分。
			
 
				+        is_video = (
			
 
				+            post.get("content_type") == "video"
			
 
				+            or bool(post.get("videos"))
			
 
				+        )
			
 
				+
			
 
				         result = {
			
 
				+            "mode": "video" if is_video else "text",
			
 
				             "field_score": 0.0,
			
 
				-            "text_score": 0.0,
			
 
				-            "engagement_score": 0.0,
			
 
				+            "text_score": 0.0,       # video 模式下含义为 title-only (0-15)
			
 
				+            "engagement_score": 0.0, # video 模式下扩展为 (0-45)
			
 
				             "total_score": 0.0,
			
 
				             "grade": "F",
			
 
				             "issues": [],
			
@@ -76,20 +94,23 @@ class SourceQualityEvaluator:
 
				             "total_fields": len(self.FIELD_WEIGHTS),
			
 
				         }
			
 
				 
			
 
				-        # 1. 字段完整性评分 (0-40分)
			
 
				+        # 1. 字段完整性评分 (0-40 分)
			
 
				         field_score, valid_count = self._evaluate_fields(post)
			
 
				         result["field_score"] = field_score
			
 
				         result["valid_fields"] = valid_count
			
 
				 
			
 
				-        # 2. 文本质量评分 (0-40分)
			
 
				-        text_score, text_issues = self._evaluate_text(post)
			
 
				-        result["text_score"] = text_score
			
 
				-        result["issues"].extend(text_issues)
			
 
				-
			
 
				-        # 3. 互动数据评分 (0-20分)
			
 
				-        engagement_score, engagement_issues = self._evaluate_engagement(post)
			
 
				-        result["engagement_score"] = engagement_score
			
 
				-        result["issues"].extend(engagement_issues)
			
 
				+        # 2 & 3. 文本/互动评分（视频模式跳过 body 长度，重分权重到 title + 互动）
			
 
				+        if is_video:
			
 
				+            title_score, eng_score, issues = self._evaluate_video_signals(post)
			
 
				+            result["text_score"] = title_score
			
 
				+            result["engagement_score"] = eng_score
			
 
				+            result["issues"].extend(issues)
			
 
				+        else:
			
 
				+            text_score, text_issues = self._evaluate_text(post)
			
 
				+            engagement_score, engagement_issues = self._evaluate_engagement(post)
			
 
				+            result["text_score"] = text_score
			
 
				+            result["engagement_score"] = engagement_score
			
 
				+            result["issues"].extend(text_issues + engagement_issues)
			
 
				 
			
 
				         # 计算总分和等级
			
 
				         result["total_score"] = round(
			
@@ -99,6 +120,94 @@ class SourceQualityEvaluator:
 
				 
			
 
				         return result
			
 
				 
			
 
				+    # ── video-mode 阈值（mirror body length tiers, but on seconds） ──
			
 
				+    DURATION_THRESHOLDS = {
			
 
				+        "very_short": 30,    # <30s     -> 5/30
			
 
				+        "short":      60,    # 30-60s   -> 12/30
			
 
				+        "fair":       120,   # 60-120s  -> 20/30
			
 
				+        "good":       300,   # 2-5 min  -> 26/30
			
 
				+        "long":       1800,  # 5-30 min -> 30/30 (best)
			
 
				+        # >=1800s (>30 min) -> 22/30 (信息密度下降)
			
 
				+    }
			
 
				+
			
 
				+    def _evaluate_video_signals(self, post: dict) -> Tuple[float, float, List[str]]:
			
 
				+        """For video posts: replaces body-length scoring with video-duration scoring.
			
 
				+
			
 
				+        Composition: title (0-10) + duration (0-30) + engagement (0-20) = 0-60,
			
 
				+        mirroring the article-post weights but with duration as the content signal.
			
 
				+
			
 
				+        Reads `duration_sec` from the post (populated by search() via
			
 
				+        transcription.probe_durations_for_posts before scoring). If absent
			
 
				+        (probe failed / no video URL), duration_score is 0 with an issue noted.
			
 
				+        """
			
 
				+        issues: List[str] = []
			
 
				+
			
 
				+        # ── title 0-10 ──
			
 
				+        title = _strip_html(post.get("title", "")).strip()
			
 
				+        tlen = len(title)
			
 
				+        if tlen == 0:
			
 
				+            title_score = 0
			
 
				+            issues.append("标题为空")
			
 
				+        elif tlen < 10:
			
 
				+            title_score = 3
			
 
				+            issues.append(f"标题过短 ({tlen}字)")
			
 
				+        elif tlen < 20:
			
 
				+            title_score = 6
			
 
				+        else:
			
 
				+            title_score = 10
			
 
				+
			
 
				+        # ── duration 0-30 (replaces body_text length) ──
			
 
				+        duration = post.get("duration_sec")
			
 
				+        if not isinstance(duration, (int, float)) or duration <= 0:
			
 
				+            dur_score = 0
			
 
				+            issues.append("无视频时长")
			
 
				+        elif duration < self.DURATION_THRESHOLDS["very_short"]:
			
 
				+            dur_score = 5
			
 
				+            issues.append(f"视频极短 ({duration:.0f}s)")
			
 
				+        elif duration < self.DURATION_THRESHOLDS["short"]:
			
 
				+            dur_score = 12
			
 
				+            issues.append(f"视频较短 ({duration:.0f}s)")
			
 
				+        elif duration < self.DURATION_THRESHOLDS["fair"]:
			
 
				+            dur_score = 20
			
 
				+        elif duration < self.DURATION_THRESHOLDS["good"]:
			
 
				+            dur_score = 26
			
 
				+        elif duration < self.DURATION_THRESHOLDS["long"]:
			
 
				+            dur_score = 30
			
 
				+        else:
			
 
				+            dur_score = 22
			
 
				+            issues.append(f"视频较长 ({duration:.0f}s，>30 分钟密度可能下降)")
			
 
				+
			
 
				+        # ── engagement 0-20 (与文章帖相同) ──
			
 
				+        like_count = post.get("like_count", 0)
			
 
				+        if not isinstance(like_count, (int, float)):
			
 
				+            like_count = 0
			
 
				+        if like_count == 0:
			
 
				+            like_score = 0
			
 
				+            issues.append("无点赞数据")
			
 
				+        elif like_count < 10:
			
 
				+            like_score = 3
			
 
				+        elif like_count < 100:
			
 
				+            like_score = 6
			
 
				+        elif like_count < 1000:
			
 
				+            like_score = 8
			
 
				+        else:
			
 
				+            like_score = 10
			
 
				+
			
 
				+        timestamp = post.get("publish_timestamp", 0)
			
 
				+        if not isinstance(timestamp, (int, float)):
			
 
				+            timestamp = 0
			
 
				+        if timestamp == 0:
			
 
				+            ts_score = 0
			
 
				+            issues.append("无发布时间")
			
 
				+        elif timestamp < self.cutoff_timestamp:
			
 
				+            ts_score = 2
			
 
				+            issues.append(f"内容过时（超过{self.time_window_days}天）")
			
 
				+        else:
			
 
				+            ts_score = 10
			
 
				+
			
 
				+        # text_score 字段在 video mode 下含义 = title + duration (0-40)
			
 
				+        return float(title_score + dur_score), float(like_score + ts_score), issues
			
 
				+
			
 
				     def _evaluate_fields(self, post: dict) -> Tuple[float, int]:
			
 
				         """评估字段完整性"""
			
 
				         total_weight = sum(self.FIELD_WEIGHTS.values())
			
--- a/examples/process_pipeline/script/extract_sources.py
+++ b/examples/process_pipeline/script/extract_sources.py
@@ -35,6 +35,12 @@ _URL_PATTERNS = [
 
				     ("zhihu", re.compile(r"zhihu\.com/question/\d+/answer/(\d+)")),
			
 
				     # 公众号: 通过 __biz 或整个 URL 作为 id（后备）
			
 
				     ("gzh", re.compile(r"mp\.weixin\.qq\.com/s[/?]([^\s\"']+)")),
			
 
				+    # 抖音: https://www.douyin.com/video/{numeric_id} — content_id 直接匹配
			
 
				+    # douyin post.channel_content_id
			
 
				+    ("douyin", re.compile(r"douyin\.com/video/(\d+)")),
			
 
				+    # 视频号: post.link 是 'export/{base64-like token}' 不是 http URL；
			
 
				+    # 这里识别出来打上 sph 标签，真实定位走 ("url", link) 索引兜底（见 main 流程）
			
 
				+    ("sph", re.compile(r"^export/([A-Za-z0-9+/=_\-]+)$")),
			
 
				 ]
			
 
				 
			
 
				 
			
@@ -102,6 +108,39 @@ DEFAULT_MIN_SCORE = 70.0
 
				 DEFAULT_CUTOFF_DATE = (2025, 10, 1)
			
 
				 
			
 
				 
			
 
				+_TRANSCRIPT_MARKER = "[视频字幕]"
			
 
				+
			
 
				+
			
 
				+def _merge_transcript_into_body(post: Dict[str, Any]) -> Dict[str, Any]:
			
 
				+    """Return a shallow copy of `post` where `body_text` includes the video transcript.
			
 
				+
			
 
				+    For sph/douyin/x video posts whose original body_text is just hashtags or empty,
			
 
				+    this exposes the real video content (Deepgram transcript or YouTube captions)
			
 
				+    under the field the frontend reads. The original transcript field is left intact
			
 
				+    so downstream code that wants it separately still works.
			
 
				+
			
 
				+    Idempotent: if body_text already contains the `[视频字幕]` marker, skip.
			
 
				+    """
			
 
				+    if not isinstance(post, dict):
			
 
				+        return post
			
 
				+    transcript = post.get("video_transcript") or post.get("captions") or ""
			
 
				+    if not isinstance(transcript, str) or not transcript.strip():
			
 
				+        return post
			
 
				+    transcript = transcript.strip()
			
 
				+    body = post.get("body_text") or post.get("desc") or ""
			
 
				+    body = body.strip() if isinstance(body, str) else ""
			
 
				+
			
 
				+    if body and _TRANSCRIPT_MARKER in body:
			
 
				+        return post  # 已经合并过
			
 
				+
			
 
				+    merged = dict(post)  # shallow copy — body_text 是 top-level field
			
 
				+    if body:
			
 
				+        merged["body_text"] = f"{body}\n\n{_TRANSCRIPT_MARKER}\n{transcript}"
			
 
				+    else:
			
 
				+        merged["body_text"] = f"{_TRANSCRIPT_MARKER}\n{transcript}"
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				 def _is_before_cutoff(source: Dict[str, Any], cutoff_ts: int) -> bool:
			
 
				     """判断帖子是否早于截止时间戳（秒级）
			
 
				 
			
@@ -150,10 +189,14 @@ def _check_filters(
 
				     """
			
 
				     post = source.get("post", {}) or {}
			
 
				 
			
 
				-    # 1. body_text 完整性
			
 
				+    # 1. 内容完整性：视频帖把 video_transcript 也计入（视频帖 body_text 通常为空）
			
 
				     body = post.get("body_text") or post.get("desc") or ""
			
 
				-    if not isinstance(body, str) or len(body.strip()) < min_body_len:
			
 
				-        return f"missing_body_text:len={len(body.strip()) if isinstance(body, str) else 0}"
			
 
				+    transcript = post.get("video_transcript") or post.get("captions") or ""
			
 
				+    body_len = len(body.strip()) if isinstance(body, str) else 0
			
 
				+    transcript_len = len(transcript.strip()) if isinstance(transcript, str) else 0
			
 
				+    total_len = body_len + transcript_len
			
 
				+    if total_len < min_body_len:
			
 
				+        return f"missing_body_text:body={body_len},transcript={transcript_len}"
			
 
				 
			
 
				     # 2. agent 评分
			
 
				     evaluation = source.get("evaluation")
			
@@ -370,9 +413,32 @@ def extract_sources_to_json(
 
				         for entry in entries:
			
 
				             url = entry["url"]
			
 
				             evaluation = entry.get("evaluation")
			
 
				-            # 解析 URL 得到 platform 和 content_id
			
 
				+            # 解析 URL 得到 platform 和 content_id（可能失败，例如 sph 的 export/... token）
			
 
				             parsed = parse_url(url)
			
 
				-            if not parsed:
			
 
				+            platform: Optional[str] = parsed[0] if parsed else None
			
 
				+            cid: Optional[str] = parsed[1] if parsed else None
			
 
				+
			
 
				+            # 多级匹配，按优先级：
			
 
				+            # 1. (platform, content_id) 精确匹配（parse_url 成功的情况）
			
 
				+            # 2. ("url", link) 完整字符串匹配（兜底，对 sph 这种非 http URL 必需）
			
 
				+            # 3. ("norm_url", normalized) 规范化匹配
			
 
				+            post = None
			
 
				+            if platform and cid:
			
 
				+                post = cache_index.get((platform, cid))
			
 
				+            if not post:
			
 
				+                post = cache_index.get(("url", url))
			
 
				+            if not post:
			
 
				+                norm = _normalize_url(url)
			
 
				+                if norm:
			
 
				+                    post = cache_index.get(("norm_url", norm))
			
 
				+
			
 
				+            # 如果 parse_url 失败但 URL 索引命中了 post，从 post 推断 platform / cid
			
 
				+            if post and (not platform or not cid):
			
 
				+                platform = platform or post.get("channel") or "unknown"
			
 
				+                cid = cid or post.get("channel_content_id") or post.get("video_id") or url
			
 
				+
			
 
				+            # 既无 parsed 又无 cache 命中：真 unmatched
			
 
				+            if not post and not platform:
			
 
				                 unmatched.append({
			
 
				                     "case_file": case_file.name,
			
 
				                     "url": url,
			
@@ -380,34 +446,20 @@ def extract_sources_to_json(
 
				                 })
			
 
				                 continue
			
 
				 
			
 
				-            platform, cid = parsed
			
 
				-            key = (platform, cid)
			
 
				+            key = (platform, str(cid) if cid else url)
			
 
				             if key in seen_keys:
			
 
				                 continue
			
 
				             seen_keys.add(key)
			
 
				 
			
 
				-            # 多级匹配：
			
 
				-            # 1. (platform, content_id) 精确匹配
			
 
				-            # 2. 完整 URL 匹配
			
 
				-            # 3. 规范化 URL 匹配
			
 
				-            post = cache_index.get(key)
			
 
				-
			
 
				-            if not post:
			
 
				-                # 2. 完整 URL
			
 
				-                post = cache_index.get(("url", url))
			
 
				-
			
 
				-            if not post:
			
 
				-                # 3. 规范化 URL
			
 
				-                norm = _normalize_url(url)
			
 
				-                if norm:
			
 
				-                    post = cache_index.get(("norm_url", norm))
			
 
				-
			
 
				             if post:
			
 
				                 # 统一用 cache 中的 channel_content_id 生成 case_id
			
 
				                 # 这样保证 case_id 和 cache 中的 ID 一致
			
 
				                 actual_cid = post.get("channel_content_id") or post.get("video_id") or cid
			
 
				                 actual_case_id = f"{platform}_{actual_cid}"
			
 
				 
			
 
				+                # 合并 video_transcript 到 body_text（视频帖前端展示需要）
			
 
				+                post_for_source = _merge_transcript_into_body(post)
			
 
				+
			
 
				                 matched.append({
			
 
				                     "case_id": actual_case_id,
			
 
				                     "case_file": case_file.name,
			
@@ -415,7 +467,7 @@ def extract_sources_to_json(
 
				                     "channel_content_id": str(actual_cid),
			
 
				                     "source_url": url,
			
 
				                     "evaluation": evaluation,
			
 
				-                    "post": post,
			
 
				+                    "post": post_for_source,
			
 
				                     "comments": post.get("author_comments", []) or [],
			
 
				                 })
			
 
				             else:
			
--- a/examples/process_pipeline/script/generate_case.py
+++ b/examples/process_pipeline/script/generate_case.py
@@ -87,10 +87,30 @@ def _extract_url(post: Dict[str, Any], platform: str) -> str:
 
				 
			
 
				 
			
 
				 def _extract_body(post: Dict[str, Any], platform: str) -> str:
			
 
				-    """字段映射：body_text / description"""
			
 
				+    """字段映射：body_text / description；视频帖把 video_transcript 也并入。
			
 
				+
			
 
				+    抖音 / 视频号等视频平台 post.body_text 通常只是几个 hashtag（甚至为空），
			
 
				+    而真正的内容在 video_transcript（Deepgram 转写）/ captions（YouTube 官方字幕）。
			
 
				+    把两者拼起来让下游评分、过滤、agent prompt 都能看到完整内容。
			
 
				+
			
 
				+    幂等：如果 body_text 已经被 extract_sources 合并过（含 `[视频字幕]` 标记），
			
 
				+    直接返回原 body，避免重复 append。
			
 
				+    """
			
 
				     if platform == "youtube":
			
 
				-        return post.get("description") or post.get("body_text") or ""
			
 
				-    return post.get("body_text") or ""
			
 
				+        body = post.get("description") or post.get("body_text") or ""
			
 
				+    else:
			
 
				+        body = post.get("body_text") or ""
			
 
				+
			
 
				+    body = body.strip() if isinstance(body, str) else ""
			
 
				+    # 已经合并过 → 跳过，避免重复
			
 
				+    if body and "[视频字幕]" in body:
			
 
				+        return body
			
 
				+
			
 
				+    transcript = post.get("video_transcript") or post.get("captions") or ""
			
 
				+    transcript = transcript.strip() if isinstance(transcript, str) else ""
			
 
				+    if transcript and body:
			
 
				+        return f"{body}\n\n[视频字幕]\n{transcript}"
			
 
				+    return transcript or body
			
 
				 
			
 
				 
			
 
				 def _extract_raw_images(post: Dict[str, Any], platform: str) -> List[str]:
			
--- a/examples/process_pipeline/script/validate_schema.py
+++ b/examples/process_pipeline/script/validate_schema.py
@@ -24,7 +24,7 @@ from typing import Dict, List, Optional, Set, Tuple
 
				 
			
 
				 from .schema_manager import validate_with_schema, get_schema_manager
			
 
				 
			
 
				-VALID_PLATFORMS = {"xhs", "youtube", "bili", "x", "zhihu", "gzh"}
			
 
				+VALID_PLATFORMS = {"xhs", "youtube", "bili", "x", "zhihu", "gzh", "douyin", "sph"}
			
 
				 
			
 
				 
			
 
				 # ── 文件名 → schema 名映射 ──────────────────────────────
			
--- a/examples/process_pipeline/ui/app.js
+++ b/examples/process_pipeline/ui/app.js
@@ -2,7 +2,7 @@ let requirements = [];
 
				 let currentSelectedIndex = null;
			
 
				 let activeRuns = {};
			
 
				 let statusInterval = null;
			
 
				-let currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x'];
			
 
				+let currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x', 'douyin', 'sph'];
			
 
				 
			
 
				 let currentPromptName = null;
			
 
				 const modalPrompts = document.getElementById('prompts-modal');
			
@@ -618,10 +618,10 @@ async function fetchRequirementData(index) {
 
				                 .filter(p => p.startsWith('case_') && p !== 'case_detailed' && p !== 'case')
			
 
				                 .map(p => p.replace('case_', ''));
			
 
				             if (currentAvailablePlatforms.length === 0) {
			
 
				-                currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x'];
			
 
				+                currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x', 'douyin', 'sph'];
			
 
				             }
			
 
				         } else {
			
 
				-            currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x'];
			
 
				+            currentAvailablePlatforms = ['xhs', 'youtube', 'bili', 'x', 'douyin', 'sph'];
			
 
				         }
			
 
				     } catch (e) {
			
 
				         console.error("Failed to fetch data", e);