|
|
@@ -12,6 +12,7 @@ like_count / publish_timestamp / images / videos / channel / link
|
|
|
python xhs_fetch/xhs_fetch.py <url> [<url> ...] [--output <subdir>]
|
|
|
python xhs_fetch/xhs_fetch.py --urls-file urls.txt
|
|
|
python xhs_fetch/xhs_fetch.py <url> --stdout # 不写文件,打 JSON 数组到 stdout
|
|
|
+ python xhs_fetch/xhs_fetch.py <video_url> --transcribe # 视频帖跑 Deepgram 转写并入 body_text
|
|
|
|
|
|
退码:0 全成功 / 1 全失败或参数错 / 2 部分失败 / 130 Ctrl+C
|
|
|
|
|
|
@@ -92,6 +93,65 @@ def _coerce_int(v: Any) -> int:
|
|
|
return 0
|
|
|
|
|
|
|
|
|
+def extract_video_urls(note: Dict[str, Any]) -> List[str]:
|
|
|
+ """从 note.video.media.stream 收集可播放的 mp4 地址。
|
|
|
+
|
|
|
+ stream 按编码分桶(h264/h265/av1),每桶是个数组,元素里 masterUrl 是首选
|
|
|
+ 地址、backupUrls 是备份。优先 h264(兼容性最好),同一物理视频不同编码会重复,
|
|
|
+ 所以按 masterUrl 去重。图片笔记没有 video 子树,返回空列表。
|
|
|
+ """
|
|
|
+ stream = (((note.get("video") or {}).get("media") or {}).get("stream")) or {}
|
|
|
+ urls: List[str] = []
|
|
|
+ seen = set()
|
|
|
+ for codec in ("h264", "h265", "av1"):
|
|
|
+ for item in (stream.get(codec) or []):
|
|
|
+ candidates = [item.get("masterUrl") or ""] + list(item.get("backupUrls") or [])
|
|
|
+ for u in candidates:
|
|
|
+ if u and u not in seen:
|
|
|
+ seen.add(u)
|
|
|
+ urls.append(u)
|
|
|
+ break # 每个清晰度只取一个可用地址
|
|
|
+ return urls
|
|
|
+
|
|
|
+
|
|
|
+_TRANSCRIPT_MARKER = "[视频字幕]"
|
|
|
+
|
|
|
+
|
|
|
+def merge_transcript_into_body(post: Dict[str, Any], transcript: str) -> None:
|
|
|
+ """把视频转写拼进 post['body_text'](in-place),带 [视频字幕] 标记。
|
|
|
+
|
|
|
+ 跟 script/extract_sources.py 的 _merge_transcript_into_body 同语义:
|
|
|
+ 幂等——body_text 已含标记就跳过;视频帖 body 通常只有话题词或为空,
|
|
|
+ 这一步把真正的视频内容暴露到前端读取的字段下。
|
|
|
+ """
|
|
|
+ transcript = (transcript or "").strip()
|
|
|
+ if not transcript:
|
|
|
+ return
|
|
|
+ body = (post.get("body_text") or "").strip()
|
|
|
+ if body and _TRANSCRIPT_MARKER in body:
|
|
|
+ return
|
|
|
+ post["body_text"] = (
|
|
|
+ f"{body}\n\n{_TRANSCRIPT_MARKER}\n{transcript}" if body
|
|
|
+ else f"{_TRANSCRIPT_MARKER}\n{transcript}"
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def transcribe_post(post: Dict[str, Any]) -> Optional[str]:
|
|
|
+ """对一条 xhs 视频 post 跑 Deepgram 转写,复用项目的 transcription 模块。
|
|
|
+
|
|
|
+ 无视频源直接返回 None。需要 DEEPGRAM_KEY(.env 已加载)+ yt-dlp + ffmpeg。
|
|
|
+ 任意环节失败返回 None(静默兜底),不影响抓取主流程。
|
|
|
+ """
|
|
|
+ import asyncio
|
|
|
+ from agent.tools.builtin.content.transcription import (
|
|
|
+ extract_video_url,
|
|
|
+ transcribe_video_from_post,
|
|
|
+ )
|
|
|
+ if not extract_video_url("xhs", post):
|
|
|
+ return None
|
|
|
+ return asyncio.run(transcribe_video_from_post("xhs", post))
|
|
|
+
|
|
|
+
|
|
|
def parse_post(html: str) -> Dict[str, Any]:
|
|
|
"""从 explore 页 HTML 解析出一个 post 字典(source.json 兼容格式)。"""
|
|
|
data = _parse_initial_state(html)
|
|
|
@@ -116,7 +176,7 @@ def parse_post(html: str) -> Dict[str, Any]:
|
|
|
"like_count": _coerce_int(interact.get("likedCount")),
|
|
|
"publish_timestamp": note.get("time") or "",
|
|
|
"images": images,
|
|
|
- "videos": [],
|
|
|
+ "videos": extract_video_urls(note),
|
|
|
"channel": "xhs",
|
|
|
"link": f"https://www.xiaohongshu.com/explore/{nid}",
|
|
|
}
|
|
|
@@ -193,6 +253,10 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
"--stdout", action="store_true",
|
|
|
help="不写文件,把抓到的 post 数组打到 stdout(JSON)",
|
|
|
)
|
|
|
+ p.add_argument(
|
|
|
+ "--transcribe", action="store_true",
|
|
|
+ help="对视频帖跑 Deepgram 转写,把字幕并入 body_text(需 DEEPGRAM_KEY+yt-dlp+ffmpeg,按量计费)",
|
|
|
+ )
|
|
|
return p
|
|
|
|
|
|
|
|
|
@@ -230,10 +294,28 @@ def run(args) -> int:
|
|
|
pass
|
|
|
continue
|
|
|
|
|
|
+ if args.transcribe and post["videos"]:
|
|
|
+ try:
|
|
|
+ transcript = transcribe_post(post)
|
|
|
+ except Exception as e:
|
|
|
+ transcript = None
|
|
|
+ print(
|
|
|
+ f" transcribe error: {type(e).__name__}: {e}",
|
|
|
+ file=sys.stderr,
|
|
|
+ )
|
|
|
+ if transcript:
|
|
|
+ merge_transcript_into_body(post, transcript)
|
|
|
+ print(
|
|
|
+ f" transcript merged ({len(transcript)} chars)",
|
|
|
+ file=sys.stderr,
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ print(" transcript: <none>", file=sys.stderr)
|
|
|
+
|
|
|
print(
|
|
|
f"[info {i}/{len(urls)}] OK id={post['channel_content_id']} "
|
|
|
f"title={post['title'][:30]!r} body={len(post['body_text'])} "
|
|
|
- f"imgs={len(post['images'])}",
|
|
|
+ f"imgs={len(post['images'])} vids={len(post['videos'])}",
|
|
|
file=sys.stderr,
|
|
|
)
|
|
|
|