"""
按 URL 抓取小红书帖子内容。
走 explore 页 HTML 内嵌的 window.__INITIAL_STATE__ JSON,不需要 cookie;
URL 必须带 xsec_token(分享/搜索/explore_feed 链接默认都带)。
输出字段集与 examples/process_pipeline/output//raw_cases/source.json 的
post 子对象对齐:channel_content_id / title / content_type / body_text /
like_count / publish_timestamp / images / videos / channel / link
用法:
python xhs_fetch/xhs_fetch.py [ ...] [--output ]
python xhs_fetch/xhs_fetch.py --urls-file urls.txt
python xhs_fetch/xhs_fetch.py --stdout # 不写文件,打 JSON 数组到 stdout
退码:0 全成功 / 1 全失败或参数错 / 2 部分失败 / 130 Ctrl+C
脚本通过探测 .git/pyproject.toml 自动定位项目根,可以放在仓库内任意位置。
"""
import argparse
import json
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional
import httpx
# Windows 控制台 UTF-8(中文输出必备)
for _s in (sys.stdout, sys.stderr):
try:
_s.reconfigure(encoding="utf-8")
except (AttributeError, OSError):
pass
def _find_project_root(start: Path) -> Path:
"""沿父目录上爬找 .git / pyproject.toml。"""
p = start.resolve()
for ancestor in [p, *p.parents]:
if (ancestor / ".git").exists() or (ancestor / "pyproject.toml").exists():
return ancestor
return start.resolve().parent
PROJECT_ROOT = _find_project_root(Path(__file__))
SCRIPT_DIR = Path(__file__).resolve().parent
OUTPUTS_DIR = SCRIPT_DIR / "outputs"
sys.path.insert(0, str(PROJECT_ROOT))
try:
from dotenv import load_dotenv
load_dotenv(PROJECT_ROOT / ".env")
except ImportError:
pass
# ── 抓取 / 解析 ─────────────────────────────────────
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml",
}
# 注意:用 .*? + 后面 固定锚点。lazy 的 \{.+?\} 在嵌套 JSON 上会断在第一个 }。
INITIAL_STATE_RE = re.compile(
r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})\s*", re.S
)
def _parse_initial_state(html: str) -> Dict[str, Any]:
m = INITIAL_STATE_RE.search(html)
if not m:
raise RuntimeError("INITIAL_STATE not found(页面可能未渲染或被风控)")
# SSR 直接把 JS undefined 当裸字面量塞了进来,json.loads 不接,先替换
raw = re.sub(r":\s*undefined", ": null", m.group(1))
return json.loads(raw)
def _coerce_int(v: Any) -> int:
s = str(v or "").strip()
if s.isdigit():
return int(s)
try:
return int(float(s))
except (ValueError, TypeError):
return 0
def parse_post(html: str) -> Dict[str, Any]:
"""从 explore 页 HTML 解析出一个 post 字典(source.json 兼容格式)。"""
data = _parse_initial_state(html)
nd_map = ((data.get("note") or {}).get("noteDetailMap")) or {}
if not nd_map:
raise RuntimeError("noteDetailMap empty")
nid, val = next(iter(nd_map.items()))
note = (val or {}).get("note") or {}
images = [
(img.get("urlDefault") or img.get("url") or "")
for img in (note.get("imageList") or [])
if (img.get("urlDefault") or img.get("url"))
]
interact = note.get("interactInfo") or {}
return {
"channel_content_id": nid,
"title": note.get("title") or "",
"content_type": note.get("type") or "note",
"body_text": note.get("desc") or "",
"like_count": _coerce_int(interact.get("likedCount")),
"publish_timestamp": note.get("time") or "",
"images": images,
"videos": [],
"channel": "xhs",
"link": f"https://www.xiaohongshu.com/explore/{nid}",
}
def fetch_one(client: httpx.Client, url: str) -> Dict[str, Any]:
r = client.get(url, headers=HEADERS, follow_redirects=True, timeout=30.0)
r.raise_for_status()
return parse_post(r.text)
# ── 输出沙盒 ────────────────────────────────────────
def resolve_output_subdir(rel_path: Optional[str]) -> Path:
"""把 --output 解析到 OUTPUTS_DIR 之下,禁止绝对路径与 '..' 越界。"""
if not rel_path:
return OUTPUTS_DIR
p = Path(rel_path)
if p.is_absolute():
raise SystemExit(f"ERROR: --output 必须是相对路径: {rel_path!r}")
target = (OUTPUTS_DIR / p).resolve()
try:
target.relative_to(OUTPUTS_DIR.resolve())
except ValueError:
raise SystemExit(f"ERROR: --output 越界到 {target}(不允许 '..')")
return target
def safe_filename(post: Dict[str, Any]) -> str:
title = post.get("title") or post.get("channel_content_id") or "untitled"
safe = re.sub(r"[^\w一-龥]+", "_", title)[:40].strip("_")
return f"xhs_{post['channel_content_id'][:12]}_{safe}.json"
# ── 输入收集 ────────────────────────────────────────
def load_urls(args) -> List[str]:
urls: List[str] = []
if args.urls_file:
text = Path(args.urls_file).read_text(encoding="utf-8", errors="replace")
urls.extend(
line.strip()
for line in text.splitlines()
if line.strip() and not line.strip().startswith("#")
)
urls.extend(args.urls or [])
if not urls:
raise SystemExit(
"ERROR: 请通过位置参数或 --urls-file 提供至少一个 URL"
)
return urls
# ── CLI ─────────────────────────────────────────────
def build_parser() -> argparse.ArgumentParser:
p = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
p.add_argument(
"urls", nargs="*",
help="小红书 explore URL(一个或多个,需带 xsec_token)",
)
p.add_argument(
"--urls-file",
help="URL 列表文件路径(每行一个 URL,# 开头为注释)",
)
p.add_argument(
"--output",
help="相对 outputs/ 的子目录路径,用于本次输出(默认直接写到 outputs/)",
)
p.add_argument(
"--stdout", action="store_true",
help="不写文件,把抓到的 post 数组打到 stdout(JSON)",
)
return p
def run(args) -> int:
urls = load_urls(args)
out_dir: Optional[Path] = None
if not args.stdout:
out_dir = resolve_output_subdir(args.output)
out_dir.mkdir(parents=True, exist_ok=True)
print(f"[info] outputs -> {out_dir}", file=sys.stderr)
print(f"[info] urls={len(urls)}", file=sys.stderr)
posts: List[Dict[str, Any]] = []
failures = 0
with httpx.Client() as client:
for i, url in enumerate(urls, 1):
short = url[:80]
try:
post = fetch_one(client, url)
except Exception as e:
failures += 1
print(
f"[err {i}/{len(urls)}] {type(e).__name__}: {e} url={short}",
file=sys.stderr,
)
for attr in ("response", "body"):
obj = getattr(e, attr, None)
if obj is not None:
try:
text = obj.text if hasattr(obj, "text") else str(obj)
print(f" server body: {text[:400]}", file=sys.stderr)
except Exception:
pass
continue
print(
f"[info {i}/{len(urls)}] OK id={post['channel_content_id']} "
f"title={post['title'][:30]!r} body={len(post['body_text'])} "
f"imgs={len(post['images'])}",
file=sys.stderr,
)
if args.stdout:
posts.append(post)
else:
assert out_dir is not None
path = out_dir / safe_filename(post)
path.write_text(
json.dumps(post, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(f" -> {path.relative_to(SCRIPT_DIR)}", file=sys.stderr)
if args.stdout:
json.dump(posts, sys.stdout, ensure_ascii=False, indent=2)
sys.stdout.write("\n")
ok = len(urls) - failures
print(f"[info] done: ok={ok} fail={failures}", file=sys.stderr)
if failures == 0:
return 0
if failures < len(urls):
return 2
return 1
def main():
try:
args = build_parser().parse_args()
sys.exit(run(args))
except KeyboardInterrupt:
print("\n[info] interrupted by user (Ctrl+C)", file=sys.stderr)
sys.exit(130)
except SystemExit:
raise
except BaseException as e:
import traceback
print(f"\n!!! UNEXPECTED ERROR: {type(e).__name__}: {e}", file=sys.stderr)
traceback.print_exc(file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()