scrape_selection_to_eval.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. # -*- coding: utf-8 -*-
  2. """
  3. 把几个公开帖子(小红书 / 微信公众号 / CSDN)的内容爬下来,归一化成
  4. procedure-dsl/input/eval_case-*.json 那套 schema,并跑 SourceQualityEvaluator
  5. 打质量分。
  6. 每条 source 只要给 URL 即可(平台从域名自动识别),无需登录 / 无需后端:
  7. - 小红书 explore 页:解析页面里的 window.__INITIAL_STATE__(含 xsec_token 的
  8. 分享链最稳),拿 title / desc / 点赞 / 图片 / 时间。
  9. - 微信公众号 mp.weixin.qq.com/s/<token>:解析 og:title + #js_content 正文 +
  10. var ct 发布时间 + 正文图片。公众号的阅读/点赞需要登录态 cookie,拿不到,
  11. like_count 置 None。
  12. - CSDN 文章页:解析 #content_views 正文 + 发布时间 + 正文图片。
  13. 输出 schema(与 eval_case-1.json 完全一致、字段同序):
  14. channel_content_id / title / content_type / body_text / like_count /
  15. publish_timestamp(字符串 "YYYY-MM-DD HH:MM:SS") / images / videos /
  16. channel / link / _quality_score / _quality_grade
  17. 用法:
  18. python scrape_selection_to_eval.py # 跑下面 SOURCES 里的全部链接
  19. 输出:
  20. ./scraped_selection/<channel>_<id>.json # 每条一个文件
  21. ./scraped_selection/_all.json # 合并成一个 list
  22. """
  23. from __future__ import annotations
  24. import json
  25. import re
  26. import sys
  27. from datetime import datetime
  28. from pathlib import Path
  29. from typing import Any, Dict, List, Optional
  30. import httpx
  31. from bs4 import BeautifulSoup
  32. # ── 让本脚本能 import 同仓的质量评估器(在上一级 script/ 目录)──
  33. SCRIPT_DIR = Path(__file__).resolve().parent # .../search_eval
  34. sys.path.insert(0, str(SCRIPT_DIR.parent)) # .../script
  35. from evaluate_source_quality import SourceQualityEvaluator # noqa: E402
  36. OUT_DIR = SCRIPT_DIR / "scraped_selection"
  37. UA = (
  38. "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
  39. "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
  40. )
  41. # ── 要爬的链接(按用户给的 4 条)──
  42. SOURCES: List[str] = [
  43. "https://blog.csdn.net/2402_86571652/article/details/153465368",
  44. "https://mp.weixin.qq.com/s/3pZ3BAET1wPGwPV92zEu9A",
  45. "https://mp.weixin.qq.com/s/LWBNsgbwFl1NfDNOYEom-Q",
  46. "https://www.xiaohongshu.com/explore/6981939f000000000e03da6d?xsec_token=ABKp_PMFXV9IuxRL9a_48ovmBpnetn8v2aUefL9JW_E_U=&xsec_source=pc_search&source=web_explore_feed",
  47. ]
  48. # eval_case schema 的字段顺序
  49. FIELD_ORDER = [
  50. "channel_content_id", "title", "content_type", "body_text", "like_count",
  51. "publish_timestamp", "images", "videos", "channel", "link",
  52. "_quality_score", "_quality_grade",
  53. ]
  54. # ── 通用工具 ──
  55. def _get(url: str) -> str:
  56. """GET 一个页面,返回 HTML 文本。"""
  57. with httpx.Client(timeout=40, follow_redirects=True, headers={"User-Agent": UA}) as c:
  58. r = c.get(url)
  59. r.raise_for_status()
  60. return r.text
  61. def _fmt_ts(ms: Optional[int]) -> Optional[str]:
  62. """epoch 毫秒 -> 本地时间 'YYYY-MM-DD HH:MM:SS' 字符串。拿不到返回 None。"""
  63. if not ms:
  64. return None
  65. try:
  66. return datetime.fromtimestamp(int(ms) / 1000).strftime("%Y-%m-%d %H:%M:%S")
  67. except Exception:
  68. return None
  69. def detect_platform(url: str) -> str:
  70. if "xiaohongshu.com" in url:
  71. return "xhs"
  72. if "mp.weixin.qq.com" in url:
  73. return "gzh"
  74. if "csdn.net" in url:
  75. return "csdn"
  76. raise ValueError(f"不支持的链接(无法识别平台):{url}")
  77. # ── 小红书:解析 window.__INITIAL_STATE__ ──
  78. def _xhs_note_id(url: str) -> str:
  79. m = re.search(r"/(?:explore|discovery/item)/([0-9a-fA-F]+)", url)
  80. if not m:
  81. raise ValueError(f"无法从小红书链接解析 note id:{url}")
  82. return m.group(1)
  83. def _xhs_img_url(item: Dict[str, Any]) -> Optional[str]:
  84. """从 imageList 的一项里取一个可用图片 URL。"""
  85. if item.get("urlDefault"):
  86. return item["urlDefault"]
  87. for info in item.get("infoList") or []:
  88. if info.get("url"):
  89. return info["url"]
  90. return None
  91. def scrape_xhs(url: str) -> Dict[str, Any]:
  92. note_id = _xhs_note_id(url)
  93. html = _get(url)
  94. m = re.search(r"window\.__INITIAL_STATE__\s*=\s*(\{.*?\})</script>", html, re.S)
  95. if not m:
  96. raise RuntimeError("小红书页面里没找到 __INITIAL_STATE__(可能被风控/需要登录)")
  97. # 页面 JSON 里会有裸 undefined,json 解析不了,替换成 null
  98. data = json.loads(m.group(1).replace("undefined", "null"))
  99. nd = (((data.get("note") or {}).get("noteDetailMap") or {}).get(note_id) or {}).get("note") or {}
  100. if not nd:
  101. raise RuntimeError(f"__INITIAL_STATE__ 里没有 note {note_id} 的详情")
  102. is_video = (nd.get("type") == "video") or bool(nd.get("video"))
  103. images = []
  104. if not is_video:
  105. for it in nd.get("imageList") or []:
  106. u = _xhs_img_url(it)
  107. if u:
  108. images.append(u)
  109. videos = []
  110. if is_video:
  111. # 视频地址藏得较深,能拿到就放,拿不到留空(不影响主流程)
  112. try:
  113. streams = nd["video"]["media"]["stream"]
  114. for codec in streams.values():
  115. for s in codec:
  116. if s.get("masterUrl"):
  117. videos.append(s["masterUrl"])
  118. break
  119. if videos:
  120. break
  121. except Exception:
  122. pass
  123. interact = nd.get("interactInfo") or {}
  124. like = interact.get("likedCount")
  125. try:
  126. like = int(like)
  127. except (TypeError, ValueError):
  128. like = None
  129. return {
  130. "channel_content_id": note_id,
  131. "title": (nd.get("title") or "").strip(),
  132. "content_type": "video" if is_video else "note",
  133. "body_text": (nd.get("desc") or "").strip(),
  134. "like_count": like,
  135. "_ts_ms": nd.get("time"),
  136. "images": images,
  137. "videos": videos,
  138. "channel": "xhs",
  139. "link": f"https://www.xiaohongshu.com/explore/{note_id}",
  140. }
  141. # ── 微信公众号:解析正文 HTML ──
  142. def scrape_gzh(url: str) -> Dict[str, Any]:
  143. html = _get(url)
  144. if "环境异常" in html and "js_content" not in html:
  145. raise RuntimeError("公众号返回风控页(环境异常)——换个网络/加 cookie 再试")
  146. soup = BeautifulSoup(html, "lxml")
  147. def og(prop: str) -> Optional[str]:
  148. tag = soup.find("meta", attrs={"property": prop})
  149. return tag["content"].strip() if tag and tag.get("content") else None
  150. title = og("og:title") or ""
  151. if not title:
  152. h = soup.select_one("#activity-name")
  153. title = h.get_text(strip=True) if h else ""
  154. # 公众号名:页面里的 var nickname / #js_name
  155. acct = None
  156. mnick = re.search(r'var nickname\s*=\s*["\']([^"\']+)["\']', html)
  157. if mnick:
  158. acct = mnick.group(1).strip()
  159. if not acct:
  160. jn = soup.select_one("#js_name")
  161. acct = jn.get_text(strip=True) if jn else None
  162. body_el = soup.select_one("#js_content")
  163. body = body_el.get_text("\n", strip=True) if body_el else ""
  164. images: List[str] = []
  165. if body_el:
  166. for img in body_el.find_all("img"):
  167. src = img.get("data-src") or img.get("src")
  168. if src and src.startswith("http"):
  169. images.append(src)
  170. # 发布时间:var ct = "<秒级 epoch>"
  171. ts_ms = None
  172. mct = re.search(r'var ct\s*=\s*["\'](\d+)["\']', html)
  173. if mct:
  174. ts_ms = int(mct.group(1)) * 1000
  175. # 用链接里的短 token 作为稳定 id
  176. mtok = re.search(r"/s/([A-Za-z0-9_-]+)", url)
  177. cid = mtok.group(1) if mtok else url
  178. return {
  179. "channel_content_id": cid,
  180. "title": title,
  181. "content_type": "article",
  182. "body_text": body,
  183. "like_count": None, # 公众号阅读/点赞需登录态,拿不到
  184. "_ts_ms": ts_ms,
  185. "images": images,
  186. "videos": [],
  187. "channel": "gzh",
  188. "channel_account_name": acct, # 额外信息,写到输出里方便溯源
  189. "link": url.split("#")[0],
  190. }
  191. # ── CSDN:解析文章 HTML ──
  192. def scrape_csdn(url: str) -> Dict[str, Any]:
  193. html = _get(url)
  194. soup = BeautifulSoup(html, "lxml")
  195. def og(prop: str) -> Optional[str]:
  196. tag = soup.find("meta", attrs={"property": prop})
  197. return tag["content"].strip() if tag and tag.get("content") else None
  198. title = og("og:title") or ""
  199. h = soup.select_one("h1.title-article, #articleContentId")
  200. if h:
  201. title = h.get_text(strip=True) or title
  202. title = re.sub(r"[-_]\s*CSDN.*$", "", title).strip()
  203. body_el = soup.select_one("#content_views")
  204. body = body_el.get_text("\n", strip=True) if body_el else ""
  205. images: List[str] = []
  206. if body_el:
  207. for img in body_el.find_all("img"):
  208. src = img.get("src") or img.get("data-src")
  209. if src and src.startswith("http"):
  210. images.append(src)
  211. # 发布时间:meta article:published_time(ISO)或 span.time
  212. ts_ms = None
  213. pub = og("article:published_time")
  214. if pub:
  215. try:
  216. ts_ms = int(datetime.fromisoformat(pub.replace("Z", "+00:00")).timestamp() * 1000)
  217. except Exception:
  218. ts_ms = None
  219. if ts_ms is None:
  220. t = soup.select_one(".time, .article-bar-top .time")
  221. if t:
  222. mdt = re.search(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", t.get_text())
  223. if mdt:
  224. try:
  225. ts_ms = int(datetime.strptime(mdt.group(0), "%Y-%m-%d %H:%M:%S").timestamp() * 1000)
  226. except Exception:
  227. ts_ms = None
  228. # 点赞数:CSDN 静态页常见 span#spanCount / .get-collection 之类,能抓到就抓
  229. like = None
  230. for sel in ["#spanCount", "#is-like-imgactive + span", ".count"]:
  231. el = soup.select_one(sel)
  232. if el and el.get_text(strip=True).isdigit():
  233. like = int(el.get_text(strip=True))
  234. break
  235. mid = re.search(r"/article/details/(\d+)", url)
  236. cid = mid.group(1) if mid else url
  237. return {
  238. "channel_content_id": cid,
  239. "title": title,
  240. "content_type": "article",
  241. "body_text": body,
  242. "like_count": like,
  243. "_ts_ms": ts_ms,
  244. "images": images,
  245. "videos": [],
  246. "channel": "csdn",
  247. "link": url.split("#")[0].split("?")[0],
  248. }
  249. SCRAPERS = {"xhs": scrape_xhs, "gzh": scrape_gzh, "csdn": scrape_csdn}
  250. def normalize_and_score(raw: Dict[str, Any], evaluator: SourceQualityEvaluator) -> Dict[str, Any]:
  251. """跑质量分(用数值时间戳),再把时间戳转成字符串、按 schema 排好字段。"""
  252. ts_ms = raw.pop("_ts_ms", None)
  253. acct = raw.pop("channel_account_name", None)
  254. # 评估器读数值 publish_timestamp(与现有 pipeline 一致:传 epoch 毫秒)
  255. scoring_post = dict(raw)
  256. scoring_post["publish_timestamp"] = ts_ms or 0
  257. res = evaluator.evaluate_post(scoring_post)
  258. raw["_quality_score"] = res["total_score"]
  259. raw["_quality_grade"] = res["grade"]
  260. # 输出用字符串时间戳
  261. raw["publish_timestamp"] = _fmt_ts(ts_ms)
  262. ordered = {k: raw.get(k) for k in FIELD_ORDER}
  263. if acct: # 公众号名放在 schema 字段之后,作为溯源附注
  264. ordered["channel_account_name"] = acct
  265. return ordered
  266. def main() -> None:
  267. OUT_DIR.mkdir(exist_ok=True)
  268. evaluator = SourceQualityEvaluator()
  269. results: List[Dict[str, Any]] = []
  270. for url in SOURCES:
  271. plat = detect_platform(url)
  272. print(f"[{plat}] {url}")
  273. try:
  274. raw = SCRAPERS[plat](url)
  275. item = normalize_and_score(raw, evaluator)
  276. except Exception as e:
  277. print(f" !! 失败: {type(e).__name__}: {e}")
  278. continue
  279. fname = f"{item['channel']}_{item['channel_content_id']}.json"
  280. (OUT_DIR / fname).write_text(
  281. json.dumps(item, ensure_ascii=False, indent=2), encoding="utf-8"
  282. )
  283. results.append(item)
  284. print(f" -> {fname} | {item['_quality_grade']} {item['_quality_score']} "
  285. f"| 正文 {len(item.get('body_text') or '')} 字 | 图 {len(item.get('images') or [])} "
  286. f"| 赞 {item.get('like_count')}")
  287. (OUT_DIR / "_all.json").write_text(
  288. json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8"
  289. )
  290. print(f"\n完成:{len(results)}/{len(SOURCES)} 条 -> {OUT_DIR}")
  291. if __name__ == "__main__":
  292. main()