fetch_xhs_data.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. #!/usr/bin/env python3
  2. """
  3. 小红书帖子数据获取脚本
  4. 功能:根据帖子链接获取帖子详情和作者历史帖子,并保存到本地目录
  5. """
  6. import json
  7. import re
  8. from pathlib import Path
  9. from typing import Dict
  10. import sys
  11. import argparse
  12. import shutil
  13. # 导入共享工具模块
  14. from xhs_utils import (
  15. get_note_detail,
  16. get_author_history_notes,
  17. merge_note_data,
  18. transform_note_data
  19. )
  20. def extract_note_id_from_url(url: str) -> str:
  21. """
  22. 从小红书URL中提取note_id
  23. Args:
  24. url: 小红书帖子URL
  25. Returns:
  26. note_id: 帖子ID
  27. Example:
  28. https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0?xsec_token=...
  29. 返回: 68c6a924000000001b0336d0
  30. """
  31. # 尝试从URL路径中提取
  32. pattern = r'/explore/([a-f0-9]+)'
  33. match = re.search(pattern, url)
  34. if match:
  35. return match.group(1)
  36. # 如果直接传入的是note_id,则直接返回
  37. if re.match(r'^[a-f0-9]{24}$', url):
  38. return url
  39. raise ValueError(f"无法从URL中提取note_id: {url}")
  40. def save_note_to_file(note_data: Dict, file_path: Path):
  41. """
  42. 将帖子数据保存到JSON文件
  43. Args:
  44. note_data: 帖子数据
  45. file_path: 文件路径
  46. """
  47. # 确保目录存在
  48. file_path.parent.mkdir(parents=True, exist_ok=True)
  49. # 保存JSON文件
  50. with open(file_path, 'w', encoding='utf-8') as f:
  51. json.dump(note_data, f, ensure_ascii=False, indent=2)
  52. print(f"已保存: {file_path}")
  53. def check_note_data_integrity(note_data: dict) -> bool:
  54. """
  55. 检查帖子数据的完整性
  56. Args:
  57. note_data: 帖子数据字典
  58. Returns:
  59. bool: 如果 images 或 video 字段至少一个不为空,返回 True,否则返回 False
  60. """
  61. images = note_data.get("images", [])
  62. video = note_data.get("video")
  63. # 检查 images 是否为非空列表
  64. has_images = isinstance(images, list) and len(images) > 0
  65. # 检查 video 是否存在且不为空(字符串或字典都可以)
  66. has_video = video is not None and video != "" and video != {}
  67. return has_images or has_video
  68. def check_data_exists(note_id: str, output_dir: str = "examples") -> dict:
  69. """
  70. 检查数据是否已经存在且完整
  71. Args:
  72. note_id: 帖子ID
  73. output_dir: 输出根目录
  74. Returns:
  75. dict: 包含检查结果的字典
  76. """
  77. result = {
  78. "exists": False,
  79. "complete": False,
  80. "target_note_path": None,
  81. "history_notes_path": None,
  82. "incomplete_files": [],
  83. "note_id": note_id
  84. }
  85. # 构建路径
  86. input_dir = Path(output_dir) / note_id / "输入"
  87. target_note_path = input_dir / "待解构帖子.json"
  88. history_notes_path = input_dir / "作者历史帖子"
  89. result["target_note_path"] = target_note_path
  90. result["history_notes_path"] = history_notes_path
  91. # 检查输入目录是否存在
  92. if not input_dir.exists():
  93. return result
  94. result["exists"] = True
  95. # 检查待解构帖子是否存在且完整
  96. if not target_note_path.exists():
  97. result["incomplete_files"].append(str(target_note_path))
  98. return result
  99. try:
  100. with open(target_note_path, 'r', encoding='utf-8') as f:
  101. target_note_data = json.load(f)
  102. if not check_note_data_integrity(target_note_data):
  103. result["incomplete_files"].append(str(target_note_path))
  104. except Exception as e:
  105. result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")
  106. # 检查历史帖子目录
  107. if not history_notes_path.exists():
  108. result["incomplete_files"].append(str(history_notes_path))
  109. return result
  110. # 检查历史帖子文件的完整性
  111. history_files = list(history_notes_path.glob("*.json"))
  112. if len(history_files) == 0:
  113. result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
  114. else:
  115. # 统计有效的历史帖子数量
  116. valid_history_count = 0
  117. for history_file in history_files:
  118. try:
  119. with open(history_file, 'r', encoding='utf-8') as f:
  120. history_note_data = json.load(f)
  121. if not check_note_data_integrity(history_note_data):
  122. result["incomplete_files"].append(str(history_file))
  123. else:
  124. valid_history_count += 1
  125. except Exception as e:
  126. result["incomplete_files"].append(f"{history_file} (读取错误: {e})")
  127. # 验证历史帖子数量必须大于4
  128. if valid_history_count <= 4:
  129. result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4,不满足要求)")
  130. # 如果没有不完整的文件,则数据完整
  131. result["complete"] = len(result["incomplete_files"]) == 0
  132. return result
  133. def delete_incomplete_data(note_id: str, output_dir: str = "examples") -> bool:
  134. """
  135. 删除不完整的数据目录
  136. Args:
  137. note_id: 帖子ID
  138. output_dir: 输出根目录
  139. Returns:
  140. bool: 删除成功返回True,否则返回False
  141. """
  142. try:
  143. target_dir = Path(output_dir) / note_id
  144. if target_dir.exists():
  145. shutil.rmtree(target_dir)
  146. print(f" ✓ 已删除不完整数据目录: {target_dir}")
  147. return True
  148. else:
  149. print(f" ⚠️ 目录不存在: {target_dir}")
  150. return False
  151. except Exception as e:
  152. print(f" ✗ 删除目录失败: {e}")
  153. return False
  154. def fetch_and_save_xhs_data(url: str, output_dir: str = "examples",
  155. check_only: bool = False, skip_if_exists: bool = True,
  156. clean_incomplete: bool = False):
  157. """
  158. 获取小红书帖子数据并保存到本地
  159. Args:
  160. url: 小红书帖子URL
  161. output_dir: 输出目录,默认为examples
  162. check_only: 如果为True,只检查数据是否存在,不执行获取操作
  163. skip_if_exists: 如果为True且数据已存在且完整,则跳过获取
  164. clean_incomplete: 如果为True,检测到不完整数据时自动删除
  165. """
  166. print(f"\n{'='*80}")
  167. print(f"{'[检查模式]' if check_only else '[处理模式]'} 根据帖子URL获取数据")
  168. print(f"{'='*80}")
  169. # 1. 提取note_id
  170. print(f"正在解析URL: {url}")
  171. note_id = extract_note_id_from_url(url)
  172. print(f"提取到note_id: {note_id}")
  173. # 先检查数据是否已存在
  174. check_result = check_data_exists(note_id, output_dir=output_dir)
  175. if check_result["exists"]:
  176. if check_result["complete"]:
  177. print(f"\n✓ 数据已存在且完整")
  178. print(f" 待解构帖子: {check_result['target_note_path']}")
  179. print(f" 历史帖子目录: {check_result['history_notes_path']}")
  180. if check_only or skip_if_exists:
  181. print(f"{' [检查模式] 跳过获取' if check_only else ' [跳过] 数据已完整'}")
  182. return
  183. else:
  184. print(f"\n⚠️ 数据存在但不完整")
  185. print(f" 不完整的文件:")
  186. for incomplete_file in check_result["incomplete_files"]:
  187. print(f" - {incomplete_file}")
  188. # 如果启用了清理不完整数据的功能
  189. if clean_incomplete:
  190. print(f" [清理模式] 删除不完整数据...")
  191. delete_incomplete_data(note_id, output_dir)
  192. if check_only:
  193. print(f" [检查模式] 需要重新获取")
  194. return
  195. else:
  196. print(f" 将重新获取数据...")
  197. else:
  198. print(f"\nℹ️ 数据不存在")
  199. if check_only:
  200. print(f" [检查模式] 需要获取")
  201. return
  202. # 如果是检查模式,到这里就结束了
  203. if check_only:
  204. return
  205. # 2. 获取帖子详情
  206. print(f"正在获取帖子详情...")
  207. note_detail = get_note_detail(note_id)
  208. # 3. 转换数据格式
  209. transformed_note = transform_note_data(note_detail)
  210. account_id = transformed_note["channel_account_id"]
  211. # 4. 创建目录结构
  212. base_path = Path(output_dir) / note_id / "输入"
  213. history_path = base_path / "作者历史帖子"
  214. # 5. 保存待解构帖子
  215. target_note_path = base_path / "待解构帖子.json"
  216. save_note_to_file(transformed_note, target_note_path)
  217. # 6. 获取作者历史帖子
  218. if account_id:
  219. print(f"正在获取作者历史帖子 (账号ID: {account_id})...")
  220. history_notes = get_author_history_notes(account_id)
  221. # 7. 为每个历史帖子处理数据并保存
  222. if isinstance(history_notes, list):
  223. print(f"找到 {len(history_notes)} 个历史帖子,正在处理...")
  224. for idx, note in enumerate(history_notes, 1):
  225. # 从历史帖子列表中提取note_id
  226. history_note_id = note.get("note_id", "")
  227. if history_note_id:
  228. print(f" [{idx}/{len(history_notes)}] 处理帖子: {history_note_id}")
  229. try:
  230. # 检查历史API数据是否缺少关键字段(主要是body_text)
  231. need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))
  232. detail_data = None
  233. if need_detail:
  234. print(f" → 缺少正文,调用详情API补充...")
  235. detail_data = get_note_detail(history_note_id)
  236. # 合并历史API和详情API的数据
  237. merged_note = merge_note_data(note, detail_data)
  238. # 保存到文件
  239. history_note_path = history_path / f"{history_note_id}.json"
  240. save_note_to_file(merged_note, history_note_path)
  241. if transformed_note['channel_content_id'] == merged_note['channel_content_id']:
  242. save_note_to_file(merged_note, target_note_path)
  243. except Exception as e:
  244. print(f" ⚠️ 处理帖子 {history_note_id} 失败: {e}")
  245. continue
  246. print(f"\n共成功保存 {len(history_notes)} 个历史帖子")
  247. else:
  248. print("历史帖子数据格式不正确")
  249. else:
  250. print("未找到账号ID,跳过获取历史帖子")
  251. print(f"\n✓ 数据获取完成!")
  252. print(f"输出目录: {base_path}")
  253. def main():
  254. """主函数"""
  255. # 解析命令行参数
  256. parser = argparse.ArgumentParser(
  257. description='小红书帖子数据获取脚本',
  258. formatter_class=argparse.RawDescriptionHelpFormatter,
  259. epilog="""
  260. 使用示例:
  261. # 获取帖子数据
  262. python fetch_xhs_data.py <帖子URL>
  263. # 只检查数据是否存在且完整
  264. python fetch_xhs_data.py <帖子URL> --check-only
  265. # 检查并清理不完整数据
  266. python fetch_xhs_data.py <帖子URL> --check-only --clean-incomplete
  267. # 强制重新获取(即使数据已存在)
  268. python fetch_xhs_data.py <帖子URL> --no-skip-if-exists
  269. """
  270. )
  271. parser.add_argument(
  272. 'url',
  273. nargs='?',
  274. default='https://www.xiaohongshu.com/explore/68c6a924000000001b0336d0',
  275. help='小红书帖子URL(可选,默认使用示例URL)'
  276. )
  277. parser.add_argument(
  278. '--check-only',
  279. action='store_true',
  280. help='只检查数据是否存在且完整,不执行获取操作'
  281. )
  282. parser.add_argument(
  283. '--no-skip-if-exists',
  284. action='store_true',
  285. help='即使数据已存在且完整也重新获取'
  286. )
  287. parser.add_argument(
  288. '--clean-incomplete',
  289. action='store_true',
  290. help='自动删除检测到的不完整数据目录'
  291. )
  292. parser.add_argument(
  293. '--output-dir',
  294. type=str,
  295. default='examples',
  296. help='输出根目录 (默认: examples)'
  297. )
  298. args = parser.parse_args()
  299. url = args.url
  300. check_only = args.check_only
  301. skip_if_exists = not args.no_skip_if_exists
  302. clean_incomplete = args.clean_incomplete
  303. output_dir = args.output_dir
  304. print(f"{'='*80}")
  305. print(f"小红书帖子数据{'检查' if check_only else '获取'}脚本")
  306. print(f"{'='*80}")
  307. print(f"帖子URL: {url}")
  308. print(f"模式: {'只检查' if check_only else '获取数据'}")
  309. print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
  310. print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
  311. print(f"输出目录: {output_dir}")
  312. print(f"{'='*80}")
  313. try:
  314. fetch_and_save_xhs_data(
  315. url,
  316. output_dir=output_dir,
  317. check_only=check_only,
  318. skip_if_exists=skip_if_exists,
  319. clean_incomplete=clean_incomplete
  320. )
  321. except Exception as e:
  322. print(f"错误: {e}")
  323. import traceback
  324. traceback.print_exc()
  325. return 1
  326. return 0
  327. if __name__ == "__main__":
  328. exit(main())