fetch_xhs_data_by_account.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. #!/usr/bin/env python3
  2. """
  3. 小红书账号数据获取脚本
  4. 功能:根据账号链接获取该账号的所有历史帖子,选择点赞数最高的作为待解构帖子
  5. """
  6. import json
  7. import re
  8. import time
  9. from pathlib import Path
  10. from typing import Dict
  11. import sys
  12. import argparse
  13. import shutil
  14. # 导入共享工具模块
  15. from xhs_utils import (
  16. get_note_detail,
  17. get_author_history_notes,
  18. merge_note_data,
  19. transform_note_data
  20. )
  21. def extract_account_id_from_url(url: str) -> str:
  22. """
  23. 从小红书账号URL中提取account_id
  24. Args:
  25. url: 小红书账号URL
  26. Returns:
  27. account_id: 账号ID
  28. Example:
  29. https://www.xiaohongshu.com/user/profile/5ff3e96a000000000100995a?xsec_token=...
  30. 返回: 5ff3e96a000000000100995a
  31. """
  32. # 尝试从URL路径中提取
  33. pattern = r'/user/profile/([a-f0-9]+)'
  34. match = re.search(pattern, url)
  35. if match:
  36. return match.group(1)
  37. # 如果直接传入的是account_id,则直接返回
  38. if re.match(r'^[a-f0-9]{24}$', url):
  39. return url
  40. raise ValueError(f"无法从URL中提取account_id: {url}")
  41. def save_note_to_file(note_data: Dict, file_path: Path):
  42. """
  43. 将帖子数据保存到JSON文件
  44. Args:
  45. note_data: 帖子数据
  46. file_path: 文件路径
  47. """
  48. # 确保目录存在
  49. file_path.parent.mkdir(parents=True, exist_ok=True)
  50. # 保存JSON文件
  51. with open(file_path, 'w', encoding='utf-8') as f:
  52. json.dump(note_data, f, ensure_ascii=False, indent=2)
  53. print(f"已保存: {file_path}")
  54. def check_note_data_integrity(note_data: dict) -> bool:
  55. """
  56. 检查帖子数据的完整性
  57. Args:
  58. note_data: 帖子数据字典
  59. Returns:
  60. bool: 如果 images 或 video 字段至少一个不为空,返回 True,否则返回 False
  61. """
  62. images = note_data.get("images", [])
  63. video = note_data.get("video")
  64. # 检查 images 是否为非空列表
  65. has_images = isinstance(images, list) and len(images) > 0
  66. # 检查 video 是否存在且不为空(字符串或字典都可以)
  67. has_video = video is not None and video != "" and video != {}
  68. return has_images or has_video
  69. def check_account_data_exists(note_id: str, output_dir: str = "examples") -> dict:
  70. """
  71. 检查账号数据是否已经存在且完整
  72. Args:
  73. note_id: 帖子ID
  74. output_dir: 输出根目录
  75. Returns:
  76. dict: 包含检查结果的字典
  77. """
  78. result = {
  79. "exists": False,
  80. "complete": False,
  81. "target_note_path": None,
  82. "history_notes_path": None,
  83. "incomplete_files": [],
  84. "note_id": note_id
  85. }
  86. # 构建路径
  87. input_dir = Path(output_dir) / note_id / "输入"
  88. target_note_path = input_dir / "待解构帖子.json"
  89. history_notes_path = input_dir / "作者历史帖子"
  90. result["target_note_path"] = target_note_path
  91. result["history_notes_path"] = history_notes_path
  92. # 检查输入目录是否存在
  93. if not input_dir.exists():
  94. return result
  95. result["exists"] = True
  96. # 检查待解构帖子是否存在且完整
  97. if not target_note_path.exists():
  98. result["incomplete_files"].append(str(target_note_path))
  99. return result
  100. try:
  101. with open(target_note_path, 'r', encoding='utf-8') as f:
  102. target_note_data = json.load(f)
  103. if not check_note_data_integrity(target_note_data):
  104. result["incomplete_files"].append(str(target_note_path))
  105. except Exception as e:
  106. result["incomplete_files"].append(f"{target_note_path} (读取错误: {e})")
  107. # 检查历史帖子目录
  108. if not history_notes_path.exists():
  109. result["incomplete_files"].append(str(history_notes_path))
  110. return result
  111. # 检查历史帖子文件的完整性
  112. history_files = list(history_notes_path.glob("*.json"))
  113. if len(history_files) == 0:
  114. result["incomplete_files"].append(f"{history_notes_path} (没有历史帖子文件)")
  115. else:
  116. # 统计有效的历史帖子数量
  117. valid_history_count = 0
  118. for history_file in history_files:
  119. try:
  120. with open(history_file, 'r', encoding='utf-8') as f:
  121. history_note_data = json.load(f)
  122. if not check_note_data_integrity(history_note_data):
  123. result["incomplete_files"].append(str(history_file))
  124. else:
  125. valid_history_count += 1
  126. except Exception as e:
  127. result["incomplete_files"].append(f"{history_file} (读取错误: {e})")
  128. # 验证历史帖子数量必须大于4
  129. if valid_history_count <= 4:
  130. result["incomplete_files"].append(f"{history_notes_path} (有效历史帖子数量 {valid_history_count} ≤ 4,不满足要求)")
  131. # 如果没有不完整的文件,则数据完整
  132. result["complete"] = len(result["incomplete_files"]) == 0
  133. return result
  134. def delete_incomplete_data(note_id: str, output_dir: str = "examples") -> bool:
  135. """
  136. 删除不完整的数据目录
  137. Args:
  138. note_id: 帖子ID
  139. output_dir: 输出根目录
  140. Returns:
  141. bool: 删除成功返回True,否则返回False
  142. """
  143. try:
  144. target_dir = Path(output_dir) / note_id
  145. if target_dir.exists():
  146. shutil.rmtree(target_dir)
  147. print(f" ✓ 已删除不完整数据目录: {target_dir}")
  148. return True
  149. else:
  150. print(f" ⚠️ 目录不存在: {target_dir}")
  151. return False
  152. except Exception as e:
  153. print(f" ✗ 删除目录失败: {e}")
  154. return False
  155. def fetch_and_save_by_account(account_url: str, output_dir: str = "examples",
  156. check_only: bool = False, skip_if_exists: bool = True,
  157. clean_incomplete: bool = False):
  158. """
  159. 根据账号URL获取数据并保存到本地
  160. 选择点赞数最高的帖子作为待解构帖子
  161. Args:
  162. account_url: 小红书账号URL
  163. output_dir: 输出目录,默认为examples
  164. check_only: 如果为True,只检查数据是否存在,不执行获取操作
  165. skip_if_exists: 如果为True且数据已存在且完整,则跳过获取
  166. clean_incomplete: 如果为True,检测到不完整数据时自动删除
  167. """
  168. print(f"\n{'='*80}")
  169. print(f"{'[检查模式]' if check_only else '[处理模式]'} 根据账号URL获取数据")
  170. print(f"{'='*80}")
  171. # 1. 提取account_id
  172. print(f"正在解析账号URL: {account_url}")
  173. account_id = extract_account_id_from_url(account_url)
  174. print(f"提取到account_id: {account_id}")
  175. # 2. 获取账号的所有历史帖子
  176. print(f"\n正在获取账号历史帖子...")
  177. history_notes = get_author_history_notes(account_id)
  178. if not history_notes or len(history_notes) == 0:
  179. print("未找到历史帖子")
  180. return
  181. print(f"找到 {len(history_notes)} 个历史帖子")
  182. # 3. 找出点赞数最高的帖子
  183. print("\n正在分析点赞数...")
  184. max_like_note = max(history_notes, key=lambda x: x.get("like_count", 0))
  185. max_like_note_id = max_like_note.get("note_id", "")
  186. max_like_count = max_like_note.get("like_count", 0)
  187. print(f"点赞数最高的帖子:")
  188. print(f" - 帖子ID: {max_like_note_id}")
  189. print(f" - 标题: {max_like_note.get('title', '无标题')}")
  190. print(f" - 点赞数: {max_like_count}")
  191. # 先检查数据是否已存在
  192. check_result = check_account_data_exists(max_like_note_id, output_dir=output_dir)
  193. if check_result["exists"]:
  194. if check_result["complete"]:
  195. print(f"\n✓ 数据已存在且完整")
  196. print(f" 待解构帖子: {check_result['target_note_path']}")
  197. print(f" 历史帖子目录: {check_result['history_notes_path']}")
  198. if check_only or skip_if_exists:
  199. print(f"{' [检查模式] 跳过获取' if check_only else ' [跳过] 数据已完整'}")
  200. return
  201. else:
  202. print(f"\n⚠️ 数据存在但不完整")
  203. print(f" 不完整的文件:")
  204. for incomplete_file in check_result["incomplete_files"]:
  205. print(f" - {incomplete_file}")
  206. # 如果启用了清理不完整数据的功能
  207. if clean_incomplete:
  208. print(f" [清理模式] 删除不完整数据...")
  209. delete_incomplete_data(max_like_note_id, output_dir)
  210. if check_only:
  211. print(f" [检查模式] 需要重新获取")
  212. return
  213. else:
  214. print(f" 将重新获取数据...")
  215. else:
  216. print(f"\nℹ️ 数据不存在")
  217. if check_only:
  218. print(f" [检查模式] 需要获取")
  219. return
  220. # 如果是检查模式,到这里就结束了
  221. if check_only:
  222. return
  223. # 4. 处理点赞数最高的帖子(待解构帖子)
  224. print(f"\n正在处理待解构帖子...")
  225. # 检查是否需要调用详情API补充数据
  226. need_detail = not (max_like_note.get("desc") or max_like_note.get("note_text") or max_like_note.get("body_text"))
  227. target_note_detail = None
  228. if need_detail:
  229. print(f" → 缺少正文,调用详情API补充...")
  230. target_note_detail = get_note_detail(max_like_note_id)
  231. # 合并历史API和详情API的数据
  232. transformed_target = merge_note_data(max_like_note, target_note_detail)
  233. # 5. 创建目录结构
  234. base_path = Path(output_dir) / max_like_note_id / "输入"
  235. history_path = base_path / "作者历史帖子"
  236. # 6. 保存待解构帖子(点赞数最高的)
  237. target_note_path = base_path / "待解构帖子.json"
  238. save_note_to_file(transformed_target, target_note_path)
  239. # 7. 为每个历史帖子处理数据并保存
  240. print(f"\n正在处理所有历史帖子...")
  241. for idx, note in enumerate(history_notes, 1):
  242. history_note_id = note.get("note_id", "")
  243. if history_note_id:
  244. print(f" [{idx}/{len(history_notes)}] 处理帖子: {history_note_id}")
  245. try:
  246. # 检查历史API数据是否缺少关键字段(主要是body_text)
  247. need_detail = not (note.get("desc") or note.get("note_text") or note.get("body_text"))
  248. detail_data = None
  249. if need_detail:
  250. print(f" → 缺少正文,调用详情API补充...")
  251. detail_data = get_note_detail(history_note_id)
  252. # 添加请求间隔,避免频繁调用
  253. if idx < len(history_notes):
  254. time.sleep(0.5)
  255. # 合并历史API和详情API的数据
  256. merged_note = merge_note_data(note, detail_data)
  257. # 保存到文件
  258. history_note_path = history_path / f"{history_note_id}.json"
  259. save_note_to_file(merged_note, history_note_path)
  260. except Exception as e:
  261. print(f" ⚠️ 处理帖子 {history_note_id} 失败: {e}")
  262. continue
  263. print(f"\n✓ 数据获取完成!")
  264. print(f"✓ 待解构帖子(点赞数最高): {max_like_note_id}")
  265. print(f"✓ 共保存 {len(history_notes)} 个历史帖子详情")
  266. print(f"✓ 输出目录: {base_path}")
  267. def main():
  268. """主函数"""
  269. # 解析命令行参数
  270. parser = argparse.ArgumentParser(
  271. description='小红书账号数据获取脚本',
  272. formatter_class=argparse.RawDescriptionHelpFormatter,
  273. epilog="""
  274. 使用示例:
  275. # 获取账号数据
  276. python fetch_xhs_data_by_account.py <账号URL>
  277. # 只检查数据是否存在且完整
  278. python fetch_xhs_data_by_account.py <账号URL> --check-only
  279. # 检查并清理不完整数据
  280. python fetch_xhs_data_by_account.py <账号URL> --check-only --clean-incomplete
  281. # 强制重新获取(即使数据已存在)
  282. python fetch_xhs_data_by_account.py <账号URL> --no-skip-if-exists
  283. """
  284. )
  285. parser.add_argument(
  286. 'url',
  287. nargs='?',
  288. default='https://www.xiaohongshu.com/user/profile/5ff3e96a000000000100995a',
  289. help='小红书账号URL(可选,默认使用示例URL)'
  290. )
  291. parser.add_argument(
  292. '--check-only',
  293. action='store_true',
  294. help='只检查数据是否存在且完整,不执行获取操作'
  295. )
  296. parser.add_argument(
  297. '--no-skip-if-exists',
  298. action='store_true',
  299. help='即使数据已存在且完整也重新获取'
  300. )
  301. parser.add_argument(
  302. '--clean-incomplete',
  303. action='store_true',
  304. help='自动删除检测到的不完整数据目录'
  305. )
  306. parser.add_argument(
  307. '--output-dir',
  308. type=str,
  309. default='examples',
  310. help='输出根目录 (默认: examples)'
  311. )
  312. args = parser.parse_args()
  313. url = args.url
  314. check_only = args.check_only
  315. skip_if_exists = not args.no_skip_if_exists
  316. clean_incomplete = args.clean_incomplete
  317. output_dir = args.output_dir
  318. print(f"{'='*80}")
  319. print(f"小红书账号数据{'检查' if check_only else '获取'}脚本")
  320. print(f"{'='*80}")
  321. print(f"账号URL: {url}")
  322. print(f"模式: {'只检查' if check_only else '获取数据'}")
  323. print(f"跳过已存在: {'是' if skip_if_exists else '否'}")
  324. print(f"清理不完整数据: {'是' if clean_incomplete else '否'}")
  325. print(f"输出目录: {output_dir}")
  326. print(f"{'='*80}")
  327. try:
  328. fetch_and_save_by_account(
  329. url,
  330. output_dir=output_dir,
  331. check_only=check_only,
  332. skip_if_exists=skip_if_exists,
  333. clean_incomplete=clean_incomplete
  334. )
  335. except Exception as e:
  336. print(f"错误: {e}")
  337. import traceback
  338. traceback.print_exc()
  339. return 1
  340. return 0
  341. if __name__ == "__main__":
  342. exit(main())