image_downloader.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 图片下载和本地服务工具
  5. 用于将小红书图片下载到本地,并通过HTTP服务器提供访问
  6. """
  7. import os
  8. import hashlib
  9. import requests
  10. import time
  11. from pathlib import Path
  12. from typing import List, Optional
  13. import logging
  14. logger = logging.getLogger(__name__)
  15. class ImageDownloader:
  16. """图片下载器"""
  17. def __init__(self, download_dir: str = "downloaded_images", max_retries: int = 3):
  18. """
  19. 初始化图片下载器
  20. Args:
  21. download_dir: 图片下载目录
  22. max_retries: 最大重试次数
  23. """
  24. self.download_dir = Path(download_dir)
  25. self.download_dir.mkdir(parents=True, exist_ok=True)
  26. self.max_retries = max_retries
  27. # 请求头,模拟浏览器
  28. self.headers = {
  29. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  30. 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
  31. 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
  32. 'Referer': 'https://www.xiaohongshu.com/'
  33. }
  34. def get_image_hash(self, url: str) -> str:
  35. """
  36. 根据URL生成唯一的图片文件名
  37. Args:
  38. url: 图片URL
  39. Returns:
  40. 文件名(不含扩展名)
  41. """
  42. return hashlib.md5(url.encode()).hexdigest()
  43. def get_extension_from_url(self, url: str) -> str:
  44. """
  45. 从URL中提取文件扩展名
  46. Args:
  47. url: 图片URL
  48. Returns:
  49. 扩展名(如 .jpg, .png, .webp)
  50. """
  51. # 检查URL中是否指定了format参数
  52. if 'format/jpg' in url or url.endswith('.jpg'):
  53. return '.jpg'
  54. elif 'format/png' in url or url.endswith('.png'):
  55. return '.png'
  56. elif 'format/webp' in url or url.endswith('.webp'):
  57. return '.webp'
  58. elif 'format/jpeg' in url or url.endswith('.jpeg'):
  59. return '.jpeg'
  60. # 默认使用webp
  61. return '.webp'
  62. def download_image(self, url: str) -> Optional[str]:
  63. """
  64. 下载单张图片
  65. Args:
  66. url: 图片URL
  67. Returns:
  68. 本地文件路径,失败返回None
  69. """
  70. if not url:
  71. return None
  72. # 生成本地文件路径
  73. file_hash = self.get_image_hash(url)
  74. extension = self.get_extension_from_url(url)
  75. local_path = self.download_dir / f"{file_hash}{extension}"
  76. # 如果文件已存在,直接返回
  77. if local_path.exists():
  78. logger.debug(f"图片已存在: {local_path}")
  79. return str(local_path)
  80. # 下载图片
  81. for attempt in range(self.max_retries):
  82. try:
  83. logger.debug(f"下载图片 (尝试 {attempt + 1}/{self.max_retries}): {url}")
  84. response = requests.get(
  85. url,
  86. headers=self.headers,
  87. timeout=30,
  88. stream=True # 使用流式下载避免内存问题
  89. )
  90. if response.status_code == 200:
  91. # 写入文件
  92. with open(local_path, 'wb') as f:
  93. for chunk in response.iter_content(chunk_size=8192):
  94. if chunk:
  95. f.write(chunk)
  96. logger.debug(f"✓ 下载成功: {local_path}")
  97. return str(local_path)
  98. else:
  99. logger.warning(f"下载失败,状态码: {response.status_code}")
  100. except requests.Timeout:
  101. logger.warning(f"下载超时 (尝试 {attempt + 1}/{self.max_retries})")
  102. except Exception as e:
  103. logger.warning(f"下载失败: {e} (尝试 {attempt + 1}/{self.max_retries})")
  104. # 等待后重试
  105. if attempt < self.max_retries - 1:
  106. wait_time = 2 ** attempt
  107. time.sleep(wait_time)
  108. logger.error(f"✗ 下载失败(已重试{self.max_retries}次): {url}")
  109. return None
  110. def download_images(self, urls: List[str]) -> List[Optional[str]]:
  111. """
  112. 批量下载图片
  113. Args:
  114. urls: 图片URL列表
  115. Returns:
  116. 本地文件路径列表
  117. """
  118. local_paths = []
  119. for url in urls:
  120. local_path = self.download_image(url)
  121. local_paths.append(local_path)
  122. return local_paths
  123. class LocalImageServer:
  124. """本地图片服务器配置"""
  125. def __init__(self, base_url: str = "http://localhost:8765", image_dir: str = "downloaded_images"):
  126. """
  127. 初始化本地图片服务器配置
  128. Args:
  129. base_url: 服务器基础URL
  130. image_dir: 图片目录名
  131. """
  132. self.base_url = base_url.rstrip('/')
  133. self.image_dir = image_dir
  134. def get_public_url(self, local_path: str) -> str:
  135. """
  136. 将本地路径转换为公开URL
  137. Args:
  138. local_path: 本地文件路径
  139. Returns:
  140. 公开可访问的URL
  141. """
  142. if not local_path:
  143. return ""
  144. # 提取文件名
  145. filename = Path(local_path).name
  146. # 生成公开URL
  147. return f"{self.base_url}/{filename}"
  148. def convert_paths_to_urls(self, local_paths: List[Optional[str]]) -> List[str]:
  149. """
  150. 批量转换本地路径为公开URL
  151. Args:
  152. local_paths: 本地文件路径列表
  153. Returns:
  154. 公开URL列表
  155. """
  156. return [self.get_public_url(path) if path else "" for path in local_paths]
  157. def start_simple_http_server(directory: str = "downloaded_images", port: int = 8765):
  158. """
  159. 启动简单的HTTP文件服务器(用于开发/测试)
  160. Args:
  161. directory: 要服务的目录
  162. port: 端口号
  163. Note:
  164. 这个函数会阻塞当前线程,建议在单独的进程中运行
  165. """
  166. import http.server
  167. import socketserver
  168. import os
  169. os.chdir(directory)
  170. Handler = http.server.SimpleHTTPRequestHandler
  171. # 添加CORS支持
  172. class CORSRequestHandler(Handler):
  173. def end_headers(self):
  174. self.send_header('Access-Control-Allow-Origin', '*')
  175. self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
  176. self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
  177. return super().end_headers()
  178. with socketserver.TCPServer(("", port), CORSRequestHandler) as httpd:
  179. print(f"图片服务器运行在 http://localhost:{port}")
  180. print(f"服务目录: {os.getcwd()}")
  181. print("按 Ctrl+C 停止服务器")
  182. httpd.serve_forever()
  183. if __name__ == '__main__':
  184. import sys
  185. if len(sys.argv) > 1 and sys.argv[1] == 'serve':
  186. # 启动HTTP服务器模式
  187. port = int(sys.argv[2]) if len(sys.argv) > 2 else 8765
  188. directory = sys.argv[3] if len(sys.argv) > 3 else "downloaded_images"
  189. print(f"启动图片服务器...")
  190. print(f"目录: {directory}")
  191. print(f"端口: {port}")
  192. start_simple_http_server(directory, port)
  193. else:
  194. # 测试下载功能
  195. test_url = "https://ci.xiaohongshu.com/1040g2sg31e4ln39lh0bg5p8vj7kp2skkvm4jgno?imageView2/2/w/1080/format/webp"
  196. print("测试图片下载功能")
  197. print(f"测试URL: {test_url}")
  198. downloader = ImageDownloader()
  199. local_path = downloader.download_image(test_url)
  200. if local_path:
  201. print(f"✓ 下载成功: {local_path}")
  202. # 测试URL转换
  203. server = LocalImageServer()
  204. public_url = server.get_public_url(local_path)
  205. print(f"公开URL: {public_url}")
  206. print(f"\n要启动图片服务器,运行:")
  207. print(f"python3 image_downloader.py serve 8765")
  208. else:
  209. print("✗ 下载失败")