#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 图片下载和本地服务工具 用于将小红书图片下载到本地,并通过HTTP服务器提供访问 """ import os import hashlib import requests import time from pathlib import Path from typing import List, Optional import logging logger = logging.getLogger(__name__) class ImageDownloader: """图片下载器""" def __init__(self, download_dir: str = "downloaded_images", max_retries: int = 3): """ 初始化图片下载器 Args: download_dir: 图片下载目录 max_retries: 最大重试次数 """ self.download_dir = Path(download_dir) self.download_dir.mkdir(parents=True, exist_ok=True) self.max_retries = max_retries # 请求头,模拟浏览器 self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Referer': 'https://www.xiaohongshu.com/' } def get_image_hash(self, url: str) -> str: """ 根据URL生成唯一的图片文件名 Args: url: 图片URL Returns: 文件名(不含扩展名) """ return hashlib.md5(url.encode()).hexdigest() def get_extension_from_url(self, url: str) -> str: """ 从URL中提取文件扩展名 Args: url: 图片URL Returns: 扩展名(如 .jpg, .png, .webp) """ # 检查URL中是否指定了format参数 if 'format/jpg' in url or url.endswith('.jpg'): return '.jpg' elif 'format/png' in url or url.endswith('.png'): return '.png' elif 'format/webp' in url or url.endswith('.webp'): return '.webp' elif 'format/jpeg' in url or url.endswith('.jpeg'): return '.jpeg' # 默认使用webp return '.webp' def download_image(self, url: str) -> Optional[str]: """ 下载单张图片 Args: url: 图片URL Returns: 本地文件路径,失败返回None """ if not url: return None # 生成本地文件路径 file_hash = self.get_image_hash(url) extension = self.get_extension_from_url(url) local_path = self.download_dir / f"{file_hash}{extension}" # 如果文件已存在,直接返回 if local_path.exists(): logger.debug(f"图片已存在: {local_path}") return str(local_path) # 下载图片 for attempt in range(self.max_retries): try: logger.debug(f"下载图片 (尝试 {attempt + 1}/{self.max_retries}): {url}") response = requests.get( url, headers=self.headers, timeout=30, stream=True # 使用流式下载避免内存问题 ) if response.status_code == 200: # 写入文件 with open(local_path, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): if chunk: f.write(chunk) logger.debug(f"✓ 下载成功: {local_path}") return str(local_path) else: logger.warning(f"下载失败,状态码: {response.status_code}") except requests.Timeout: logger.warning(f"下载超时 (尝试 {attempt + 1}/{self.max_retries})") except Exception as e: logger.warning(f"下载失败: {e} (尝试 {attempt + 1}/{self.max_retries})") # 等待后重试 if attempt < self.max_retries - 1: wait_time = 2 ** attempt time.sleep(wait_time) logger.error(f"✗ 下载失败(已重试{self.max_retries}次): {url}") return None def download_images(self, urls: List[str]) -> List[Optional[str]]: """ 批量下载图片 Args: urls: 图片URL列表 Returns: 本地文件路径列表 """ local_paths = [] for url in urls: local_path = self.download_image(url) local_paths.append(local_path) return local_paths class LocalImageServer: """本地图片服务器配置""" def __init__(self, base_url: str = "http://localhost:8765", image_dir: str = "downloaded_images"): """ 初始化本地图片服务器配置 Args: base_url: 服务器基础URL image_dir: 图片目录名 """ self.base_url = base_url.rstrip('/') self.image_dir = image_dir def get_public_url(self, local_path: str) -> str: """ 将本地路径转换为公开URL Args: local_path: 本地文件路径 Returns: 公开可访问的URL """ if not local_path: return "" # 提取文件名 filename = Path(local_path).name # 生成公开URL return f"{self.base_url}/{filename}" def convert_paths_to_urls(self, local_paths: List[Optional[str]]) -> List[str]: """ 批量转换本地路径为公开URL Args: local_paths: 本地文件路径列表 Returns: 公开URL列表 """ return [self.get_public_url(path) if path else "" for path in local_paths] def start_simple_http_server(directory: str = "downloaded_images", port: int = 8765): """ 启动简单的HTTP文件服务器(用于开发/测试) Args: directory: 要服务的目录 port: 端口号 Note: 这个函数会阻塞当前线程,建议在单独的进程中运行 """ import http.server import socketserver import os os.chdir(directory) Handler = http.server.SimpleHTTPRequestHandler # 添加CORS支持 class CORSRequestHandler(Handler): def end_headers(self): self.send_header('Access-Control-Allow-Origin', '*') self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS') self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate') return super().end_headers() with socketserver.TCPServer(("", port), CORSRequestHandler) as httpd: print(f"图片服务器运行在 http://localhost:{port}") print(f"服务目录: {os.getcwd()}") print("按 Ctrl+C 停止服务器") httpd.serve_forever() if __name__ == '__main__': import sys if len(sys.argv) > 1 and sys.argv[1] == 'serve': # 启动HTTP服务器模式 port = int(sys.argv[2]) if len(sys.argv) > 2 else 8765 directory = sys.argv[3] if len(sys.argv) > 3 else "downloaded_images" print(f"启动图片服务器...") print(f"目录: {directory}") print(f"端口: {port}") start_simple_http_server(directory, port) else: # 测试下载功能 test_url = "https://ci.xiaohongshu.com/1040g2sg31e4ln39lh0bg5p8vj7kp2skkvm4jgno?imageView2/2/w/1080/format/webp" print("测试图片下载功能") print(f"测试URL: {test_url}") downloader = ImageDownloader() local_path = downloader.download_image(test_url) if local_path: print(f"✓ 下载成功: {local_path}") # 测试URL转换 server = LocalImageServer() public_url = server.get_public_url(local_path) print(f"公开URL: {public_url}") print(f"\n要启动图片服务器,运行:") print(f"python3 image_downloader.py serve 8765") else: print("✗ 下载失败")