| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- 图片下载和本地服务工具
- 用于将小红书图片下载到本地,并通过HTTP服务器提供访问
- """
- import os
- import hashlib
- import requests
- import time
- from pathlib import Path
- from typing import List, Optional
- import logging
- logger = logging.getLogger(__name__)
- class ImageDownloader:
- """图片下载器"""
- def __init__(self, download_dir: str = "downloaded_images", max_retries: int = 3):
- """
- 初始化图片下载器
- Args:
- download_dir: 图片下载目录
- max_retries: 最大重试次数
- """
- self.download_dir = Path(download_dir)
- self.download_dir.mkdir(parents=True, exist_ok=True)
- self.max_retries = max_retries
- # 请求头,模拟浏览器
- self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
- 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
- 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
- 'Referer': 'https://www.xiaohongshu.com/'
- }
- def get_image_hash(self, url: str) -> str:
- """
- 根据URL生成唯一的图片文件名
- Args:
- url: 图片URL
- Returns:
- 文件名(不含扩展名)
- """
- return hashlib.md5(url.encode()).hexdigest()
- def get_extension_from_url(self, url: str) -> str:
- """
- 从URL中提取文件扩展名
- Args:
- url: 图片URL
- Returns:
- 扩展名(如 .jpg, .png, .webp)
- """
- # 检查URL中是否指定了format参数
- if 'format/jpg' in url or url.endswith('.jpg'):
- return '.jpg'
- elif 'format/png' in url or url.endswith('.png'):
- return '.png'
- elif 'format/webp' in url or url.endswith('.webp'):
- return '.webp'
- elif 'format/jpeg' in url or url.endswith('.jpeg'):
- return '.jpeg'
- # 默认使用webp
- return '.webp'
- def download_image(self, url: str) -> Optional[str]:
- """
- 下载单张图片
- Args:
- url: 图片URL
- Returns:
- 本地文件路径,失败返回None
- """
- if not url:
- return None
- # 生成本地文件路径
- file_hash = self.get_image_hash(url)
- extension = self.get_extension_from_url(url)
- local_path = self.download_dir / f"{file_hash}{extension}"
- # 如果文件已存在,直接返回
- if local_path.exists():
- logger.debug(f"图片已存在: {local_path}")
- return str(local_path)
- # 下载图片
- for attempt in range(self.max_retries):
- try:
- logger.debug(f"下载图片 (尝试 {attempt + 1}/{self.max_retries}): {url}")
- response = requests.get(
- url,
- headers=self.headers,
- timeout=30,
- stream=True # 使用流式下载避免内存问题
- )
- if response.status_code == 200:
- # 写入文件
- with open(local_path, 'wb') as f:
- for chunk in response.iter_content(chunk_size=8192):
- if chunk:
- f.write(chunk)
- logger.debug(f"✓ 下载成功: {local_path}")
- return str(local_path)
- else:
- logger.warning(f"下载失败,状态码: {response.status_code}")
- except requests.Timeout:
- logger.warning(f"下载超时 (尝试 {attempt + 1}/{self.max_retries})")
- except Exception as e:
- logger.warning(f"下载失败: {e} (尝试 {attempt + 1}/{self.max_retries})")
- # 等待后重试
- if attempt < self.max_retries - 1:
- wait_time = 2 ** attempt
- time.sleep(wait_time)
- logger.error(f"✗ 下载失败(已重试{self.max_retries}次): {url}")
- return None
- def download_images(self, urls: List[str]) -> List[Optional[str]]:
- """
- 批量下载图片
- Args:
- urls: 图片URL列表
- Returns:
- 本地文件路径列表
- """
- local_paths = []
- for url in urls:
- local_path = self.download_image(url)
- local_paths.append(local_path)
- return local_paths
- class LocalImageServer:
- """本地图片服务器配置"""
- def __init__(self, base_url: str = "http://localhost:8765", image_dir: str = "downloaded_images"):
- """
- 初始化本地图片服务器配置
- Args:
- base_url: 服务器基础URL
- image_dir: 图片目录名
- """
- self.base_url = base_url.rstrip('/')
- self.image_dir = image_dir
- def get_public_url(self, local_path: str) -> str:
- """
- 将本地路径转换为公开URL
- Args:
- local_path: 本地文件路径
- Returns:
- 公开可访问的URL
- """
- if not local_path:
- return ""
- # 提取文件名
- filename = Path(local_path).name
- # 生成公开URL
- return f"{self.base_url}/{filename}"
- def convert_paths_to_urls(self, local_paths: List[Optional[str]]) -> List[str]:
- """
- 批量转换本地路径为公开URL
- Args:
- local_paths: 本地文件路径列表
- Returns:
- 公开URL列表
- """
- return [self.get_public_url(path) if path else "" for path in local_paths]
- def start_simple_http_server(directory: str = "downloaded_images", port: int = 8765):
- """
- 启动简单的HTTP文件服务器(用于开发/测试)
- Args:
- directory: 要服务的目录
- port: 端口号
- Note:
- 这个函数会阻塞当前线程,建议在单独的进程中运行
- """
- import http.server
- import socketserver
- import os
- os.chdir(directory)
- Handler = http.server.SimpleHTTPRequestHandler
- # 添加CORS支持
- class CORSRequestHandler(Handler):
- def end_headers(self):
- self.send_header('Access-Control-Allow-Origin', '*')
- self.send_header('Access-Control-Allow-Methods', 'GET, OPTIONS')
- self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
- return super().end_headers()
- with socketserver.TCPServer(("", port), CORSRequestHandler) as httpd:
- print(f"图片服务器运行在 http://localhost:{port}")
- print(f"服务目录: {os.getcwd()}")
- print("按 Ctrl+C 停止服务器")
- httpd.serve_forever()
- if __name__ == '__main__':
- import sys
- if len(sys.argv) > 1 and sys.argv[1] == 'serve':
- # 启动HTTP服务器模式
- port = int(sys.argv[2]) if len(sys.argv) > 2 else 8765
- directory = sys.argv[3] if len(sys.argv) > 3 else "downloaded_images"
- print(f"启动图片服务器...")
- print(f"目录: {directory}")
- print(f"端口: {port}")
- start_simple_http_server(directory, port)
- else:
- # 测试下载功能
- test_url = "https://ci.xiaohongshu.com/1040g2sg31e4ln39lh0bg5p8vj7kp2skkvm4jgno?imageView2/2/w/1080/format/webp"
- print("测试图片下载功能")
- print(f"测试URL: {test_url}")
- downloader = ImageDownloader()
- local_path = downloader.download_image(test_url)
- if local_path:
- print(f"✓ 下载成功: {local_path}")
- # 测试URL转换
- server = LocalImageServer()
- public_url = server.get_public_url(local_path)
- print(f"公开URL: {public_url}")
- print(f"\n要启动图片服务器,运行:")
- print(f"python3 image_downloader.py serve 8765")
- else:
- print("✗ 下载失败")
|