import json import os import traceback import uuid from typing import List, Mapping, Optional import requests from loguru import logger from utils.container import Container from utils.fei_shu import FeiShu class Crawler(object): feishu = FeiShu() @classmethod def _search_keyword(cls, keyword: str): url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2' payload = { 'keyword': keyword, 'content_type': '图文', 'sort_type': '最热', 'cursor': '', } response = requests.post(url=url, json=payload) obj = response.json() if obj['code'] != 0: raise ValueError('接口响应状态不为0') obj = obj['data'] if not obj['data']: raise ValueError('搜索结果为空') return obj['data'] @classmethod def _handle_images(cls, container: Container, image_list: List[Mapping]): length = len(image_list) for i in range(length): logger.info(f'正在处理图片 {i + 1}/{length}...') new_image_url = None for _ in range(10): # 最多尝试10次 download_path = cls._download_image(container, image_list[i]['image_url']) if not download_path: continue convert_path = cls._convert_image(container, download_path) if not convert_path: continue new_image_url = cls._upload_image(container, convert_path) break if new_image_url: # 失败就使用原图链接 image_list[i]['image_url'] = new_image_url @classmethod def _download_image(cls, container: Container, image_url: str) -> Optional[str]: # 下载图片 save_path = f'/tmp/{str(uuid.uuid4())}' command = [ 'curl', '-L', # 跟随重定向 '-f', # 请求失败时不写入文件 '-x', os.getenv('DYNAMIC_HTTP_PROXY'), '-o', save_path, image_url, '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8', '-H', 'Accept-Language: zh-CN,zh;q=0.9', '-H', 'Cache-Control: no-cache', '-H', 'Origin: https://www.xiaohongshu.com', '-H', 'Pragma: no-cache', '-H', 'Referer: https://www.xiaohongshu.com/', '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', ] exit_code, msg, _ = container.run_command(command) if exit_code == 0: return save_path return None @classmethod def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]: # 将图片转为jpg格式 save_path = f'/tmp/{str(uuid.uuid4())}.jpg' command = [ 'ffmpeg', '-i', origin_path, '-frames:v', '1', save_path, ] exit_code, msg, _ = container.run_command(command) if exit_code == 0: return save_path return None @classmethod def _upload_image(cls, container: Container, convert_path: str) -> str: # 将图片上传到指定oss并返回永久链接 oss_object_key = container.upload_oss( bucket_name='art-pubbucket', object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg', container_file_path=convert_path, media_type='image/jpg', ) return f'http://rescdn.yishihui.com/{oss_object_key}' @classmethod def run(cls, keyword: str): container = Container() try: container.start_container() search_result = cls._search_keyword(keyword) table_id = cls.feishu.create_table(keyword) for note in search_result: if not note.get('image_url_list', []): continue logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}') cls._handle_images(container, note['image_url_list']) fields = { '原文链接': { 'link': note['content_link'], 'text': note['content_link'], }, '抓取结果': json.dumps(note, ensure_ascii=False, indent=4), } cls.feishu.create_record(table_id, fields) except Exception: logger.error(traceback.format_exc()) finally: if container.container_id: container.stop_container() if __name__ == '__main__': Crawler.run('阿比西尼亚猫')