123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134 |
- import json
- import os
- import traceback
- import uuid
- from typing import List, Mapping, Optional
- import requests
- from loguru import logger
- from utils.container import Container
- from utils.fei_shu import FeiShu
- class Crawler(object):
- feishu = FeiShu()
- @classmethod
- def _search_keyword(cls, keyword: str):
- url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
- payload = {
- 'keyword': keyword,
- 'content_type': '图文',
- 'sort_type': '最热',
- 'cursor': '',
- }
- response = requests.post(url=url, json=payload)
- obj = response.json()
- if obj['code'] != 0:
- raise ValueError('接口响应状态不为0')
- obj = obj['data']
- if not obj['data']:
- raise ValueError('搜索结果为空')
- return obj['data']
- @classmethod
- def _handle_images(cls, container: Container, image_list: List[Mapping]):
- length = len(image_list)
- for i in range(length):
- logger.info(f'正在处理图片 {i + 1}/{length}...')
- new_image_url = None
- for _ in range(10): # 最多尝试10次
- download_path = cls._download_image(container, image_list[i]['image_url'])
- if not download_path:
- continue
- convert_path = cls._convert_image(container, download_path)
- if not convert_path:
- continue
- new_image_url = cls._upload_image(container, convert_path)
- break
- if new_image_url: # 失败就使用原图链接
- image_list[i]['image_url'] = new_image_url
- @classmethod
- def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
- # 下载图片
- save_path = f'/tmp/{str(uuid.uuid4())}'
- command = [
- 'curl',
- '-L', # 跟随重定向
- '-f', # 请求失败时不写入文件
- '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
- '-o', save_path,
- image_url,
- '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
- '-H', 'Accept-Language: zh-CN,zh;q=0.9',
- '-H', 'Cache-Control: no-cache',
- '-H', 'Origin: https://www.xiaohongshu.com',
- '-H', 'Pragma: no-cache',
- '-H', 'Referer: https://www.xiaohongshu.com/',
- '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
- '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
- ]
- exit_code, msg, _ = container.run_command(command)
- if exit_code == 0:
- return save_path
- return None
- @classmethod
- def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
- # 将图片转为jpg格式
- save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
- command = [
- 'ffmpeg',
- '-i', origin_path,
- '-frames:v', '1',
- save_path,
- ]
- exit_code, msg, _ = container.run_command(command)
- if exit_code == 0:
- return save_path
- return None
- @classmethod
- def _upload_image(cls, container: Container, convert_path: str) -> str:
- # 将图片上传到指定oss并返回永久链接
- oss_object_key = container.upload_oss(
- bucket_name='art-pubbucket',
- object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
- container_file_path=convert_path,
- media_type='image/jpg',
- )
- return f'http://rescdn.yishihui.com/{oss_object_key}'
- @classmethod
- def run(cls, keyword: str):
- container = Container()
- try:
- container.start_container()
- search_result = cls._search_keyword(keyword)
- table_id = cls.feishu.create_table(keyword)
- for note in search_result:
- if not note.get('image_url_list', []):
- continue
- logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
- cls._handle_images(container, note['image_url_list'])
- fields = {
- '原文链接': {
- 'link': note['content_link'],
- 'text': note['content_link'],
- },
- '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
- }
- cls.feishu.create_record(table_id, fields)
- except Exception:
- logger.error(traceback.format_exc())
- finally:
- if container.container_id:
- container.stop_container()
- if __name__ == '__main__':
- Crawler.run('阿比西尼亚猫')
|