|
@@ -0,0 +1,134 @@
|
|
|
+import json
|
|
|
+import os
|
|
|
+import traceback
|
|
|
+import uuid
|
|
|
+from typing import List, Mapping, Optional
|
|
|
+
|
|
|
+import requests
|
|
|
+from loguru import logger
|
|
|
+
|
|
|
+from utils.container import Container
|
|
|
+from utils.fei_shu import FeiShu
|
|
|
+
|
|
|
+
|
|
|
+class Crawler(object):
|
|
|
+ feishu = FeiShu()
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _search_keyword(cls, keyword: str):
|
|
|
+ url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
|
|
|
+ payload = {
|
|
|
+ 'keyword': keyword,
|
|
|
+ 'content_type': '图文',
|
|
|
+ 'sort_type': '最热',
|
|
|
+ 'cursor': '',
|
|
|
+ }
|
|
|
+ response = requests.post(url=url, json=payload)
|
|
|
+ obj = response.json()
|
|
|
+
|
|
|
+ if obj['code'] != 0:
|
|
|
+ raise ValueError('接口响应状态不为0')
|
|
|
+ obj = obj['data']
|
|
|
+ if not obj['data']:
|
|
|
+ raise ValueError('搜索结果为空')
|
|
|
+ return obj['data']
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _handle_images(cls, container: Container, image_list: List[Mapping]):
|
|
|
+ length = len(image_list)
|
|
|
+ for i in range(length):
|
|
|
+ logger.info(f'正在处理图片 {i + 1}/{length}...')
|
|
|
+ new_image_url = None
|
|
|
+ for _ in range(10): # 最多尝试10次
|
|
|
+ download_path = cls._download_image(container, image_list[i]['image_url'])
|
|
|
+ if not download_path:
|
|
|
+ continue
|
|
|
+ convert_path = cls._convert_image(container, download_path)
|
|
|
+ if not convert_path:
|
|
|
+ continue
|
|
|
+ new_image_url = cls._upload_image(container, convert_path)
|
|
|
+ break
|
|
|
+ if new_image_url: # 失败就使用原图链接
|
|
|
+ image_list[i]['image_url'] = new_image_url
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
|
|
|
+ # 下载图片
|
|
|
+ save_path = f'/tmp/{str(uuid.uuid4())}'
|
|
|
+ command = [
|
|
|
+ 'curl',
|
|
|
+ '-L', # 跟随重定向
|
|
|
+ '-f', # 请求失败时不写入文件
|
|
|
+ '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
|
|
|
+ '-o', save_path,
|
|
|
+ image_url,
|
|
|
+ '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
|
|
|
+ '-H', 'Accept-Language: zh-CN,zh;q=0.9',
|
|
|
+ '-H', 'Cache-Control: no-cache',
|
|
|
+ '-H', 'Origin: https://www.xiaohongshu.com',
|
|
|
+ '-H', 'Pragma: no-cache',
|
|
|
+ '-H', 'Referer: https://www.xiaohongshu.com/',
|
|
|
+ '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
|
|
|
+ '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
|
|
|
+ ]
|
|
|
+ exit_code, msg, _ = container.run_command(command)
|
|
|
+ if exit_code == 0:
|
|
|
+ return save_path
|
|
|
+ return None
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
|
|
|
+ # 将图片转为jpg格式
|
|
|
+ save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
|
|
|
+ command = [
|
|
|
+ 'ffmpeg',
|
|
|
+ '-i', origin_path,
|
|
|
+ '-frames:v', '1',
|
|
|
+ save_path,
|
|
|
+ ]
|
|
|
+ exit_code, msg, _ = container.run_command(command)
|
|
|
+ if exit_code == 0:
|
|
|
+ return save_path
|
|
|
+ return None
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def _upload_image(cls, container: Container, convert_path: str) -> str:
|
|
|
+ # 将图片上传到指定oss并返回永久链接
|
|
|
+ oss_object_key = container.upload_oss(
|
|
|
+ bucket_name='art-pubbucket',
|
|
|
+ object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
|
|
|
+ container_file_path=convert_path,
|
|
|
+ media_type='image/jpg',
|
|
|
+ )
|
|
|
+ return f'http://rescdn.yishihui.com/{oss_object_key}'
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def run(cls, keyword: str):
|
|
|
+ container = Container()
|
|
|
+ try:
|
|
|
+ container.start_container()
|
|
|
+ search_result = cls._search_keyword(keyword)
|
|
|
+ table_id = cls.feishu.create_table(keyword)
|
|
|
+
|
|
|
+ for note in search_result:
|
|
|
+ if not note.get('image_url_list', []):
|
|
|
+ continue
|
|
|
+ logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
|
|
|
+ cls._handle_images(container, note['image_url_list'])
|
|
|
+ fields = {
|
|
|
+ '原文链接': {
|
|
|
+ 'link': note['content_link'],
|
|
|
+ 'text': note['content_link'],
|
|
|
+ },
|
|
|
+ '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
|
|
|
+ }
|
|
|
+ cls.feishu.create_record(table_id, fields)
|
|
|
+ except Exception:
|
|
|
+ logger.error(traceback.format_exc())
|
|
|
+ finally:
|
|
|
+ if container.container_id:
|
|
|
+ container.stop_container()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ Crawler.run('阿比西尼亚猫')
|