Browse Source

新增抓取代码

kevin.yang 1 week ago
parent
commit
2436a2c700
2 changed files with 139 additions and 0 deletions
  1. 4 0
      .env
  2. 135 0
      fetch.py

+ 4 - 0
.env

@@ -12,3 +12,7 @@ GEMINI_API_KEY=AIzaSyC0J8gtl5I6-nu6fgvQrfnWkw0QIzfXEWE
 
 # 代理
 DYNAMIC_HTTP_PROXY=http://t10952018781111:1ap37oc3@d844.kdltps.com:15818
+
+# GRPC
+CONTAINER_GRPC_HOST=192.168.203.112
+CONTAINER_GRPC_PORT=50051

+ 135 - 0
fetch.py

@@ -0,0 +1,135 @@
+import json
+import os
+import traceback
+import uuid
+from typing import List, Mapping, Optional
+
+import requests
+from loguru import logger
+
+from utils.container import Container
+from utils.fei_shu import FeiShu
+
+
+class Crawler(object):
+    feishu = FeiShu()
+
+    @classmethod
+    def _search_keyword(cls, keyword: str):
+        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
+        payload = {
+            'keyword': keyword,
+            'content_type': '图文',
+            'sort_type': '最热',
+            'cursor': '',
+        }
+        response = requests.post(url=url, json=payload)
+        obj = response.json()
+
+        if obj['code'] != 0:
+            raise ValueError('接口响应状态不为0')
+        obj = obj['data']
+        if not obj['data']:
+            raise ValueError('搜索结果为空')
+        return obj['data']
+
+    @classmethod
+    def _handle_images(cls, container: Container, image_list: List[Mapping]):
+        length = len(image_list)
+        for i in range(length):
+            logger.info(f'正在处理图片 {i + 1}/{length}...')
+            new_image_url = None
+            for _ in range(10):  # 最多尝试10次
+                download_path = cls._download_image(container, image_list[i]['image_url'])
+                if not download_path:
+                    continue
+                convert_path = cls._convert_image(container, download_path)
+                if not convert_path:
+                    continue
+                new_image_url = cls._upload_image(container, convert_path)
+                break
+            if new_image_url:  # 失败就使用原图链接
+                image_list[i]['image_url'] = new_image_url
+        print('\n图片处理完成')
+
+    @classmethod
+    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
+        # 下载图片
+        save_path = f'/tmp/{str(uuid.uuid4())}'
+        command = [
+            'curl',
+            '-L',  # 跟随重定向
+            '-f',  # 请求失败时不写入文件
+            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
+            '-o', save_path,
+            image_url,
+            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
+            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
+            '-H', 'Cache-Control: no-cache',
+            '-H', 'Origin: https://www.xiaohongshu.com',
+            '-H', 'Pragma: no-cache',
+            '-H', 'Referer: https://www.xiaohongshu.com/',
+            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
+                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
+        ]
+        exit_code, msg, _ = container.run_command(command)
+        if exit_code == 0:
+            return save_path
+        return None
+
+    @classmethod
+    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
+        # 将图片转为jpg格式
+        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
+        command = [
+            'ffmpeg',
+            '-i', origin_path,
+            '-frames:v', '1',
+            save_path,
+        ]
+        exit_code, msg, _ = container.run_command(command)
+        if exit_code == 0:
+            return save_path
+        return None
+
+    @classmethod
+    def _upload_image(cls, container: Container, convert_path: str) -> str:
+        # 将图片上传到指定oss并返回永久链接
+        oss_object_key = container.upload_oss(
+            bucket_name='art-pubbucket',
+            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
+            container_file_path=convert_path,
+            media_type='image/jpg',
+        )
+        return f'http://rescdn.yishihui.com/{oss_object_key}'
+
+    @classmethod
+    def run(cls, keyword: str):
+        container = Container()
+        try:
+            container.start_container()
+            search_result = cls._search_keyword(keyword)
+            table_id = cls.feishu.create_table(keyword)
+
+            for note in search_result:
+                if not note.get('image_url_list', []):
+                    continue
+                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
+                cls._handle_images(container, note['image_url_list'])
+                fields = {
+                    '原文链接': {
+                        'link': note['content_link'],
+                        'text': note['content_link'],
+                    },
+                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
+                }
+                cls.feishu.create_record(table_id, fields)
+        except Exception:
+            logger.info(traceback.format_exc())
+        finally:
+            if container.container_id:
+                container.stop_container()
+
+
+if __name__ == '__main__':
+    Crawler.run('阿比西尼亚猫')