3 mesiacov pred · 2436a2c700
--- a/.env
+++ b/.env
@@ -12,3 +12,7 @@ GEMINI_API_KEY=AIzaSyC0J8gtl5I6-nu6fgvQrfnWkw0QIzfXEWE
 
				 
			
 
				 # 代理
			
 
				 DYNAMIC_HTTP_PROXY=http://t10952018781111:1ap37oc3@d844.kdltps.com:15818
			
 
				+
			
 
				+# GRPC
			
 
				+CONTAINER_GRPC_HOST=192.168.203.112
			
 
				+CONTAINER_GRPC_PORT=50051
			
--- a/fetch.py
+++ b/fetch.py
@@ -0,0 +1,135 @@
 
				+import json
			
 
				+import os
			
 
				+import traceback
			
 
				+import uuid
			
 
				+from typing import List, Mapping, Optional
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+
			
 
				+from utils.container import Container
			
 
				+from utils.fei_shu import FeiShu
			
 
				+
			
 
				+
			
 
				+class Crawler(object):
			
 
				+    feishu = FeiShu()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _search_keyword(cls, keyword: str):
			
 
				+        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
			
 
				+        payload = {
			
 
				+            'keyword': keyword,
			
 
				+            'content_type': '图文',
			
 
				+            'sort_type': '最热',
			
 
				+            'cursor': '',
			
 
				+        }
			
 
				+        response = requests.post(url=url, json=payload)
			
 
				+        obj = response.json()
			
 
				+
			
 
				+        if obj['code'] != 0:
			
 
				+            raise ValueError('接口响应状态不为0')
			
 
				+        obj = obj['data']
			
 
				+        if not obj['data']:
			
 
				+            raise ValueError('搜索结果为空')
			
 
				+        return obj['data']
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _handle_images(cls, container: Container, image_list: List[Mapping]):
			
 
				+        length = len(image_list)
			
 
				+        for i in range(length):
			
 
				+            logger.info(f'正在处理图片 {i + 1}/{length}...')
			
 
				+            new_image_url = None
			
 
				+            for _ in range(10):  # 最多尝试10次
			
 
				+                download_path = cls._download_image(container, image_list[i]['image_url'])
			
 
				+                if not download_path:
			
 
				+                    continue
			
 
				+                convert_path = cls._convert_image(container, download_path)
			
 
				+                if not convert_path:
			
 
				+                    continue
			
 
				+                new_image_url = cls._upload_image(container, convert_path)
			
 
				+                break
			
 
				+            if new_image_url:  # 失败就使用原图链接
			
 
				+                image_list[i]['image_url'] = new_image_url
			
 
				+        print('\n图片处理完成')
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
			
 
				+        # 下载图片
			
 
				+        save_path = f'/tmp/{str(uuid.uuid4())}'
			
 
				+        command = [
			
 
				+            'curl',
			
 
				+            '-L',  # 跟随重定向
			
 
				+            '-f',  # 请求失败时不写入文件
			
 
				+            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
			
 
				+            '-o', save_path,
			
 
				+            image_url,
			
 
				+            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
			
 
				+            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
			
 
				+            '-H', 'Cache-Control: no-cache',
			
 
				+            '-H', 'Origin: https://www.xiaohongshu.com',
			
 
				+            '-H', 'Pragma: no-cache',
			
 
				+            '-H', 'Referer: https://www.xiaohongshu.com/',
			
 
				+            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
			
 
				+                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
			
 
				+        ]
			
 
				+        exit_code, msg, _ = container.run_command(command)
			
 
				+        if exit_code == 0:
			
 
				+            return save_path
			
 
				+        return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
			
 
				+        # 将图片转为jpg格式
			
 
				+        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
			
 
				+        command = [
			
 
				+            'ffmpeg',
			
 
				+            '-i', origin_path,
			
 
				+            '-frames:v', '1',
			
 
				+            save_path,
			
 
				+        ]
			
 
				+        exit_code, msg, _ = container.run_command(command)
			
 
				+        if exit_code == 0:
			
 
				+            return save_path
			
 
				+        return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _upload_image(cls, container: Container, convert_path: str) -> str:
			
 
				+        # 将图片上传到指定oss并返回永久链接
			
 
				+        oss_object_key = container.upload_oss(
			
 
				+            bucket_name='art-pubbucket',
			
 
				+            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
			
 
				+            container_file_path=convert_path,
			
 
				+            media_type='image/jpg',
			
 
				+        )
			
 
				+        return f'http://rescdn.yishihui.com/{oss_object_key}'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def run(cls, keyword: str):
			
 
				+        container = Container()
			
 
				+        try:
			
 
				+            container.start_container()
			
 
				+            search_result = cls._search_keyword(keyword)
			
 
				+            table_id = cls.feishu.create_table(keyword)
			
 
				+
			
 
				+            for note in search_result:
			
 
				+                if not note.get('image_url_list', []):
			
 
				+                    continue
			
 
				+                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
			
 
				+                cls._handle_images(container, note['image_url_list'])
			
 
				+                fields = {
			
 
				+                    '原文链接': {
			
 
				+                        'link': note['content_link'],
			
 
				+                        'text': note['content_link'],
			
 
				+                    },
			
 
				+                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
			
 
				+                }
			
 
				+                cls.feishu.create_record(table_id, fields)
			
 
				+        except Exception:
			
 
				+            logger.info(traceback.format_exc())
			
 
				+        finally:
			
 
				+            if container.container_id:
			
 
				+                container.stop_container()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    Crawler.run('阿比西尼亚猫')