3 months ago · c55b9b589f
--- a/1_fetch.py
+++ b/1_fetch.py
@@ -0,0 +1,134 @@
 
				+import json
			
 
				+import os
			
 
				+import traceback
			
 
				+import uuid
			
 
				+from typing import List, Mapping, Optional
			
 
				+
			
 
				+import requests
			
 
				+from loguru import logger
			
 
				+
			
 
				+from utils.container import Container
			
 
				+from utils.fei_shu import FeiShu
			
 
				+
			
 
				+
			
 
				+class Crawler(object):
			
 
				+    feishu = FeiShu()
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _search_keyword(cls, keyword: str):
			
 
				+        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
			
 
				+        payload = {
			
 
				+            'keyword': keyword,
			
 
				+            'content_type': '图文',
			
 
				+            'sort_type': '最热',
			
 
				+            'cursor': '',
			
 
				+        }
			
 
				+        response = requests.post(url=url, json=payload)
			
 
				+        obj = response.json()
			
 
				+
			
 
				+        if obj['code'] != 0:
			
 
				+            raise ValueError('接口响应状态不为0')
			
 
				+        obj = obj['data']
			
 
				+        if not obj['data']:
			
 
				+            raise ValueError('搜索结果为空')
			
 
				+        return obj['data']
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _handle_images(cls, container: Container, image_list: List[Mapping]):
			
 
				+        length = len(image_list)
			
 
				+        for i in range(length):
			
 
				+            logger.info(f'正在处理图片 {i + 1}/{length}...')
			
 
				+            new_image_url = None
			
 
				+            for _ in range(10):  # 最多尝试10次
			
 
				+                download_path = cls._download_image(container, image_list[i]['image_url'])
			
 
				+                if not download_path:
			
 
				+                    continue
			
 
				+                convert_path = cls._convert_image(container, download_path)
			
 
				+                if not convert_path:
			
 
				+                    continue
			
 
				+                new_image_url = cls._upload_image(container, convert_path)
			
 
				+                break
			
 
				+            if new_image_url:  # 失败就使用原图链接
			
 
				+                image_list[i]['image_url'] = new_image_url
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
			
 
				+        # 下载图片
			
 
				+        save_path = f'/tmp/{str(uuid.uuid4())}'
			
 
				+        command = [
			
 
				+            'curl',
			
 
				+            '-L',  # 跟随重定向
			
 
				+            '-f',  # 请求失败时不写入文件
			
 
				+            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
			
 
				+            '-o', save_path,
			
 
				+            image_url,
			
 
				+            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
			
 
				+            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
			
 
				+            '-H', 'Cache-Control: no-cache',
			
 
				+            '-H', 'Origin: https://www.xiaohongshu.com',
			
 
				+            '-H', 'Pragma: no-cache',
			
 
				+            '-H', 'Referer: https://www.xiaohongshu.com/',
			
 
				+            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
			
 
				+                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
			
 
				+        ]
			
 
				+        exit_code, msg, _ = container.run_command(command)
			
 
				+        if exit_code == 0:
			
 
				+            return save_path
			
 
				+        return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
			
 
				+        # 将图片转为jpg格式
			
 
				+        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
			
 
				+        command = [
			
 
				+            'ffmpeg',
			
 
				+            '-i', origin_path,
			
 
				+            '-frames:v', '1',
			
 
				+            save_path,
			
 
				+        ]
			
 
				+        exit_code, msg, _ = container.run_command(command)
			
 
				+        if exit_code == 0:
			
 
				+            return save_path
			
 
				+        return None
			
 
				+
			
 
				+    @classmethod
			
 
				+    def _upload_image(cls, container: Container, convert_path: str) -> str:
			
 
				+        # 将图片上传到指定oss并返回永久链接
			
 
				+        oss_object_key = container.upload_oss(
			
 
				+            bucket_name='art-pubbucket',
			
 
				+            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
			
 
				+            container_file_path=convert_path,
			
 
				+            media_type='image/jpg',
			
 
				+        )
			
 
				+        return f'http://rescdn.yishihui.com/{oss_object_key}'
			
 
				+
			
 
				+    @classmethod
			
 
				+    def run(cls, keyword: str):
			
 
				+        container = Container()
			
 
				+        try:
			
 
				+            container.start_container()
			
 
				+            search_result = cls._search_keyword(keyword)
			
 
				+            table_id = cls.feishu.create_table(keyword)
			
 
				+
			
 
				+            for note in search_result:
			
 
				+                if not note.get('image_url_list', []):
			
 
				+                    continue
			
 
				+                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
			
 
				+                cls._handle_images(container, note['image_url_list'])
			
 
				+                fields = {
			
 
				+                    '原文链接': {
			
 
				+                        'link': note['content_link'],
			
 
				+                        'text': note['content_link'],
			
 
				+                    },
			
 
				+                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
			
 
				+                }
			
 
				+                cls.feishu.create_record(table_id, fields)
			
 
				+        except Exception:
			
 
				+            logger.error(traceback.format_exc())
			
 
				+        finally:
			
 
				+            if container.container_id:
			
 
				+                container.stop_container()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    Crawler.run('阿比西尼亚猫')
			
--- a/fetch.py
+++ b/fetch.py
@@ -1,134 +0,0 @@
 
				-import json
			
 
				-import os
			
 
				-import traceback
			
 
				-import uuid
			
 
				-from typing import List, Mapping, Optional
			
 
				-
			
 
				-import requests
			
 
				-from loguru import logger
			
 
				-
			
 
				-from utils.container import Container
			
 
				-from utils.fei_shu import FeiShu
			
 
				-
			
 
				-
			
 
				-class Crawler(object):
			
 
				-    feishu = FeiShu()
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _search_keyword(cls, keyword: str):
			
 
				-        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
			
 
				-        payload = {
			
 
				-            'keyword': keyword,
			
 
				-            'content_type': '图文',
			
 
				-            'sort_type': '最热',
			
 
				-            'cursor': '',
			
 
				-        }
			
 
				-        response = requests.post(url=url, json=payload)
			
 
				-        obj = response.json()
			
 
				-
			
 
				-        if obj['code'] != 0:
			
 
				-            raise ValueError('接口响应状态不为0')
			
 
				-        obj = obj['data']
			
 
				-        if not obj['data']:
			
 
				-            raise ValueError('搜索结果为空')
			
 
				-        return obj['data']
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _handle_images(cls, container: Container, image_list: List[Mapping]):
			
 
				-        length = len(image_list)
			
 
				-        for i in range(length):
			
 
				-            logger.info(f'正在处理图片 {i + 1}/{length}...')
			
 
				-            new_image_url = None
			
 
				-            for _ in range(10):  # 最多尝试10次
			
 
				-                download_path = cls._download_image(container, image_list[i]['image_url'])
			
 
				-                if not download_path:
			
 
				-                    continue
			
 
				-                convert_path = cls._convert_image(container, download_path)
			
 
				-                if not convert_path:
			
 
				-                    continue
			
 
				-                new_image_url = cls._upload_image(container, convert_path)
			
 
				-                break
			
 
				-            if new_image_url:  # 失败就使用原图链接
			
 
				-                image_list[i]['image_url'] = new_image_url
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
			
 
				-        # 下载图片
			
 
				-        save_path = f'/tmp/{str(uuid.uuid4())}'
			
 
				-        command = [
			
 
				-            'curl',
			
 
				-            '-L',  # 跟随重定向
			
 
				-            '-f',  # 请求失败时不写入文件
			
 
				-            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
			
 
				-            '-o', save_path,
			
 
				-            image_url,
			
 
				-            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
			
 
				-            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
			
 
				-            '-H', 'Cache-Control: no-cache',
			
 
				-            '-H', 'Origin: https://www.xiaohongshu.com',
			
 
				-            '-H', 'Pragma: no-cache',
			
 
				-            '-H', 'Referer: https://www.xiaohongshu.com/',
			
 
				-            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
			
 
				-                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
			
 
				-        ]
			
 
				-        exit_code, msg, _ = container.run_command(command)
			
 
				-        if exit_code == 0:
			
 
				-            return save_path
			
 
				-        return None
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
			
 
				-        # 将图片转为jpg格式
			
 
				-        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
			
 
				-        command = [
			
 
				-            'ffmpeg',
			
 
				-            '-i', origin_path,
			
 
				-            '-frames:v', '1',
			
 
				-            save_path,
			
 
				-        ]
			
 
				-        exit_code, msg, _ = container.run_command(command)
			
 
				-        if exit_code == 0:
			
 
				-            return save_path
			
 
				-        return None
			
 
				-
			
 
				-    @classmethod
			
 
				-    def _upload_image(cls, container: Container, convert_path: str) -> str:
			
 
				-        # 将图片上传到指定oss并返回永久链接
			
 
				-        oss_object_key = container.upload_oss(
			
 
				-            bucket_name='art-pubbucket',
			
 
				-            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
			
 
				-            container_file_path=convert_path,
			
 
				-            media_type='image/jpg',
			
 
				-        )
			
 
				-        return f'http://rescdn.yishihui.com/{oss_object_key}'
			
 
				-
			
 
				-    @classmethod
			
 
				-    def run(cls, keyword: str):
			
 
				-        container = Container()
			
 
				-        try:
			
 
				-            container.start_container()
			
 
				-            search_result = cls._search_keyword(keyword)
			
 
				-            table_id = cls.feishu.create_table(keyword)
			
 
				-
			
 
				-            for note in search_result:
			
 
				-                if not note.get('image_url_list', []):
			
 
				-                    continue
			
 
				-                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
			
 
				-                cls._handle_images(container, note['image_url_list'])
			
 
				-                fields = {
			
 
				-                    '原文链接': {
			
 
				-                        'link': note['content_link'],
			
 
				-                        'text': note['content_link'],
			
 
				-                    },
			
 
				-                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
			
 
				-                }
			
 
				-                cls.feishu.create_record(table_id, fields)
			
 
				-        except Exception:
			
 
				-            logger.error(traceback.format_exc())
			
 
				-        finally:
			
 
				-            if container.container_id:
			
 
				-                container.stop_container()
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    Crawler.run('阿比西尼亚猫')