Browse Source

Merge branch 'main' of https://git.yishihui.com/ai/knowledge

jihuaqiang 1 week ago
parent
commit
c55b9b589f
2 changed files with 134 additions and 134 deletions
  1. 134 0
      1_fetch.py
  2. 0 134
      fetch.py

+ 134 - 0
1_fetch.py

@@ -0,0 +1,134 @@
+import json
+import os
+import traceback
+import uuid
+from typing import List, Mapping, Optional
+
+import requests
+from loguru import logger
+
+from utils.container import Container
+from utils.fei_shu import FeiShu
+
+
+class Crawler(object):
+    feishu = FeiShu()
+
+    @classmethod
+    def _search_keyword(cls, keyword: str):
+        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
+        payload = {
+            'keyword': keyword,
+            'content_type': '图文',
+            'sort_type': '最热',
+            'cursor': '',
+        }
+        response = requests.post(url=url, json=payload)
+        obj = response.json()
+
+        if obj['code'] != 0:
+            raise ValueError('接口响应状态不为0')
+        obj = obj['data']
+        if not obj['data']:
+            raise ValueError('搜索结果为空')
+        return obj['data']
+
+    @classmethod
+    def _handle_images(cls, container: Container, image_list: List[Mapping]):
+        length = len(image_list)
+        for i in range(length):
+            logger.info(f'正在处理图片 {i + 1}/{length}...')
+            new_image_url = None
+            for _ in range(10):  # 最多尝试10次
+                download_path = cls._download_image(container, image_list[i]['image_url'])
+                if not download_path:
+                    continue
+                convert_path = cls._convert_image(container, download_path)
+                if not convert_path:
+                    continue
+                new_image_url = cls._upload_image(container, convert_path)
+                break
+            if new_image_url:  # 失败就使用原图链接
+                image_list[i]['image_url'] = new_image_url
+
+    @classmethod
+    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
+        # 下载图片
+        save_path = f'/tmp/{str(uuid.uuid4())}'
+        command = [
+            'curl',
+            '-L',  # 跟随重定向
+            '-f',  # 请求失败时不写入文件
+            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
+            '-o', save_path,
+            image_url,
+            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
+            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
+            '-H', 'Cache-Control: no-cache',
+            '-H', 'Origin: https://www.xiaohongshu.com',
+            '-H', 'Pragma: no-cache',
+            '-H', 'Referer: https://www.xiaohongshu.com/',
+            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
+                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
+        ]
+        exit_code, msg, _ = container.run_command(command)
+        if exit_code == 0:
+            return save_path
+        return None
+
+    @classmethod
+    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
+        # 将图片转为jpg格式
+        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
+        command = [
+            'ffmpeg',
+            '-i', origin_path,
+            '-frames:v', '1',
+            save_path,
+        ]
+        exit_code, msg, _ = container.run_command(command)
+        if exit_code == 0:
+            return save_path
+        return None
+
+    @classmethod
+    def _upload_image(cls, container: Container, convert_path: str) -> str:
+        # 将图片上传到指定oss并返回永久链接
+        oss_object_key = container.upload_oss(
+            bucket_name='art-pubbucket',
+            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
+            container_file_path=convert_path,
+            media_type='image/jpg',
+        )
+        return f'http://rescdn.yishihui.com/{oss_object_key}'
+
+    @classmethod
+    def run(cls, keyword: str):
+        container = Container()
+        try:
+            container.start_container()
+            search_result = cls._search_keyword(keyword)
+            table_id = cls.feishu.create_table(keyword)
+
+            for note in search_result:
+                if not note.get('image_url_list', []):
+                    continue
+                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
+                cls._handle_images(container, note['image_url_list'])
+                fields = {
+                    '原文链接': {
+                        'link': note['content_link'],
+                        'text': note['content_link'],
+                    },
+                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
+                }
+                cls.feishu.create_record(table_id, fields)
+        except Exception:
+            logger.error(traceback.format_exc())
+        finally:
+            if container.container_id:
+                container.stop_container()
+
+
+if __name__ == '__main__':
+    Crawler.run('阿比西尼亚猫')

+ 0 - 134
fetch.py

@@ -1,134 +0,0 @@
-import json
-import os
-import traceback
-import uuid
-from typing import List, Mapping, Optional
-
-import requests
-from loguru import logger
-
-from utils.container import Container
-from utils.fei_shu import FeiShu
-
-
-class Crawler(object):
-    feishu = FeiShu()
-
-    @classmethod
-    def _search_keyword(cls, keyword: str):
-        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
-        payload = {
-            'keyword': keyword,
-            'content_type': '图文',
-            'sort_type': '最热',
-            'cursor': '',
-        }
-        response = requests.post(url=url, json=payload)
-        obj = response.json()
-
-        if obj['code'] != 0:
-            raise ValueError('接口响应状态不为0')
-        obj = obj['data']
-        if not obj['data']:
-            raise ValueError('搜索结果为空')
-        return obj['data']
-
-    @classmethod
-    def _handle_images(cls, container: Container, image_list: List[Mapping]):
-        length = len(image_list)
-        for i in range(length):
-            logger.info(f'正在处理图片 {i + 1}/{length}...')
-            new_image_url = None
-            for _ in range(10):  # 最多尝试10次
-                download_path = cls._download_image(container, image_list[i]['image_url'])
-                if not download_path:
-                    continue
-                convert_path = cls._convert_image(container, download_path)
-                if not convert_path:
-                    continue
-                new_image_url = cls._upload_image(container, convert_path)
-                break
-            if new_image_url:  # 失败就使用原图链接
-                image_list[i]['image_url'] = new_image_url
-
-    @classmethod
-    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
-        # 下载图片
-        save_path = f'/tmp/{str(uuid.uuid4())}'
-        command = [
-            'curl',
-            '-L',  # 跟随重定向
-            '-f',  # 请求失败时不写入文件
-            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
-            '-o', save_path,
-            image_url,
-            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
-            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
-            '-H', 'Cache-Control: no-cache',
-            '-H', 'Origin: https://www.xiaohongshu.com',
-            '-H', 'Pragma: no-cache',
-            '-H', 'Referer: https://www.xiaohongshu.com/',
-            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
-                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
-        ]
-        exit_code, msg, _ = container.run_command(command)
-        if exit_code == 0:
-            return save_path
-        return None
-
-    @classmethod
-    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
-        # 将图片转为jpg格式
-        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
-        command = [
-            'ffmpeg',
-            '-i', origin_path,
-            '-frames:v', '1',
-            save_path,
-        ]
-        exit_code, msg, _ = container.run_command(command)
-        if exit_code == 0:
-            return save_path
-        return None
-
-    @classmethod
-    def _upload_image(cls, container: Container, convert_path: str) -> str:
-        # 将图片上传到指定oss并返回永久链接
-        oss_object_key = container.upload_oss(
-            bucket_name='art-pubbucket',
-            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
-            container_file_path=convert_path,
-            media_type='image/jpg',
-        )
-        return f'http://rescdn.yishihui.com/{oss_object_key}'
-
-    @classmethod
-    def run(cls, keyword: str):
-        container = Container()
-        try:
-            container.start_container()
-            search_result = cls._search_keyword(keyword)
-            table_id = cls.feishu.create_table(keyword)
-
-            for note in search_result:
-                if not note.get('image_url_list', []):
-                    continue
-                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
-                cls._handle_images(container, note['image_url_list'])
-                fields = {
-                    '原文链接': {
-                        'link': note['content_link'],
-                        'text': note['content_link'],
-                    },
-                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
-                }
-                cls.feishu.create_record(table_id, fields)
-        except Exception:
-            logger.error(traceback.format_exc())
-        finally:
-            if container.container_id:
-                container.stop_container()
-
-
-if __name__ == '__main__':
-    Crawler.run('阿比西尼亚猫')