ai
/
knowledge


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
							import json
import os
import traceback
import uuid
from typing import List, Mapping, Optional

import requests
from loguru import logger

from utils.container import Container
from utils.fei_shu import FeiShu


class Crawler(object):
    feishu = FeiShu()

    @classmethod
    def _search_keyword(cls, keyword: str):
        url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
        payload = {
            'keyword': keyword,
            'content_type': '图文',
            'sort_type': '最热',
            'cursor': '',
        }
        response = requests.post(url=url, json=payload)
        obj = response.json()

        if obj['code'] != 0:
            raise ValueError('接口响应状态不为0')
        obj = obj['data']
        if not obj['data']:
            raise ValueError('搜索结果为空')
        return obj['data']

    @classmethod
    def _handle_images(cls, container: Container, image_list: List[Mapping]):
        length = len(image_list)
        for i in range(length):
            logger.info(f'正在处理图片 {i + 1}/{length}...')
            new_image_url = None
            for _ in range(10):  # 最多尝试10次
                download_path = cls._download_image(container, image_list[i]['image_url'])
                if not download_path:
                    continue
                convert_path = cls._convert_image(container, download_path)
                if not convert_path:
                    continue
                new_image_url = cls._upload_image(container, convert_path)
                break
            if new_image_url:  # 失败就使用原图链接
                image_list[i]['image_url'] = new_image_url

    @classmethod
    def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
        # 下载图片
        save_path = f'/tmp/{str(uuid.uuid4())}'
        command = [
            'curl',
            '-L',  # 跟随重定向
            '-f',  # 请求失败时不写入文件
            '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
            '-o', save_path,
            image_url,
            '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
            '-H', 'Accept-Language: zh-CN,zh;q=0.9',
            '-H', 'Cache-Control: no-cache',
            '-H', 'Origin: https://www.xiaohongshu.com',
            '-H', 'Pragma: no-cache',
            '-H', 'Referer: https://www.xiaohongshu.com/',
            '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
        ]
        exit_code, msg, _ = container.run_command(command)
        if exit_code == 0:
            return save_path
        return None

    @classmethod
    def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
        # 将图片转为jpg格式
        save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
        command = [
            'ffmpeg',
            '-i', origin_path,
            '-frames:v', '1',
            save_path,
        ]
        exit_code, msg, _ = container.run_command(command)
        if exit_code == 0:
            return save_path
        return None

    @classmethod
    def _upload_image(cls, container: Container, convert_path: str) -> str:
        # 将图片上传到指定oss并返回永久链接
        oss_object_key = container.upload_oss(
            bucket_name='art-pubbucket',
            object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
            container_file_path=convert_path,
            media_type='image/jpg',
        )
        return f'http://rescdn.yishihui.com/{oss_object_key}'

    @classmethod
    def run(cls, keyword: str):
        container = Container()
        try:
            container.start_container()
            search_result = cls._search_keyword(keyword)
            table_id = cls.feishu.create_table(keyword)

            for note in search_result:
                if not note.get('image_url_list', []):
                    continue
                logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
                cls._handle_images(container, note['image_url_list'])
                fields = {
                    '原文链接': {
                        'link': note['content_link'],
                        'text': note['content_link'],
                    },
                    '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
                }
                cls.feishu.create_record(table_id, fields)
        except Exception:
            logger.error(traceback.format_exc())
        finally:
            if container.container_id:
                container.stop_container()


if __name__ == '__main__':
    Crawler.run('阿比西尼亚猫')