fetch.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import json
  2. import os
  3. import traceback
  4. import uuid
  5. from typing import List, Mapping, Optional
  6. import requests
  7. from loguru import logger
  8. from utils.container import Container
  9. from utils.fei_shu import FeiShu
  10. class Crawler(object):
  11. feishu = FeiShu()
  12. @classmethod
  13. def _search_keyword(cls, keyword: str):
  14. url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
  15. payload = {
  16. 'keyword': keyword,
  17. 'content_type': '图文',
  18. 'sort_type': '最热',
  19. 'cursor': '',
  20. }
  21. response = requests.post(url=url, json=payload)
  22. obj = response.json()
  23. if obj['code'] != 0:
  24. raise ValueError('接口响应状态不为0')
  25. obj = obj['data']
  26. if not obj['data']:
  27. raise ValueError('搜索结果为空')
  28. return obj['data']
  29. @classmethod
  30. def _handle_images(cls, container: Container, image_list: List[Mapping]):
  31. length = len(image_list)
  32. for i in range(length):
  33. logger.info(f'正在处理图片 {i + 1}/{length}...')
  34. new_image_url = None
  35. for _ in range(10): # 最多尝试10次
  36. download_path = cls._download_image(container, image_list[i]['image_url'])
  37. if not download_path:
  38. continue
  39. convert_path = cls._convert_image(container, download_path)
  40. if not convert_path:
  41. continue
  42. new_image_url = cls._upload_image(container, convert_path)
  43. break
  44. if new_image_url: # 失败就使用原图链接
  45. image_list[i]['image_url'] = new_image_url
  46. @classmethod
  47. def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
  48. # 下载图片
  49. save_path = f'/tmp/{str(uuid.uuid4())}'
  50. command = [
  51. 'curl',
  52. '-L', # 跟随重定向
  53. '-f', # 请求失败时不写入文件
  54. '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
  55. '-o', save_path,
  56. image_url,
  57. '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
  58. '-H', 'Accept-Language: zh-CN,zh;q=0.9',
  59. '-H', 'Cache-Control: no-cache',
  60. '-H', 'Origin: https://www.xiaohongshu.com',
  61. '-H', 'Pragma: no-cache',
  62. '-H', 'Referer: https://www.xiaohongshu.com/',
  63. '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
  64. '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
  65. ]
  66. exit_code, msg, _ = container.run_command(command)
  67. if exit_code == 0:
  68. return save_path
  69. return None
  70. @classmethod
  71. def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
  72. # 将图片转为jpg格式
  73. save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
  74. command = [
  75. 'ffmpeg',
  76. '-i', origin_path,
  77. '-frames:v', '1',
  78. save_path,
  79. ]
  80. exit_code, msg, _ = container.run_command(command)
  81. if exit_code == 0:
  82. return save_path
  83. return None
  84. @classmethod
  85. def _upload_image(cls, container: Container, convert_path: str) -> str:
  86. # 将图片上传到指定oss并返回永久链接
  87. oss_object_key = container.upload_oss(
  88. bucket_name='art-pubbucket',
  89. object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
  90. container_file_path=convert_path,
  91. media_type='image/jpg',
  92. )
  93. return f'http://rescdn.yishihui.com/{oss_object_key}'
  94. @classmethod
  95. def run(cls, keyword: str):
  96. container = Container()
  97. try:
  98. container.start_container()
  99. search_result = cls._search_keyword(keyword)
  100. table_id = cls.feishu.create_table(keyword)
  101. for note in search_result:
  102. if not note.get('image_url_list', []):
  103. continue
  104. logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
  105. cls._handle_images(container, note['image_url_list'])
  106. fields = {
  107. '原文链接': {
  108. 'link': note['content_link'],
  109. 'text': note['content_link'],
  110. },
  111. '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
  112. }
  113. cls.feishu.create_record(table_id, fields)
  114. except Exception:
  115. logger.error(traceback.format_exc())
  116. finally:
  117. if container.container_id:
  118. container.stop_container()
  119. if __name__ == '__main__':
  120. Crawler.run('阿比西尼亚猫')