fetch.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import json
  2. import os
  3. import traceback
  4. import uuid
  5. from typing import List, Mapping, Optional
  6. import requests
  7. from loguru import logger
  8. from utils.container import Container
  9. from utils.fei_shu import FeiShu
  10. class Crawler(object):
  11. feishu = FeiShu()
  12. @classmethod
  13. def _search_keyword(cls, keyword: str):
  14. url = 'https://crawler.aiddit.com/crawler/xiao_hong_shu/keyword_v2'
  15. payload = {
  16. 'keyword': keyword,
  17. 'content_type': '图文',
  18. 'sort_type': '最热',
  19. 'cursor': '',
  20. }
  21. response = requests.post(url=url, json=payload)
  22. obj = response.json()
  23. if obj['code'] != 0:
  24. raise ValueError('接口响应状态不为0')
  25. obj = obj['data']
  26. if not obj['data']:
  27. raise ValueError('搜索结果为空')
  28. return obj['data']
  29. @classmethod
  30. def _handle_images(cls, container: Container, image_list: List[Mapping]):
  31. length = len(image_list)
  32. for i in range(length):
  33. logger.info(f'正在处理图片 {i + 1}/{length}...')
  34. new_image_url = None
  35. for _ in range(10): # 最多尝试10次
  36. download_path = cls._download_image(container, image_list[i]['image_url'])
  37. if not download_path:
  38. continue
  39. convert_path = cls._convert_image(container, download_path)
  40. if not convert_path:
  41. continue
  42. new_image_url = cls._upload_image(container, convert_path)
  43. break
  44. if new_image_url: # 失败就使用原图链接
  45. image_list[i]['image_url'] = new_image_url
  46. print('\n图片处理完成')
  47. @classmethod
  48. def _download_image(cls, container: Container, image_url: str) -> Optional[str]:
  49. # 下载图片
  50. save_path = f'/tmp/{str(uuid.uuid4())}'
  51. command = [
  52. 'curl',
  53. '-L', # 跟随重定向
  54. '-f', # 请求失败时不写入文件
  55. '-x', os.getenv('DYNAMIC_HTTP_PROXY'),
  56. '-o', save_path,
  57. image_url,
  58. '-H', 'Accept: image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
  59. '-H', 'Accept-Language: zh-CN,zh;q=0.9',
  60. '-H', 'Cache-Control: no-cache',
  61. '-H', 'Origin: https://www.xiaohongshu.com',
  62. '-H', 'Pragma: no-cache',
  63. '-H', 'Referer: https://www.xiaohongshu.com/',
  64. '-H', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 '
  65. '(KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
  66. ]
  67. exit_code, msg, _ = container.run_command(command)
  68. if exit_code == 0:
  69. return save_path
  70. return None
  71. @classmethod
  72. def _convert_image(cls, container: Container, origin_path: str) -> Optional[str]:
  73. # 将图片转为jpg格式
  74. save_path = f'/tmp/{str(uuid.uuid4())}.jpg'
  75. command = [
  76. 'ffmpeg',
  77. '-i', origin_path,
  78. '-frames:v', '1',
  79. save_path,
  80. ]
  81. exit_code, msg, _ = container.run_command(command)
  82. if exit_code == 0:
  83. return save_path
  84. return None
  85. @classmethod
  86. def _upload_image(cls, container: Container, convert_path: str) -> str:
  87. # 将图片上传到指定oss并返回永久链接
  88. oss_object_key = container.upload_oss(
  89. bucket_name='art-pubbucket',
  90. object_key=f'pipeline/image/{str(uuid.uuid4())}.jpg',
  91. container_file_path=convert_path,
  92. media_type='image/jpg',
  93. )
  94. return f'http://rescdn.yishihui.com/{oss_object_key}'
  95. @classmethod
  96. def run(cls, keyword: str):
  97. container = Container()
  98. try:
  99. container.start_container()
  100. search_result = cls._search_keyword(keyword)
  101. table_id = cls.feishu.create_table(keyword)
  102. for note in search_result:
  103. if not note.get('image_url_list', []):
  104. continue
  105. logger.info(f'笔记ID: {note["channel_content_id"]}, 标题: {note["title"]}, 图片数量: {len(note["image_url_list"])}')
  106. cls._handle_images(container, note['image_url_list'])
  107. fields = {
  108. '原文链接': {
  109. 'link': note['content_link'],
  110. 'text': note['content_link'],
  111. },
  112. '抓取结果': json.dumps(note, ensure_ascii=False, indent=4),
  113. }
  114. cls.feishu.create_record(table_id, fields)
  115. except Exception:
  116. logger.error(traceback.format_exc())
  117. finally:
  118. if container.container_id:
  119. container.stop_container()
  120. if __name__ == '__main__':
  121. Crawler.run('阿比西尼亚猫')