xiaoniangao.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """
  2. @Author : luojunhui
  3. 小年糕账号爬虫
  4. """
  5. import os
  6. import sys
  7. import json
  8. import time
  9. import uuid
  10. import random
  11. import asyncio
  12. import aiohttp
  13. import datetime
  14. sys.path.append(os.getcwd())
  15. from application.items import VideoItem
  16. from application.pipeline import PiaoQuanPipeline
  17. from application.common.messageQueue import MQ
  18. from application.common.log import AliyunLogger
  19. class XiaoNianGaoAuthor(object):
  20. """
  21. 小年糕账号爬虫
  22. """
  23. def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
  24. self.platform = platform
  25. self.mode = mode
  26. self.rule_dict = rule_dict
  27. self.user_list = user_list
  28. self.env = env
  29. self.download_cnt = 0
  30. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  31. self.expire_flag = False
  32. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  33. def split_accounts(self):
  34. """
  35. 操作 user_list,把重要账号挑选出来
  36. :return:
  37. """
  38. return self.user_list
  39. async def get_user_videos(self, user_dict):
  40. """
  41. 小年糕执行代码
  42. """
  43. url = "https://kapi-xng-app.xiaoniangao.cn/v1/album/user_public"
  44. headers = {
  45. 'Host': 'kapi-xng-app.xiaoniangao.cn',
  46. 'content-type': 'application/json; charset=utf-8',
  47. 'accept': '*/*',
  48. 'authorization': 'hSNQ2s9pvPxvFn4LaQJxKQ6/7Is=',
  49. 'verb': 'POST',
  50. 'content-md5': 'c7b7f8663984e8800e3bcd9b44465083',
  51. 'x-b3-traceid': '2f9da41f960ae077',
  52. 'accept-language': 'zh-cn',
  53. 'date': 'Mon, 19 Jun 2023 06:41:17 GMT',
  54. 'x-token-id': '',
  55. 'x-signaturemethod': 'hmac-sha1',
  56. 'user-agent': 'xngapp/157 CFNetwork/1335.0.3.1 Darwin/21.6.0'
  57. }
  58. async with aiohttp.ClientSession() as session:
  59. next_index = -1
  60. # 只抓取更新的视频,如果刷到已经更新的立即退出
  61. while True:
  62. payload = {
  63. "token": "",
  64. "limit": 20,
  65. "start_t": next_index,
  66. "visited_mid": int(user_dict["link"]),
  67. "share_width": 300,
  68. "share_height": 240,
  69. }
  70. async with session.post(url, headers=headers, data=json.dumps(payload)) as response:
  71. data = await response.json()
  72. print(data)
  73. async def scan_important_accounts(self, accounts):
  74. """
  75. 批量扫描重要账号
  76. :param accounts:重要账号
  77. """
  78. tasks = [self.get_user_videos(account) for account in accounts]
  79. await asyncio.gather(*tasks)
  80. async def run(self):
  81. """
  82. 控制函数代码
  83. :return:
  84. """
  85. self.split_acoounts()