zhuhaoshiduomo.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. import os
  2. import json
  3. import random
  4. import sys
  5. import time
  6. import uuid
  7. import requests
  8. from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
  9. from cryptography.hazmat.backends import default_backend
  10. sys.path.append(os.getcwd())
  11. from application.items import VideoItem
  12. from application.pipeline import PiaoQuanPipelineTest
  13. from application.common.messageQueue import MQ
  14. from application.common.proxies import tunnel_proxies
  15. class AESCipher:
  16. def __init__(self):
  17. self.key = b'50102fa64073ad76' # 用适当的方式转换或直接定义为字节串
  18. self.iv = b'173d023138824bb0' # 同上
  19. def aes_encrypt(self, data):
  20. cipher = Cipher(algorithms.AES(self.key), modes.CBC(self.iv), backend=default_backend())
  21. encryptor = cipher.encryptor()
  22. ct = encryptor.update(self._pad(data).encode()) + encryptor.finalize()
  23. return ct.hex().upper()
  24. def aes_decrypt(self, data):
  25. cipher = Cipher(algorithms.AES(self.key), modes.CBC(self.iv), backend=default_backend())
  26. decryptor = cipher.decryptor()
  27. decrypted_data = decryptor.update(bytes.fromhex(data)) + decryptor.finalize()
  28. return self._unpad(decrypted_data).decode()
  29. def _pad(self, s):
  30. return s + (16 - len(s) % 16) * chr(16 - len(s) % 16)
  31. def _unpad(self, s):
  32. return s[:-ord(s[len(s) - 1:])]
  33. class ZhuHaoShiDuoMoRecommend(object):
  34. """
  35. 祝好事多磨小程序爬虫,测试版本
  36. """
  37. def __init__(self, platform, mode, rule_dict, user_list, env):
  38. self.platform = platform
  39. self.mode = mode
  40. self.rule_dict = rule_dict
  41. self.user_list = user_list
  42. self.env = env
  43. self.download_cnt = 0
  44. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  45. self.expire_flag = False
  46. self.cryptor = AESCipher()
  47. def get_recommend_list(self):
  48. """
  49. 获取推荐流
  50. :return:
  51. """
  52. url = "https://api.lidongze.cn/jeecg-boot/ugc/getVideoListsEn2"
  53. headers = {
  54. 'Host': 'api.lidongze.cn',
  55. 'xweb_xhr': '1',
  56. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.4(0x13080410)XWEB/31009',
  57. 'token': '',
  58. 'content-type': 'application/json',
  59. 'accept': '*/*',
  60. 'referer': 'https://servicewechat.com/wx0afdc2669ed8df2f/3/page-frame.html',
  61. 'accept-language': 'en-US,en;q=0.9'
  62. }
  63. page_index = 1
  64. total_page = 2
  65. while page_index <= total_page:
  66. query = {
  67. "pageNo": page_index,
  68. "pageSize": 10,
  69. "groupId": "1650323161797439489", # 推荐流的 ID
  70. "vn": 1,
  71. "gx": 1,
  72. "appid": "wx0afdc2669ed8df2f",
  73. "type": 0
  74. }
  75. params = {
  76. "v": self.cryptor.aes_encrypt(data=json.dumps(query))
  77. }
  78. response = requests.request("GET", url, headers=headers, params=params, proxies=tunnel_proxies())
  79. result = json.loads(self.cryptor.aes_decrypt(response.text))
  80. total_page = result['list']['pages']
  81. page_index = result['list']['current'] + 1
  82. for index, video_obj in enumerate(result['list']['records']):
  83. self.process_video_obj(video_obj)
  84. def process_video_obj(self, video_obj):
  85. """
  86. 处理视频信息,清洗,规范化,发送至ETL
  87. :param video_obj: 视频信息
  88. :return: None
  89. """
  90. trace_id = self.platform + str(uuid.uuid1())
  91. play_cnt = int(video_obj['playnum'].replace("万+", "0000")) if "万+" in video_obj['playnum'] else int(
  92. video_obj['playnum'])
  93. item = VideoItem()
  94. user_dict = random.choice(self.user_list)
  95. item.add_video_info("video_id", video_obj['id'])
  96. item.add_video_info("video_title", video_obj['vname'])
  97. item.add_video_info("play_cnt", play_cnt)
  98. item.add_video_info("publish_time_stamp", int(time.time()))
  99. item.add_video_info("out_user_id", video_obj['authid'])
  100. item.add_video_info("cover_url", video_obj['shareimg'])
  101. item.add_video_info("like_cnt", int(video_obj['likenum']))
  102. item.add_video_info("video_url", video_obj['videoaddr'])
  103. item.add_video_info("out_video_id", video_obj['id'])
  104. item.add_video_info("platform", self.platform)
  105. item.add_video_info("strategy", self.mode)
  106. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  107. item.add_video_info("user_id", user_dict['uid'])
  108. item.add_video_info("user_name", user_dict['link'])
  109. mq_obj = item.produce_item()
  110. pipeline = PiaoQuanPipelineTest(
  111. platform=self.platform,
  112. mode=self.mode,
  113. rule_dict=self.rule_dict,
  114. env=self.env,
  115. item=mq_obj,
  116. trace_id=trace_id,
  117. )
  118. if pipeline.process_item():
  119. print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
  120. self.download_cnt += 1
  121. print(self.download_cnt)
  122. def run(self):
  123. """
  124. 执行函数
  125. """
  126. self.get_recommend_list()
  127. if __name__ == '__main__':
  128. Z = ZhuHaoShiDuoMoRecommend(
  129. platform="zhuwanwufusu",
  130. mode="recommend",
  131. rule_dict={},
  132. user_dict={"uid": 123456, "nick_name": "luojunhuishuaige"},
  133. env="prod"
  134. )
  135. Z.get_recommend_list()