shayuzhufu.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. """
  2. 鲨鱼祝福——原鲸鱼祝福
  3. """
  4. import os
  5. import json
  6. import random
  7. import sys
  8. import time
  9. import uuid
  10. import requests
  11. import datetime
  12. from base64 import b64encode, b64decode
  13. from Crypto.Cipher import AES
  14. from Crypto.Util.Padding import pad, unpad
  15. sys.path.append(os.getcwd())
  16. from application.items import VideoItem
  17. from application.pipeline import PiaoQuanPipeline
  18. from application.common.messageQueue import MQ
  19. from application.common.proxies import tunnel_proxies
  20. from application.common.log import AliyunLogger
  21. from application.common.mysql import MysqlHelper
  22. class SharkAES(object):
  23. """
  24. 鲨鱼祝福 aes 解密
  25. """
  26. def __init__(self):
  27. self.key = 'xlc2ze7qnqg8xi1d'.encode() # 需要一个bytes类型的key
  28. self.iv = self.key # 在这个例子中,key和iv是相同的
  29. def encrypt(self, data):
  30. cipher = AES.new(self.key, AES.MODE_CBC, self.iv)
  31. ct_bytes = cipher.encrypt(pad(data.encode('utf-8'), AES.block_size))
  32. ct = b64encode(ct_bytes).decode()
  33. return ct
  34. def decrypt(self, data):
  35. try:
  36. ct = b64decode(data.encode('utf-8'))
  37. cipher = AES.new(self.key, AES.MODE_CBC, self.iv)
  38. pt = unpad(cipher.decrypt(ct), AES.block_size)
  39. return pt.decode()
  40. except Exception as e:
  41. print("Incorrect decryption")
  42. return None
  43. class SharkZhuFuRecommend(object):
  44. """
  45. 鲨鱼祝福推荐爬虫
  46. """
  47. def __init__(self, platform, mode, rule_dict, user_list, env):
  48. self.platform = platform
  49. self.mode = mode
  50. self.rule_dict = rule_dict
  51. self.user_list = user_list
  52. self.env = env
  53. self.download_cnt = 0
  54. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  55. self.limit_flag = False
  56. self.cryptor = SharkAES()
  57. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  58. self.mysql = MysqlHelper(platform=self.platform, mode=self.mode)
  59. def get_video_list(self):
  60. """
  61. :return: 获取视频列表
  62. """
  63. base_url = "https://shanhu.nnapi.cn/videos/api.videos/getItem"
  64. headers = {
  65. "Host": "shanhu.nnapi.cn",
  66. "xweb_xhr": "1",
  67. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.5(0x13080510)XWEB/1100",
  68. "content-type": "application/json",
  69. "accept": "*/*",
  70. "sec-fetch-site": "cross-site",
  71. "sec-fetch-mode": "cors",
  72. "sec-fetch-dest": "empty",
  73. "referer": "https://servicewechat.com/wx7444f6906dbd46b1/2/page-frame.html",
  74. "accept-language": "en-US,en;q=0.9",
  75. "Cookie": "PHPSESSID=562dc39e8e68ad3e76c237f687bd049b; lang=zh-cn",
  76. }
  77. for i in range(100):
  78. time.sleep(random.randint(1, 10))
  79. try:
  80. if self.limit_flag:
  81. self.aliyun_log.logging(
  82. code="2000",
  83. message="本轮已经抓取到足够的数据,自动退出\t{}".format(self.download_cnt),
  84. )
  85. return
  86. else:
  87. params = {"mark": "", "page": i + 1}
  88. response = requests.get(
  89. url=base_url,
  90. headers=headers,
  91. params=params,
  92. )
  93. encrypted_info = response.json()["data"]
  94. decrypted_info = json.loads(
  95. self.cryptor.decrypt(data=encrypted_info)
  96. )
  97. video_list = decrypted_info["list"]
  98. for index, video_obj in enumerate(video_list, 1):
  99. try:
  100. self.aliyun_log.logging(
  101. code="1001",
  102. message="扫描到一条视频",
  103. data=video_obj,
  104. )
  105. self.process_video_obj(video_obj)
  106. except Exception as e:
  107. self.aliyun_log.logging(
  108. code="3000",
  109. data=video_obj,
  110. message="抓取第{}条的时候出现问题, 报错信息是{}".format(index, e),
  111. )
  112. except Exception as e:
  113. self.aliyun_log.logging(
  114. code="3000",
  115. message="抓取第{}页时候出现错误, 报错信息是{}".format(i + 1, e),
  116. )
  117. def process_video_obj(self, video_obj):
  118. """
  119. :param video_obj: 视频 obj
  120. :return: None
  121. """
  122. trace_id = self.platform + str(uuid.uuid1())
  123. our_user = random.choice(self.user_list)
  124. publish_time_stamp = datetime.datetime.strptime(
  125. video_obj["create_at"], "%Y-%m-%d %H:%M:%S"
  126. ).timestamp()
  127. item = VideoItem()
  128. item.add_video_info("user_id", our_user["uid"])
  129. item.add_video_info("user_name", our_user["nick_name"])
  130. item.add_video_info("video_id", video_obj["id"])
  131. item.add_video_info("video_title", video_obj["name"])
  132. item.add_video_info("publish_time_str", video_obj["create_at"])
  133. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  134. item.add_video_info("video_url", video_obj["cover"])
  135. item.add_video_info(
  136. "cover_url", video_obj["cover"] + "&vframe/png/offset/1/w/200"
  137. )
  138. item.add_video_info("like_cnt", video_obj["num_like"])
  139. item.add_video_info("play_cnt", video_obj["num_read"])
  140. item.add_video_info("comment_cnt", video_obj["num_comment"])
  141. item.add_video_info("out_video_id", video_obj["id"])
  142. item.add_video_info("platform", self.platform)
  143. item.add_video_info("strategy", self.mode)
  144. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  145. mq_obj = item.produce_item()
  146. pipeline = PiaoQuanPipeline(
  147. platform=self.platform,
  148. mode=self.mode,
  149. rule_dict=self.rule_dict,
  150. env=self.env,
  151. item=mq_obj,
  152. trace_id=trace_id,
  153. )
  154. if pipeline.process_item():
  155. self.download_cnt += 1
  156. # print(mq_obj)
  157. self.mq.send_msg(mq_obj)
  158. self.aliyun_log.logging(
  159. code="1002",
  160. message="成功发送至 ETL",
  161. data=mq_obj,
  162. )
  163. if self.download_cnt >= int(
  164. self.rule_dict.get("videos_cnt", {}).get("min", 200)
  165. ):
  166. self.limit_flag = True
  167. def run(self):
  168. """
  169. 执行函数
  170. """
  171. self.get_video_list()
  172. # if __name__ == '__main__':
  173. # S = ShanHuZhuFuRecommend(
  174. # platform="shanhuzhufu",
  175. # mode="recommend",
  176. # env="dev",
  177. # rule_dict={},
  178. # user_list=[{'nick_name': "Ivring", 'uid': "1997"}, {'nick_name': "paul", 'uid': "1998"}]
  179. # )
  180. # S.get_video_list()