zhufusonglaoyou_recommend.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/12/14
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import uuid
  10. import requests
  11. from Crypto.Cipher import AES
  12. from Crypto.Hash import MD5
  13. from Crypto.Util.Padding import pad, unpad
  14. from base64 import b64encode, b64decode
  15. from common.mq import MQ
  16. sys.path.append(os.getcwd())
  17. from common.common import Common
  18. from common.aliyun_log import AliyunLogger
  19. from common.pipeline import PiaoQuanPipeline
  20. from common.public import clean_title
  21. def decrypt(a, e, n):
  22. e = MD5.new(e.encode()).hexdigest()
  23. key = e[16:].encode()
  24. iv = e[:16].encode()
  25. cipher = AES.new(key, AES.MODE_CBC, iv)
  26. if n:
  27. encrypted_data = b64decode(a)
  28. decrypted_data = unpad(cipher.decrypt(encrypted_data), AES.block_size)
  29. return decrypted_data.decode()
  30. else:
  31. padded_data = pad(a.encode(), AES.block_size)
  32. encrypted_data = cipher.encrypt(padded_data)
  33. return b64encode(encrypted_data).decode()
  34. def find_tencent_url(tx_vid):
  35. headers = {
  36. "Host": "h5vv.video.qq.com",
  37. "xweb_xhr": "1",
  38. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  39. "Content-Type": "application/x-www-form-urlencoded",
  40. "Accept": "*/*",
  41. "Sec-Fetch-Site": "cross-site",
  42. "Sec-Fetch-Mode": "cors",
  43. "Sec-Fetch-Dest": "empty",
  44. "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
  45. "Accept-Language": "en",
  46. }
  47. video_id = tx_vid
  48. url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(video_id)
  49. response = requests.get(url, headers=headers)
  50. result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
  51. vl = result["vl"]["vi"][0]
  52. key = vl["fvkey"]
  53. name = vl["fn"]
  54. folder = vl["ul"]["ui"][0]["url"]
  55. url = folder + name + "?vkey=" + key
  56. return url
  57. class ZFSLYScheduling:
  58. def __init__(self, log_type, crawler, rule_dict, env, our_uid):
  59. self.platform = "zhufusonglaoyou"
  60. self.log_type = log_type
  61. self.crawler = crawler
  62. self.rule_dict = rule_dict
  63. self.env = env
  64. self.our_uid = our_uid
  65. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  66. self.download_count = 0
  67. # 获取视频id_list
  68. def get_videoList(self, page_id):
  69. time.sleep(random.randint(5, 10))
  70. url = "https://zhufusonglaoyou2.mengniu99.com/api/getcatevideos"
  71. params = {
  72. "cateid": "video",
  73. "page": page_id,
  74. "timeline": 0,
  75. "version": "9.0.2",
  76. }
  77. headers = {
  78. 'Host': 'zhufusonglaoyou2.mengniu99.com',
  79. 'xweb_xhr': '1',
  80. 'Authorization': 'o7hOQ5XsP-OtIuOK8qAXe368o45E',
  81. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.5(0x13080510)XWEB/1100',
  82. 'Sign': '2c694618acd1218cb0876a825165ca45',
  83. 'Content-Type': 'application/json',
  84. 'Accept': '*/*',
  85. 'Sec-Fetch-Site': 'cross-site',
  86. 'Sec-Fetch-Mode': 'cors',
  87. 'Sec-Fetch-Dest': 'empty',
  88. 'Referer': 'https://servicewechat.com/wx5b38d01fa06bba64/4/page-frame.html',
  89. 'Accept-Language': 'en-US,en;q=0.9'
  90. }
  91. while True:
  92. try:
  93. response = requests.get(url, headers=headers, params=params)
  94. decrypted_data = decrypt(
  95. response.json()["data"][:-2], response.json()["_yyy"], True
  96. )
  97. result = json.loads(decrypted_data)
  98. AliyunLogger.logging(
  99. code="1000",
  100. platform=self.crawler,
  101. mode=self.log_type,
  102. env=self.env,
  103. data={},
  104. message="开始抓取第{}页".format(page_id),
  105. )
  106. break
  107. except:
  108. AliyunLogger.logging(
  109. code="2000",
  110. platform=self.crawler,
  111. mode=self.log_type,
  112. env=self.env,
  113. data={},
  114. message="抓取第{}页,未获取数据,编码错误".format(page_id),
  115. )
  116. Common.logger(self.log_type, self.crawler).info("编码不对,解密失败\n")
  117. return
  118. if "totalCount" not in result:
  119. Common.logger(self.log_type, self.crawler).info(
  120. f"get_videoList:{response.text}\n"
  121. )
  122. AliyunLogger.logging(
  123. code="2000",
  124. platform=self.crawler,
  125. mode=self.log_type,
  126. env=self.env,
  127. data={},
  128. message="抓取第{}页,未获取数据".format(page_id),
  129. )
  130. return
  131. elif len(result["videos"]) == 0:
  132. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  133. AliyunLogger.logging(
  134. code="2000",
  135. platform=self.crawler,
  136. mode=self.log_type,
  137. env=self.env,
  138. data={},
  139. message="抓取第{}页,没有更多数据啦".format(page_id),
  140. )
  141. return
  142. else:
  143. data_list = result["videos"]
  144. for index, video_obj in enumerate(data_list):
  145. try:
  146. AliyunLogger.logging(
  147. code="1001",
  148. platform=self.crawler,
  149. mode=self.log_type,
  150. env=self.env,
  151. data={},
  152. message="成功扫描到一条视频, 该视频位于第{}页{}条".format(page_id, index + 1),
  153. )
  154. self.process_video_obj(video_obj)
  155. except Exception as e:
  156. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  157. AliyunLogger.logging(
  158. code="3000",
  159. platform=self.crawler,
  160. mode=self.log_type,
  161. env=self.env,
  162. data=video_obj,
  163. message="抓取单条视频异常, 报错原因是: {}, 该视频位于第{}页{}条".format(
  164. e, page_id, index + 1
  165. ),
  166. )
  167. AliyunLogger.logging(
  168. code="1000",
  169. platform=self.crawler,
  170. mode=self.log_type,
  171. env=self.env,
  172. data={},
  173. message="完成抓取第{}页".format(page_id),
  174. )
  175. def process_video_obj(self, video_obj):
  176. trace_id = self.platform + str(uuid.uuid1())
  177. video_id = video_obj.get("videoid", 0)
  178. video_title = clean_title(video_obj.get("title", "no title"))
  179. video_time = video_obj.get("v_time", 0)
  180. publish_time_stamp = int(time.time())
  181. publish_time_str = time.strftime(
  182. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  183. )
  184. user_name = video_obj["nickname"]
  185. video_dict = {
  186. "video_title": video_title,
  187. "video_id": video_id,
  188. "duration": video_time,
  189. "play_cnt": 0,
  190. "like_cnt": 0,
  191. "comment_cnt": 0,
  192. "share_cnt": 0,
  193. "user_name": user_name,
  194. "publish_time_stamp": publish_time_stamp,
  195. "publish_time_str": publish_time_str,
  196. "update_time_stamp": int(time.time()),
  197. "video_width": 0,
  198. "video_height": 0,
  199. "profile_id": 0,
  200. "profile_mid": 0,
  201. "cover_url": video_obj["cover"],
  202. "session": f"ganggangdouchuan-{int(time.time())}",
  203. }
  204. video_dict["out_video_id"] = str(video_dict["video_id"])
  205. rule_pipeline = PiaoQuanPipeline(
  206. platform=self.platform,
  207. mode=self.log_type,
  208. rule_dict=self.rule_dict,
  209. env=self.env,
  210. item=video_dict,
  211. trace_id=trace_id
  212. )
  213. flag = rule_pipeline.process_item()
  214. if flag:
  215. video_dict["out_user_id"] = video_dict["profile_id"]
  216. video_dict["platform"] = self.crawler
  217. video_dict["strategy"] = self.log_type
  218. video_dict["width"] = video_dict["video_width"]
  219. video_dict["height"] = video_dict["video_height"]
  220. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  221. video_dict["user_id"] = self.our_uid
  222. video_dict["publish_time"] = video_dict["publish_time_str"]
  223. video_dict["video_url"] = find_tencent_url(video_obj["txvid"])
  224. video_dict["avatar_url"] = video_obj["avatarurl"]
  225. video_dict["cover_url"] = video_obj["cover"]
  226. self.download_count += 1
  227. self.mq.send_msg(video_dict)
  228. AliyunLogger.logging(
  229. code="1002",
  230. platform=self.crawler,
  231. mode=self.log_type,
  232. env=self.env,
  233. data=video_dict,
  234. trace_id=trace_id,
  235. message="成功发送 MQ 至 ETL",
  236. )
  237. if __name__ == "__main__":
  238. ZL =ZFSLYScheduling(
  239. log_type="recommend",
  240. crawler="zhufusonglaoyou",
  241. rule_dict={},
  242. our_uid="luojunhuihaoshuai",
  243. env="dev",
  244. )
  245. for i in range(1):
  246. ZL.get_videoList(page_id=i + 1)
  247. print(ZL.download_count)