ganggangdouchuan_recommend2.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/10
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from Crypto.Cipher import AES
  11. from Crypto.Hash import MD5
  12. from Crypto.Util.Padding import pad, unpad
  13. from base64 import b64encode, b64decode
  14. from common.mq import MQ
  15. sys.path.append(os.getcwd())
  16. from common.common import Common
  17. from common.scheduling_db import MysqlHelper
  18. from common.public import get_config_from_mysql, download_rule_v2
  19. proxies = {"http": None, "https": None}
  20. def clean_title(strings):
  21. return (
  22. strings.strip()
  23. .replace("\n", "")
  24. .replace("/", "")
  25. .replace("\r", "")
  26. .replace("#", "")
  27. .replace(".", "。")
  28. .replace("\\", "")
  29. .replace("&NBSP", "")
  30. .replace(":", "")
  31. .replace("*", "")
  32. .replace("?", "")
  33. .replace("?", "")
  34. .replace('"', "")
  35. .replace("<", "")
  36. .replace(">", "")
  37. .replace("|", "")
  38. .replace(" ", "")
  39. .replace('"', "")
  40. .replace("'", "")
  41. )
  42. def decrypt(a, e, n):
  43. e = MD5.new(e.encode()).hexdigest()
  44. key = e[16:].encode()
  45. iv = e[:16].encode()
  46. cipher = AES.new(key, AES.MODE_CBC, iv)
  47. if n:
  48. encrypted_data = b64decode(a)
  49. # print(encrypted_data)
  50. decrypted_data = unpad(cipher.decrypt(encrypted_data), AES.block_size)
  51. return decrypted_data.decode()
  52. else:
  53. padded_data = pad(a.encode(), AES.block_size)
  54. encrypted_data = cipher.encrypt(padded_data)
  55. return b64encode(encrypted_data).decode()
  56. def find_tencent_url(tx_vid):
  57. headers = {
  58. "Host": "h5vv.video.qq.com",
  59. "xweb_xhr": "1",
  60. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  61. "Content-Type": "application/x-www-form-urlencoded",
  62. "Accept": "*/*",
  63. "Sec-Fetch-Site": "cross-site",
  64. "Sec-Fetch-Mode": "cors",
  65. "Sec-Fetch-Dest": "empty",
  66. "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
  67. "Accept-Language": "en",
  68. }
  69. video_id = tx_vid
  70. url = "https://h5vv.video.qq.com/getinfo?vid={}&platform=101001&charge=0&otype=json&defn=shd".format(
  71. video_id
  72. )
  73. response = requests.get(url, headers=headers)
  74. result = json.loads(response.text.replace("QZOutputJson=", "")[:-1])
  75. vl = result["vl"]["vi"][0]
  76. key = vl["fvkey"]
  77. name = vl["fn"]
  78. folder = vl["ul"]["ui"][0]["url"]
  79. url = folder + name + "?vkey=" + key
  80. return url
  81. class GGDCScheduling:
  82. def __init__(self, log_type, crawler, rule_dict, env, our_uid):
  83. self.platform = "刚刚都传"
  84. self.log_type = log_type
  85. self.crawler = crawler
  86. self.rule_dict = rule_dict
  87. self.env = env
  88. self.our_uid = our_uid
  89. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  90. self.download_count = 0
  91. def repeat_video(self, video_id):
  92. sql = f""" select * from crawler_video where platform in ("{self.crawler}","{self.platform}") and out_video_id="{video_id}"; """
  93. repeat_video = MysqlHelper.get_values(
  94. self.log_type, self.crawler, sql, self.env
  95. )
  96. return len(repeat_video)
  97. # 获取视频id_list
  98. def get_videoList(self, page_id):
  99. time.sleep(random.randint(5, 10))
  100. url = "https://ganggangdouchuan2.mengniu99.com/api/getcatevideos"
  101. params = {
  102. "cateid": "video",
  103. "page": page_id,
  104. "timeline": 0,
  105. "version": "9.0.2",
  106. # "timestamp": 1697700674000,
  107. # "openid": "oNnpe5SwkfGtD5aJAaRbsIKIEdjc",
  108. }
  109. headers = {
  110. "Host": "ganggangdouchuan2.mengniu99.com",
  111. # "Authorization": "oNnpe5SwkfGtD5aJAaRbsIKIEdjc",
  112. "xweb_xhr": "1",
  113. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  114. # "Sign": "88ab143a9401ebac3525562073248231",
  115. "Content-Type": "application/json",
  116. "Accept": "*/*",
  117. "Sec-Fetch-Site": "cross-site",
  118. "Sec-Fetch-Mode": "cors",
  119. "Sec-Fetch-Dest": "empty",
  120. "Referer": "https://servicewechat.com/wx5fcd817f3f80aece/3/page-frame.html",
  121. "Accept-Language": "en",
  122. }
  123. response = requests.get(url, headers=headers, params=params)
  124. while True:
  125. try:
  126. decrypted_data = decrypt(
  127. response.json()["data"], response.json()["_yyy"], True
  128. )
  129. result = json.loads(decrypted_data)
  130. break
  131. except:
  132. print("编码不对,解密失败, 等待10秒钟")
  133. time.sleep(10)
  134. if "totalCount" not in result or response.status_code != 200:
  135. Common.logger(self.log_type, self.crawler).info(
  136. f"get_videoList:{response.text}\n"
  137. )
  138. Common.logging(
  139. self.log_type,
  140. self.crawler,
  141. self.env,
  142. f"get_videoList:{response.text}\n",
  143. )
  144. return
  145. elif len(result["videos"]) == 0:
  146. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  147. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  148. return
  149. else:
  150. data_list = result["videos"]
  151. for video_obj in data_list:
  152. try:
  153. self.process_video_obj(video_obj)
  154. except Exception as e:
  155. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  156. Common.logging(
  157. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  158. )
  159. def process_video_obj(self, video_obj):
  160. video_id = video_obj.get("videoid", 0)
  161. video_title = clean_title(video_obj.get("title", "no title"))
  162. video_time = video_obj.get("v_time", 0)
  163. publish_time_stamp = int(time.time())
  164. publish_time_str = time.strftime(
  165. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  166. )
  167. user_name = video_obj["nickname"]
  168. video_dict = {
  169. "video_title": video_title,
  170. "video_id": video_id,
  171. "duration": video_time,
  172. "play_cnt": 0,
  173. "like_cnt": 0,
  174. "comment_cnt": 0,
  175. "share_cnt": 0,
  176. "user_name": user_name,
  177. "publish_time_stamp": publish_time_stamp,
  178. "publish_time_str": publish_time_str,
  179. "video_width": 0,
  180. "video_height": 0,
  181. "profile_id": 0,
  182. "profile_mid": 0,
  183. "cover_url": video_obj["cover"],
  184. "session": f"ganggangdouchuan-{int(time.time())}",
  185. }
  186. for k, v in video_dict.items():
  187. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  188. Common.logging(self.log_type, self.crawler, self.env, f"{video_dict}")
  189. # 过滤无效视频
  190. if video_title == "" or video_dict["video_id"] == "":
  191. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  192. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  193. # 抓取基础规则过滤
  194. elif (
  195. download_rule_v2(
  196. log_type=self.log_type,
  197. crawler=self.crawler,
  198. video_dict=video_dict,
  199. rule_dict=self.rule_dict,
  200. )
  201. is False
  202. ):
  203. Common.logger(self.log_type, self.crawler).info("不满足抓取规则\n")
  204. Common.logging(self.log_type, self.crawler, self.env, "不满足抓取规则\n")
  205. elif (
  206. any(
  207. str(word) if str(word) in video_dict["video_title"] else False
  208. for word in get_config_from_mysql(
  209. log_type=self.log_type,
  210. source=self.crawler,
  211. env=self.env,
  212. text="filter",
  213. action="",
  214. )
  215. )
  216. is True
  217. ):
  218. Common.logger(self.log_type, self.crawler).info("已中过滤词\n")
  219. Common.logging(self.log_type, self.crawler, self.env, "已中过滤词\n")
  220. elif self.repeat_video(video_dict["video_id"]) != 0:
  221. Common.logger(self.log_type, self.crawler).info("视频已下载\n")
  222. Common.logging(self.log_type, self.crawler, self.env, "视频已下载\n")
  223. else:
  224. video_dict["out_user_id"] = video_dict["profile_id"]
  225. video_dict["platform"] = self.crawler
  226. video_dict["strategy"] = self.log_type
  227. video_dict["out_video_id"] = str(video_dict["video_id"])
  228. video_dict["width"] = video_dict["video_width"]
  229. video_dict["height"] = video_dict["video_height"]
  230. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  231. video_dict["user_id"] = self.our_uid
  232. video_dict["publish_time"] = video_dict["publish_time_str"]
  233. video_dict["video_url"] = find_tencent_url(video_obj["txvid"])
  234. video_dict["avatar_url"] = video_obj["avatarurl"]
  235. video_dict["cover_url"] = video_obj["cover"]
  236. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  237. self.download_count += 1
  238. self.mq.send_msg(video_dict)
  239. if __name__ == "__main__":
  240. ZL = GGDCScheduling(
  241. log_type="recommend",
  242. crawler="ggdc",
  243. rule_dict={},
  244. our_uid="luojunhuihaoshuai",
  245. env="dev",
  246. )
  247. for i in range(5):
  248. ZL.get_videoList(page_id=i + 1)
  249. print(ZL.download_count)