zhufuzhonglaonianrenruyijixiang_scheduling.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/13
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from hashlib import md5
  11. from datetime import datetime
  12. from common.mq import MQ
  13. sys.path.append(os.getcwd())
  14. from common.common import Common
  15. from common.scheduling_db import MysqlHelper
  16. from common.public import get_config_from_mysql, download_rule
  17. proxies = {"http": None, "https": None}
  18. def clean_title(strings):
  19. return (
  20. strings.strip()
  21. .replace("\n", "")
  22. .replace("/", "")
  23. .replace("\r", "")
  24. .replace("#", "")
  25. .replace(".", "。")
  26. .replace("\\", "")
  27. .replace("&NBSP", "")
  28. .replace(":", "")
  29. .replace("*", "")
  30. .replace("?", "")
  31. .replace("?", "")
  32. .replace('"', "")
  33. .replace("<", "")
  34. .replace(">", "")
  35. .replace("|", "")
  36. .replace(" ", "")
  37. .replace('"', "")
  38. .replace("'", "")
  39. )
  40. class ZFZLNRRYJXScheduling:
  41. def __init__(self, log_type, crawler, rule_dict, env, our_uid):
  42. self.platform = "祝福中老年人如意吉祥"
  43. self.log_type = log_type
  44. self.crawler = crawler
  45. self.rule_dict = rule_dict
  46. self.env = env
  47. self.our_uid = our_uid
  48. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  49. self.download_count = 0
  50. def repeat_video(self, video_id):
  51. sql = f""" select * from crawler_video where platform in ("{self.crawler}","{self.platform}") and out_video_id="{video_id}"; """
  52. repeat_video = MysqlHelper.get_values(
  53. self.log_type, self.crawler, sql, self.env
  54. )
  55. return len(repeat_video)
  56. # 获取视频id_list
  57. def get_videoList(self, page_id):
  58. time.sleep(random.randint(5, 10))
  59. url = "https://api.newboqing.top/index.php?s=mobile/Video/getList&cid=1&page={}&api_version=4&appid=wx65490b687b7c9892&version=1.9.0&env_version=release&scene=1053".format(
  60. page_id)
  61. headers = {
  62. "Host": "api.newboqing.top",
  63. # "ik": "b326b5062b2f0e69046810717534cb09",
  64. "xweb_xhr": "1",
  65. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  66. "token": "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE2OTcxNzY4NDYsIm5iZiI6MTY5NzE3Njg0NiwiZXhwIjoxNjk3MTg0MDQ2LCJkYXRhIjp7InVzZXJfaWQiOiIyNTA0MjkwNyJ9fQ.BVA0CZG2mHrsHBVWTvQVrlZkH0TWExdTRxHXp5hJyxk",
  67. "Content-Type": "application/json",
  68. "Accept": "*/*",
  69. "Referer": "https://servicewechat.com/wx65490b687b7c9892/9/page-frame.html",
  70. "Accept-Language": "en",
  71. }
  72. # data = {
  73. # "time": "1696991482000",
  74. # "str_data": "A7ZHUDdb",
  75. # "page": str(page_id),
  76. # "limit": str(page_limit),
  77. # "appid": "wx742465c143d1bd2b",
  78. # # wx742465c143d1bd2b
  79. # "version": "1.4.2",
  80. # "openid": "ogEOH5cHAMpi8qrWle_vjtaqT6zw",
  81. # }
  82. response = requests.post(url, headers=headers)
  83. if "data" not in response.text or response.status_code != 200:
  84. Common.logger(self.log_type, self.crawler).info(
  85. f"get_videoList:{response.text}\n"
  86. )
  87. Common.logging(
  88. self.log_type,
  89. self.crawler,
  90. self.env,
  91. f"get_videoList:{response.text}\n",
  92. )
  93. return
  94. elif len(response.json()["data"]["list"]) == 0:
  95. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  96. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  97. return
  98. elif len(response.json()["msg"]) == "未登录":
  99. Common.logger(self.log_type, self.crawler).info(f"token 过期啦~\n")
  100. Common.logging(self.log_type, self.crawler, self.env, f"token 过期啦~\n")
  101. return
  102. else:
  103. data_list = response.json()["data"]["list"]
  104. for video_obj in data_list:
  105. try:
  106. self.process_video_obj(video_obj)
  107. except Exception as e:
  108. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  109. Common.logging(
  110. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  111. )
  112. def process_video_obj(self, video_obj):
  113. # print(type(video_obj))
  114. video_id = video_obj.get("id", 0)
  115. video_title = clean_title(video_obj.get("title", "no title"))
  116. video_time = video_obj.get("v_time", 0)
  117. publish_time_stamp = int(time.time())
  118. # date_object = datetime.strptime(publish_time_stamp, "%Y-%m-%d")
  119. # publish_time_stamp = int(time.mktime(date_object.timetuple()))
  120. publish_time_str = time.strftime(
  121. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  122. )
  123. user_name = ""
  124. video_dict = {
  125. "video_title": video_title,
  126. "video_id": video_id,
  127. "duration": video_time,
  128. "play_cnt": video_obj.get("visited", 0),
  129. "like_cnt": 0,
  130. "comment_cnt": 0,
  131. "share_cnt": video_obj.get("shared", 0),
  132. "user_name": user_name,
  133. "publish_time_stamp": publish_time_stamp,
  134. "publish_time_str": publish_time_str,
  135. "video_width": 0,
  136. "video_height": 0,
  137. "profile_id": 0,
  138. "profile_mid": 0,
  139. # "cover_url": "",
  140. "session": f"zhufuzhonglaonianrenruyijixiang-{int(time.time())}",
  141. }
  142. for k, v in video_dict.items():
  143. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  144. Common.logging(
  145. self.log_type, self.crawler, self.env, f"{video_dict}"
  146. )
  147. # 过滤无效视频
  148. if video_title == "" or video_dict["video_id"] == "":
  149. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  150. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  151. # 抓取基础规则过滤
  152. elif (
  153. download_rule(
  154. log_type=self.log_type,
  155. crawler=self.crawler,
  156. video_dict=video_dict,
  157. rule_dict=self.rule_dict,
  158. )
  159. is False
  160. ):
  161. Common.logger(self.log_type, self.crawler).info("不满足抓取规则\n")
  162. Common.logging(
  163. self.log_type, self.crawler, self.env, "不满足抓取规则\n"
  164. )
  165. elif (
  166. any(
  167. str(word)
  168. if str(word) in video_dict["video_title"]
  169. else False
  170. for word in get_config_from_mysql(
  171. log_type=self.log_type,
  172. source=self.crawler,
  173. env=self.env,
  174. text="filter",
  175. action="",
  176. )
  177. )
  178. is True
  179. ):
  180. Common.logger(self.log_type, self.crawler).info("已中过滤词\n")
  181. Common.logging(self.log_type, self.crawler, self.env, "已中过滤词\n")
  182. elif self.repeat_video(video_dict["video_id"]) != 0:
  183. Common.logger(self.log_type, self.crawler).info("视频已下载\n")
  184. Common.logging(self.log_type, self.crawler, self.env, "视频已下载\n")
  185. else:
  186. # out_video_id = md5(video_title.encode('utf8')).hexdigest()
  187. # out_user_id = md5(user_name.encode('utf8')).hexdigest()
  188. video_dict["out_user_id"] = video_dict["profile_id"]
  189. video_dict["platform"] = self.crawler
  190. video_dict["strategy"] = self.log_type
  191. video_dict["out_video_id"] = str(video_dict["video_id"])
  192. video_dict["width"] = video_dict["video_width"]
  193. video_dict["height"] = video_dict["video_height"]
  194. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  195. video_dict["user_id"] = self.our_uid
  196. video_dict["publish_time"] = video_dict["publish_time_str"]
  197. video_dict["video_url"] = video_obj['video_url']
  198. video_dict["avatar_url"] = video_obj['end_cover']
  199. video_dict["cover_url"] = video_obj['video_cover']
  200. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  201. self.download_count += 1
  202. self.mq.send_msg(video_dict)
  203. if __name__ == "__main__":
  204. ZL = ZFZLNRRYJXScheduling(
  205. log_type="recommend",
  206. crawler="zfzlnrjxry",
  207. rule_dict={},
  208. our_uid="luojunhuihaoshuai",
  209. env="dev"
  210. )
  211. for i in range(4):
  212. ZL.get_videoList(page_id=i + 1)
  213. print(ZL.download_count)