zhonglaonianyule_recommend_scheduling.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/10
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from hashlib import md5
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common.common import Common
  14. from common.scheduling_db import MysqlHelper
  15. from common.public import get_config_from_mysql, download_rule
  16. proxies = {"http": None, "https": None}
  17. def clean_title(strings):
  18. return (
  19. strings.strip()
  20. .replace("\n", "")
  21. .replace("/", "")
  22. .replace("\r", "")
  23. .replace("#", "")
  24. .replace(".", "。")
  25. .replace("\\", "")
  26. .replace("&NBSP", "")
  27. .replace(":", "")
  28. .replace("*", "")
  29. .replace("?", "")
  30. .replace("?", "")
  31. .replace('"', "")
  32. .replace("<", "")
  33. .replace(">", "")
  34. .replace("|", "")
  35. .replace(" ", "")
  36. .replace('"', "")
  37. .replace("'", "")
  38. )
  39. class ZLNYLScheduling:
  40. def __init__(self, log_type, crawler, category, rule_dict, env):
  41. self.platform = "中老年娱乐"
  42. self.log_type = log_type
  43. self.crawler = crawler
  44. self.category = category
  45. self.rule_dict = rule_dict
  46. self.env = env
  47. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  48. self.download_count = 0
  49. def repeat_video(self, video_id):
  50. sql = f""" select * from crawler_video where platform in ("{self.crawler}","{self.platform}") and out_video_id="{video_id}"; """
  51. repeat_video = MysqlHelper.get_values(
  52. self.log_type, self.crawler, sql, self.env
  53. )
  54. return len(repeat_video)
  55. # 获取视频id_list
  56. def get_videoList(self, page_id):
  57. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videolist&"
  58. headers = {
  59. "Host": "kkj.xinhuachuanmeijs.com",
  60. "accept": "*/*",
  61. "content-type": "application/x-www-form-urlencoded",
  62. "accept-language": "zh-cn",
  63. "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac",
  64. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  65. "Cookie": "PHPSESSID=ef4e78382296a0db2021ecd6e35c614f",
  66. }
  67. payload = "category={}&page={}&israndom=1&type=4&noauth=true".format(
  68. self.category, page_id
  69. )
  70. response = requests.request("POST", url, headers=headers, data=payload)
  71. if "data" not in response.text or response.status_code != 200:
  72. Common.logger(self.log_type, self.crawler).info(
  73. f"get_videoList:{response.text}\n"
  74. )
  75. Common.logging(
  76. self.log_type,
  77. self.crawler,
  78. self.env,
  79. f"get_videoList:{response.text}\n",
  80. )
  81. return
  82. elif len(response.json()["data"]) == 0:
  83. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  84. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  85. return
  86. else:
  87. data_list = response.json()["data"]
  88. for video_obj in data_list:
  89. try:
  90. video_id = video_obj.get("vid", 0)
  91. video_title = clean_title(video_obj.get("vtitle", 0))
  92. video_time = video_obj.get("v_time", 0)
  93. publish_time_stamp = int(time.time())
  94. publish_time_str = time.strftime(
  95. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  96. )
  97. user_name = ""
  98. video_dict = {
  99. "video_title": video_title,
  100. "video_id": video_id,
  101. "duration": video_time,
  102. "play_cnt": 0,
  103. "like_cnt": 0,
  104. "comment_cnt": 0,
  105. "share_cnt": 0,
  106. "user_name": user_name,
  107. "publish_time_stamp": publish_time_stamp,
  108. "publish_time_str": publish_time_str,
  109. "video_width": 0,
  110. "video_height": 0,
  111. "profile_id": 0,
  112. "profile_mid": 0,
  113. # "cover_url": "",
  114. "session": f"zhonglaonianyule-{int(time.time())}",
  115. }
  116. for k, v in video_dict.items():
  117. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  118. Common.logging(
  119. self.log_type, self.crawler, self.env, f"{video_dict}"
  120. )
  121. # 过滤无效视频
  122. if video_title == "" or video_dict["video_id"] == "":
  123. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  124. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  125. # 抓取基础规则过滤
  126. elif (
  127. download_rule(
  128. log_type=self.log_type,
  129. crawler=self.crawler,
  130. video_dict=video_dict,
  131. rule_dict=self.rule_dict,
  132. )
  133. is False
  134. ):
  135. Common.logger(self.log_type, self.crawler).info("不满足抓取规则\n")
  136. Common.logging(
  137. self.log_type, self.crawler, self.env, "不满足抓取规则\n"
  138. )
  139. elif (
  140. any(
  141. str(word)
  142. if str(word) in video_dict["video_title"]
  143. else False
  144. for word in get_config_from_mysql(
  145. log_type=self.log_type,
  146. source=self.crawler,
  147. env=self.env,
  148. text="filter",
  149. action="",
  150. )
  151. )
  152. is True
  153. ):
  154. Common.logger(self.log_type, self.crawler).info("已中过滤词\n")
  155. Common.logging(self.log_type, self.crawler, self.env, "已中过滤词\n")
  156. elif self.repeat_video(video_dict["video_id"]) != 0:
  157. Common.logger(self.log_type, self.crawler).info("视频已下载\n")
  158. Common.logging(self.log_type, self.crawler, self.env, "视频已下载\n")
  159. else:
  160. # out_video_id = md5(video_title.encode('utf8')).hexdigest()
  161. # out_user_id = md5(user_name.encode('utf8')).hexdigest()
  162. video_dict["out_user_id"] = video_dict["profile_id"]
  163. video_dict["platform"] = self.crawler
  164. video_dict["strategy"] = self.log_type
  165. video_dict["out_video_id"] = video_dict["video_id"]
  166. video_dict["width"] = video_dict["video_width"]
  167. video_dict["height"] = video_dict["video_height"]
  168. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  169. video_dict["user_id"] = "-1"
  170. video_dict["publish_time"] = video_dict["publish_time_str"]
  171. d_obj = self.find_video_url(video_id)
  172. video_dict["video_url"] = d_obj["url"]
  173. video_dict["avatar_url"] = d_obj["cover"]
  174. video_dict["cover_url"] = d_obj["cover"]
  175. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  176. self.mq.send_msg(video_dict)
  177. except Exception as e:
  178. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  179. Common.logging(
  180. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  181. )
  182. def find_video_url(self, video_id):
  183. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videoinfo&state=we7sid-f0008a08276fc324921185dc74427c56&sign=fa36387242169f01aa747a80d49c8670&vid={}&version=1.0.3".format(
  184. video_id
  185. )
  186. headers = {
  187. "Host": "kkj.xinhuachuanmeijs.com",
  188. "xweb_xhr": "1",
  189. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  190. "content-type": "application/x-www-form-urlencoded",
  191. "accept": "*/*",
  192. "sec-fetch-site": "cross-site",
  193. "sec-fetch-mode": "cors",
  194. "sec-fetch-dest": "empty",
  195. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  196. "accept-language": "en",
  197. }
  198. response = requests.get(url, headers=headers).json()
  199. video_url = response["data"]["res"]
  200. video_cover = response["data"]["cover"]
  201. Common.logger(self.log_type, self.crawler).info(
  202. "{}成功抓取视频链接\n".format(response["data"]["vtitle"])
  203. )
  204. Common.logging(
  205. self.log_type,
  206. self.crawler,
  207. self.env,
  208. "{}成功抓取视频链接\n".format(response["data"]["vtitle"]),
  209. )
  210. time.sleep(random.randint(3, 5))
  211. self.download_count += 1
  212. return {"url": video_url, "cover": video_cover}
  213. if __name__ == "__main__":
  214. ZL = ZLNYLScheduling(
  215. log_type="recommend", crawler="zlnyl", category=3615, rule_dict={}, env="dev"
  216. )
  217. for i in range(4):
  218. ZL.get_videoList(i + 1)