zhonglaonianyule_recommend_scheduling.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/10
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from common.mq import MQ
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common.scheduling_db import MysqlHelper
  14. from common.public import get_config_from_mysql, download_rule
  15. proxies = {"http": None, "https": None}
  16. def clean_title(strings):
  17. return (
  18. strings.strip()
  19. .replace("\n", "")
  20. .replace("/", "")
  21. .replace("\r", "")
  22. .replace("#", "")
  23. .replace(".", "。")
  24. .replace("\\", "")
  25. .replace("&NBSP", "")
  26. .replace(":", "")
  27. .replace("*", "")
  28. .replace("?", "")
  29. .replace("?", "")
  30. .replace('"', "")
  31. .replace("<", "")
  32. .replace(">", "")
  33. .replace("|", "")
  34. .replace(" ", "")
  35. .replace('"', "")
  36. .replace("'", "")
  37. )
  38. class ZLNYLScheduling:
  39. def __init__(self, log_type, crawler, category, rule_dict, env):
  40. self.platform = "中老年娱乐"
  41. self.log_type = log_type
  42. self.crawler = crawler
  43. self.category = category
  44. self.rule_dict = rule_dict
  45. self.env = env
  46. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  47. def repeat_video(self, video_id):
  48. sql = f""" select * from crawler_video where platform in ("{self.crawler}","{self.platform}") and out_video_id="{video_id}"; """
  49. repeat_video = MysqlHelper.get_values(
  50. self.log_type, self.crawler, sql, self.env
  51. )
  52. return len(repeat_video)
  53. # 获取视频id_list
  54. def get_videoList(self, page_id):
  55. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videolist&"
  56. headers = {
  57. "Host": "kkj.xinhuachuanmeijs.com",
  58. "accept": "*/*",
  59. "content-type": "application/x-www-form-urlencoded",
  60. "accept-language": "zh-cn",
  61. "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac",
  62. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  63. "Cookie": "PHPSESSID=ef4e78382296a0db2021ecd6e35c614f",
  64. }
  65. payload = "category={}&page={}&israndom=1&type=4&noauth=true".format(
  66. self.category, page_id
  67. )
  68. response = requests.request("POST", url, headers=headers, data=payload)
  69. if "data" not in response.text or response.status_code != 200:
  70. Common.logger(self.log_type, self.crawler).info(
  71. f"get_videoList:{response.text}\n"
  72. )
  73. Common.logging(
  74. self.log_type,
  75. self.crawler,
  76. self.env,
  77. f"get_videoList:{response.text}\n",
  78. )
  79. return
  80. elif len(response.json()["data"]) == 0:
  81. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  82. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  83. return
  84. else:
  85. data_list = response.json()["data"]
  86. for video_obj in data_list:
  87. try:
  88. video_id = video_obj.get("vid", 0)
  89. video_title = clean_title(video_obj.get("vtitle", 0))
  90. video_time = video_obj.get("v_time", 0)
  91. publish_time_stamp = int(time.time())
  92. publish_time_str = time.strftime(
  93. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  94. )
  95. user_name = ""
  96. video_dict = {
  97. "video_title": video_title,
  98. "video_id": video_id,
  99. "duration": video_time,
  100. "play_cnt": 0,
  101. "like_cnt": 0,
  102. "comment_cnt": 0,
  103. "share_cnt": 0,
  104. "user_name": user_name,
  105. "publish_time_stamp": publish_time_stamp,
  106. "publish_time_str": publish_time_str,
  107. "video_width": 0,
  108. "video_height": 0,
  109. "profile_id": 0,
  110. "profile_mid": 0,
  111. "cover_url": "",
  112. "session": f"zhonglaonianyule-{int(time.time())}",
  113. }
  114. for k, v in video_dict.items():
  115. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  116. Common.logging(
  117. self.log_type, self.crawler, self.env, f"{video_dict}"
  118. )
  119. # 过滤无效视频
  120. if video_title == "" or video_dict["video_id"] == "":
  121. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  122. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  123. # 抓取基础规则过滤
  124. elif (
  125. download_rule(
  126. log_type=self.log_type,
  127. crawler=self.crawler,
  128. video_dict=video_dict,
  129. rule_dict=self.rule_dict,
  130. )
  131. is False
  132. ):
  133. Common.logger(self.log_type, self.crawler).info("不满足抓取规则\n")
  134. Common.logging(
  135. self.log_type, self.crawler, self.env, "不满足抓取规则\n"
  136. )
  137. elif (
  138. any(
  139. str(word)
  140. if str(word) in video_dict["video_title"]
  141. else False
  142. for word in get_config_from_mysql(
  143. log_type=self.log_type,
  144. source=self.crawler,
  145. env=self.env,
  146. text="filter",
  147. action="",
  148. )
  149. )
  150. is True
  151. ):
  152. Common.logger(self.log_type, self.crawler).info("已中过滤词\n")
  153. Common.logging(self.log_type, self.crawler, self.env, "已中过滤词\n")
  154. elif self.repeat_video(video_dict["video_id"]) != 0:
  155. Common.logger(self.log_type, self.crawler).info("视频已下载\n")
  156. Common.logging(self.log_type, self.crawler, self.env, "视频已下载\n")
  157. else:
  158. video_dict["out_user_id"] = video_dict["profile_id"]
  159. video_dict["platform"] = self.crawler
  160. video_dict["strategy"] = self.log_type
  161. video_dict["out_video_id"] = video_dict["video_id"]
  162. video_dict["width"] = video_dict["video_width"]
  163. video_dict["height"] = video_dict["video_height"]
  164. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  165. video_dict["user_id"] = ""
  166. video_dict["publish_time"] = video_dict["publish_time_str"]
  167. d_obj = self.find_video_url(video_id)
  168. video_dict["video_url"] = d_obj["url"]
  169. video_dict["avatar_url"] = d_obj["cover"]
  170. # print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  171. self.mq.send_msg(video_dict)
  172. except Exception as e:
  173. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  174. Common.logging(
  175. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  176. )
  177. def find_video_url(self, video_id):
  178. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videoinfo&state=we7sid-f0008a08276fc324921185dc74427c56&sign=fa36387242169f01aa747a80d49c8670&vid={}&version=1.0.3".format(
  179. video_id
  180. )
  181. headers = {
  182. "Host": "kkj.xinhuachuanmeijs.com",
  183. "xweb_xhr": "1",
  184. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  185. "content-type": "application/x-www-form-urlencoded",
  186. "accept": "*/*",
  187. "sec-fetch-site": "cross-site",
  188. "sec-fetch-mode": "cors",
  189. "sec-fetch-dest": "empty",
  190. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  191. "accept-language": "en",
  192. }
  193. response = requests.get(url, headers=headers).json()
  194. video_url = response["data"]["res"]
  195. video_cover = response["data"]["cover"]
  196. Common.logger(self.log_type, self.crawler).info(
  197. "{}成功抓取视频链接\n".format(response["data"]["vtitle"])
  198. )
  199. Common.logging(
  200. self.log_type,
  201. self.crawler,
  202. self.env,
  203. "{}成功抓取视频链接\n".format(response["data"]["vtitle"]),
  204. )
  205. time.sleep(random.randint(3, 5))
  206. return {"url": video_url, "cover": video_cover}
  207. if __name__ == "__main__":
  208. ZL = ZLNYLScheduling(
  209. log_type="recommend", crawler="zlnyl", category=3615, rule_dict={}, env="dev"
  210. )
  211. for i in range(4):
  212. ZL.get_videoList(i + 1)