jixiangzhufuweinichuandihaoyun_recommend.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244
  1. # -*- coding: utf-8 -*-
  2. # @Author: zhangyong
  3. # @Time: 2023/12/04
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from hashlib import md5
  11. from datetime import datetime
  12. from common import get_redirect_url
  13. from common.mq import MQ
  14. sys.path.append(os.getcwd())
  15. from common.common import Common
  16. from common.scheduling_db import MysqlHelper
  17. from common.public import get_config_from_mysql, download_rule
  18. proxies = {"http": None, "https": None}
  19. def clean_title(strings):
  20. return (
  21. strings.strip()
  22. .replace("\n", "")
  23. .replace("/", "")
  24. .replace("\r", "")
  25. .replace("#", "")
  26. .replace(".", "。")
  27. .replace("\\", "")
  28. .replace("&NBSP", "")
  29. .replace(":", "")
  30. .replace("*", "")
  31. .replace("?", "")
  32. .replace("?", "")
  33. .replace('"', "")
  34. .replace("<", "")
  35. .replace(">", "")
  36. .replace("|", "")
  37. .replace(" ", "")
  38. .replace('"', "")
  39. .replace("'", "")
  40. )
  41. class Jxzfwncdhyspcheduling:
  42. def __init__(self, log_type, crawler, rule_dict, env, our_uid):
  43. self.platform = "吉祥祝福为你传递好运"
  44. self.log_type = log_type
  45. self.crawler = crawler
  46. self.rule_dict = rule_dict
  47. self.env = env
  48. self.our_uid = our_uid
  49. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  50. self.download_count = 0
  51. def repeat_video(self, video_id):
  52. sql = f""" select * from crawler_video where platform in ("{self.crawler}","{self.platform}") and out_video_id="{video_id}"; """
  53. repeat_video = MysqlHelper.get_values(
  54. self.log_type, self.crawler, sql, self.env
  55. )
  56. return len(repeat_video)
  57. # 获取视频id_list
  58. def get_videoList(self):
  59. for i in range(1, 10):
  60. time.sleep(random.randint(5, 10))
  61. url = "https://api.xiahong.top/index.php?s=mobile/Home/getHomeList&page={}&appid=wx7457ce7bf3cdbdbf&version=1.9.1&env_version=release&scene=1008".format(i)
  62. headers = {
  63. 'Host': 'api.xiahong.top',
  64. 'Content-Type': 'application/json',
  65. 'Accept-Language': 'zh-cn',
  66. 'Accept': '*/*',
  67. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac',
  68. 'Referer': 'https://servicewechat.com/wx49e7ec4c849fb4e2/2/page-frame.html',
  69. 'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MDI0MzU3NjUsIm5iZiI6MTcwMjQzNTc2NSwiZXhwIjoxNzAyNDQyOTY1LCJkYXRhIjp7InVzZXJfaWQiOiIyNDQ3NzI2NzMifX0.yqz9rShAXYSGEonLtCu5h8geosw7dEO-2unxvmmugZE',
  70. 'ik': 'b326b5062b2f0e69046810717534cb09'
  71. }
  72. response = requests.post(url, headers=headers)
  73. if "data" not in response.text or response.status_code != 200:
  74. Common.logger(self.log_type, self.crawler).info(
  75. f"get_videoList:{response.text}\n"
  76. )
  77. Common.logging(
  78. self.log_type,
  79. self.crawler,
  80. self.env,
  81. f"get_videoList:{response.text}\n",
  82. )
  83. return
  84. elif len(response.json()["data"]["list"]) == 0:
  85. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  86. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  87. return
  88. else:
  89. data_list = response.json()["data"]["list"]
  90. for video_obj in data_list:
  91. try:
  92. self.process_video_obj(video_obj)
  93. except Exception as e:
  94. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  95. Common.logging(
  96. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  97. )
  98. def get_video_list(self, video_id):
  99. url = "https://api.xiahong.top/index.php?s=mobile/Video/getVideoInfo&vid={}&appid=wx7457ce7bf3cdbdbf&version=1.9.1&scene=1089".format(
  100. video_id)
  101. headers = {
  102. 'Host': 'api.xiahong.top',
  103. 'Content-Type': 'application/json',
  104. 'Accept-Language': 'zh-cn',
  105. 'Accept': '*/*',
  106. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac',
  107. 'Referer': 'https://servicewechat.com/wx7457ce7bf3cdbdbf/2/page-frame.html',
  108. 'token': 'eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpYXQiOjE3MDE4NDA2MzcsIm5iZiI6MTcwMTg0MDYzNywiZXhwIjoxNzAxODQ3ODM3LCJkYXRhIjp7InVzZXJfaWQiOjIzNjM5MTAxMH19.WYV52kw3Oi4RT-KAblVCFYXWO3RJAQH9x6hB2tWKMKc',
  109. 'ik': 'b326b5062b2f0e69046810717534cb09'
  110. }
  111. response = requests.post(url, headers=headers)
  112. if "data" not in response.text or response.status_code != 200:
  113. Common.logger(self.log_type, self.crawler).info(
  114. f"get_videoList:{response.text}\n"
  115. )
  116. Common.logging(
  117. self.log_type,
  118. self.crawler,
  119. self.env,
  120. f"get_videoList:{response.text}\n",
  121. )
  122. return
  123. elif len(response.json()["data"]["video_info"]) == 0:
  124. Common.logger(self.log_type, self.crawler).info(f"详情页数据为空~\n")
  125. Common.logging(self.log_type, self.crawler, self.env, f"详情页数据为空~\n")
  126. return
  127. else:
  128. data_list = response.json()["data"]["video_info"]
  129. return data_list
  130. def process_video_obj(self, video_obj):
  131. video_id = video_obj.get("id", 0)
  132. get_video_list = self.get_video_list(video_id)
  133. video_title = clean_title(get_video_list.get("title", "no title"))
  134. video_time = video_obj.get("v_time", 0)
  135. publish_time_stamp = int(time.time())
  136. publish_time_str = time.strftime(
  137. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  138. )
  139. user_name = ""
  140. video_dict = {
  141. "video_title": video_title,
  142. "video_id": video_id,
  143. "duration": video_time,
  144. "play_cnt": video_obj.get("visited", 0),
  145. "like_cnt": 0,
  146. "comment_cnt": 0,
  147. "share_cnt": video_obj.get("shared", 0),
  148. "user_name": user_name,
  149. "publish_time_stamp": publish_time_stamp,
  150. "publish_time_str": publish_time_str,
  151. "video_width": 0,
  152. "video_height": 0,
  153. "profile_id": 0,
  154. "profile_mid": 0,
  155. # "cover_url": "",
  156. "session": f"zhufuzanmenzhonglaonianzhishipin-{int(time.time())}",
  157. }
  158. for k, v in video_dict.items():
  159. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  160. Common.logging(
  161. self.log_type, self.crawler, self.env, f"{video_dict}"
  162. )
  163. # 过滤无效视频
  164. if video_title == "" or video_dict["video_id"] == "":
  165. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  166. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  167. # 抓取基础规则过滤
  168. elif (
  169. download_rule(
  170. log_type=self.log_type,
  171. crawler=self.crawler,
  172. video_dict=video_dict,
  173. rule_dict=self.rule_dict,
  174. )
  175. is False
  176. ):
  177. Common.logger(self.log_type, self.crawler).info("不满足抓取规则\n")
  178. Common.logging(
  179. self.log_type, self.crawler, self.env, "不满足抓取规则\n"
  180. )
  181. elif (
  182. any(
  183. str(word)
  184. if str(word) in video_dict["video_title"]
  185. else False
  186. for word in get_config_from_mysql(
  187. log_type=self.log_type,
  188. source=self.crawler,
  189. env=self.env,
  190. text="filter",
  191. action="",
  192. )
  193. )
  194. is True
  195. ):
  196. Common.logger(self.log_type, self.crawler).info("已中过滤词\n")
  197. Common.logging(self.log_type, self.crawler, self.env, "已中过滤词\n")
  198. elif self.repeat_video(video_dict["video_id"]) != 0:
  199. Common.logger(self.log_type, self.crawler).info("视频已下载\n")
  200. Common.logging(self.log_type, self.crawler, self.env, "视频已下载\n")
  201. else:
  202. video_url = get_video_list['video_url']
  203. video_url = get_redirect_url(video_url)
  204. video_dict["out_user_id"] = video_dict["profile_id"]
  205. video_dict["platform"] = self.crawler
  206. video_dict["strategy"] = self.log_type
  207. video_dict["out_video_id"] = str(video_dict["video_id"])
  208. video_dict["width"] = video_dict["video_width"]
  209. video_dict["height"] = video_dict["video_height"]
  210. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  211. video_dict["user_id"] = self.our_uid
  212. video_dict["publish_time"] = video_dict["publish_time_str"]
  213. video_dict["video_url"] = video_url
  214. video_dict["avatar_url"] = video_obj['images']
  215. video_dict["cover_url"] = video_obj['images']
  216. self.download_count += 1
  217. self.mq.send_msg(video_dict)
  218. if __name__ == "__main__":
  219. ZL = Jxzfwncdhyspcheduling(
  220. log_type="recommend",
  221. crawler="Jxzfwncdhy",
  222. rule_dict={},
  223. our_uid="zhangyong",
  224. env="dev"
  225. )
  226. ZL.get_videoList()