zhonglaonianyule_recommend_scheduling.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/10
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import uuid
  9. import time
  10. import requests
  11. from common.mq import MQ
  12. sys.path.append(os.getcwd())
  13. from common.common import Common
  14. from common import PiaoQuanPipeline, AliyunLogger
  15. from common.public import clean_title
  16. proxies = {"http": None, "https": None}
  17. class ZLNYLScheduling:
  18. def __init__(self, log_type, crawler, category, rule_dict, env, our_uid):
  19. self.platform = "中老年娱乐"
  20. self.log_type = log_type
  21. self.crawler = crawler
  22. self.category = category
  23. self.rule_dict = rule_dict
  24. self.env = env
  25. self.our_uid = our_uid
  26. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  27. self.download_count = 0
  28. # 获取视频id_list
  29. def get_videoList(self, page_id):
  30. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videolist&"
  31. headers = {
  32. "Host": "kkj.xinhuachuanmeijs.com",
  33. "accept": "*/*",
  34. "content-type": "application/x-www-form-urlencoded",
  35. "accept-language": "zh-cn",
  36. "user-agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac",
  37. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  38. "Cookie": "PHPSESSID=ef4e78382296a0db2021ecd6e35c614f",
  39. }
  40. payload = "category={}&page={}&israndom=1&type=4&noauth=true".format(
  41. self.category, page_id
  42. )
  43. response = requests.request("POST", url, headers=headers, data=payload)
  44. if "data" not in response.text or response.status_code != 200:
  45. Common.logger(self.log_type, self.crawler).info(
  46. f"get_videoList:{response.text}\n"
  47. )
  48. Common.logging(
  49. self.log_type,
  50. self.crawler,
  51. self.env,
  52. f"get_videoList:{response.text}\n",
  53. )
  54. AliyunLogger.logging(
  55. code="2000",
  56. platform=self.crawler,
  57. mode=self.log_type,
  58. env=self.env,
  59. message=f"get_videoList:{response.text}\n",
  60. )
  61. return
  62. elif len(response.json()["data"]) == 0:
  63. Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  64. Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  65. AliyunLogger.logging(
  66. code="2000",
  67. platform=self.crawler,
  68. mode=self.log_type,
  69. env=self.env,
  70. message=f"没有更多数据啦~\n"
  71. )
  72. return
  73. else:
  74. data_list = response.json()["data"]
  75. for video_obj in data_list:
  76. try:
  77. trace_id = self.crawler + str(uuid.uuid1())
  78. AliyunLogger.logging(
  79. code="1001",
  80. platform=self.crawler,
  81. mode=self.log_type,
  82. env=self.env,
  83. trace_id=trace_id,
  84. message="扫描到一条视频"
  85. )
  86. video_id = video_obj.get("vid", 0)
  87. video_title = clean_title(video_obj.get("vtitle", 0))
  88. video_time = video_obj.get("v_time", 0)
  89. publish_time_stamp = int(time.time())
  90. publish_time_str = time.strftime(
  91. "%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp)
  92. )
  93. user_name = ""
  94. video_dict = {
  95. "video_title": video_title,
  96. "video_id": video_id,
  97. "duration": video_time,
  98. "play_cnt": 0,
  99. "like_cnt": 0,
  100. "comment_cnt": 0,
  101. "share_cnt": 0,
  102. "user_name": user_name,
  103. "publish_time_stamp": publish_time_stamp,
  104. "publish_time_str": publish_time_str,
  105. "update_time_stamp": int(time.time()),
  106. "video_width": 0,
  107. "video_height": 0,
  108. "profile_id": 0,
  109. "profile_mid": 0,
  110. # "cover_url": "",
  111. "session": f"zhonglaonianyule-{int(time.time())}",
  112. }
  113. for k, v in video_dict.items():
  114. Common.logger(self.log_type, self.crawler).info(f"{k}:{v}")
  115. Common.logging(
  116. self.log_type, self.crawler, self.env, f"{video_dict}"
  117. )
  118. video_dict["out_user_id"] = video_dict["profile_id"]
  119. video_dict["platform"] = self.crawler
  120. video_dict["strategy"] = self.log_type
  121. video_dict["out_video_id"] = video_dict["video_id"]
  122. video_dict["width"] = video_dict["video_width"]
  123. video_dict["height"] = video_dict["video_height"]
  124. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  125. video_dict["user_id"] = self.our_uid
  126. video_dict["publish_time"] = video_dict["publish_time_str"]
  127. d_obj = self.find_video_url(video_id)
  128. video_dict["video_url"] = d_obj["url"]
  129. video_dict["avatar_url"] = d_obj["cover"]
  130. video_dict["cover_url"] = d_obj["cover"]
  131. # 过滤无效视频
  132. if video_title == "" or video_dict["video_id"] == "":
  133. Common.logger(self.log_type, self.crawler).info("无效视频\n")
  134. Common.logging(self.log_type, self.crawler, self.env, "无效视频\n")
  135. AliyunLogger.logging(
  136. code="2005",
  137. platform=self.crawler,
  138. mode=self.log_type,
  139. env=self.env,
  140. trace_id=trace_id,
  141. message="无效视频"
  142. )
  143. continue
  144. pipeline = PiaoQuanPipeline(
  145. platform=self.crawler,
  146. mode=self.log_type,
  147. env=self.env,
  148. rule_dict=self.rule_dict,
  149. item=video_dict,
  150. trace_id=trace_id
  151. )
  152. if pipeline.process_item():
  153. self.mq.send_msg(video_dict)
  154. AliyunLogger.logging(
  155. code="1002",
  156. platform=self.crawler,
  157. mode=self.log_type,
  158. env=self.env,
  159. data=video_dict,
  160. trace_id=trace_id,
  161. message="成功发送至 ETL"
  162. )
  163. except Exception as e:
  164. Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  165. Common.logging(
  166. self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  167. )
  168. AliyunLogger.logging(
  169. code="3000",
  170. platform=self.crawler,
  171. mode=self.log_type,
  172. env=self.env,
  173. message=f"抓取单条视频异常:{e}\n"
  174. )
  175. def find_video_url(self, video_id):
  176. url = "https://kkj.xinhuachuanmeijs.com/app/index.php?i=299&t=0&m=jyt_txvideo&v=1.0&from=wxapp&c=entry&a=wxapp&do=videoinfo&state=we7sid-f0008a08276fc324921185dc74427c56&sign=fa36387242169f01aa747a80d49c8670&vid={}&version=1.0.3".format(
  177. video_id
  178. )
  179. headers = {
  180. "Host": "kkj.xinhuachuanmeijs.com",
  181. "xweb_xhr": "1",
  182. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF XWEB/30817",
  183. "content-type": "application/x-www-form-urlencoded",
  184. "accept": "*/*",
  185. "sec-fetch-site": "cross-site",
  186. "sec-fetch-mode": "cors",
  187. "sec-fetch-dest": "empty",
  188. "referer": "https://servicewechat.com/wx546222d9b2fe5fc0/3/page-frame.html",
  189. "accept-language": "en",
  190. }
  191. response = requests.get(url, headers=headers).json()
  192. video_url = response["data"]["res"]
  193. video_cover = response["data"]["cover"]
  194. Common.logger(self.log_type, self.crawler).info(
  195. "{}成功抓取视频链接\n".format(response["data"]["vtitle"])
  196. )
  197. Common.logging(
  198. self.log_type,
  199. self.crawler,
  200. self.env,
  201. "{}成功抓取视频链接\n".format(response["data"]["vtitle"]),
  202. )
  203. time.sleep(random.randint(3, 5))
  204. self.download_count += 1
  205. return {"url": video_url, "cover": video_cover}
  206. if __name__ == "__main__":
  207. ZL = ZLNYLScheduling(
  208. log_type="recommend", crawler="zlnyl", category=3615, rule_dict={}, env="dev"
  209. )
  210. for i in range(4):
  211. ZL.get_videoList(i + 1)