youlegaoxiaoxiaoshipin_scheduling.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. # -*- coding: utf-8 -*-
  2. # @Author: luojunhui
  3. # @Time: 2023/10/23
  4. import json
  5. import os
  6. import random
  7. import sys
  8. import time
  9. import requests
  10. from common.mq import MQ
  11. sys.path.append(os.getcwd())
  12. from common.common import Common
  13. from common.aliyun_log import AliyunLogger
  14. from common.pipeline import PiaoQuanPipeline
  15. def clean_title(strings):
  16. return (
  17. strings.strip()
  18. .replace("\n", "")
  19. .replace("/", "")
  20. .replace("\r", "")
  21. .replace("#", "")
  22. .replace(".", "。")
  23. .replace("\\", "")
  24. .replace("&NBSP", "")
  25. .replace(":", "")
  26. .replace("*", "")
  27. .replace("?", "")
  28. .replace("?", "")
  29. .replace('"', "")
  30. .replace("<", "")
  31. .replace(">", "")
  32. .replace("|", "")
  33. .replace(" ", "")
  34. .replace('"', "")
  35. .replace("'", "")
  36. )
  37. class YLGXXSPScheduling:
  38. def __init__(self, log_type, crawler, rule_dict, env, our_uid):
  39. self.platform = "youlegaoxiaoxiaoshipin"
  40. self.log_type = log_type
  41. self.crawler = crawler
  42. self.rule_dict = rule_dict
  43. self.env = env
  44. self.our_uid = our_uid
  45. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  46. self.download_count = 0
  47. # 获取视频id_list
  48. def get_videoList(self, page_id):
  49. # time.sleep(random.randint(5, 10))
  50. headers = {
  51. "Host": "cpu.baidu.com",
  52. "xweb_xhr": "1",
  53. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.4(0x13080410)XWEB/31009",
  54. "Accept": "*/*",
  55. "Sec-Fetch-Site": "cross-site",
  56. "Sec-Fetch-Mode": "cors",
  57. "Sec-Fetch-Dest": "empty",
  58. "Referer": "https://servicewechat.com/wx38382a240eab7214/4/page-frame.html",
  59. "Accept-Language": "en-US,en;q=0.9",
  60. }
  61. data = {
  62. "channelId": "1033",
  63. "needHybrid": "1",
  64. "pageNo": str(page_id),
  65. "pageSize": "10",
  66. }
  67. response = requests.post(
  68. "https://cpu.baidu.com/1033/a16a67fe",
  69. headers=headers,
  70. data=data
  71. )
  72. result = response.json()
  73. if "data" not in result or response.status_code != 200:
  74. # Common.logger(self.log_type, self.crawler).info(
  75. # f"get_videoList:{response.text}\n"
  76. # )
  77. # Common.logging(
  78. # self.log_type,
  79. # self.crawler,
  80. # self.env,
  81. # f"get_videoList:{response.text}\n",
  82. # )
  83. return
  84. elif len(result["data"]['result']) == 0:
  85. # Common.logger(self.log_type, self.crawler).info(f"没有更多数据啦~\n")
  86. # Common.logging(self.log_type, self.crawler, self.env, f"没有更多数据啦~\n")
  87. return
  88. else:
  89. data_list = result['data']["result"]
  90. for video_obj in data_list:
  91. print(1)
  92. AliyunLogger.logging(
  93. code="1001",
  94. platform=self.crawler,
  95. mode=self.log_type,
  96. env=self.env,
  97. data={},
  98. message="成功扫描到一条视频"
  99. )
  100. self.process_video_obj(video_obj)
  101. # try:
  102. # AliyunLogger.logging(
  103. # code="1001",
  104. # platform=self.crawler,
  105. # mode=self.log_type,
  106. # env=self.env,
  107. # data="",
  108. # message="成功扫描到一条视频"
  109. # )
  110. # self.process_video_obj(video_obj)
  111. # except Exception as e:
  112. # Common.logger(self.log_type, self.crawler).error(f"抓取单条视频异常:{e}\n")
  113. # Common.logging(
  114. # self.log_type, self.crawler, self.env, f"抓取单条视频异常:{e}\n"
  115. # )
  116. def process_video_obj(self, video_obj):
  117. video_id = video_obj.get("data", {}).get("id", 0)
  118. video_title = clean_title(video_obj.get("data", {}).get("title", "no title"))
  119. video_time = video_obj['data']['duration']
  120. publish_time_stamp = int(video_obj['data']['clusterTime'])
  121. publish_time_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(publish_time_stamp))
  122. user_name = video_obj['data']['source']
  123. video_dict = {
  124. "video_title": video_title,
  125. "video_id": video_id,
  126. "duration": video_time,
  127. "play_cnt": int(video_obj['data'].get("playbackCount", 0)),
  128. "like_cnt": int(video_obj.get("likeCount", 0)),
  129. "comment_cnt": int(video_obj.get("commentCounts", 0)),
  130. "share_cnt": 0,
  131. "user_name": user_name,
  132. "publish_time_stamp": publish_time_stamp,
  133. "publish_time_str": publish_time_str,
  134. "video_width": 0,
  135. "video_height": 0,
  136. "profile_id": 0,
  137. "profile_mid": 0,
  138. "session": f"youlegaoxiaoxiaoshipin-{int(time.time())}",
  139. }
  140. flag = PiaoQuanPipeline(
  141. platform=self.crawler,
  142. mode=self.log_type,
  143. rule_dict=self.rule_dict,
  144. env=self.env,
  145. item=video_dict
  146. )
  147. if flag:
  148. video_dict["out_user_id"] = video_obj['data'].get("ownerId", 0)
  149. video_dict["platform"] = self.crawler
  150. video_dict["strategy"] = self.log_type
  151. video_dict["out_video_id"] = str(video_dict["video_id"])
  152. video_dict["width"] = video_dict["video_width"]
  153. video_dict["height"] = video_dict["video_height"]
  154. video_dict["crawler_rule"] = json.dumps(self.rule_dict)
  155. video_dict["user_id"] = self.our_uid
  156. video_dict["publish_time"] = video_dict["publish_time_str"]
  157. video_dict["video_url"] = "http:" + video_obj['data']['url']
  158. video_dict["avatar_url"] = "http:" + video_obj['data']['avatar']
  159. video_dict["cover_url"] = "http:" + video_obj['data']['thumbUrl']
  160. print(json.dumps(video_dict, ensure_ascii=False, indent=4))
  161. self.download_count += 1
  162. # self.mq.send_msg(video_dict)
  163. if __name__ == "__main__":
  164. ZL = YLGXXSPScheduling(
  165. log_type="recommend",
  166. crawler="ylgxxsp",
  167. rule_dict={},
  168. our_uid="luojunhuihaoshuai",
  169. env="prod"
  170. )
  171. for i in range(5):
  172. ZL.get_videoList(page_id=i + 1)
  173. print(ZL.download_count)