zuihaodesongni.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. """
  2. @author: luojunhui
  3. """
  4. import os
  5. import sys
  6. import json
  7. import time
  8. import uuid
  9. import random
  10. import datetime
  11. import requests
  12. sys.path.append(os.getcwd())
  13. from application.items import VideoItem
  14. from application.pipeline import PiaoQuanPipeline
  15. from application.common.messageQueue import MQ
  16. from application.common.proxies import tunnel_proxies
  17. from application.common.log import AliyunLogger
  18. class ZuiHaoDeSongNi(object):
  19. """
  20. 最好的送你——推荐爬虫
  21. """
  22. def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
  23. self.platform = platform
  24. self.mode = mode
  25. self.rule_dict = rule_dict
  26. self.user_list = user_list
  27. self.env = env
  28. self.download_cnt = 0
  29. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  30. self.expire_flag = False
  31. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  32. def process_video_obj(self, video_obj):
  33. """
  34. 处理每一个视频内容
  35. :return: None
  36. """
  37. trace_id = self.platform + str(uuid.uuid1())
  38. our_user = random.choice(self.user_list)
  39. publish_time_stamp = int(video_obj["update_time"])
  40. publish_time_str = datetime.datetime.fromtimestamp(publish_time_stamp).strftime(
  41. "%Y-%m-%d %H:%M:%S"
  42. )
  43. item = VideoItem()
  44. item.add_video_info("user_id", our_user["uid"])
  45. item.add_video_info("user_name", our_user["nick_name"])
  46. item.add_video_info("video_id", video_obj["nid"])
  47. item.add_video_info("video_title", video_obj["title"])
  48. item.add_video_info("publish_time_str", publish_time_str)
  49. item.add_video_info("publish_time_stamp", int(publish_time_stamp))
  50. item.add_video_info("video_url", video_obj["video_url"])
  51. item.add_video_info("cover_url", video_obj["video_cover"])
  52. item.add_video_info("out_video_id", video_obj["nid"])
  53. item.add_video_info("platform", self.platform)
  54. item.add_video_info("strategy", self.mode)
  55. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  56. mq_obj = item.produce_item()
  57. pipeline = PiaoQuanPipeline(
  58. platform=self.platform,
  59. mode=self.mode,
  60. rule_dict=self.rule_dict,
  61. env=self.env,
  62. item=mq_obj,
  63. trace_id=trace_id,
  64. )
  65. if pipeline.process_item():
  66. self.download_cnt += 1
  67. self.mq.send_msg(mq_obj)
  68. # 随机等待 5 分钟
  69. time.sleep(60 * random.randint(1, 5))
  70. self.aliyun_log.logging(
  71. code="1002",
  72. message="成功发送至 ETL",
  73. data=mq_obj,
  74. )
  75. if self.download_cnt >= int(
  76. self.rule_dict.get("videos_cnt", {}).get("min", 200)
  77. ):
  78. self.expire_flag = True
  79. def get_recommend_list(self, page_index):
  80. """
  81. 获取推荐页面的video_list
  82. :param page_index: 页码
  83. :return: None
  84. """
  85. if self.expire_flag:
  86. self.aliyun_log.logging(
  87. code="2000",
  88. message="本轮已经抓取到足够的数据,自动退出\t{}".format(self.download_cnt),
  89. )
  90. return
  91. headers = {
  92. 'Host': 'zhdsn.wentingyou.cn',
  93. "content-time": str(int(time.time() * 1000)),
  94. "cache-time": str(int(time.time() * 1000)),
  95. 'chatkey': 'wx00da988283a73cdf',
  96. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.6(0x13080610) XWEB/1156',
  97. 'content-type': 'application/x-www-form-urlencoded',
  98. 'visitorkey': '17096941221026589978',
  99. 'xweb_xhr': '1',
  100. 'vision': '1.1.0',
  101. 'token': '',
  102. 'accept': '*/*',
  103. 'sec-fetch-site': 'cross-site',
  104. 'sec-fetch-mode': 'cors',
  105. 'sec-fetch-dest': 'empty',
  106. 'referer': 'https://servicewechat.com/wx00da988283a73cdf/7/page-frame.html',
  107. 'accept-language': 'en-US,en;q=0.9'
  108. }
  109. po = {
  110. "cid": "",
  111. "page": page_index,
  112. "is_ads": 1,
  113. "model": random.choice(
  114. [
  115. "Windows",
  116. "Mac",
  117. "HuaWei",
  118. "Xiaomi",
  119. "Xiaomi2",
  120. "Yandex",
  121. "Google",
  122. "iphone",
  123. "oppo",
  124. ]
  125. ),
  126. "mini_version": "3.8.6",
  127. "ini_id": "17096941221026589978"
  128. }
  129. params = {"parameter": json.dumps(po)}
  130. url = "https://zhdsn.wentingyou.cn/index.php/v111/index/index"
  131. time.sleep(5)
  132. response = requests.request(
  133. "GET", url=url, headers=headers, params=params
  134. )
  135. data = response.json()
  136. for index, video_obj in enumerate(data["data"]["list"], 1):
  137. try:
  138. self.aliyun_log.logging(
  139. code="1001",
  140. message="扫描到一条视频",
  141. data=video_obj,
  142. )
  143. self.process_video_obj(video_obj)
  144. except Exception as e:
  145. self.aliyun_log.logging(
  146. code="3000",
  147. message="抓取第{}条的时候出现问题, 报错信息是{}".format(index, e),
  148. )
  149. def run(self):
  150. """
  151. 执行代码
  152. :return: None
  153. """
  154. for page in range(1, 40):
  155. if self.expire_flag:
  156. self.aliyun_log.logging(
  157. code="2000",
  158. message="本轮已经抓取到足够的数据,自动退出\t{}".format(self.download_cnt),
  159. )
  160. return
  161. else:
  162. # self.get_recommend_list(page_index=page)
  163. try:
  164. self.get_recommend_list(page_index=page)
  165. except Exception as e:
  166. self.aliyun_log.logging(
  167. code="3000",
  168. message="抓取第{}页时候出现错误, 报错信息是{}".format(page, e),
  169. )