meitiansongzhufu_recommend.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. import os
  2. import json
  3. import random
  4. import sys
  5. import time
  6. import uuid
  7. import requests
  8. sys.path.append(os.getcwd())
  9. from common.video_item import VideoItem
  10. from common import PiaoQuanPipeline, AliyunLogger, tunnel_proxies
  11. from common.mq import MQ
  12. def get_video_detail(vid, userad, cate):
  13. time.sleep(random.randint(5, 25))
  14. url = "https://gkvxwq2023.we-media.cn/app/index.php"
  15. payload = {
  16. "i": "1",
  17. "t": "0",
  18. "m": "jyt_txvideo",
  19. "v": "1.0.0",
  20. "from": "wxapp",
  21. "c": "entry",
  22. "a": "wxapp",
  23. "do": "videodetail",
  24. "vid": vid,
  25. "userad": userad,
  26. "cate": cate,
  27. }
  28. headers = {
  29. "Host": "gkvxwq2023.we-media.cn",
  30. "xweb_xhr": "1",
  31. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.5(0x13080510)XWEB/1100",
  32. "content-type": "application/x-www-form-urlencoded",
  33. "accept": "*/*",
  34. "referer": "https://servicewechat.com/wx49f06df06becc7fa/2/page-frame.html",
  35. "accept-language": "en-US,en;q=0.9",
  36. }
  37. response = requests.request("POST", url, headers=headers, data=payload)
  38. return response.json()
  39. class SongZhuFuRecommend(object):
  40. def __init__(self, platform, mode, rule_dict, user_list, env):
  41. self.platform = platform
  42. self.mode = mode
  43. self.rule_dict = rule_dict
  44. self.user_list = user_list
  45. self.env = env
  46. self.download_cnt = 0
  47. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  48. self.limit_flag = False
  49. def get_video_list(self):
  50. """
  51. 推荐流并没有页数,每次请求数据不同,设置每天的抓取视频数量为100-200条
  52. """
  53. base_url = "https://gkvxwq2023.we-media.cn/app/index.php"
  54. headers = {
  55. "Host": "gkvxwq2023.we-media.cn",
  56. "xweb_xhr": "1",
  57. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.5(0x13080510)XWEB/1100",
  58. "content-type": "application/x-www-form-urlencoded",
  59. "accept": "*/*",
  60. "referer": "https://servicewechat.com/wx49f06df06becc7fa/2/page-frame.html",
  61. "accept-language": "en-US,en;q=0.9",
  62. }
  63. params = {
  64. "i": "1",
  65. "t": "0",
  66. "m": "jyt_txvideo",
  67. "v": "1.0.0",
  68. "from": "wxapp",
  69. "c": "entry",
  70. "a": "wxapp",
  71. "do": "videotui",
  72. "city": "",
  73. "category": "494",
  74. }
  75. while True:
  76. time.sleep(random.randint(1, 10))
  77. try:
  78. if self.limit_flag:
  79. AliyunLogger.logging(
  80. code="2000",
  81. platform=self.platform,
  82. mode=self.mode,
  83. env=self.env,
  84. message="本轮已经抓取到足够的数据,自动退出\t{}".format(self.download_cnt),
  85. )
  86. return
  87. else:
  88. response = requests.get(
  89. url=base_url,
  90. headers=headers,
  91. params=params,
  92. proxies=tunnel_proxies(),
  93. )
  94. video_list = response.json()['data']
  95. if video_list:
  96. for index, video_obj in enumerate(video_list, 1):
  97. try:
  98. AliyunLogger.logging(
  99. code="1001",
  100. platform=self.platform,
  101. mode=self.mode,
  102. env=self.env,
  103. message="扫描到一条视频",
  104. data=video_obj,
  105. )
  106. self.process_video_obj(video_obj)
  107. except Exception as e:
  108. AliyunLogger.logging(
  109. code="3000",
  110. platform=self.platform,
  111. mode=self.mode,
  112. env=self.env,
  113. data=video_obj,
  114. message="抓取第{}条的时候出现问题, 报错信息是{}".format(index, e),
  115. )
  116. else:
  117. AliyunLogger.logging(
  118. code="2000",
  119. platform=self.platform,
  120. mode=self.mode,
  121. env=self.env,
  122. message="已经抓完了,自动退出"
  123. )
  124. return
  125. except Exception as e:
  126. AliyunLogger.logging(
  127. code="3000",
  128. platform=self.platform,
  129. mode=self.mode,
  130. env=self.env,
  131. message="抓取推荐页的时候出现错误, 报错信息是{}".format(e),
  132. )
  133. def process_video_obj(self, video_obj):
  134. trace_id = self.platform + str(uuid.uuid1())
  135. our_user = random.choice(self.user_list)
  136. item = VideoItem()
  137. item.add_video_info("user_id", our_user["uid"])
  138. item.add_video_info("user_name", our_user["nick_name"])
  139. item.add_video_info("video_id", video_obj["id"])
  140. item.add_video_info("video_title", video_obj["vtitle"])
  141. item.add_video_info("publish_time_stamp", int(video_obj['create_time']))
  142. vid = video_obj['vid']
  143. if vid.startswith("new"):
  144. userad = ""
  145. cate = video_obj['category']
  146. else:
  147. return
  148. detail_obj = get_video_detail(vid=vid, userad=userad, cate=cate)['data']
  149. item.add_video_info("video_url", detail_obj['vid'])
  150. item.add_video_info("cover_url", detail_obj["poster"])
  151. item.add_video_info("duration", detail_obj['vtime'])
  152. item.add_video_info("play_cnt", detail_obj['realview'])
  153. item.add_video_info("out_video_id", video_obj["id"])
  154. item.add_video_info("platform", self.platform)
  155. item.add_video_info("strategy", self.mode)
  156. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  157. mq_obj = item.produce_item()
  158. pipeline = PiaoQuanPipeline(
  159. platform=self.platform,
  160. mode=self.mode,
  161. rule_dict=self.rule_dict,
  162. env=self.env,
  163. item=mq_obj,
  164. trace_id=trace_id,
  165. )
  166. if pipeline.process_item():
  167. self.download_cnt += 1
  168. self.mq.send_msg(mq_obj)
  169. # print(mq_obj)
  170. AliyunLogger.logging(
  171. code="1002",
  172. platform=self.platform,
  173. mode=self.mode,
  174. env=self.env,
  175. message="成功发送至 ETL",
  176. data=mq_obj,
  177. )
  178. if self.download_cnt >= int(
  179. self.rule_dict.get("videos_cnt", {}).get("min", 200)
  180. ):
  181. self.limit_flag = True