meihaoxinghe_recommend.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. import os
  2. import json
  3. import random
  4. import sys
  5. import time
  6. import uuid
  7. import requests
  8. sys.path.append(os.getcwd())
  9. from common.video_item import VideoItem
  10. from common import PiaoQuanPipeline, AliyunLogger, tunnel_proxies
  11. from common.mq import MQ
  12. class MHXHspcheduling(object):
  13. def __init__(self, platform, mode, rule_dict, user_list, env):
  14. self.platform = platform
  15. self.mode = mode
  16. self.rule_dict = rule_dict
  17. self.user_list = user_list
  18. self.env = env
  19. self.download_cnt = 0
  20. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  21. self.limit_flag = False
  22. def video_list(self, video_id):
  23. url = "https://app.miguoyun.cn/app/index.php?i=959&t=0&m=jyt_txvideo&v=31.1.11&from=wxapp&c=entry&a=wxapp&do=videoinfo&vid={}&version=1.0.3".format(
  24. video_id)
  25. headers = {
  26. 'Host': 'app.miguoyun.cn',
  27. 'accept': '*/*',
  28. 'content-type': 'application/x-www-form-urlencoded',
  29. 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac',
  30. 'referer': 'https://servicewechat.com/wx08c7ede18f448973/20/page-frame.html',
  31. 'accept-language': 'zh-cn',
  32. 'Cookie': '0a52___multiid=1; 0a52_logout=; PHPSESSID=d7eaef273b29b6e2a2b0130b0bfb92bc'
  33. }
  34. response = requests.post(url, headers=headers)
  35. data_list = response.json()["data"]
  36. return data_list
  37. def get_video_list(self):
  38. url = "https://app.miguoyun.cn/app/index.php?i=959&t=1&m=jyt_txvideo&v=31.1.11&from=wxapp&c=entry&a=wxapp&do=videolist&sign=bf3a8068467ce73c96a0409ae1136c4f"
  39. headers = {
  40. 'Host': 'app.miguoyun.cn',
  41. 'accept': '*/*',
  42. 'content-type': 'application/x-www-form-urlencoded',
  43. 'accept-language': 'zh-cn',
  44. 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E217 MicroMessenger/6.8.0(0x16080000) NetType/WIFI Language/en Branch/Br_trunk MiniProgramEnv/Mac',
  45. 'referer': 'https://servicewechat.com/wx08c7ede18f448973/20/page-frame.html',
  46. 'Cookie': '0a52___multiid=1; 0a52_logout=; PHPSESSID=d7eaef273b29b6e2a2b0130b0bfb92bc'
  47. }
  48. page_index = 1
  49. while True:
  50. time.sleep(random.randint(1, 10))
  51. try:
  52. if self.limit_flag:
  53. AliyunLogger.logging(
  54. code="2000",
  55. platform=self.platform,
  56. mode=self.mode,
  57. env=self.env,
  58. message="本轮已经抓取到足够的数据,自动退出\t{}".format(self.download_cnt),
  59. )
  60. return
  61. else:
  62. payload = "category=476&page=1&israndom=1&type=0&isview=&noauth=true"
  63. response = requests.post(url, headers=headers, data=payload)
  64. video_list = response.json()["data"]
  65. if video_list:
  66. for index, video_obj in enumerate(video_list, 1):
  67. try:
  68. video_id = video_obj.get("vid", 0)
  69. get_video_list = self.video_list(video_id)
  70. if get_video_list.get("vtitle"):
  71. AliyunLogger.logging(
  72. code="1001",
  73. platform=self.platform,
  74. mode=self.mode,
  75. env=self.env,
  76. message="扫描到一条视频",
  77. data=video_obj,
  78. )
  79. self.process_video_obj(video_obj, get_video_list)
  80. except Exception as e:
  81. AliyunLogger.logging(
  82. code="3000",
  83. platform=self.platform,
  84. mode=self.mode,
  85. env=self.env,
  86. data=video_obj,
  87. message="抓取第{}条的时候出现问题, 报错信息是{}".format(index, e),
  88. )
  89. page_index += 1
  90. else:
  91. AliyunLogger.logging(
  92. code="2000",
  93. platform=self.platform,
  94. mode=self.mode,
  95. env=self.env,
  96. message="已经抓完了,自动退出"
  97. )
  98. return
  99. except Exception as e:
  100. AliyunLogger.logging(
  101. code="3000",
  102. platform=self.platform,
  103. mode=self.mode,
  104. env=self.env,
  105. message="抓取第{}页时候出现错误, 报错信息是{}".format(page_index + 1, e),
  106. )
  107. def process_video_obj(self, video_obj, get_video_list):
  108. trace_id = self.platform + str(uuid.uuid1())
  109. our_user = random.choice(self.user_list)
  110. item = VideoItem()
  111. item.add_video_info("user_id", our_user["uid"])
  112. item.add_video_info("user_name", our_user["nick_name"])
  113. item.add_video_info("video_id", video_obj["vid"])
  114. item.add_video_info("video_title", video_obj["vtitle"])
  115. item.add_video_info("publish_time_stamp", int(time.time()))
  116. item.add_video_info("video_url", get_video_list["res"])
  117. item.add_video_info("cover_url", video_obj["poster"])
  118. item.add_video_info("out_video_id", video_obj["vid"])
  119. item.add_video_info("platform", self.platform)
  120. item.add_video_info("strategy", self.mode)
  121. item.add_video_info("session", "{}-{}".format(self.platform, int(time.time())))
  122. mq_obj = item.produce_item()
  123. pipeline = PiaoQuanPipeline(
  124. platform=self.platform,
  125. mode=self.mode,
  126. rule_dict=self.rule_dict,
  127. env=self.env,
  128. item=mq_obj,
  129. trace_id=trace_id,
  130. )
  131. if pipeline.process_item():
  132. self.download_cnt += 1
  133. self.mq.send_msg(mq_obj)
  134. AliyunLogger.logging(
  135. code="1002",
  136. platform=self.platform,
  137. mode=self.mode,
  138. env=self.env,
  139. message="成功发送至 ETL",
  140. data=mq_obj,
  141. )
  142. if self.download_cnt >= int(
  143. self.rule_dict.get("videos_cnt", {}).get("min", 200)
  144. ):
  145. self.limit_flag = True
  146. if __name__ == '__main__':
  147. S = MHXHspcheduling(
  148. platform="meihaoxinghe",
  149. mode="recommend",
  150. env="dev",
  151. rule_dict={},
  152. user_list=[{'nick_name': "Ivring", 'uid': "1997"}, {'nick_name': "paul", 'uid': "1998"}]
  153. )
  154. S.get_video_list()