xigua_author.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. """
  2. 西瓜视频——新规则抓取
  3. """
  4. import os
  5. import sys
  6. import uuid
  7. import time
  8. import random
  9. import string
  10. import asyncio
  11. import requests
  12. sys.path.append(os.getcwd())
  13. from application.common.messageQueue import MQ
  14. from application.common.proxies import tunnel_proxies
  15. from application.common.log import AliyunLogger
  16. async def create_signature():
  17. """
  18. 随机生成签名
  19. :return:
  20. """
  21. src_digits = string.digits # string_数字
  22. src_uppercase = string.ascii_uppercase # string_大写字母
  23. src_lowercase = string.ascii_lowercase # string_小写字母
  24. digits_num = random.randint(1, 6)
  25. uppercase_num = random.randint(1, 26 - digits_num - 1)
  26. lowercase_num = 26 - (digits_num + uppercase_num)
  27. password = (
  28. random.sample(src_digits, digits_num)
  29. + random.sample(src_uppercase, uppercase_num)
  30. + random.sample(src_lowercase, lowercase_num)
  31. )
  32. random.shuffle(password)
  33. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  34. new_password_start = new_password[0:18]
  35. new_password_end = new_password[-7:]
  36. if new_password[18] == "8":
  37. new_password = new_password_start + "w" + new_password_end
  38. elif new_password[18] == "9":
  39. new_password = new_password_start + "x" + new_password_end
  40. elif new_password[18] == "-":
  41. new_password = new_password_start + "y" + new_password_end
  42. elif new_password[18] == ".":
  43. new_password = new_password_start + "z" + new_password_end
  44. else:
  45. new_password = new_password_start + "y" + new_password_end
  46. return new_password
  47. class XiGuaAuthor(object):
  48. """
  49. 西瓜账号抓取object
  50. """
  51. def __init__(self, platform, mode, rule_dict, user_list, env):
  52. self.platform = platform
  53. self.mode = mode
  54. self.rule_dict = rule_dict
  55. self.user_list = user_list
  56. self.env = env
  57. self.download_cnt = 0
  58. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  59. self.expire_flag = False
  60. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  61. async def process_author_list(self):
  62. """
  63. 抓取多个账号
  64. :return:
  65. """
  66. for user_account in self.user_list:
  67. await self.process_each_author(user_account)
  68. async def process_each_author(self, user_account):
  69. """
  70. 抓取单个账号的视频列表;
  71. :return:
  72. """
  73. off_set = 0
  74. signature = await create_signature()
  75. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  76. while True:
  77. params = {
  78. "to_user_id": str(
  79. user_account["link"].replace("https://www.ixigua.com/home/", "")
  80. ),
  81. "offset": str(off_set),
  82. "limit": "30",
  83. "maxBehotTime": "0",
  84. "order": "new",
  85. "isHome": "0",
  86. "_signature": signature,
  87. }
  88. headers = {
  89. "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  90. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  91. }
  92. response = requests.get(
  93. url=url,
  94. headers=headers,
  95. params=params,
  96. proxies=tunnel_proxies(),
  97. timeout=5
  98. )
  99. off_set += 30
  100. if "data" not in response.text or response.status_code != 200:
  101. self.aliyun_log.logging(
  102. code="2000",
  103. message=f"get_videoList:{response.text}\n",
  104. )
  105. return
  106. elif not response.json()["data"]["videoList"]:
  107. self.aliyun_log.logging(
  108. code="2000",
  109. message=f"没有更多数据啦~\n",
  110. )
  111. return
  112. else:
  113. video_list = response.json()["data"]["videoList"]
  114. for video in video_list:
  115. try:
  116. self.aliyun_log.logging(
  117. code="1001",
  118. data=video,
  119. message="扫描到一条视频"
  120. )
  121. # 判断时间是否符合要求
  122. if self.date_flag(video, user_account):
  123. return
  124. else:
  125. self.process_video_obj(video, user_account)
  126. except Exception as e:
  127. self.aliyun_log.logging(
  128. code="3000",
  129. data=video,
  130. message="抓取单条视频异常, 报错原因是: {}".format(e)
  131. )
  132. def rule_maker(self, account):
  133. """
  134. 通过不同的账号生成不同的规则
  135. :param account: 输入的账号信息
  136. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  137. """
  138. flag = account.split("")
  139. if flag == "V1":
  140. rule_dict = {
  141. "play_cnt": {"min": 50000, "max": 0},
  142. 'period': {"min": 15, "max": 15},
  143. 'special': True
  144. }
  145. return rule_dict
  146. elif flag == "V2":
  147. rule_dict = {
  148. "play_cnt": {"min": 10000, "max": 0},
  149. 'period': {"min": 7, "max": 7},
  150. 'special': True
  151. }
  152. return rule_dict
  153. elif flag == "V3":
  154. rule_dict = {
  155. "play_cnt": {"min": 5000, "max": 0},
  156. 'period': {"min": 3, "max": 3},
  157. 'special': True
  158. }
  159. return rule_dict
  160. else:
  161. return self.rule_dict
  162. def date_flag(self, video, user_account):
  163. """
  164. 判断时间是否满足条件
  165. :param video: 视频信息
  166. :param user_account: 用户账号
  167. :return:
  168. """
  169. rule = self.rule_maker(user_account)