xigua_author.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. """
  2. 西瓜视频——新规则抓取
  3. """
  4. import os
  5. import sys
  6. import uuid
  7. import time
  8. import random
  9. import string
  10. import asyncio
  11. import requests
  12. sys.path.append(os.getcwd())
  13. from application.common.messageQueue import MQ
  14. from application.common.proxies import tunnel_proxies
  15. from application.common.log import AliyunLogger
  16. async def create_signature():
  17. """
  18. 随机生成签名
  19. :return:
  20. """
  21. src_digits = string.digits # string_数字
  22. src_uppercase = string.ascii_uppercase # string_大写字母
  23. src_lowercase = string.ascii_lowercase # string_小写字母
  24. digits_num = random.randint(1, 6)
  25. uppercase_num = random.randint(1, 26 - digits_num - 1)
  26. lowercase_num = 26 - (digits_num + uppercase_num)
  27. password = (
  28. random.sample(src_digits, digits_num)
  29. + random.sample(src_uppercase, uppercase_num)
  30. + random.sample(src_lowercase, lowercase_num)
  31. )
  32. random.shuffle(password)
  33. new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
  34. new_password_start = new_password[0:18]
  35. new_password_end = new_password[-7:]
  36. if new_password[18] == "8":
  37. new_password = new_password_start + "w" + new_password_end
  38. elif new_password[18] == "9":
  39. new_password = new_password_start + "x" + new_password_end
  40. elif new_password[18] == "-":
  41. new_password = new_password_start + "y" + new_password_end
  42. elif new_password[18] == ".":
  43. new_password = new_password_start + "z" + new_password_end
  44. else:
  45. new_password = new_password_start + "y" + new_password_end
  46. return new_password
  47. class XiGuaAuthor(object):
  48. """
  49. 西瓜账号抓取object
  50. """
  51. def __init__(self, platform, mode, rule_dict, user_list, env):
  52. self.platform = platform
  53. self.mode = mode
  54. self.rule_dict = rule_dict
  55. self.user_list = user_list
  56. self.env = env
  57. self.download_cnt = 0
  58. self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
  59. self.expire_flag = False
  60. self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
  61. async def process_author_list(self):
  62. """
  63. 抓取多个账号
  64. :return:
  65. """
  66. for user_account in self.user_list:
  67. await self.process_each_author(user_account)
  68. async def process_each_author(self, user_account):
  69. """
  70. 抓取单个账号的视频列表;
  71. :return:
  72. """
  73. off_set = 0
  74. signature = await create_signature()
  75. url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
  76. while True:
  77. params = {
  78. "to_user_id": str(
  79. user_account["link"].replace("https://www.ixigua.com/home/", "")
  80. ),
  81. "offset": str(off_set),
  82. "limit": "30",
  83. "maxBehotTime": "0",
  84. "order": "new",
  85. "isHome": "0",
  86. "_signature": signature,
  87. }
  88. headers = {
  89. "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
  90. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
  91. }
  92. response = requests.get(
  93. url=url,
  94. headers=headers,
  95. params=params,
  96. timeout=5
  97. )
  98. off_set += 30
  99. if "data" not in response.text or response.status_code != 200:
  100. self.aliyun_log.logging(
  101. code="2000",
  102. message=f"get_videoList:{response.text}\n",
  103. )
  104. return
  105. elif not response.json()["data"]["videoList"]:
  106. self.aliyun_log.logging(
  107. code="2000",
  108. message=f"没有更多数据啦~\n",
  109. )
  110. return
  111. else:
  112. video_list = response.json()["data"]["videoList"]
  113. for video in video_list:
  114. try:
  115. self.aliyun_log.logging(
  116. code="1001",
  117. data=video,
  118. message="扫描到一条视频"
  119. )
  120. # 判断时间是否符合要求
  121. if self.date_flag(video, user_account):
  122. return
  123. else:
  124. self.process_video_obj(video, user_account)
  125. except Exception as e:
  126. self.aliyun_log.logging(
  127. code="3000",
  128. data=video,
  129. message="抓取单条视频异常, 报错原因是: {}".format(e)
  130. )
  131. def rule_maker(self, account):
  132. """
  133. 通过不同的账号生成不同的规则
  134. :param account: 输入的账号信息
  135. {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
  136. """
  137. flag = account.split("")
  138. if flag == "V1":
  139. rule_dict = {
  140. "play_cnt": {"min": 50000, "max": 0},
  141. 'period': {"min": 15, "max": 15},
  142. 'special': True
  143. }
  144. return rule_dict
  145. elif flag == "V2":
  146. rule_dict = {
  147. "play_cnt": {"min": 10000, "max": 0},
  148. 'period': {"min": 7, "max": 7},
  149. 'special': True
  150. }
  151. return rule_dict
  152. elif flag == "V3":
  153. rule_dict = {
  154. "play_cnt": {"min": 5000, "max": 0},
  155. 'period': {"min": 3, "max": 3},
  156. 'special': True
  157. }
  158. return rule_dict
  159. else:
  160. return self.rule_dict
  161. def date_flag(self, video, user_account):
  162. """
  163. 判断时间是否满足条件
  164. :param video: 视频信息
  165. :param user_account: 用户账号
  166. :return:
  167. """
  168. rule = self.rule_maker(user_account)