|
@@ -0,0 +1,187 @@
|
|
|
+"""
|
|
|
+西瓜视频——新规则抓取
|
|
|
+"""
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import uuid
|
|
|
+import time
|
|
|
+import random
|
|
|
+import string
|
|
|
+import asyncio
|
|
|
+import requests
|
|
|
+
|
|
|
+sys.path.append(os.getcwd())
|
|
|
+
|
|
|
+from application.common.messageQueue import MQ
|
|
|
+from application.common.proxies import tunnel_proxies
|
|
|
+from application.common.log import AliyunLogger
|
|
|
+
|
|
|
+
|
|
|
+async def create_signature():
|
|
|
+ """
|
|
|
+ 随机生成签名
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ src_digits = string.digits # string_数字
|
|
|
+ src_uppercase = string.ascii_uppercase # string_大写字母
|
|
|
+ src_lowercase = string.ascii_lowercase # string_小写字母
|
|
|
+ digits_num = random.randint(1, 6)
|
|
|
+ uppercase_num = random.randint(1, 26 - digits_num - 1)
|
|
|
+ lowercase_num = 26 - (digits_num + uppercase_num)
|
|
|
+ password = (
|
|
|
+ random.sample(src_digits, digits_num)
|
|
|
+ + random.sample(src_uppercase, uppercase_num)
|
|
|
+ + random.sample(src_lowercase, lowercase_num)
|
|
|
+ )
|
|
|
+ random.shuffle(password)
|
|
|
+ new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
|
|
|
+ new_password_start = new_password[0:18]
|
|
|
+ new_password_end = new_password[-7:]
|
|
|
+ if new_password[18] == "8":
|
|
|
+ new_password = new_password_start + "w" + new_password_end
|
|
|
+ elif new_password[18] == "9":
|
|
|
+ new_password = new_password_start + "x" + new_password_end
|
|
|
+ elif new_password[18] == "-":
|
|
|
+ new_password = new_password_start + "y" + new_password_end
|
|
|
+ elif new_password[18] == ".":
|
|
|
+ new_password = new_password_start + "z" + new_password_end
|
|
|
+ else:
|
|
|
+ new_password = new_password_start + "y" + new_password_end
|
|
|
+ return new_password
|
|
|
+
|
|
|
+
|
|
|
+class XiGuaAuthor(object):
|
|
|
+ """
|
|
|
+ 西瓜账号抓取object
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, platform, mode, rule_dict, user_list, env):
|
|
|
+ self.platform = platform
|
|
|
+ self.mode = mode
|
|
|
+ self.rule_dict = rule_dict
|
|
|
+ self.user_list = user_list
|
|
|
+ self.env = env
|
|
|
+ self.download_cnt = 0
|
|
|
+ self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
|
|
|
+ self.expire_flag = False
|
|
|
+ self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
|
|
|
+
|
|
|
+ async def process_author_list(self):
|
|
|
+ """
|
|
|
+ 抓取多个账号
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for user_account in self.user_list:
|
|
|
+ await self.process_each_author(user_account)
|
|
|
+
|
|
|
+ async def process_each_author(self, user_account):
|
|
|
+ """
|
|
|
+ 抓取单个账号的视频列表;
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ off_set = 0
|
|
|
+ signature = await create_signature()
|
|
|
+ url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
|
|
|
+ while True:
|
|
|
+ params = {
|
|
|
+ "to_user_id": str(
|
|
|
+ user_account["link"].replace("https://www.ixigua.com/home/", "")
|
|
|
+ ),
|
|
|
+ "offset": str(off_set),
|
|
|
+ "limit": "30",
|
|
|
+ "maxBehotTime": "0",
|
|
|
+ "order": "new",
|
|
|
+ "isHome": "0",
|
|
|
+ "_signature": signature,
|
|
|
+ }
|
|
|
+ headers = {
|
|
|
+ "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
|
|
|
+ }
|
|
|
+ response = requests.get(
|
|
|
+ url=url,
|
|
|
+ headers=headers,
|
|
|
+ params=params,
|
|
|
+ proxies=tunnel_proxies(),
|
|
|
+ timeout=5
|
|
|
+ )
|
|
|
+ off_set += 30
|
|
|
+ if "data" not in response.text or response.status_code != 200:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="2000",
|
|
|
+ message=f"get_videoList:{response.text}\n",
|
|
|
+ )
|
|
|
+ return
|
|
|
+ elif not response.json()["data"]["videoList"]:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="2000",
|
|
|
+ message=f"没有更多数据啦~\n",
|
|
|
+ )
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ video_list = response.json()["data"]["videoList"]
|
|
|
+ for video in video_list:
|
|
|
+ try:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="1001",
|
|
|
+ data=video,
|
|
|
+ message="扫描到一条视频"
|
|
|
+ )
|
|
|
+ # 判断时间是否符合要求
|
|
|
+ if self.date_flag(video, user_account):
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ self.process_video_obj(video, user_account)
|
|
|
+ except Exception as e:
|
|
|
+ self.aliyun_log.logging(
|
|
|
+ code="3000",
|
|
|
+ data=video,
|
|
|
+ message="抓取单条视频异常, 报错原因是: {}".format(e)
|
|
|
+ )
|
|
|
+
|
|
|
+ def rule_maker(self, account):
|
|
|
+ """
|
|
|
+ 通过不同的账号生成不同的规则
|
|
|
+ :param account: 输入的账号信息
|
|
|
+ {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
|
|
|
+ """
|
|
|
+ flag = account.split("")
|
|
|
+ if flag == "V1":
|
|
|
+ rule_dict = {
|
|
|
+ "play_cnt": {"min": 50000, "max": 0},
|
|
|
+ 'period': {"min": 15, "max": 15},
|
|
|
+ 'special': True
|
|
|
+ }
|
|
|
+ return rule_dict
|
|
|
+ elif flag == "V2":
|
|
|
+ rule_dict = {
|
|
|
+ "play_cnt": {"min": 10000, "max": 0},
|
|
|
+ 'period': {"min": 7, "max": 7},
|
|
|
+ 'special': True
|
|
|
+ }
|
|
|
+ return rule_dict
|
|
|
+ elif flag == "V3":
|
|
|
+ rule_dict = {
|
|
|
+ "play_cnt": {"min": 5000, "max": 0},
|
|
|
+ 'period': {"min": 3, "max": 3},
|
|
|
+ 'special': True
|
|
|
+ }
|
|
|
+ return rule_dict
|
|
|
+ else:
|
|
|
+ return self.rule_dict
|
|
|
+
|
|
|
+ def date_flag(self, video, user_account):
|
|
|
+ """
|
|
|
+ 判断时间是否满足条件
|
|
|
+ :param video: 视频信息
|
|
|
+ :param user_account: 用户账号
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ rule = self.rule_maker(user_account)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|