""" 西瓜视频——新规则抓取 """ import os import sys import uuid import time import random import string import asyncio import requests sys.path.append(os.getcwd()) from application.common.messageQueue import MQ from application.common.proxies import tunnel_proxies from application.common.log import AliyunLogger async def create_signature(): """ 随机生成签名 :return: """ src_digits = string.digits # string_数字 src_uppercase = string.ascii_uppercase # string_大写字母 src_lowercase = string.ascii_lowercase # string_小写字母 digits_num = random.randint(1, 6) uppercase_num = random.randint(1, 26 - digits_num - 1) lowercase_num = 26 - (digits_num + uppercase_num) password = ( random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample(src_lowercase, lowercase_num) ) random.shuffle(password) new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB" new_password_start = new_password[0:18] new_password_end = new_password[-7:] if new_password[18] == "8": new_password = new_password_start + "w" + new_password_end elif new_password[18] == "9": new_password = new_password_start + "x" + new_password_end elif new_password[18] == "-": new_password = new_password_start + "y" + new_password_end elif new_password[18] == ".": new_password = new_password_start + "z" + new_password_end else: new_password = new_password_start + "y" + new_password_end return new_password class XiGuaAuthor(object): """ 西瓜账号抓取object """ def __init__(self, platform, mode, rule_dict, user_list, env): self.platform = platform self.mode = mode self.rule_dict = rule_dict self.user_list = user_list self.env = env self.download_cnt = 0 self.mq = MQ(topic_name="topic_crawler_etl_" + self.env) self.expire_flag = False self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode) async def process_author_list(self): """ 抓取多个账号 :return: """ for user_account in self.user_list: await self.process_each_author(user_account) async def process_each_author(self, user_account): """ 抓取单个账号的视频列表; :return: """ off_set = 0 signature = await create_signature() url = "https://www.ixigua.com/api/videov2/author/new_video_list?" while True: params = { "to_user_id": str( user_account["link"].replace("https://www.ixigua.com/home/", "") ), "offset": str(off_set), "limit": "30", "maxBehotTime": "0", "order": "new", "isHome": "0", "_signature": signature, } headers = { "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail', "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41", } response = requests.get( url=url, headers=headers, params=params, timeout=5 ) off_set += 30 if "data" not in response.text or response.status_code != 200: self.aliyun_log.logging( code="2000", message=f"get_videoList:{response.text}\n", ) return elif not response.json()["data"]["videoList"]: self.aliyun_log.logging( code="2000", message=f"没有更多数据啦~\n", ) return else: video_list = response.json()["data"]["videoList"] for video in video_list: try: self.aliyun_log.logging( code="1001", data=video, message="扫描到一条视频" ) # 判断时间是否符合要求 if self.date_flag(video, user_account): return else: self.process_video_obj(video, user_account) except Exception as e: self.aliyun_log.logging( code="3000", data=video, message="抓取单条视频异常, 报错原因是: {}".format(e) ) def rule_maker(self, account): """ 通过不同的账号生成不同的规则 :param account: 输入的账号信息 {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}} """ flag = account.split("") if flag == "V1": rule_dict = { "play_cnt": {"min": 50000, "max": 0}, 'period': {"min": 15, "max": 15}, 'special': True } return rule_dict elif flag == "V2": rule_dict = { "play_cnt": {"min": 10000, "max": 0}, 'period': {"min": 7, "max": 7}, 'special': True } return rule_dict elif flag == "V3": rule_dict = { "play_cnt": {"min": 5000, "max": 0}, 'period': {"min": 3, "max": 3}, 'special': True } return rule_dict else: return self.rule_dict def date_flag(self, video, user_account): """ 判断时间是否满足条件 :param video: 视频信息 :param user_account: 用户账号 :return: """ rule = self.rule_maker(user_account)