| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 | """西瓜视频——新规则抓取"""import osimport sysimport uuidimport timeimport randomimport stringimport asyncioimport requestssys.path.append(os.getcwd())from application.common.messageQueue import MQfrom application.common.proxies import tunnel_proxiesfrom application.common.log import AliyunLoggerasync def create_signature():    """    随机生成签名    :return:    """    src_digits = string.digits  # string_数字    src_uppercase = string.ascii_uppercase  # string_大写字母    src_lowercase = string.ascii_lowercase  # string_小写字母    digits_num = random.randint(1, 6)    uppercase_num = random.randint(1, 26 - digits_num - 1)    lowercase_num = 26 - (digits_num + uppercase_num)    password = (            random.sample(src_digits, digits_num)            + random.sample(src_uppercase, uppercase_num)            + random.sample(src_lowercase, lowercase_num)    )    random.shuffle(password)    new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"    new_password_start = new_password[0:18]    new_password_end = new_password[-7:]    if new_password[18] == "8":        new_password = new_password_start + "w" + new_password_end    elif new_password[18] == "9":        new_password = new_password_start + "x" + new_password_end    elif new_password[18] == "-":        new_password = new_password_start + "y" + new_password_end    elif new_password[18] == ".":        new_password = new_password_start + "z" + new_password_end    else:        new_password = new_password_start + "y" + new_password_end    return new_passwordclass XiGuaAuthor(object):    """    西瓜账号抓取object    """    def __init__(self, platform, mode, rule_dict, user_list, env):        self.platform = platform        self.mode = mode        self.rule_dict = rule_dict        self.user_list = user_list        self.env = env        self.download_cnt = 0        self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)        self.expire_flag = False        self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)    async def process_author_list(self):        """        抓取多个账号        :return:        """        for user_account in self.user_list:            await self.process_each_author(user_account)    async def process_each_author(self, user_account):        """        抓取单个账号的视频列表;        :return:        """        off_set = 0        signature = await create_signature()        url = "https://www.ixigua.com/api/videov2/author/new_video_list?"        while True:            params = {                "to_user_id": str(                    user_account["link"].replace("https://www.ixigua.com/home/", "")                ),                "offset": str(off_set),                "limit": "30",                "maxBehotTime": "0",                "order": "new",                "isHome": "0",                "_signature": signature,            }            headers = {                "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",            }            response = requests.get(                url=url,                headers=headers,                params=params,                timeout=5            )            off_set += 30            if "data" not in response.text or response.status_code != 200:                self.aliyun_log.logging(                    code="2000",                    message=f"get_videoList:{response.text}\n",                )                return            elif not response.json()["data"]["videoList"]:                self.aliyun_log.logging(                    code="2000",                    message=f"没有更多数据啦~\n",                )                return            else:                video_list = response.json()["data"]["videoList"]                for video in video_list:                    try:                        self.aliyun_log.logging(                            code="1001",                            data=video,                            message="扫描到一条视频"                        )                        # 判断时间是否符合要求                        if self.date_flag(video, user_account):                            return                        else:                            self.process_video_obj(video, user_account)                    except Exception as e:                        self.aliyun_log.logging(                            code="3000",                            data=video,                            message="抓取单条视频异常, 报错原因是: {}".format(e)                        )    def rule_maker(self, account):        """        通过不同的账号生成不同的规则        :param account: 输入的账号信息        {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}        """        flag = account.split("")        if flag == "V1":            rule_dict = {                "play_cnt": {"min": 50000, "max": 0},                'period': {"min": 15, "max": 15},                'special': True            }            return rule_dict        elif flag == "V2":            rule_dict = {                "play_cnt": {"min": 10000, "max": 0},                'period': {"min": 7, "max": 7},                'special': True            }            return rule_dict        elif flag == "V3":            rule_dict = {                "play_cnt": {"min": 5000, "max": 0},                'period': {"min": 3, "max": 3},                'special': True            }            return rule_dict        else:            return self.rule_dict    def date_flag(self, video, user_account):        """        判断时间是否满足条件        :param video: 视频信息        :param user_account: 用户账号        :return:        """        rule = self.rule_maker(user_account)
 |