123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- """
- 西瓜视频——新规则抓取
- """
- import os
- import sys
- import uuid
- import time
- import random
- import string
- import asyncio
- import requests
- sys.path.append(os.getcwd())
- from application.common.messageQueue import MQ
- from application.common.proxies import tunnel_proxies
- from application.common.log import AliyunLogger
- async def create_signature():
- """
- 随机生成签名
- :return:
- """
- src_digits = string.digits # string_数字
- src_uppercase = string.ascii_uppercase # string_大写字母
- src_lowercase = string.ascii_lowercase # string_小写字母
- digits_num = random.randint(1, 6)
- uppercase_num = random.randint(1, 26 - digits_num - 1)
- lowercase_num = 26 - (digits_num + uppercase_num)
- password = (
- random.sample(src_digits, digits_num)
- + random.sample(src_uppercase, uppercase_num)
- + random.sample(src_lowercase, lowercase_num)
- )
- random.shuffle(password)
- new_password = "AAAAAAAAAA" + "".join(password)[10:-4] + "AAAB"
- new_password_start = new_password[0:18]
- new_password_end = new_password[-7:]
- if new_password[18] == "8":
- new_password = new_password_start + "w" + new_password_end
- elif new_password[18] == "9":
- new_password = new_password_start + "x" + new_password_end
- elif new_password[18] == "-":
- new_password = new_password_start + "y" + new_password_end
- elif new_password[18] == ".":
- new_password = new_password_start + "z" + new_password_end
- else:
- new_password = new_password_start + "y" + new_password_end
- return new_password
- class XiGuaAuthor(object):
- """
- 西瓜账号抓取object
- """
- def __init__(self, platform, mode, rule_dict, user_list, env):
- self.platform = platform
- self.mode = mode
- self.rule_dict = rule_dict
- self.user_list = user_list
- self.env = env
- self.download_cnt = 0
- self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
- self.expire_flag = False
- self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
- async def process_author_list(self):
- """
- 抓取多个账号
- :return:
- """
- for user_account in self.user_list:
- await self.process_each_author(user_account)
- async def process_each_author(self, user_account):
- """
- 抓取单个账号的视频列表;
- :return:
- """
- off_set = 0
- signature = await create_signature()
- url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
- while True:
- params = {
- "to_user_id": str(
- user_account["link"].replace("https://www.ixigua.com/home/", "")
- ),
- "offset": str(off_set),
- "limit": "30",
- "maxBehotTime": "0",
- "order": "new",
- "isHome": "0",
- "_signature": signature,
- }
- headers = {
- "referer": f'https://www.ixigua.com/home/{user_account["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41",
- }
- response = requests.get(
- url=url,
- headers=headers,
- params=params,
- timeout=5
- )
- off_set += 30
- if "data" not in response.text or response.status_code != 200:
- self.aliyun_log.logging(
- code="2000",
- message=f"get_videoList:{response.text}\n",
- )
- return
- elif not response.json()["data"]["videoList"]:
- self.aliyun_log.logging(
- code="2000",
- message=f"没有更多数据啦~\n",
- )
- return
- else:
- video_list = response.json()["data"]["videoList"]
- for video in video_list:
- try:
- self.aliyun_log.logging(
- code="1001",
- data=video,
- message="扫描到一条视频"
- )
- # 判断时间是否符合要求
- if self.date_flag(video, user_account):
- return
- else:
- self.process_video_obj(video, user_account)
- except Exception as e:
- self.aliyun_log.logging(
- code="3000",
- data=video,
- message="抓取单条视频异常, 报错原因是: {}".format(e)
- )
- def rule_maker(self, account):
- """
- 通过不同的账号生成不同的规则
- :param account: 输入的账号信息
- {'play_cnt': {'min': 100000, 'max': 0}, 'period': {'min': 5, 'max': 5}}
- """
- flag = account.split("")
- if flag == "V1":
- rule_dict = {
- "play_cnt": {"min": 50000, "max": 0},
- 'period': {"min": 15, "max": 15},
- 'special': True
- }
- return rule_dict
- elif flag == "V2":
- rule_dict = {
- "play_cnt": {"min": 10000, "max": 0},
- 'period': {"min": 7, "max": 7},
- 'special': True
- }
- return rule_dict
- elif flag == "V3":
- rule_dict = {
- "play_cnt": {"min": 5000, "max": 0},
- 'period': {"min": 3, "max": 3},
- 'special': True
- }
- return rule_dict
- else:
- return self.rule_dict
- def date_flag(self, video, user_account):
- """
- 判断时间是否满足条件
- :param video: 视频信息
- :param user_account: 用户账号
- :return:
- """
- rule = self.rule_maker(user_account)
|