# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/5/26 import base64 import json import os import random import string import sys import time import requests import urllib3 from requests.adapters import HTTPAdapter from common.mq import MQ sys.path.append(os.getcwd()) from common.userAgent import get_random_user_agent from common.scheduling_db import MysqlHelper from common.common import Common from common.public import get_config_from_mysql, download_rule class XiguaauthorScheduling: platform = "西瓜视频" @classmethod def random_signature(cls): src_digits = string.digits # string_数字 src_uppercase = string.ascii_uppercase # string_大写字母 src_lowercase = string.ascii_lowercase # string_小写字母 digits_num = random.randint(1, 6) uppercase_num = random.randint(1, 26 - digits_num - 1) lowercase_num = 26 - (digits_num + uppercase_num) password = random.sample(src_digits, digits_num) + random.sample(src_uppercase, uppercase_num) + random.sample( src_lowercase, lowercase_num) random.shuffle(password) new_password = 'AAAAAAAAAA' + ''.join(password)[10:-4] + 'AAAB' new_password_start = new_password[0:18] new_password_end = new_password[-7:] if new_password[18] == '8': new_password = new_password_start + 'w' + new_password_end elif new_password[18] == '9': new_password = new_password_start + 'x' + new_password_end elif new_password[18] == '-': new_password = new_password_start + 'y' + new_password_end elif new_password[18] == '.': new_password = new_password_start + 'z' + new_password_end else: new_password = new_password_start + 'y' + new_password_end return new_password @classmethod def get_video_url(cls, video_info): video_url_dict = {} # video_url if 'videoResource' not in video_info: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 elif 'dash_120fps' in video_info['videoResource']: if "video_list" in video_info['videoResource']['dash_120fps'] and 'video_4' in \ video_info['videoResource']['dash_120fps']['video_list']: video_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_4']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vwidth'] video_height = video_info['videoResource']['dash_120fps']['video_list']['video_4']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_3' in \ video_info['videoResource']['dash_120fps']['video_list']: video_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_3']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vwidth'] video_height = video_info['videoResource']['dash_120fps']['video_list']['video_3']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_2' in \ video_info['videoResource']['dash_120fps']['video_list']: video_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_2']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vwidth'] video_height = video_info['videoResource']['dash_120fps']['video_list']['video_2']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash_120fps'] and 'video_1' in \ video_info['videoResource']['dash_120fps']['video_list']: video_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['video_list']['video_1']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vwidth'] video_height = video_info['videoResource']['dash_120fps']['video_list']['video_1']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \ and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \ and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \ and len( video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \ and len( video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0: video_url = \ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][ 'backup_url_1'] audio_url = \ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1][ 'backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = \ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][ 'vwidth'] video_height = \ video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1][ 'vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height else: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 elif 'dash' in video_info['videoResource']: if "video_list" in video_info['videoResource']['dash'] and 'video_4' in \ video_info['videoResource']['dash']['video_list']: video_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1'] audio_url = video_info['videoResource']['dash']['video_list']['video_4']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['video_list']['video_4']['vwidth'] video_height = video_info['videoResource']['dash']['video_list']['video_4']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash'] and 'video_3' in \ video_info['videoResource']['dash']['video_list']: video_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1'] audio_url = video_info['videoResource']['dash']['video_list']['video_3']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['video_list']['video_3']['vwidth'] video_height = video_info['videoResource']['dash']['video_list']['video_3']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash'] and 'video_2' in \ video_info['videoResource']['dash']['video_list']: video_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1'] audio_url = video_info['videoResource']['dash']['video_list']['video_2']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['video_list']['video_2']['vwidth'] video_height = video_info['videoResource']['dash']['video_list']['video_2']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['dash'] and 'video_1' in \ video_info['videoResource']['dash']['video_list']: video_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1'] audio_url = video_info['videoResource']['dash']['video_list']['video_1']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['video_list']['video_1']['vwidth'] video_height = video_info['videoResource']['dash']['video_list']['video_1']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif 'dynamic_video' in video_info['videoResource']['dash'] \ and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video'] \ and 'dynamic_audio_list' in video_info['videoResource']['dash']['dynamic_video'] \ and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list']) != 0 \ and len(video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list']) != 0: video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][ 'backup_url_1'] audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1][ 'backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][ 'vwidth'] video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1][ 'vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height else: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 elif 'normal' in video_info['videoResource']: if "video_list" in video_info['videoResource']['normal'] and 'video_4' in \ video_info['videoResource']['normal']['video_list']: video_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1'] audio_url = video_info['videoResource']['normal']['video_list']['video_4']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['video_list']['video_4']['vwidth'] video_height = video_info['videoResource']['normal']['video_list']['video_4']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['normal'] and 'video_3' in \ video_info['videoResource']['normal']['video_list']: video_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1'] audio_url = video_info['videoResource']['normal']['video_list']['video_3']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['video_list']['video_3']['vwidth'] video_height = video_info['videoResource']['normal']['video_list']['video_3']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['normal'] and 'video_2' in \ video_info['videoResource']['normal']['video_list']: video_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1'] audio_url = video_info['videoResource']['normal']['video_list']['video_2']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['video_list']['video_2']['vwidth'] video_height = video_info['videoResource']['normal']['video_list']['video_2']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif "video_list" in video_info['videoResource']['normal'] and 'video_1' in \ video_info['videoResource']['normal']['video_list']: video_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1'] audio_url = video_info['videoResource']['normal']['video_list']['video_1']['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['video_list']['video_1']['vwidth'] video_height = video_info['videoResource']['normal']['video_list']['video_1']['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif 'dynamic_video' in video_info['videoResource']['normal'] \ and 'dynamic_video_list' in video_info['videoResource']['normal']['dynamic_video'] \ and 'dynamic_audio_list' in video_info['videoResource']['normal']['dynamic_video'] \ and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list']) != 0 \ and len(video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list']) != 0: video_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][ 'backup_url_1'] audio_url = video_info['videoResource']['normal']['dynamic_video']['dynamic_audio_list'][-1][ 'backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][ 'vwidth'] video_height = video_info['videoResource']['normal']['dynamic_video']['dynamic_video_list'][-1][ 'vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height else: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 else: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 return video_url_dict @classmethod def get_comment_cnt(cls, item_id): url = "https://www.ixigua.com/tlb/comment/article/v5/tab_comments/?" params = { "tab_index": "0", "count": "10", "offset": "10", "group_id": str(item_id), "item_id": str(item_id), "aid": "1768", "msToken": "50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==", "X-Bogus": "DFSzswVOyGtANVeWtCLMqR/F6q9U", "_signature": cls.random_signature(), } headers = { 'authority': 'www.ixigua.com', 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'cookie': 'MONITOR_WEB_ID=67cb5099-a022-4ec3-bb8e-c4de6ba51dd0; passport_csrf_token=72b2574f3c99f8ba670e42df430218fd; passport_csrf_token_default=72b2574f3c99f8ba670e42df430218fd; sid_guard=c7472b508ea631823ba765a60cf8757f%7C1680867422%7C3024002%7CFri%2C+12-May-2023+11%3A37%3A04+GMT; uid_tt=c13f47d51767f616befe32fb3e9f485a; uid_tt_ss=c13f47d51767f616befe32fb3e9f485a; sid_tt=c7472b508ea631823ba765a60cf8757f; sessionid=c7472b508ea631823ba765a60cf8757f; sessionid_ss=c7472b508ea631823ba765a60cf8757f; sid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; ssid_ucp_v1=1.0.0-KGUzNWYxNmRkZGJiZjgxY2MzZWNkMTEzMTkwYjY1Yjg5OTY5NzVlNmMKFQiu3d-eqQIQ3oDAoQYYGCAMOAhACxoCaGwiIGM3NDcyYjUwOGVhNjMxODIzYmE3NjVhNjBjZjg3NTdm; odin_tt=b893608d4dde2e1e8df8cd5d97a0e2fbeafc4ca762ac72ebef6e6c97e2ed19859bb01d46b4190ddd6dd17d7f9678e1de; SEARCH_CARD_MODE=7168304743566296612_0; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=7Pux7s634-z8DYvCM20y7KigwH5u7Rh6D9C-RROpnT.aGMEcz6Vsxp.oai47wJqa4f86; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1683858689%7Ca5223fe1500578e01e138a0d71d6444692018296c4c24f5885af174a65873c95; ixigua-a-s=3; msToken=50-JJObWB07HfHs-BMJWT1eIDX3G-6lPSF_i-QwxBIXE9VVa-iN0jbEXR5pG2DKjXBmP299n6ZTuXzY-GAy968CCvouSAYIS4GzvGQT3pNlKNejr5G4-1g==; __ac_nonce=0645dcbf0005064517440; __ac_signature=_02B4Z6wo00f01FEGmAwAAIDBKchzCGqn-MBRJpyAAHAjieFC5GEg6gGiwz.I4PRrJl7f0GcixFrExKmgt6QI1i1S-dQyofPEj2ugWTCnmKUdJQv-wYuDofeKNe8VtMtZq2aKewyUGeKU-5Ud21; ixigua-a-s=3', 'pragma': 'no-cache', 'referer': f'https://www.ixigua.com/{item_id}?logTag=3c5aa86a8600b9ab8540', 'sec-ch-ua': '"Microsoft Edge";v="113", "Chromium";v="113", "Not-A.Brand";v="24"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'tt-anti-token': 'cBITBHvmYjEygzv-f9c78c1297722cf1f559c74b084e4525ce4900bdcf9e8588f20cc7c2e3234422', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35', 'x-secsdk-csrf-token': '000100000001f8e733cf37f0cd255a51aea9a81ff7bc0c09490cfe41ad827c3c5c18ec809279175e4d9f5553d8a5' } urllib3.disable_warnings() s = requests.session() # max_retries=3 重试3次 s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) response = s.get(url=url, headers=headers, params=params, verify=False, proxies=Common.tunnel_proxies(), timeout=5) response.close() if response.status_code != 200 or 'total_number' not in response.json() or response.json() == {}: return 0 return response.json().get("total_number", 0) # 获取视频详情 @classmethod def get_video_info(cls, log_type, crawler, item_id): url = 'https://www.ixigua.com/api/mixVideo/information?' headers = { "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh-Hans;q=0.9", "user-agent": get_random_user_agent('pc'), "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62", } params = { 'mixId': str(item_id), 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC' 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA', 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r', '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px' 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94', } cookies = { 'ixigua-a-s': '1', 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB' 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA', 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7' '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8', 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3', 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad', '__ac_nonce': '06304878000964fdad287', '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb' 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8', 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882', '_tea_utm_cache_1300': 'undefined', 'support_avif': 'false', 'support_webp': 'false', 'xiguavideopcwebid': '7134967546256016900', 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc', } urllib3.disable_warnings() s = requests.session() # max_retries=3 重试3次 s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) response = s.get(url=url, headers=headers, params=params, cookies=cookies, verify=False, proxies=Common.tunnel_proxies(), timeout=5) response.close() if response.status_code != 200 or 'data' not in response.json() or response.json()['data'] == {}: Common.logger(log_type, crawler).warning(f"get_video_info:{response.status_code}, {response.text}\n") return None else: video_info = response.json()['data'].get("gidInformation", {}).get("packerData", {}).get("video", {}) if video_info == {}: return None video_dict = { "video_title": video_info.get("title", ""), "video_id": video_info.get("videoResource", {}).get("vid", ""), "gid": str(item_id), "play_cnt": int(video_info.get("video_watch_count", 0)), "like_cnt": int(video_info.get("video_like_count", 0)), "comment_cnt": int(cls.get_comment_cnt(item_id)), "share_cnt": 0, "favorite_cnt": 0, "duration": int(video_info.get("video_duration", 0)), "video_width": int(cls.get_video_url(video_info)["video_width"]), "video_height": int(cls.get_video_url(video_info)["video_height"]), "publish_time_stamp": int(video_info.get("video_publish_time", 0)), "publish_time_str": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(video_info.get("video_publish_time", 0)))), "user_name": video_info.get("user_info", {}).get("name", ""), "user_id": str(video_info.get("user_info", {}).get("user_id", "")), "avatar_url": str(video_info.get("user_info", {}).get("avatar_url", "")), "cover_url": video_info.get("poster_url", ""), "audio_url": cls.get_video_url(video_info)["audio_url"], "video_url": cls.get_video_url(video_info)["video_url"], "session": f"xigua-search-{int(time.time())}" } return video_dict @classmethod def get_videoList(cls, log_type, crawler, user_dict, rule_dict, env): mq = MQ(topic_name="topic_crawler_etl_" + env) signature = cls.random_signature() offset = 0 while True: url = "https://www.ixigua.com/api/videov2/author/new_video_list?" params = { 'to_user_id': str(user_dict["link"].replace("https://www.ixigua.com/home/", "")), 'offset': str(offset), 'limit': '30', 'maxBehotTime': '0', 'order': 'new', 'isHome': '0', # 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==', # 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt', '_signature': signature, } headers = { 'referer': f'https://www.ixigua.com/home/{user_dict["link"].replace("https://www.ixigua.com/home/", "")}/video/?preActiveKey=hotsoon&list_entrance=userdetail', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41', } urllib3.disable_warnings() s = requests.session() # max_retries=3 重试3次 s.mount('http://', HTTPAdapter(max_retries=3)) s.mount('https://', HTTPAdapter(max_retries=3)) response = s.get(url=url, headers=headers, params=params, proxies=Common.tunnel_proxies(), verify=False, timeout=5) response.close() offset += 30 if response.status_code != 200: Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n") Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n") return elif 'data' not in response.text: Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n") Common.logging(log_type, crawler, env, f"get_videolist_response:{response.text}\n") return elif not response.json()["data"]['videoList']: Common.logger(log_type, crawler).warning(f"没有更多数据啦~:{response.json()}\n") Common.logging(log_type, crawler, env, f"没有更多数据啦~:{response.json()}\n") return feeds = response.json()['data']['videoList'] for i in range(len(feeds)): try: item_id = feeds[i].get("item_id", "") if item_id == "": Common.logger(log_type, crawler).info("无效视频\n") Common.logging(log_type, crawler, env, "无效视频\n") continue video_dict = cls.get_video_info(log_type, crawler, item_id) if video_dict is None: Common.logger(log_type, crawler).info("无效视频\n") Common.logging(log_type, crawler, env, "无效视频\n") continue for k, v in video_dict.items(): Common.logger(log_type, crawler).info(f"{k}:{v}") Common.logging(log_type, crawler, env, f"{video_dict}") if int((int(time.time()) - int(video_dict["publish_time_stamp"])) / (3600 * 24)) > int(rule_dict.get("period", {}).get("max", 1000)): Common.logger(log_type, crawler).info(f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n') Common.logging(log_type, crawler, env, f'发布时间超过{int(rule_dict.get("period", {}).get("max", 1000))}天\n') return if download_rule(log_type=log_type, crawler=crawler, video_dict=video_dict, rule_dict=rule_dict) is False: Common.logger(log_type, crawler).info("不满足抓取规则\n") Common.logging(log_type, crawler, env, "不满足抓取规则\n") elif any(str(word) if str(word) in video_dict["video_title"] else False for word in get_config_from_mysql(log_type=log_type, source=crawler, env=env, text="filter", action="")) is True: Common.logger(log_type, crawler).info('已中过滤词\n') Common.logging(log_type, crawler, env, "已中过滤词\n") elif cls.repeat_video(log_type, crawler, video_dict["video_id"], env) != 0: Common.logger(log_type, crawler).info('视频已下载\n') Common.logging(log_type, crawler, env, "视频已下载\n") else: video_dict["out_user_id"] = video_dict["user_id"] video_dict["platform"] = crawler video_dict["strategy"] = log_type video_dict["out_video_id"] = video_dict["video_id"] video_dict["width"] = video_dict["video_width"] video_dict["height"] = video_dict["video_height"] video_dict["crawler_rule"] = json.dumps(rule_dict) video_dict["user_id"] = user_dict["uid"] video_dict["publish_time"] = video_dict["publish_time_str"] video_dict["strategy_type"] = log_type mq.send_msg(video_dict) except Exception as e: Common.logger(log_type, crawler).error(f"抓取单条视频异常:{e}\n") Common.logging(log_type, crawler, env, f"抓取单条视频异常:{e}\n") @classmethod def repeat_video(cls, log_type, crawler, video_id, env): # sql = f""" select * from crawler_video where platform="{cls.platform}" and out_video_id="{video_id}"; """ sql = f""" select * from crawler_video where platform in ("{crawler}","{cls.platform}") and out_video_id="{video_id}"; """ repeat_video = MysqlHelper.get_values(log_type, crawler, sql, env) return len(repeat_video) @classmethod def get_author_videos(cls, log_type, crawler, user_list, rule_dict, env): for user_dict in user_list: try: Common.logger(log_type, crawler).info(f"开始抓取 {user_dict['nick_name']} 用户主页视频\n") Common.logging(log_type, crawler, env, f"开始抓取 {user_dict['nick_name']} 用户主页视频\n") cls.get_videoList(log_type=log_type, crawler=crawler, user_dict=user_dict, rule_dict=rule_dict, env=env) except Exception as e: Common.logger(log_type, crawler).error(f"抓取{user_dict['nick_name']}视频时异常:{e}\n") Common.logging(log_type, crawler, env, f"抓取{user_dict['nick_name']}视频时异常:{e}\n") if __name__ == '__main__': print(XiguaauthorScheduling.repeat_video("follow", "xigua", "v0201ag10000ce3jcjbc77u8jsplpgrg", "dev")) pass