# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/2/17 import base64 import json import os import sys import time import requests import urllib3 from selenium.webdriver import DesiredCapabilities from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from seleniumwire import webdriver sys.path.append(os.getcwd()) from common.common import Common from common.feishu import Feishu from common.publish import Publish proxies = {"http": None, "https": None} class Follow: # 个人主页视频翻页参数 offset = 0 # 下载规则 @staticmethod def download_rule(duration, width, height): if int(duration) >= 60: if int(width) >= 720 or int(height) >= 720: return True else: return False else: return False # 过滤词库 @classmethod def filter_words(cls, log_type, crawler): try: filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc') filter_words_list = [] for x in filter_words_sheet: for y in x: if y is None: pass else: filter_words_list.append(y) return filter_words_list except Exception as e: Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n') # 获取用户信息(字典格式). 注意:部分 user_id 字符类型是 int / str @classmethod def get_user_info_from_feishu(cls, log_type, crawler): try: user_sheet = Feishu.get_values_batch(log_type, crawler, '5tlTYB') user_dict = {} for i in range(1, len(user_sheet)): user_name = user_sheet[i][0] out_id = user_sheet[i][1] our_id = user_sheet[i][3] if user_name is None or out_id is None or our_id is None: pass else: user_dict[user_name] = str(out_id) + ',' + str(our_id) return user_dict except Exception as e: Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n') @classmethod def get_signature(cls, log_type, crawler, out_uid, machine): try: # 打印请求配置 ca = DesiredCapabilities.CHROME ca["goog:loggingPrefs"] = {"performance": "ALL"} # 不打开浏览器运行 chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--headless") chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36') chrome_options.add_argument("--no-sandbox") # driver初始化 if machine == 'aliyun' or machine == 'aliyun_hk': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options) elif machine == 'macpro': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver')) elif machine == 'macair': driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/piaoquan/Downloads/chromedriver')) else: driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver')) driver.implicitly_wait(10) driver.get(f'https://www.ixigua.com/home/{out_uid}/') time.sleep(3) data_src = driver.find_elements(By.XPATH, '//img[@class="tt-img BU-MagicImage tt-img-loaded"]')[1].get_attribute("data-src") signature = data_src.split("x-signature=")[-1] # print(f"data_src:{data_src}") # print(f"signature:{signature}") return signature except Exception as e: Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n') # 获取视频详情 @classmethod def get_video_url(cls, log_type, crawler, gid): # try: url = 'https://www.ixigua.com/api/mixVideo/information?' headers = { "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh-Hans;q=0.9", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15", "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62", } params = { 'mixId': gid, 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC' 'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA', 'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r', '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px' 'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94', } cookies = { 'ixigua-a-s': '1', 'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB' 'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA', 'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7' '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8', 'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3', 'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad', '__ac_nonce': '06304878000964fdad287', '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb' 'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8', 'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882', '_tea_utm_cache_1300': 'undefined', 'support_avif': 'false', 'support_webp': 'false', 'xiguavideopcwebid': '7134967546256016900', 'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc', } urllib3.disable_warnings() response = requests.get(url=url, headers=headers, params=params, cookies=cookies, verify=False) if 'data' not in response.json() or response.json()['data'] == '': Common.logger(log_type, crawler).warning('get_video_info: response: {}', response) else: video_info = response.json()['data']['gidInformation']['packerData']['video'] video_url_dict = {} # video_url if 'videoResource' not in video_info: video_url_dict["video_url"] = '' video_url_dict["audio_url"] = '' video_url_dict["video_width"] = 0 video_url_dict["video_height"] = 0 elif 'dash_120fps' in video_info['videoResource']: if "video_list" in video_info['videoResource']['dash_120fps'] and len(video_info['videoResource']['dash_120fps']['video_list']) != 0: video_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['video_list'][-1]['vwidth'] video_height = video_info['videoResource']['dash_120fps']['video_list'][-1]['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \ and 'dynamic_video' in video_info['videoResource']['dash_120fps'] \ and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \ and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \ and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \ and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0: video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1'] audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth'] video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight'] video_url_dict["video_url"] = video_url video_url_dict["audio_url"] = audio_url video_url_dict["video_width"] = video_width video_url_dict["video_height"] = video_height elif 'dash' in video_info['videoResource'] \ and 'dynamic_video' in video_info['videoResource']['dash'] \ and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video']: video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1'] audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth'] video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight'] elif 'normal' in video_info['videoResource']: video_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1'] audio_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1'] if len(video_url) % 3 == 1: video_url += '==' elif len(video_url) % 3 == 2: video_url += '=' elif len(audio_url) % 3 == 1: audio_url += '==' elif len(audio_url) % 3 == 2: audio_url += '=' video_url = base64.b64decode(video_url).decode('utf8') audio_url = base64.b64decode(audio_url).decode('utf8') video_width = video_info['videoResource']['normal']['video_list'][-1]['vwidth'] video_height = video_info['videoResource']['normal']['video_list'][-1]['vheight'] else: video_url = 0 audio_url = 0 video_width = 0 video_height = 0 return video_url_dict # except Exception as e: # Common.logger(log_type).error(f'get_video_info异常:{e}\n') @classmethod def get_videolist(cls, log_type, crawler, out_uid, machine): signature = cls.get_signature(log_type, crawler, out_uid, machine) url = "https://www.ixigua.com/api/videov2/author/new_video_list?" params = { 'to_user_id': str(out_uid), 'offset': str(cls.offset), 'limit': '30', 'maxBehotTime': '0', 'order': 'new', 'isHome': '0', 'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==', 'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt', '_signature': signature, } headers = { 'authority': 'www.ixigua.com', 'accept': 'application/json, text/plain, */*', 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'cache-control': 'no-cache', 'cookie': f'MONITOR_WEB_ID=7168304743566296612; __ac_signature={signature}; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; msToken=G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==; tt_scid=o4agqz7u9SKPwfBoPt6S82Cw0q.9KDtqmNe0JHxMqmpxNHQWq1BmrQdgVU6jEoX7ed99; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1676618894%7Cee5ad95378275f282f230a7ffa9947ae7eff40d0829c5a2568672a6dc90a1c96; ixigua-a-s=1', 'pragma': 'no-cache', 'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail', 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41', 'x-secsdk-csrf-token': '00010000000119e3f9454d1dcbb288704cda1960f241e2d19bd21f2fd283520c3615a990ac5a17448bfbb902a249' } urllib3.disable_warnings() response = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False) cls.offset += 30 if response.status_code != 200: Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n") elif 'data' not in response.text: Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n") elif 'videoList' not in response.json()["data"]: Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n") else: videoList = response.json()['data']['videoList'] for i in range(len(videoList)): # video_title if 'title' not in videoList[i]: video_title = 0 else: video_title = videoList[i]['title'].strip().replace('手游', '') \ .replace('/', '').replace('\/', '').replace('\n', '') # video_id if 'video_id' not in videoList[i]: video_id = 0 else: video_id = videoList[i]['video_id'] # gid if 'gid' not in videoList[i]: gid = 0 else: gid = videoList[i]['gid'] # play_cnt if 'video_detail_info' not in videoList[i]: play_cnt = 0 elif 'video_watch_count' not in videoList[i]['video_detail_info']: play_cnt = 0 else: play_cnt = videoList[i]['video_detail_info']['video_watch_count'] # comment_cnt if 'comment_count' not in videoList[i]: comment_cnt = 0 else: comment_cnt = videoList[i]['comment_count'] # like_cnt if 'digg_count' not in videoList[i]: like_cnt = 0 else: like_cnt = videoList[i]['digg_count'] # share_cnt share_cnt = 0 # video_duration if 'video_duration' not in videoList[i]: video_duration = 0 else: video_duration = videoList[i]['video_duration'] # send_time if 'publish_time' not in videoList[i]: publish_time = 0 else: publish_time = videoList[i]['publish_time'] # is_top if 'is_top' not in videoList[i]: is_top = 0 else: is_top = videoList[i]['is_top'] # user_name if 'user_info' not in videoList[i]: user_name = 0 elif 'name' not in videoList[i]['user_info']: user_name = 0 else: user_name = videoList[i]['user_info']['name'] # user_id if 'user_info' not in videoList[i]: user_id = 0 elif 'user_id' not in videoList[i]['user_info']: user_id = 0 else: user_id = videoList[i]['user_info']['user_id'] # avatar_url if 'user_info' not in videoList[i]: avatar_url = 0 elif 'avatar_url' not in videoList[i]['user_info']: avatar_url = 0 else: avatar_url = videoList[i]['user_info']['avatar_url'] # cover_url if 'video_detail_info' not in videoList[i]: cover_url = 0 elif 'detail_video_large_image' not in videoList[i]['video_detail_info']: cover_url = 0 elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']: cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url'] else: cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url'] Common.logger(log_type, crawler).info( f'send_time:{time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time))}') video_url_dict = cls.get_video_url(log_type, crawler, gid) video_url = video_url_dict["video_url"] audio_url = video_url_dict["audio_url"] video_width = video_url_dict["video_width"] video_height = video_url_dict["video_height"] video_dict = {'video_title': video_title, 'video_id': video_id, 'gid': gid, 'play_cnt': play_cnt, 'comment_cnt': comment_cnt, 'like_cnt': like_cnt, 'share_cnt': share_cnt, 'video_width': video_width, 'video_height': video_height, 'video_duration': video_duration, 'publish_time': publish_time, 'is_top': is_top, 'user_name': user_name, 'user_id': user_id, 'avatar_url': avatar_url, 'cover_url': cover_url, 'audio_url': audio_url, 'video_url': video_url} for k, v in video_dict.items(): print(f"{k}:{v}") print("\n") if __name__ == '__main__': # print(Follow.get_signature("follow", "xigua", "95420624045", "local")) Follow.get_videolist("follow", "xigua", "95420624045", "local") pass