Server
/
piaoquan_crawler


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
							# -*- coding: utf-8 -*-
# @Author: wangkun
# @Time: 2023/2/17
import base64
import json
import os
import sys
import time

import requests
import urllib3
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from seleniumwire import webdriver

sys.path.append(os.getcwd())
from common.common import Common
from common.feishu import Feishu
from common.publish import Publish
proxies = {"http": None, "https": None}


class Follow:
    # 个人主页视频翻页参数
    offset = 0

    # 下载规则
    @staticmethod
    def download_rule(duration, width, height):
        if int(duration) >= 60:
            if int(width) >= 720 or int(height) >= 720:
                return True
            else:
                return False
        else:
            return False

    # 过滤词库
    @classmethod
    def filter_words(cls, log_type, crawler):
        try:
            filter_words_sheet = Feishu.get_values_batch(log_type, crawler, 'KGB4Hc')
            filter_words_list = []
            for x in filter_words_sheet:
                for y in x:
                    if y is None:
                        pass
                    else:
                        filter_words_list.append(y)
            return filter_words_list
        except Exception as e:
            Common.logger(log_type, crawler).error(f'filter_words异常:{e}\n')

    # 获取用户信息（字典格式）. 注意：部分 user_id 字符类型是 int / str
    @classmethod
    def get_user_info_from_feishu(cls, log_type, crawler):
        try:
            user_sheet = Feishu.get_values_batch(log_type, crawler, '5tlTYB')
            user_dict = {}
            for i in range(1, len(user_sheet)):
                user_name = user_sheet[i][0]
                out_id = user_sheet[i][1]
                our_id = user_sheet[i][3]
                if user_name is None or out_id is None or our_id is None:
                    pass
                else:
                    user_dict[user_name] = str(out_id) + ',' + str(our_id)
            return user_dict
        except Exception as e:
            Common.logger(log_type, crawler).error(f'get_user_id_from_feishu异常:{e}\n')

    @classmethod
    def get_signature(cls, log_type, crawler, out_uid, machine):
        try:
            # 打印请求配置
            ca = DesiredCapabilities.CHROME
            ca["goog:loggingPrefs"] = {"performance": "ALL"}

            # 不打开浏览器运行
            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36')
            chrome_options.add_argument("--no-sandbox")

            # driver初始化
            if machine == 'aliyun' or machine == 'aliyun_hk':
                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options)
            elif machine == 'macpro':
                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
                                          service=Service('/Users/lieyunye/Downloads/chromedriver_v86/chromedriver'))
            elif machine == 'macair':
                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options,
                                          service=Service('/Users/piaoquan/Downloads/chromedriver'))
            else:
                driver = webdriver.Chrome(desired_capabilities=ca, options=chrome_options, service=Service('/Users/wangkun/Downloads/chromedriver/chromedriver_v110/chromedriver'))
            driver.implicitly_wait(10)
            driver.get(f'https://www.ixigua.com/home/{out_uid}/')
            time.sleep(3)
            data_src = driver.find_elements(By.XPATH, '//img[@class="tt-img BU-MagicImage tt-img-loaded"]')[1].get_attribute("data-src")
            signature = data_src.split("x-signature=")[-1]
            # print(f"data_src:{data_src}")
            # print(f"signature:{signature}")
            return signature
        except Exception as e:
            Common.logger(log_type, crawler).error(f'get_signature异常:{e}\n')

    # 获取视频详情
    @classmethod
    def get_video_url(cls, log_type, crawler, gid):
        # try:
        url = 'https://www.ixigua.com/api/mixVideo/information?'
        headers = {
            "accept-encoding": "gzip, deflate",
            "accept-language": "zh-CN,zh-Hans;q=0.9",
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                          "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15",
            "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
        }
        params = {
            'mixId': gid,
            'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC'
                       'NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
            'X-Bogus': 'DFSzswVupYTANCJOSBk0P53WxM-r',
            '_signature': '_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px'
                          'fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94',
        }
        cookies = {
            'ixigua-a-s': '1',
            'msToken': 'IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB'
                       'NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA',
            'ttwid': '1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7'
                     '6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8',
            'tt_scid': 'QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3',
            'MONITOR_WEB_ID': '0a49204a-7af5-4e96-95f0-f4bafb7450ad',
            '__ac_nonce': '06304878000964fdad287',
            '__ac_signature': '_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb'
                              'FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8',
            'ttcid': 'e56fabf6e85d4adf9e4d91902496a0e882',
            '_tea_utm_cache_1300': 'undefined',
            'support_avif': 'false',
            'support_webp': 'false',
            'xiguavideopcwebid': '7134967546256016900',
            'xiguavideopcwebid.sig': 'xxRww5R1VEMJN_dQepHorEu_eAc',
        }
        urllib3.disable_warnings()
        response = requests.get(url=url, headers=headers, params=params, cookies=cookies, verify=False)
        if 'data' not in response.json() or response.json()['data'] == '':
            Common.logger(log_type, crawler).warning('get_video_info: response: {}', response)
        else:
            video_info = response.json()['data']['gidInformation']['packerData']['video']
            video_url_dict = {}
            # video_url
            if 'videoResource' not in video_info:
                video_url_dict["video_url"] = ''
                video_url_dict["audio_url"] = ''
                video_url_dict["video_width"] = 0
                video_url_dict["video_height"] = 0

            elif 'dash_120fps' in video_info['videoResource']:
                if "video_list" in video_info['videoResource']['dash_120fps'] and len(video_info['videoResource']['dash_120fps']['video_list']) != 0:
                    video_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
                    audio_url = video_info['videoResource']['dash_120fps']['video_list'][-1]['backup_url_1']
                    if len(video_url) % 3 == 1:
                        video_url += '=='
                    elif len(video_url) % 3 == 2:
                        video_url += '='
                    elif len(audio_url) % 3 == 1:
                        audio_url += '=='
                    elif len(audio_url) % 3 == 2:
                        audio_url += '='
                    video_url = base64.b64decode(video_url).decode('utf8')
                    audio_url = base64.b64decode(audio_url).decode('utf8')
                    video_width = video_info['videoResource']['dash_120fps']['video_list'][-1]['vwidth']
                    video_height = video_info['videoResource']['dash_120fps']['video_list'][-1]['vheight']
                    video_url_dict["video_url"] = video_url
                    video_url_dict["audio_url"] = audio_url
                    video_url_dict["video_width"] = video_width
                    video_url_dict["video_height"] = video_height
                elif 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
                        and 'dynamic_video' in video_info['videoResource']['dash_120fps'] \
                        and 'dynamic_video_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
                        and 'dynamic_audio_list' in video_info['videoResource']['dash_120fps']['dynamic_video'] \
                        and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list']) != 0 \
                        and len(video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list']) != 0:

                    video_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
                    audio_url = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
                    if len(video_url) % 3 == 1:
                        video_url += '=='
                    elif len(video_url) % 3 == 2:
                        video_url += '='
                    elif len(audio_url) % 3 == 1:
                        audio_url += '=='
                    elif len(audio_url) % 3 == 2:
                        audio_url += '='
                    video_url = base64.b64decode(video_url).decode('utf8')
                    audio_url = base64.b64decode(audio_url).decode('utf8')
                    video_width = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
                    video_height = video_info['videoResource']['dash_120fps']['dynamic_video']['dynamic_video_list'][-1]['vheight']
                    video_url_dict["video_url"] = video_url
                    video_url_dict["audio_url"] = audio_url
                    video_url_dict["video_width"] = video_width
                    video_url_dict["video_height"] = video_height


            elif 'dash' in video_info['videoResource'] \
                    and 'dynamic_video' in video_info['videoResource']['dash'] \
                    and 'dynamic_video_list' in video_info['videoResource']['dash']['dynamic_video']:
                video_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['backup_url_1']
                audio_url = video_info['videoResource']['dash']['dynamic_video']['dynamic_audio_list'][-1]['backup_url_1']
                if len(video_url) % 3 == 1:
                    video_url += '=='
                elif len(video_url) % 3 == 2:
                    video_url += '='
                elif len(audio_url) % 3 == 1:
                    audio_url += '=='
                elif len(audio_url) % 3 == 2:
                    audio_url += '='
                video_url = base64.b64decode(video_url).decode('utf8')
                audio_url = base64.b64decode(audio_url).decode('utf8')
                video_width = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vwidth']
                video_height = video_info['videoResource']['dash']['dynamic_video']['dynamic_video_list'][-1]['vheight']


            elif 'normal' in video_info['videoResource']:
                video_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
                audio_url = video_info['videoResource']['normal']['video_list'][-1]['backup_url_1']
                if len(video_url) % 3 == 1:
                    video_url += '=='
                elif len(video_url) % 3 == 2:
                    video_url += '='
                elif len(audio_url) % 3 == 1:
                    audio_url += '=='
                elif len(audio_url) % 3 == 2:
                    audio_url += '='
                video_url = base64.b64decode(video_url).decode('utf8')
                audio_url = base64.b64decode(audio_url).decode('utf8')
                video_width = video_info['videoResource']['normal']['video_list'][-1]['vwidth']
                video_height = video_info['videoResource']['normal']['video_list'][-1]['vheight']
            else:
                video_url = 0
                audio_url = 0
                video_width = 0
                video_height = 0

            return video_url_dict


        # except Exception as e:
        #     Common.logger(log_type).error(f'get_video_info异常:{e}\n')

    @classmethod
    def get_videolist(cls, log_type, crawler, out_uid, machine):
        signature = cls.get_signature(log_type, crawler, out_uid, machine)
        url = "https://www.ixigua.com/api/videov2/author/new_video_list?"
        params = {
            'to_user_id': str(out_uid),
            'offset': str(cls.offset),
            'limit': '30',
            'maxBehotTime': '0',
            'order': 'new',
            'isHome': '0',
            'msToken': 'G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==',
            'X-Bogus': 'DFSzswVuEkUANjW9ShFTgR/F6qHt',
            '_signature': signature,
        }
        headers = {
            'authority': 'www.ixigua.com',
            'accept': 'application/json, text/plain, */*',
            'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
            'cache-control': 'no-cache',
            'cookie': f'MONITOR_WEB_ID=7168304743566296612; __ac_signature={signature}; ixigua-a-s=1; support_webp=true; support_avif=false; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; msToken=G0eRzNkw189a8TLaXjc6nTHVMQwh9XcxVAqTbGKi7iPJdQcLwS3-XRrJ3MZ7QBfqErpxp3EX1WtvWOIcZ3NIgr41hgcd-v64so_RRj3YCRw1UsKW8mIssNLlIMspsg==; tt_scid=o4agqz7u9SKPwfBoPt6S82Cw0q.9KDtqmNe0JHxMqmpxNHQWq1BmrQdgVU6jEoX7ed99; ttwid=1%7CHHtv2QqpSGuSu8r-zXF1QoWsvjmNi1SJrqOrZzg-UCY%7C1676618894%7Cee5ad95378275f282f230a7ffa9947ae7eff40d0829c5a2568672a6dc90a1c96; ixigua-a-s=1',
            'pragma': 'no-cache',
            'referer': f'https://www.ixigua.com/home/{out_uid}/video/?preActiveKey=hotsoon&list_entrance=userdetail',
            'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Microsoft Edge";v="110"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"macOS"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41',
            'x-secsdk-csrf-token': '00010000000119e3f9454d1dcbb288704cda1960f241e2d19bd21f2fd283520c3615a990ac5a17448bfbb902a249'
        }
        urllib3.disable_warnings()
        response = requests.get(url=url, headers=headers, params=params, proxies=proxies, verify=False)
        cls.offset += 30
        if response.status_code != 200:
            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
        elif 'data' not in response.text:
            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.text}\n")
        elif 'videoList' not in response.json()["data"]:
            Common.logger(log_type, crawler).warning(f"get_videolist_response:{response.json()}\n")
        else:
            videoList = response.json()['data']['videoList']
            for i in range(len(videoList)):
                # video_title
                if 'title' not in videoList[i]:
                    video_title = 0
                else:
                    video_title = videoList[i]['title'].strip().replace('手游', '') \
                        .replace('/', '').replace('\/', '').replace('\n', '')

                # video_id
                if 'video_id' not in videoList[i]:
                    video_id = 0
                else:
                    video_id = videoList[i]['video_id']

                # gid
                if 'gid' not in videoList[i]:
                    gid = 0
                else:
                    gid = videoList[i]['gid']

                # play_cnt
                if 'video_detail_info' not in videoList[i]:
                    play_cnt = 0
                elif 'video_watch_count' not in videoList[i]['video_detail_info']:
                    play_cnt = 0
                else:
                    play_cnt = videoList[i]['video_detail_info']['video_watch_count']

                # comment_cnt
                if 'comment_count' not in videoList[i]:
                    comment_cnt = 0
                else:
                    comment_cnt = videoList[i]['comment_count']

                # like_cnt
                if 'digg_count' not in videoList[i]:
                    like_cnt = 0
                else:
                    like_cnt = videoList[i]['digg_count']

                # share_cnt
                share_cnt = 0

                # video_duration
                if 'video_duration' not in videoList[i]:
                    video_duration = 0
                else:
                    video_duration = videoList[i]['video_duration']

                # send_time
                if 'publish_time' not in videoList[i]:
                    publish_time = 0
                else:
                    publish_time = videoList[i]['publish_time']

                # is_top
                if 'is_top' not in videoList[i]:
                    is_top = 0
                else:
                    is_top = videoList[i]['is_top']

                # user_name
                if 'user_info' not in videoList[i]:
                    user_name = 0
                elif 'name' not in videoList[i]['user_info']:
                    user_name = 0
                else:
                    user_name = videoList[i]['user_info']['name']

                # user_id
                if 'user_info' not in videoList[i]:
                    user_id = 0
                elif 'user_id' not in videoList[i]['user_info']:
                    user_id = 0
                else:
                    user_id = videoList[i]['user_info']['user_id']

                # avatar_url
                if 'user_info' not in videoList[i]:
                    avatar_url = 0
                elif 'avatar_url' not in videoList[i]['user_info']:
                    avatar_url = 0
                else:
                    avatar_url = videoList[i]['user_info']['avatar_url']

                # cover_url
                if 'video_detail_info' not in videoList[i]:
                    cover_url = 0
                elif 'detail_video_large_image' not in videoList[i]['video_detail_info']:
                    cover_url = 0
                elif 'url' in videoList[i]['video_detail_info']['detail_video_large_image']:
                    cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url']
                else:
                    cover_url = videoList[i]['video_detail_info']['detail_video_large_image']['url_list'][0]['url']

                Common.logger(log_type, crawler).info(
                    f'send_time:{time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(publish_time))}')

                video_url_dict = cls.get_video_url(log_type, crawler, gid)
                video_url = video_url_dict["video_url"]
                audio_url = video_url_dict["audio_url"]
                video_width = video_url_dict["video_width"]
                video_height = video_url_dict["video_height"]

                video_dict = {'video_title': video_title,
                              'video_id': video_id,
                              'gid': gid,
                              'play_cnt': play_cnt,
                              'comment_cnt': comment_cnt,
                              'like_cnt': like_cnt,
                              'share_cnt': share_cnt,
                              'video_width': video_width,
                              'video_height': video_height,
                              'video_duration': video_duration,
                              'publish_time': publish_time,
                              'is_top': is_top,
                              'user_name': user_name,
                              'user_id': user_id,
                              'avatar_url': avatar_url,
                              'cover_url': cover_url,
                              'audio_url': audio_url,
                              'video_url': video_url}
                for k, v in video_dict.items():
                    print(f"{k}:{v}")
                print("\n")


if __name__ == '__main__':
    # print(Follow.get_signature("follow", "xigua", "95420624045", "local"))
    Follow.get_videolist("follow", "xigua", "95420624045", "local")


    pass