Server
/
title_with_video


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
							"""
@author: luojunhui
西瓜视频搜索爬虫
"""
import re
import json
import time
import random
import base64
import urllib.parse

import requests
from lxml import etree
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad
from fake_useragent import FakeUserAgent


def byte_dance_cookie(item_id):
    """
    获取西瓜视频的 cookie
    :param item_id:
    """
    sess = requests.Session()
    sess.headers.update({
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
    })

    # 获取 cookies
    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
    # print(r.text)
    return r.cookies.values()[0]


def aes_decrypt(data: str, key: str) -> str:
    """
    XiGua AES decrypt
    :param data:
    :param key:
    :return:
    """
    password = key.encode()
    iv = password[:16]
    try:
        ct = base64.b64decode(data.encode())
        cipher = AES.new(password, AES.MODE_CBC, iv)
        pt = unpad(cipher.decrypt(ct), AES.block_size)
        return base64.b64decode(pt).decode()
    except Exception as e:
        print("Incorrect decryption {}".format(e))
        return None


def extract_video_url(text):
    """
    获取视频 video_url
    :param text:
    :return:
    """
    HTML = etree.HTML(text)
    str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
    json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
    Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
    # python中不规则的定义
    for I in Irregulars:
        if I in ['=false', '=true']:
            json_2 = json_2.replace(I, '=' + I[1:].capitalize())
        else:
            json_2 = json_2.replace(I, '12')
    dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
    if dict_2['dash'] == 12:
        obj = dict_2['normal']
        ptk = obj['ptk']
        main_url = obj['video_list']['video_3']['main_url']
        real_video_url = aes_decrypt(data=main_url, key=ptk)
    else:
        obj = dict_2['dash']
        ptk = obj["ptk"]
        video_url = obj['dynamic_video']['main_url']
        real_video_url = aes_decrypt(data=video_url, key=ptk)
    return real_video_url


def extract_info_by_re(text):
    """
    通过正则表达式获取文本中的信息
    :param text:
    :return:
    """
    # 标题
    title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
    if title_match:
        title_content = title_match.group(1)
        title_content = title_content.split(" - ")[0]
        title_content = bytes(title_content, "latin1").decode()
    else:
        title_content = ""

    # video_id
    video_id = re.search(r'"vid":"(.*?)"', text).group(1)

    # like_count
    like_count = re.search(r'"video_like_count":(.*?),', text).group(1)

    # cover_url
    cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)

    # video_play
    video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)

    # "video_publish_time"
    publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)

    # video_duration
    duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")

    return {
        "title": title_content,
        "url": extract_video_url(text),
        "video_id": video_id,
        "like_count": like_count,
        "cover_url": cover_url,
        "play_count": video_watch_count,
        "publish_time": publish_time,
        "duration": duration
    }


def byte_dance_cookie(item_id):
    """
    获取西瓜视频的 cookie
    :param item_id:
    """
    sess = requests.Session()
    sess.headers.update({
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
        'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
    })

    # 获取 cookies
    sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
    data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
    r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
    # print(r.text)
    return r.cookies.values()[0]


def get_video_info(item_id):
    """
    获取视频信息
    """
    url = "https://www.ixigua.com/{}".format(item_id)
    headers = {
        "accept-encoding": "gzip, deflate",
        "accept-language": "zh-CN,zh-Hans;q=0.9",
        "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
        "user-agent": FakeUserAgent().random,
        "referer": "https://www.ixigua.com/{}/".format(item_id),
    }
    response = requests.get(
        url=url,
        headers=headers,
        # proxies=tunnel_proxies(),
        timeout=5,
    )
    time.sleep(random.randint(1, 5))
    video_info = extract_info_by_re(response.text)

    video_dict = {
        "video_title": video_info.get("title", ""),
        "video_id": video_info.get("video_id"),
        "gid": str(item_id),
        "play_cnt": int(video_info.get("play_count", 0)),
        "like_cnt": int(video_info.get("like_count", 0)),
        "comment_cnt": 0,
        "share_cnt": 0,
        "favorite_cnt": 0,
        "duration": int(video_info.get("duration", 0)),
        "video_width": 0,
        "video_height": 0,
        "publish_time_stamp": int(video_info.get("publish_time", 0)),
        "publish_time_str": time.strftime(
            "%Y-%m-%d %H:%M:%S",
            time.localtime(int(video_info.get("publish_time", 0))),
        ),
        "avatar_url": str(
            video_info.get("user_info", {}).get("avatar_url", "")
        ),
        "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
        "video_url": video_info.get("url"),
        "session": f"xigua-author-{int(time.time())}",
    }
    return video_dict


def xigua_search(keyword):
    """
    搜索
    """
    keyword = urllib.parse.quote(keyword)
    base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
        keyword
    )
    headers = {
        "authority": "www.ixigua.com",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
        "cache-control": "max-age=0",
        "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
        "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"macOS"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "none",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    }
    basic_response = requests.get(url=base_url, headers=headers)
    html = etree.HTML(basic_response.text)
    result = html.xpath(
        '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
    )
    res_list = []
    for page_id in result[:5]:
        doc_id = page_id[1:].split("?")[0]
        try:
            res = get_video_info(doc_id)
            temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
            res_list.append(temp)
        except:
            pass
    return res_list