123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711 |
- """
- 西瓜视频搜索爬虫
- """
- import os
- import sys
- import json
- import time
- import uuid
- import random
- import base64
- import asyncio
- import aiohttp
- import urllib.parse
- import requests
- from lxml import etree
- sys.path.append(os.getcwd())
- from application.items import VideoItem
- from application.pipeline import PiaoQuanPipeline
- from application.common.messageQueue import MQ
- from application.common.proxies import tunnel_proxies
- from application.common.log import AliyunLogger
- def get_video_url(video_info):
- """
- 获取视频链接信息
- :param video_info:
- :return:
- """
- video_url_dict = {}
- # video_url
- if "videoResource" not in video_info:
- video_url_dict["video_url"] = ""
- video_url_dict["audio_url"] = ""
- video_url_dict["video_width"] = 0
- video_url_dict["video_height"] = 0
- elif "dash_120fps" in video_info["videoResource"]:
- if (
- "video_list" in video_info["videoResource"]["dash_120fps"]
- and "video_4" in video_info["videoResource"]["dash_120fps"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_4"
- ]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_4"
- ]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_4"
- ]["vwidth"]
- video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_4"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash_120fps"]
- and "video_3" in video_info["videoResource"]["dash_120fps"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_3"
- ]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_3"
- ]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_3"
- ]["vwidth"]
- video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_3"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash_120fps"]
- and "video_2" in video_info["videoResource"]["dash_120fps"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_2"
- ]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_2"
- ]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_2"
- ]["vwidth"]
- video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_2"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash_120fps"]
- and "video_1" in video_info["videoResource"]["dash_120fps"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_1"
- ]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_1"
- ]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_1"
- ]["vwidth"]
- video_height = video_info["videoResource"]["dash_120fps"]["video_list"][
- "video_1"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "dynamic_video" in video_info["videoResource"]["dash_120fps"]
- and "dynamic_video_list"
- in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
- and "dynamic_audio_list"
- in video_info["videoResource"]["dash_120fps"]["dynamic_video"]
- and len(
- video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_video_list"
- ]
- )
- != 0
- and len(
- video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_audio_list"
- ]
- )
- != 0
- ):
- video_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_audio_list"
- ][-1]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vwidth"]
- video_height = video_info["videoResource"]["dash_120fps"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- else:
- video_url_dict["video_url"] = ""
- video_url_dict["audio_url"] = ""
- video_url_dict["video_width"] = 0
- video_url_dict["video_height"] = 0
- elif "dash" in video_info["videoResource"]:
- if (
- "video_list" in video_info["videoResource"]["dash"]
- and "video_4" in video_info["videoResource"]["dash"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["dash"]["video_list"]["video_4"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash"]["video_list"]["video_4"][
- "vwidth"
- ]
- video_height = video_info["videoResource"]["dash"]["video_list"]["video_4"][
- "vheight"
- ]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash"]
- and "video_3" in video_info["videoResource"]["dash"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["dash"]["video_list"]["video_3"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash"]["video_list"]["video_3"][
- "vwidth"
- ]
- video_height = video_info["videoResource"]["dash"]["video_list"]["video_3"][
- "vheight"
- ]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash"]
- and "video_2" in video_info["videoResource"]["dash"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["dash"]["video_list"]["video_2"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash"]["video_list"]["video_2"][
- "vwidth"
- ]
- video_height = video_info["videoResource"]["dash"]["video_list"]["video_2"][
- "vheight"
- ]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["dash"]
- and "video_1" in video_info["videoResource"]["dash"]["video_list"]
- ):
- video_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["dash"]["video_list"]["video_1"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash"]["video_list"]["video_1"][
- "vwidth"
- ]
- video_height = video_info["videoResource"]["dash"]["video_list"]["video_1"][
- "vheight"
- ]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "dynamic_video" in video_info["videoResource"]["dash"]
- and "dynamic_video_list"
- in video_info["videoResource"]["dash"]["dynamic_video"]
- and "dynamic_audio_list"
- in video_info["videoResource"]["dash"]["dynamic_video"]
- and len(
- video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_video_list"
- ]
- )
- != 0
- and len(
- video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_audio_list"
- ]
- )
- != 0
- ):
- video_url = video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["backup_url_1"]
- audio_url = video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_audio_list"
- ][-1]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vwidth"]
- video_height = video_info["videoResource"]["dash"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- else:
- video_url_dict["video_url"] = ""
- video_url_dict["audio_url"] = ""
- video_url_dict["video_width"] = 0
- video_url_dict["video_height"] = 0
- elif "normal" in video_info["videoResource"]:
- if (
- "video_list" in video_info["videoResource"]["normal"]
- and "video_4" in video_info["videoResource"]["normal"]["video_list"]
- ):
- video_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["normal"]["video_list"]["video_4"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["normal"]["video_list"][
- "video_4"
- ]["vwidth"]
- video_height = video_info["videoResource"]["normal"]["video_list"][
- "video_4"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["normal"]
- and "video_3" in video_info["videoResource"]["normal"]["video_list"]
- ):
- video_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["normal"]["video_list"]["video_3"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["normal"]["video_list"][
- "video_3"
- ]["vwidth"]
- video_height = video_info["videoResource"]["normal"]["video_list"][
- "video_3"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["normal"]
- and "video_2" in video_info["videoResource"]["normal"]["video_list"]
- ):
- video_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["normal"]["video_list"]["video_2"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["normal"]["video_list"][
- "video_2"
- ]["vwidth"]
- video_height = video_info["videoResource"]["normal"]["video_list"][
- "video_2"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "video_list" in video_info["videoResource"]["normal"]
- and "video_1" in video_info["videoResource"]["normal"]["video_list"]
- ):
- video_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
- "backup_url_1"
- ]
- audio_url = video_info["videoResource"]["normal"]["video_list"]["video_1"][
- "backup_url_1"
- ]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["normal"]["video_list"][
- "video_1"
- ]["vwidth"]
- video_height = video_info["videoResource"]["normal"]["video_list"][
- "video_1"
- ]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- elif (
- "dynamic_video" in video_info["videoResource"]["normal"]
- and "dynamic_video_list"
- in video_info["videoResource"]["normal"]["dynamic_video"]
- and "dynamic_audio_list"
- in video_info["videoResource"]["normal"]["dynamic_video"]
- and len(
- video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_video_list"
- ]
- )
- != 0
- and len(
- video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_audio_list"
- ]
- )
- != 0
- ):
- video_url = video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["backup_url_1"]
- audio_url = video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_audio_list"
- ][-1]["backup_url_1"]
- if len(video_url) % 3 == 1:
- video_url += "=="
- elif len(video_url) % 3 == 2:
- video_url += "="
- elif len(audio_url) % 3 == 1:
- audio_url += "=="
- elif len(audio_url) % 3 == 2:
- audio_url += "="
- video_url = base64.b64decode(video_url).decode("utf8")
- audio_url = base64.b64decode(audio_url).decode("utf8")
- video_width = video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vwidth"]
- video_height = video_info["videoResource"]["normal"]["dynamic_video"][
- "dynamic_video_list"
- ][-1]["vheight"]
- video_url_dict["video_url"] = video_url
- video_url_dict["audio_url"] = audio_url
- video_url_dict["video_width"] = video_width
- video_url_dict["video_height"] = video_height
- else:
- video_url_dict["video_url"] = ""
- video_url_dict["audio_url"] = ""
- video_url_dict["video_width"] = 0
- video_url_dict["video_height"] = 0
- else:
- video_url_dict["video_url"] = ""
- video_url_dict["audio_url"] = ""
- video_url_dict["video_width"] = 0
- video_url_dict["video_height"] = 0
- return video_url_dict
- class XiGuaSearch(object):
- """
- XiGuaSearch
- """
- def __init__(self, platform, mode, rule_dict, user_list, env="prod"):
- self.platform = platform
- self.mode = mode
- self.rule_dict = rule_dict
- self.user_list = user_list
- self.env = env
- self.download_cnt = 0
- self.mq = MQ(topic_name="topic_crawler_etl_" + self.env)
- self.expire_flag = False
- self.aliyun_log = AliyunLogger(platform=self.platform, mode=self.mode)
- async def search(self, keyword):
- """搜索"""
- keyword = urllib.parse.quote(keyword)
- base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
- keyword
- )
- headers = {
- "authority": "www.ixigua.com",
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
- "cache-control": "max-age=0",
- "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
- "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": '"macOS"',
- "sec-fetch-dest": "document",
- "sec-fetch-mode": "navigate",
- "sec-fetch-site": "none",
- "sec-fetch-user": "?1",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- }
- basic_response = requests.get(url=base_url, headers=headers)
- html = etree.HTML(basic_response.text)
- result = html.xpath(
- '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
- )
- print(result)
- async with aiohttp.ClientSession() as session:
- tasks = [self.get_video_info(session, page_id[1:-2]) for page_id in result]
- await asyncio.gather(*tasks)
- async def get_video_info(self, session, page_id):
- """
- :param session:
- :param page_id: 视频主页 id
- :return:
- """
- url = "https://www.ixigua.com/api/mixVideo/information?"
- headers = {
- "accept-encoding": "gzip, deflate",
- "accept-language": "zh-CN,zh-Hans;q=0.9",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- "referer": "https://www.ixigua.com/7102614741050196520?logTag=0531c88ac04f38ab2c62",
- }
- params = {
- "mixId": str(page_id),
- "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfC"
- "NVVIOBNjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
- "X-Bogus": "DFSzswVupYTANCJOSBk0P53WxM-r",
- "_signature": "_02B4Z6wo0000119LvEwAAIDCuktNZ0y5wkdfS7jAALThuOR8D9yWNZ.EmWHKV0WSn6Px"
- "fPsH9-BldyxVje0f49ryXgmn7Tzk-swEHNb15TiGqa6YF.cX0jW8Eds1TtJOIZyfc9s5emH7gdWN94",
- }
- cookies = {
- "ixigua-a-s": "1",
- "msToken": "IlG0wd0Pylyw9ghcYiB2YseUmTwrsrqqhXrbIcsSaTcLTJyVlbYJzk20zw3UO-CfrfCNVVIOB"
- "NjIl7vfBoxnVUwO9ZyzAI3umSKsT5-pef_RRfQCJwmA",
- "ttwid": "1%7C_yXQeHWwLZgCsgHClOwTCdYSOt_MjdOkgnPIkpi-Sr8%7C1661241238%7Cf57d0c5ef3f1d7"
- "6e049fccdca1ac54887c34d1f8731c8e51a49780ff0ceab9f8",
- "tt_scid": "QZ4l8KXDG0YAEaMCSbADdcybdKbUfG4BC6S4OBv9lpRS5VyqYLX2bIR8CTeZeGHR9ee3",
- "MONITOR_WEB_ID": "0a49204a-7af5-4e96-95f0-f4bafb7450ad",
- "__ac_nonce": "06304878000964fdad287",
- "__ac_signature": "_02B4Z6wo00f017Rcr3AAAIDCUVxeW1tOKEu0fKvAAI4cvoYzV-wBhq7B6D8k0no7lb"
- "FlvYoinmtK6UXjRIYPXnahUlFTvmWVtb77jsMkKAXzAEsLE56m36RlvL7ky.M3Xn52r9t1IEb7IR3ke8",
- "ttcid": "e56fabf6e85d4adf9e4d91902496a0e882",
- "_tea_utm_cache_1300": "undefined",
- "support_avif": "false",
- "support_webp": "false",
- "xiguavideopcwebid": "7134967546256016900",
- "xiguavideopcwebid.sig": "xxRww5R1VEMJN_dQepHorEu_eAc",
- }
- async with session.get(
- url, headers=headers, params=params, cookies=cookies
- ) as response:
- video_info = await response.json()
- video_info = (
- video_info["data"]
- .get("gidInformation", {})
- .get("packerData", {})
- .get("video", {})
- )
- # print(video_info)
- item = VideoItem()
- item.add_video_info("video_title", video_info.get("title", ""))
- item.add_video_info(
- "video_id", video_info.get("videoResource", {}).get("vid", "")
- )
- item.add_video_info("play_cnt", int(video_info.get("video_watch_count", 0)))
- item.add_video_info("like_cnt", int(video_info.get("video_like_count", 0)))
- item.add_video_info("duration", int(video_info.get("video_duration", 0)))
- item.add_video_info(
- "publish_time_stamp", int(video_info.get("video_publish_time", 0))
- )
- item.add_video_info(
- "publish_time_str",
- time.strftime(
- "%Y-%m-%d %H:%M:%S",
- time.localtime(int(video_info.get("video_publish_time", 0))),
- ),
- )
- item.add_video_info(
- "user_name", video_info.get("user_info", {}).get("name", "")
- )
- item.add_video_info(
- "user_id", str(video_info.get("user_info", {}).get("user_id", ""))
- )
- item.add_video_info(
- "avatar_url", str(video_info.get("user_info", {}).get("avatar_url", ""))
- )
- item.add_video_info("cover_url", video_info.get("poster_url", ""))
- item.add_video_info("audio_url", get_video_url(video_info)["audio_url"])
- item.add_video_info("video_url", get_video_url(video_info)["video_url"])
- item.add_video_info("session", "xigua-search-{}".format(int(time.time())))
- item.add_video_info("out_video_id", video_info.get("videoResource", {}).get("vid", ""))
- item.add_video_info("platform", self.platform)
- item.add_video_info("strategy", self.mode)
- # item.add_video_info("")
- mq_obj = item.produce_item()
- # print(mq_obj)
- print(json.dumps(mq_obj, ensure_ascii=False, indent=4))
- if __name__ == "__main__":
- S = XiGuaSearch(platform=1, mode=2, rule_dict=3, user_list=1)
- loop = asyncio.get_event_loop()
- loop.run_until_complete(S.search("春节"))
- # await
|