"""
@author: luojunhui
西瓜视频搜索爬虫
"""
import re
import json
import time
import random
import base64
import urllib.parse
import requests
from lxml import etree
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad
from fake_useragent import FakeUserAgent
def byte_dance_cookie(item_id):
"""
获取西瓜视频的 cookie
:param item_id:
"""
sess = requests.Session()
sess.headers.update({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
})
# 获取 cookies
sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
# print(r.text)
return r.cookies.values()[0]
def aes_decrypt(data: str, key: str) -> str:
"""
XiGua AES decrypt
:param data:
:param key:
:return:
"""
password = key.encode()
iv = password[:16]
try:
ct = base64.b64decode(data.encode())
cipher = AES.new(password, AES.MODE_CBC, iv)
pt = unpad(cipher.decrypt(ct), AES.block_size)
return base64.b64decode(pt).decode()
except Exception as e:
print("Incorrect decryption {}".format(e))
return None
def extract_video_url(text):
"""
获取视频 video_url
:param text:
:return:
"""
HTML = etree.HTML(text)
str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
# python中不规则的定义
for I in Irregulars:
if I in ['=false', '=true']:
json_2 = json_2.replace(I, '=' + I[1:].capitalize())
else:
json_2 = json_2.replace(I, '12')
dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
if dict_2['dash'] == 12:
obj = dict_2['normal']
ptk = obj['ptk']
main_url = obj['video_list']['video_3']['main_url']
real_video_url = aes_decrypt(data=main_url, key=ptk)
else:
obj = dict_2['dash']
ptk = obj["ptk"]
video_url = obj['dynamic_video']['main_url']
real_video_url = aes_decrypt(data=video_url, key=ptk)
return real_video_url
def extract_info_by_re(text):
"""
通过正则表达式获取文本中的信息
:param text:
:return:
"""
# 标题
title_match = re.search(r'
]*>(.*?)', text)
if title_match:
title_content = title_match.group(1)
title_content = title_content.split(" - ")[0]
title_content = bytes(title_content, "latin1").decode()
else:
title_content = ""
# video_id
video_id = re.search(r'"vid":"(.*?)"', text).group(1)
# like_count
like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
# cover_url
cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
# video_play
video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
# "video_publish_time"
publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
# video_duration
duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
return {
"title": title_content,
"url": extract_video_url(text),
"video_id": video_id,
"like_count": like_count,
"cover_url": cover_url,
"play_count": video_watch_count,
"publish_time": publish_time,
"duration": duration
}
def byte_dance_cookie(item_id):
"""
获取西瓜视频的 cookie
:param item_id:
"""
sess = requests.Session()
sess.headers.update({
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
})
# 获取 cookies
sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
# print(r.text)
return r.cookies.values()[0]
def get_video_info(item_id):
"""
获取视频信息
"""
url = "https://www.ixigua.com/{}".format(item_id)
headers = {
"accept-encoding": "gzip, deflate",
"accept-language": "zh-CN,zh-Hans;q=0.9",
"cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
"user-agent": FakeUserAgent().random,
"referer": "https://www.ixigua.com/{}/".format(item_id),
}
response = requests.get(
url=url,
headers=headers,
# proxies=tunnel_proxies(),
timeout=5,
)
time.sleep(random.randint(1, 5))
video_info = extract_info_by_re(response.text)
video_dict = {
"video_title": video_info.get("title", ""),
"video_id": video_info.get("video_id"),
"gid": str(item_id),
"play_cnt": int(video_info.get("play_count", 0)),
"like_cnt": int(video_info.get("like_count", 0)),
"comment_cnt": 0,
"share_cnt": 0,
"favorite_cnt": 0,
"duration": int(video_info.get("duration", 0)),
"video_width": 0,
"video_height": 0,
"publish_time_stamp": int(video_info.get("publish_time", 0)),
"publish_time_str": time.strftime(
"%Y-%m-%d %H:%M:%S",
time.localtime(int(video_info.get("publish_time", 0))),
),
"avatar_url": str(
video_info.get("user_info", {}).get("avatar_url", "")
),
"cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
"video_url": video_info.get("url"),
"session": f"xigua-author-{int(time.time())}",
}
return video_dict
def xigua_search(keyword):
"""
搜索
"""
keyword = urllib.parse.quote(keyword)
base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
keyword
)
headers = {
"authority": "www.ixigua.com",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
"cache-control": "max-age=0",
"cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
"sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
}
basic_response = requests.get(url=base_url, headers=headers)
html = etree.HTML(basic_response.text)
result = html.xpath(
'//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
)
res_list = []
for page_id in result[:5]:
doc_id = page_id[1:].split("?")[0]
try:
res = get_video_info(doc_id)
temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
res_list.append(temp)
except:
pass
return res_list