123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- """
- @author: luojunhui
- 西瓜视频搜索爬虫
- """
- import re
- import json
- import time
- import random
- import base64
- import urllib.parse
- import requests
- from lxml import etree
- from Crypto.Cipher import AES
- from Crypto.Util.Padding import unpad
- from fake_useragent import FakeUserAgent
- def byte_dance_cookie(item_id):
- """
- 获取西瓜视频的 cookie
- :param item_id:
- """
- sess = requests.Session()
- sess.headers.update({
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
- 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
- })
- # 获取 cookies
- sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
- data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
- r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
- # print(r.text)
- return r.cookies.values()[0]
- def aes_decrypt(data: str, key: str) -> str:
- """
- XiGua AES decrypt
- :param data:
- :param key:
- :return:
- """
- password = key.encode()
- iv = password[:16]
- try:
- ct = base64.b64decode(data.encode())
- cipher = AES.new(password, AES.MODE_CBC, iv)
- pt = unpad(cipher.decrypt(ct), AES.block_size)
- return base64.b64decode(pt).decode()
- except Exception as e:
- print("Incorrect decryption {}".format(e))
- return None
- def extract_video_url(text):
- """
- 获取视频 video_url
- :param text:
- :return:
- """
- HTML = etree.HTML(text)
- str_2 = HTML.xpath('//script[@id="SSR_HYDRATED_DATA"]/text()')[0]
- json_2 = str_2[str_2.find('{'):str_2.rfind('}') + 1]
- Irregulars = ['null', 'undefined', '=false', '=true', 'false', 'true']
- # python中不规则的定义
- for I in Irregulars:
- if I in ['=false', '=true']:
- json_2 = json_2.replace(I, '=' + I[1:].capitalize())
- else:
- json_2 = json_2.replace(I, '12')
- dict_2 = json.loads(json_2)["anyVideo"]["gidInformation"]["packerData"]["video"]["videoResource"]
- if dict_2['dash'] == 12:
- obj = dict_2['normal']
- ptk = obj['ptk']
- main_url = obj['video_list']['video_3']['main_url']
- real_video_url = aes_decrypt(data=main_url, key=ptk)
- else:
- obj = dict_2['dash']
- ptk = obj["ptk"]
- video_url = obj['dynamic_video']['main_url']
- real_video_url = aes_decrypt(data=video_url, key=ptk)
- return real_video_url
- def extract_info_by_re(text):
- """
- 通过正则表达式获取文本中的信息
- :param text:
- :return:
- """
- # 标题
- title_match = re.search(r'<title[^>]*>(.*?)</title>', text)
- if title_match:
- title_content = title_match.group(1)
- title_content = title_content.split(" - ")[0]
- title_content = bytes(title_content, "latin1").decode()
- else:
- title_content = ""
- # video_id
- video_id = re.search(r'"vid":"(.*?)"', text).group(1)
- # like_count
- like_count = re.search(r'"video_like_count":(.*?),', text).group(1)
- # cover_url
- cover_url = re.search(r'"avatar_url":"(.*?)"', text).group(1)
- # video_play
- video_watch_count = re.search(r'"video_watch_count":(.*?),', text).group(1)
- # "video_publish_time"
- publish_time = re.search(r'"video_publish_time":"(.*?)"', text).group(1)
- # video_duration
- duration = re.search(r'("video_duration":)(.*?)"', text).group(2).replace(",", "")
- return {
- "title": title_content,
- "url": extract_video_url(text),
- "video_id": video_id,
- "like_count": like_count,
- "cover_url": cover_url,
- "play_count": video_watch_count,
- "publish_time": publish_time,
- "duration": duration
- }
- def byte_dance_cookie(item_id):
- """
- 获取西瓜视频的 cookie
- :param item_id:
- """
- sess = requests.Session()
- sess.headers.update({
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
- 'referer': 'https://www.ixigua.com/home/{}/'.format(item_id),
- })
- # 获取 cookies
- sess.get('https://i.snssdk.com/slardar/sdk.js?bid=xigua_video_web_pc')
- data = '{"region":"cn","aid":1768,"needFid":false,"service":"www.ixigua.com","migrate_info":{"ticket":"","source":"node"},"cbUrlProtocol":"https","union":true}'
- r = sess.post('https://ttwid.bytedance.com/ttwid/union/register/', data=data)
- # print(r.text)
- return r.cookies.values()[0]
- def get_video_info(item_id):
- """
- 获取视频信息
- """
- url = "https://www.ixigua.com/{}".format(item_id)
- headers = {
- "accept-encoding": "gzip, deflate",
- "accept-language": "zh-CN,zh-Hans;q=0.9",
- "cookie": "ttwid={}".format(byte_dance_cookie(item_id)),
- "user-agent": FakeUserAgent().random,
- "referer": "https://www.ixigua.com/{}/".format(item_id),
- }
- response = requests.get(
- url=url,
- headers=headers,
- # proxies=tunnel_proxies(),
- timeout=5,
- )
- time.sleep(random.randint(1, 5))
- video_info = extract_info_by_re(response.text)
- video_dict = {
- "video_title": video_info.get("title", ""),
- "video_id": video_info.get("video_id"),
- "gid": str(item_id),
- "play_cnt": int(video_info.get("play_count", 0)),
- "like_cnt": int(video_info.get("like_count", 0)),
- "comment_cnt": 0,
- "share_cnt": 0,
- "favorite_cnt": 0,
- "duration": int(video_info.get("duration", 0)),
- "video_width": 0,
- "video_height": 0,
- "publish_time_stamp": int(video_info.get("publish_time", 0)),
- "publish_time_str": time.strftime(
- "%Y-%m-%d %H:%M:%S",
- time.localtime(int(video_info.get("publish_time", 0))),
- ),
- "avatar_url": str(
- video_info.get("user_info", {}).get("avatar_url", "")
- ),
- "cover_url": video_info.get("cover_url", "").replace("\\u002F", "/"),
- "video_url": video_info.get("url"),
- "session": f"xigua-author-{int(time.time())}",
- }
- return video_dict
- def xigua_search(keyword):
- """
- 搜索
- """
- keyword = urllib.parse.quote(keyword)
- base_url = "https://www.ixigua.com/search/{}/ab_name=search&fss=input".format(
- keyword
- )
- headers = {
- "authority": "www.ixigua.com",
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "accept-language": "zh,en;q=0.9,zh-CN;q=0.8",
- "cache-control": "max-age=0",
- "cookie": "ixigua-a-s=1; support_webp=true; support_avif=true; csrf_session_id=a5355d954d3c63ed1ba35faada452b4d; tt_scid=Ur23fgYD2pMJOvi1BpILyfaobg8wA7IhGwmQx260ULRa8Dvjaxc5ZA63BUIP-6Vi473f; ttwid=1%7CNtTtSp4Iej-v0nWtepdZH3d3Ts6uGNMFzTN20ps1cdo%7C1708236945%7Cc1f301c64aa3bf69cdaa41f28856e2bb7b7eed16583f8c92d50cffa2d9944fc6; msToken=rr418opQf04vm8n9s8FAGdr1AoCUsvAOGKSDPbBEfwVS1sznxxZCvcZTI93qVz5uAXlX9yRwcKlNQZ4wMro2DmlHw5yWHAVeKr_SzgO1KtVVnjUMTUNEux_cq1-EIkI=",
- "sec-ch-ua": '"Not A(Brand";v="99", "Google Chrome";v="121", "Chromium";v="121"',
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": '"macOS"',
- "sec-fetch-dest": "document",
- "sec-fetch-mode": "navigate",
- "sec-fetch-site": "none",
- "sec-fetch-user": "?1",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
- }
- basic_response = requests.get(url=base_url, headers=headers)
- html = etree.HTML(basic_response.text)
- result = html.xpath(
- '//a[@class="HorizontalFeedCard__coverWrapper disableZoomAnimation"]/@href'
- )
- res_list = []
- for page_id in result[:5]:
- doc_id = page_id[1:].split("?")[0]
- try:
- res = get_video_info(doc_id)
- temp = ["xigua", res['video_title'], res['video_url'], "https://www.ixigua.com/{}".format(doc_id)]
- res_list.append(temp)
- except:
- pass
- return res_list
|