""" @author: luojunhui 好看视频搜索爬虫 """ import requests import urllib.parse import time import hashlib from fake_useragent import FakeUserAgent from applications.functions.common import MySQLServer def get_video_detail(video_id): """ 获取好看视频的视频链接 :param video_id: :return: """ url = "https://haokan.baidu.com/v" params = { 'vid': video_id, '_format': 'json', # 'hk_nonce': 'f47386e95fe657182aa3c1826d9a6b85', # 'hk_timestamp': '1715225386', # 'hk_sign': '4b219f5e3971e42b3e23dc2a209fc9d9', # 'hk_token': 'Dg8DdAVwdwNzDHcFcXF+D3gHBQA' } headers = { 'Accept': '*/*', 'cookie': "BIDUPSID='", 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://haokan.baidu.com', 'User-Agent': FakeUserAgent().chrome, } response = requests.request( "GET", url, headers=headers, params=params ).json() return response['data']['apiData']['curVideoMeta'] def hksp_search(key): """ 好看视频搜索爬虫 """ sensitive_words = MySQLServer().select_sensitive_words() def sensitive_flag(s_words, ori_title): """ :param ori_title: :param s_words: :return: """ for word in s_words: if word in ori_title: return False return True timestamp_seconds = time.time() timestamp_milliseconds = int(timestamp_seconds * 1000) url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video' # 定义请求的参数 strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1) sign = hashlib.md5(strings.encode()).hexdigest() params = { 'pn': 1, 'rn': 10, 'type': 'video', 'query': key, 'sign': sign, 'version': 1, 'timestamp': timestamp_milliseconds } # 定义请求头 headers = { 'authority': 'haokan.baidu.com', 'accept': '*/*', 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8', 'cookie': "BIDUPSID=", 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', 'x-requested-with': 'xmlhttprequest', } # 发送GET请求 response = requests.get(url, headers=headers, params=params).json() try: data_list = response['data']['list'] L = [] for data in data_list: try: video_id = data['vid'] res = get_video_detail(video_id) if sensitive_flag(sensitive_words, ['title']) and int(res['duration']) <= 300: L.append(res) else: continue except Exception as e: print(e) pass return L except: return []