""" @author: luojunhui 好看视频搜索爬虫 """ import json import time import base64 import hashlib import requests import urllib.parse from uuid import uuid4 from fake_useragent import FakeUserAgent from applications.functions.common import sensitive_flag def tunnel_proxies(): """ 快代理 :return: """ # 隧道域名:端口号 tunnel = "l901.kdltps.com:15818" # 用户名密码方式 username = "t11983523373311" password = "mtuhdr2z" proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} } return proxies def get_video_detail(video_id): """ 获取好看视频的视频链接 :param video_id: :return: """ url = "https://haokan.baidu.com/v" params = { 'vid': video_id, '_format': 'json' } base_64_string = base64.b64encode(str(uuid4()).encode()).decode() headers = { 'Accept': '*/*', 'cookie': "BIDUPSID={}".format(base_64_string), 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded', 'Referer': 'https://haokan.baidu.com', 'User-Agent': FakeUserAgent().chrome, } response = requests.request( "GET", url, headers=headers, params=params, proxies=tunnel_proxies() ).json() time.sleep(2) return response['data']['apiData']['curVideoMeta'] def hksp_search(key, sensitive_words, trace_id): """ 好看视频搜索爬虫 """ timestamp_seconds = time.time() timestamp_milliseconds = int(timestamp_seconds * 1000) url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video' # 定义请求的参数 strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1) sign = hashlib.md5(strings.encode()).hexdigest() params = { 'pn': 1, 'rn': 10, 'type': 'video', 'query': key, 'sign': sign, 'version': 1, 'timestamp': timestamp_milliseconds } # 定义请求头 base_64_string = base64.b64encode(str(uuid4()).encode()).decode() headers = { 'authority': 'haokan.baidu.com', 'accept': '*/*', 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8', 'cookie': "BIDUPSID={}".format(base_64_string), 'user-agent': FakeUserAgent().chrome, 'x-requested-with': 'xmlhttprequest', } # 发送GET请求 try: response = requests.get( url, headers=headers, params=params, proxies=tunnel_proxies(), timeout=120 ).json() data_list = response['data']['list'] L = [] for data in data_list: try: video_id = data['vid'] title = data['title'] duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1]) if sensitive_flag(sensitive_words, title) and int(duration) <= 300: res = get_video_detail(video_id) L.append(res) return L else: continue except Exception as e: pass return L except Exception as e: print(e) return [] if __name__ == '__main__': res = hksp_search("90岁上海大爷征婚", sensitive_words=[], trace_id="testId") for item in res: print(json.dumps(item, ensure_ascii=False, indent=4))