123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- """
- @author: luojunhui
- 好看视频搜索爬虫
- """
- import json
- import time
- import base64
- import hashlib
- import requests
- import urllib.parse
- from uuid import uuid4
- from fake_useragent import FakeUserAgent
- from applications.functions.common import sensitive_flag
- def tunnel_proxies():
- """
- 快代理
- :return:
- """
- # 隧道域名:端口号
- tunnel = "l901.kdltps.com:15818"
- # 用户名密码方式
- username = "t11983523373311"
- password = "mtuhdr2z"
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
- }
- return proxies
- def get_video_detail(video_id):
- """
- 获取好看视频的视频链接
- :param video_id:
- :return:
- """
- url = "https://haokan.baidu.com/v"
- params = {
- 'vid': video_id,
- '_format': 'json'
- }
- base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
- headers = {
- 'Accept': '*/*',
- 'cookie': "BIDUPSID={}".format(base_64_string),
- 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
- 'Cache-Control': 'no-cache',
- 'Connection': 'keep-alive',
- 'Content-Type': 'application/x-www-form-urlencoded',
- 'Referer': 'https://haokan.baidu.com',
- 'User-Agent': FakeUserAgent().chrome,
- }
- response = requests.request(
- "GET",
- url,
- headers=headers,
- params=params,
- proxies=tunnel_proxies()
- ).json()
- time.sleep(2)
- return response['data']['apiData']['curVideoMeta']
- def hksp_search(key, sensitive_words, trace_id):
- """
- 好看视频搜索爬虫
- """
- timestamp_seconds = time.time()
- timestamp_milliseconds = int(timestamp_seconds * 1000)
- url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
- # 定义请求的参数
- strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
- sign = hashlib.md5(strings.encode()).hexdigest()
- params = {
- 'pn': 1,
- 'rn': 10,
- 'type': 'video',
- 'query': key,
- 'sign': sign,
- 'version': 1,
- 'timestamp': timestamp_milliseconds
- }
- # 定义请求头
- base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
- headers = {
- 'authority': 'haokan.baidu.com',
- 'accept': '*/*',
- 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
- 'cookie': "BIDUPSID={}".format(base_64_string),
- 'user-agent': FakeUserAgent().chrome,
- 'x-requested-with': 'xmlhttprequest',
- }
- # 发送GET请求
- try:
- response = requests.get(
- url,
- headers=headers,
- params=params,
- proxies=tunnel_proxies(),
- timeout=120
- ).json()
- data_list = response['data']['list']
- L = []
- for data in data_list:
- try:
- video_id = data['vid']
- title = data['title']
- duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
- if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
- res = get_video_detail(video_id)
- L.append(res)
- return L
- else:
- continue
- except Exception as e:
- pass
- return L
- except Exception as e:
- print(e)
- return []
- if __name__ == '__main__':
- res = hksp_search("90岁上海大爷征婚", sensitive_words=[], trace_id="testId")
- for item in res:
- print(json.dumps(item, ensure_ascii=False, indent=4))
|