hksp_search.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. """
  2. @author: luojunhui
  3. 好看视频搜索爬虫
  4. """
  5. import requests
  6. import urllib.parse
  7. import time
  8. import hashlib
  9. from applications.functions.common import MySQLServer
  10. def tunnel_proxies():
  11. # 隧道域名:端口号
  12. tunnel = "q796.kdltps.com:15818"
  13. # 用户名密码方式
  14. username = "t17772369458618"
  15. password = "5zqcjkmy"
  16. proxies = {
  17. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  18. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  19. }
  20. return proxies
  21. def get_video_detail(video_id):
  22. """
  23. 获取好看视频的视频链接
  24. :param video_id:
  25. :return:
  26. """
  27. url = "https://haokan.baidu.com/v"
  28. params = {
  29. 'vid': video_id,
  30. '_format': 'json',
  31. # 'hk_nonce': 'f47386e95fe657182aa3c1826d9a6b85',
  32. # 'hk_timestamp': '1715225386',
  33. # 'hk_sign': '4b219f5e3971e42b3e23dc2a209fc9d9',
  34. # 'hk_token': 'Dg8DdAVwdwNzDHcFcXF+D3gHBQA'
  35. }
  36. headers = {
  37. 'Accept': '*/*',
  38. 'cookie': "BIDUPSID='",
  39. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  40. 'Cache-Control': 'no-cache',
  41. 'Connection': 'keep-alive',
  42. 'Content-Type': 'application/x-www-form-urlencoded',
  43. 'Referer': 'https://haokan.baidu.com',
  44. 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
  45. }
  46. response = requests.request(
  47. "GET",
  48. url,
  49. headers=headers,
  50. params=params,
  51. proxies=tunnel_proxies()
  52. ).json()
  53. return response['data']['apiData']['curVideoMeta']
  54. def hksp_search(key):
  55. """
  56. 好看视频搜索爬虫
  57. """
  58. sensitive_words = MySQLServer().select_sensitive_words()
  59. def sensitive_flag(s_words, ori_title):
  60. """
  61. :param ori_title:
  62. :param s_words:
  63. :return:
  64. """
  65. for word in s_words:
  66. if word in ori_title:
  67. return False
  68. return True
  69. timestamp_seconds = time.time()
  70. timestamp_milliseconds = int(timestamp_seconds * 1000)
  71. url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
  72. # 定义请求的参数
  73. strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
  74. sign = hashlib.md5(strings.encode()).hexdigest()
  75. params = {
  76. 'pn': 1,
  77. 'rn': 10,
  78. 'type': 'video',
  79. 'query': key,
  80. 'sign': sign,
  81. 'version': 1,
  82. 'timestamp': timestamp_milliseconds
  83. }
  84. # 定义请求头
  85. headers = {
  86. 'authority': 'haokan.baidu.com',
  87. 'accept': '*/*',
  88. 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
  89. 'cookie': "BIDUPSID=",
  90. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
  91. 'x-requested-with': 'xmlhttprequest',
  92. }
  93. # 发送GET请求
  94. response = requests.get(url, headers=headers, params=params).json()
  95. print(response)
  96. data_list = response['data']['list']
  97. L = []
  98. for data in data_list:
  99. try:
  100. video_id = data['vid']
  101. res = get_video_detail(video_id)
  102. if sensitive_flag(sensitive_words, ['title']) and int(res['duration']) <= 300:
  103. L.append(res)
  104. else:
  105. continue
  106. except:
  107. pass
  108. return L
  109. L = hksp_search("【头次】对拜登背后放冷枪,越南干得漂亮,俄罗斯立功,中国成赢家!")
  110. print(L)