hksp_search.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. """
  2. @author: luojunhui
  3. 好看视频搜索爬虫
  4. """
  5. import json
  6. import time
  7. import base64
  8. import hashlib
  9. import requests
  10. import urllib.parse
  11. from uuid import uuid4
  12. from fake_useragent import FakeUserAgent
  13. from applications.functions.common import MySQLServer
  14. def get_video_detail(video_id):
  15. """
  16. 获取好看视频的视频链接
  17. :param video_id:
  18. :return:
  19. """
  20. url = "https://haokan.baidu.com/v"
  21. params = {
  22. 'vid': video_id,
  23. '_format': 'json'
  24. }
  25. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  26. headers = {
  27. 'Accept': '*/*',
  28. 'cookie': "BIDUPSID={}".format(base_64_string),
  29. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  30. 'Cache-Control': 'no-cache',
  31. 'Connection': 'keep-alive',
  32. 'Content-Type': 'application/x-www-form-urlencoded',
  33. 'Referer': 'https://haokan.baidu.com',
  34. 'User-Agent': FakeUserAgent().chrome,
  35. }
  36. response = requests.request(
  37. "GET",
  38. url,
  39. headers=headers,
  40. params=params
  41. ).json()
  42. return response['data']['apiData']['curVideoMeta']
  43. def hksp_search(key):
  44. """
  45. 好看视频搜索爬虫
  46. """
  47. sensitive_words = MySQLServer().select_sensitive_words()
  48. def sensitive_flag(s_words, ori_title):
  49. """
  50. :param ori_title:
  51. :param s_words:
  52. :return:
  53. """
  54. for word in s_words:
  55. if word in ori_title:
  56. return False
  57. return True
  58. timestamp_seconds = time.time()
  59. timestamp_milliseconds = int(timestamp_seconds * 1000)
  60. url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
  61. # 定义请求的参数
  62. strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
  63. sign = hashlib.md5(strings.encode()).hexdigest()
  64. params = {
  65. 'pn': 1,
  66. 'rn': 10,
  67. 'type': 'video',
  68. 'query': key,
  69. 'sign': sign,
  70. 'version': 1,
  71. 'timestamp': timestamp_milliseconds
  72. }
  73. # 定义请求头
  74. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  75. headers = {
  76. 'authority': 'haokan.baidu.com',
  77. 'accept': '*/*',
  78. 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
  79. 'cookie': "BIDUPSID={}".format(base_64_string),
  80. 'user-agent': FakeUserAgent().chrome,
  81. 'x-requested-with': 'xmlhttprequest',
  82. }
  83. # 发送GET请求
  84. response = requests.get(url, headers=headers, params=params).json()
  85. try:
  86. data_list = response['data']['list']
  87. L = []
  88. for data in data_list:
  89. try:
  90. video_id = data['vid']
  91. title = data['title']
  92. duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
  93. if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
  94. res = get_video_detail(video_id)
  95. L.append(res)
  96. else:
  97. continue
  98. except Exception as e:
  99. print(e)
  100. pass
  101. return L
  102. except:
  103. return []