baidu.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. """
  2. @author: luojunhui
  3. 好看视频搜索爬虫
  4. """
  5. import json
  6. import time
  7. import base64
  8. import hashlib
  9. import requests
  10. import urllib.parse
  11. from uuid import uuid4
  12. from fake_useragent import FakeUserAgent
  13. from applications.functions.common import sensitive_flag
  14. def tunnel_proxies():
  15. """
  16. 快代理
  17. :return:
  18. """
  19. # 隧道域名:端口号
  20. tunnel = "l901.kdltps.com:15818"
  21. # 用户名密码方式
  22. username = "t11983523373311"
  23. password = "mtuhdr2z"
  24. proxies = {
  25. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  26. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  27. }
  28. return proxies
  29. def get_video_detail(video_id):
  30. """
  31. 获取好看视频的视频链接
  32. :param video_id:
  33. :return:
  34. """
  35. url = "https://haokan.baidu.com/v"
  36. params = {
  37. 'vid': video_id,
  38. '_format': 'json'
  39. }
  40. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  41. headers = {
  42. 'Accept': '*/*',
  43. 'cookie': "BIDUPSID={}".format(base_64_string),
  44. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  45. 'Cache-Control': 'no-cache',
  46. 'Connection': 'keep-alive',
  47. 'Content-Type': 'application/x-www-form-urlencoded',
  48. 'Referer': 'https://haokan.baidu.com',
  49. 'User-Agent': FakeUserAgent().chrome,
  50. }
  51. response = requests.request(
  52. "GET",
  53. url,
  54. headers=headers,
  55. params=params,
  56. proxies=tunnel_proxies()
  57. ).json()
  58. time.sleep(2)
  59. return response['data']['apiData']['curVideoMeta']
  60. def hksp_search(key, sensitive_words, trace_id):
  61. """
  62. 好看视频搜索爬虫
  63. """
  64. timestamp_seconds = time.time()
  65. timestamp_milliseconds = int(timestamp_seconds * 1000)
  66. url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
  67. # 定义请求的参数
  68. strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
  69. sign = hashlib.md5(strings.encode()).hexdigest()
  70. params = {
  71. 'pn': 1,
  72. 'rn': 10,
  73. 'type': 'video',
  74. 'query': key,
  75. 'sign': sign,
  76. 'version': 1,
  77. 'timestamp': timestamp_milliseconds
  78. }
  79. # 定义请求头
  80. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  81. headers = {
  82. 'authority': 'haokan.baidu.com',
  83. 'accept': '*/*',
  84. 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
  85. 'cookie': "BIDUPSID={}".format(base_64_string),
  86. 'user-agent': FakeUserAgent().chrome,
  87. 'x-requested-with': 'xmlhttprequest',
  88. }
  89. # 发送GET请求
  90. try:
  91. response = requests.get(
  92. url,
  93. headers=headers,
  94. params=params,
  95. proxies=tunnel_proxies(),
  96. timeout=120
  97. ).json()
  98. data_list = response['data']['list']
  99. L = []
  100. for data in data_list:
  101. try:
  102. video_id = data['vid']
  103. title = data['title']
  104. duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
  105. if sensitive_flag(sensitive_words, title) and int(duration) <= 300:
  106. res = get_video_detail(video_id)
  107. L.append(res)
  108. return L
  109. else:
  110. continue
  111. except Exception as e:
  112. pass
  113. return L
  114. except Exception as e:
  115. print(e)
  116. return []
  117. if __name__ == '__main__':
  118. res = hksp_search("90岁上海大爷征婚", sensitive_words=[], trace_id="testId")
  119. for item in res:
  120. print(json.dumps(item, ensure_ascii=False, indent=4))