hksp_search.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. """
  2. @author: luojunhui
  3. 好看视频搜索爬虫
  4. """
  5. import json
  6. import time
  7. import base64
  8. import hashlib
  9. import requests
  10. import urllib.parse
  11. from uuid import uuid4
  12. from fake_useragent import FakeUserAgent
  13. from applications.functions.common import sensitive_flag
  14. from applications.log import logging
  15. from applications.const import server_const
  16. def tunnel_proxies():
  17. """
  18. 快代理
  19. :return:
  20. """
  21. # 隧道域名:端口号
  22. tunnel = "l901.kdltps.com:15818"
  23. # 用户名密码方式
  24. username = "t11983523373311"
  25. password = "mtuhdr2z"
  26. proxies = {
  27. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  28. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  29. }
  30. return proxies
  31. def get_video_detail(video_id):
  32. """
  33. 获取好看视频的视频链接
  34. :param video_id:
  35. :return:
  36. """
  37. url = "https://haokan.baidu.com/v"
  38. params = {
  39. 'vid': video_id,
  40. '_format': 'json'
  41. }
  42. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  43. headers = {
  44. 'Accept': '*/*',
  45. 'cookie': "BIDUPSID={}".format(base_64_string),
  46. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  47. 'Cache-Control': 'no-cache',
  48. 'Connection': 'keep-alive',
  49. 'Content-Type': 'application/x-www-form-urlencoded',
  50. 'Referer': 'https://haokan.baidu.com',
  51. 'User-Agent': FakeUserAgent().chrome,
  52. }
  53. response = requests.request(
  54. "GET",
  55. url,
  56. headers=headers,
  57. params=params,
  58. proxies=tunnel_proxies()
  59. ).json()
  60. time.sleep(2)
  61. return response['data']['apiData']['curVideoMeta']
  62. def hksp_search(key, sensitive_words, trace_id):
  63. """
  64. 好看视频搜索爬虫
  65. """
  66. timestamp_seconds = time.time()
  67. timestamp_milliseconds = int(timestamp_seconds * 1000)
  68. url = 'https://haokan.baidu.com/haokan/ui-search/pc/search/video'
  69. # 定义请求的参数
  70. strings = "{}_{}_{}_{}_{}".format(1, urllib.parse.quote(key), 10, timestamp_milliseconds, 1)
  71. sign = hashlib.md5(strings.encode()).hexdigest()
  72. params = {
  73. 'pn': 1,
  74. 'rn': 10,
  75. 'type': 'video',
  76. 'query': key,
  77. 'sign': sign,
  78. 'version': 1,
  79. 'timestamp': timestamp_milliseconds
  80. }
  81. # 定义请求头
  82. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  83. headers = {
  84. 'authority': 'haokan.baidu.com',
  85. 'accept': '*/*',
  86. 'accept-language': 'zh,en;q=0.9,zh-CN;q=0.8',
  87. 'cookie': "BIDUPSID={}".format(base_64_string),
  88. 'user-agent': FakeUserAgent().chrome,
  89. 'x-requested-with': 'xmlhttprequest',
  90. }
  91. # 发送GET请求
  92. try:
  93. response = requests.get(
  94. url,
  95. headers=headers,
  96. params=params,
  97. proxies=tunnel_proxies(),
  98. timeout=120
  99. ).json()
  100. data_list = response['data']['list']
  101. L = []
  102. logging(
  103. code="4002",
  104. info="百度搜索成功",
  105. trace_id=trace_id
  106. )
  107. for data in data_list:
  108. try:
  109. video_id = data['vid']
  110. title = data['title']
  111. duration = int(data['duration'].split(":")[0]) * 60 + int(data['duration'].split(":")[1])
  112. if sensitive_flag(sensitive_words, title) and int(duration) <= server_const.MAX_VIDEO_DURATION:
  113. res = get_video_detail(video_id)
  114. L.append(res)
  115. else:
  116. continue
  117. except Exception as e:
  118. pass
  119. logging(
  120. code="8001",
  121. info="百度搜索",
  122. data={
  123. "keys": key,
  124. "search_count": len(data_list),
  125. "useful_count": len(L)
  126. },
  127. trace_id=trace_id
  128. )
  129. return L
  130. except Exception as e:
  131. logging(
  132. code="4003",
  133. info="百度搜索失败",
  134. trace_id=trace_id,
  135. data={"error": str(e)}
  136. )
  137. return []