spider.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. from __future__ import annotations
  2. import json
  3. import base64
  4. import hashlib
  5. import requests
  6. import urllib.parse
  7. from datetime import datetime
  8. from tenacity import retry
  9. from uuid import uuid4
  10. from fake_useragent import FakeUserAgent
  11. from applications import log
  12. from applications.utils import proxy, request_retry
  13. retry_desc = request_retry(retry_times=3, min_retry_delay=2, max_retry_delay=30)
  14. @retry(**retry_desc)
  15. def haokan_search_videos(search_key: str) -> dict | None:
  16. """
  17. get haokan search videos
  18. :param search_key: search key
  19. :return: haokan search videos
  20. """
  21. timestamp_with_ms = datetime.now().timestamp()
  22. timestamp_ms = int(timestamp_with_ms * 1000)
  23. query_string = urllib.parse.quote(search_key)
  24. strings = "{}_{}_{}_{}_{}".format(1, query_string, 10, timestamp_ms, 1)
  25. sign = hashlib.md5(strings.encode()).hexdigest()
  26. url = f"https://haokan.baidu.com/haokan/ui-search/pc/search/video?pn=1&rn=10&type=video&query={query_string}&sign={sign}&version=1&timestamp={timestamp_ms}"
  27. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  28. headers = {
  29. "Accept": "*/*",
  30. "Accept-Language": "zh",
  31. "Connection": "keep-alive",
  32. "Referer": "https://haokan.baidu.com/web/search/page?query={}".format(
  33. query_string
  34. ),
  35. "User-Agent": FakeUserAgent().chrome,
  36. "Cookie": "BAIDUID={}".format(base_64_string),
  37. }
  38. try:
  39. response = requests.get(url, headers=headers, proxies=proxy(), timeout=120)
  40. response.raise_for_status()
  41. return response.json()
  42. except requests.exceptions.RequestException as e:
  43. log(
  44. task="haokan_crawler_videos",
  45. function="haokan_search_videos",
  46. message=f"API请求失败: {e}",
  47. data={"search_key": search_key},
  48. )
  49. except json.JSONDecodeError as e:
  50. log(
  51. task="haokan_crawler_videos",
  52. function="haokan_search_videos",
  53. message=f"响应解析失败: {e}",
  54. data={"search_key": search_key},
  55. )
  56. return None
  57. @retry(**retry_desc)
  58. def haokan_fetch_video_detail(video_id: str) -> dict | None:
  59. """
  60. get haokan video detail
  61. :param video_id: video id
  62. :return: haokan video detail
  63. """
  64. url = "https://haokan.baidu.com/v"
  65. params = {
  66. 'vid': video_id,
  67. '_format': 'json'
  68. }
  69. base_64_string = base64.b64encode(str(uuid4()).encode()).decode()
  70. headers = {
  71. 'Accept': '*/*',
  72. 'cookie': "BIDUPSID={}".format(base_64_string),
  73. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  74. 'Cache-Control': 'no-cache',
  75. 'Connection': 'keep-alive',
  76. 'Content-Type': 'application/x-www-form-urlencoded',
  77. 'Referer': 'https://haokan.baidu.com',
  78. 'User-Agent': FakeUserAgent().chrome,
  79. }
  80. try:
  81. response = requests.get(url, headers=headers, proxies=proxy(), params=params, timeout=120)
  82. response.raise_for_status()
  83. return response.json()
  84. except requests.exceptions.RequestException as e:
  85. log(
  86. task="haokan_crawler_videos",
  87. function="haokan_get_detail",
  88. message=f"API请求失败: {e}",
  89. data={"video_id": video_id},
  90. )
  91. except json.JSONDecodeError as e:
  92. log(
  93. task="haokan_crawler_videos",
  94. function="haokan_get_detail",
  95. message=f"响应解析失败: {e}",
  96. data={"video_id": video_id},
  97. )
  98. return None