baidu_spider.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. """
  2. @author: luojunhui
  3. """
  4. import base64
  5. import uuid
  6. import requests
  7. from fake_useragent import FakeUserAgent
  8. from applications.exception import SpiderError
  9. from applications import Functions
  10. functions = Functions()
  11. def baidu_account_video_crawler(account_id, cursor=None):
  12. """
  13. baidu account video crawler
  14. :param account_id: 百度账号id
  15. :param cursor: 游标, 默认为None,表示从最新的开始爬取
  16. success requests:
  17. """
  18. cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
  19. url = "https://haokan.baidu.com/web/author/listall?"
  20. params = {
  21. 'app_id': account_id,
  22. 'ctime': cursor,
  23. 'rn': 10,
  24. 'searchAfter': '',
  25. '_api': 1
  26. }
  27. headers = {
  28. 'Accept': '*/*',
  29. 'Accept-Language': 'zh,zh-CN;q=0.9',
  30. 'Connection': 'keep-alive',
  31. 'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
  32. 'User-Agent': FakeUserAgent().chrome,
  33. 'x-requested-with': 'xmlhttprequest',
  34. 'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
  35. }
  36. try:
  37. response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
  38. response_json = response.json()
  39. if response_json['errmsg'] == '成功':
  40. response_data = response_json['data']
  41. return response_data
  42. else:
  43. raise SpiderError(
  44. platform="baidu",
  45. spider="account_video_crawler",
  46. error=response_json['errmsg'],
  47. url=url
  48. )
  49. except Exception as e:
  50. raise SpiderError(
  51. platform="baidu",
  52. spider="account_video_crawler",
  53. error=str(e),
  54. url=url
  55. )
  56. def baidu_single_video_crawler(video_id):
  57. """
  58. baidu video crawler
  59. :param video_id: 视频id
  60. """
  61. url = "https://haokan.baidu.com/v"
  62. params = {
  63. 'vid': video_id,
  64. '_format': 'json'
  65. }
  66. base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
  67. headers = {
  68. 'Accept': '*/*',
  69. 'cookie': "BIDUPSID={}".format(base_64_string),
  70. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  71. 'Cache-Control': 'no-cache',
  72. 'Connection': 'keep-alive',
  73. 'Content-Type': 'application/x-www-form-urlencoded',
  74. 'Referer': 'https://haokan.baidu.com',
  75. 'User-Agent': FakeUserAgent().chrome,
  76. }
  77. try:
  78. response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
  79. response_json = response.json()
  80. return response_json['data']['apiData']['curVideoMeta']
  81. except Exception as e:
  82. raise SpiderError(
  83. platform="baidu",
  84. spider="single_video_crawler",
  85. error=str(e),
  86. url=url
  87. )