baidu_spider.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. """
  2. @author: luojunhui
  3. """
  4. import base64
  5. import uuid
  6. import requests
  7. from fake_useragent import FakeUserAgent
  8. from applications.exception import SpiderError
  9. from applications import Functions
  10. functions = Functions()
  11. def baidu_account_video_crawler(account_id, cursor=None):
  12. """
  13. baidu account video crawler
  14. :param account_id: 百度账号id
  15. :param cursor: 游标, 默认为None,表示从最新的开始爬取
  16. success requests:
  17. {
  18. "errno": 0,
  19. "errmsg": "成功",
  20. "data": {
  21. "response_count": 10,
  22. "has_more": 1,
  23. "ctime" : timestamp_ms plus one integer,
  24. "results": [
  25. {
  26. "tplName": "video",
  27. "type": "video",
  28. "content": {
  29. "vid": "6472901034127874496",
  30. "publish_time": "昨天",
  31. "title": "8年前妈妈囤黄金当彩礼,金价飙升后,我们全家乐开了花",
  32. "cover_src": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
  33. "cover_src_pc": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
  34. "thumbnails": "https://gimg0.baidu.com/gimg/src=h&refer=http%3A%2F%2Fwww.baidu.com&app=0&size=f339,225&n=0&g=0n&q=80?sec=0&t=f01af5f96ffb6d0d1904b33cbc2e136b",
  35. "duration": "03:15",
  36. "poster": "https://f7.baidu.com/it/u=1085139160,1164454909&fm=222&app=106&f=JPEG@s_0,w_660,h_370,q_80,f_auto",
  37. "playcnt": "1054",
  38. "playcntText": "1054次播放"
  39. }
  40. }...
  41. ]
  42. }
  43. }
  44. """
  45. cookie_str = uuid.uuid4().__str__().replace('-', '').upper()
  46. url = "https://haokan.baidu.com/web/author/listall?"
  47. params = {
  48. 'app_id': account_id,
  49. 'ctime': cursor,
  50. 'rn': 10,
  51. 'searchAfter': '',
  52. '_api': 1
  53. }
  54. headers = {
  55. 'Accept': '*/*',
  56. 'Accept-Language': 'zh,zh-CN;q=0.9',
  57. 'Connection': 'keep-alive',
  58. 'Referer': 'https://haokan.baidu.com/author/{}'.format(account_id),
  59. 'User-Agent': FakeUserAgent().chrome,
  60. 'x-requested-with': 'xmlhttprequest',
  61. 'Cookie': 'BAIDUID={}:FG=1; BAIDUID_BFESS={}:FG=1'.format(cookie_str, cookie_str)
  62. }
  63. try:
  64. response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
  65. response_json = response.json()
  66. if response_json['errmsg'] == '成功':
  67. response_data = response_json['data']
  68. return response_data
  69. else:
  70. raise SpiderError(
  71. platform="baidu",
  72. spider="account_video_crawler",
  73. error=response_json['errmsg'],
  74. url=url
  75. )
  76. except Exception as e:
  77. raise SpiderError(
  78. platform="baidu",
  79. spider="account_video_crawler",
  80. error=str(e),
  81. url=url
  82. )
  83. def baidu_single_video_crawler(video_id):
  84. """
  85. baidu video crawler
  86. :param video_id: 视频id
  87. """
  88. url = "https://haokan.baidu.com/v"
  89. params = {
  90. 'vid': video_id,
  91. '_format': 'json'
  92. }
  93. base_64_string = base64.b64encode(str(uuid.uuid4()).encode()).decode()
  94. headers = {
  95. 'Accept': '*/*',
  96. 'cookie': "BIDUPSID={}".format(base_64_string),
  97. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  98. 'Cache-Control': 'no-cache',
  99. 'Connection': 'keep-alive',
  100. 'Content-Type': 'application/x-www-form-urlencoded',
  101. 'Referer': 'https://haokan.baidu.com',
  102. 'User-Agent': FakeUserAgent().chrome,
  103. }
  104. try:
  105. response = requests.request("GET", url, headers=headers, params=params, proxies=functions.proxy())
  106. response_json = response.json()
  107. return response_json['data']['apiData']['curVideoMeta']
  108. except Exception as e:
  109. raise SpiderError(
  110. platform="baidu",
  111. spider="single_video_crawler",
  112. error=str(e),
  113. url=url
  114. )