get_img.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # -*- coding: utf-8 -*-
  2. # @Author: wangkun
  3. # @Time: 2023/9/11
  4. from urllib.parse import urlencode
  5. from urllib.request import urlretrieve
  6. import requests
  7. import time
  8. def getPage(offset):
  9. """获取网页信息"""
  10. data = {
  11. 'tn': 'resultjson_com',
  12. 'ipn': 'rj',
  13. 'ct': '201326592',
  14. 'is': '',
  15. 'fp': 'result',
  16. 'queryWord': '街拍',
  17. 'cl': '2',
  18. 'lm': '-1',
  19. 'ie': 'utf - 8',
  20. 'oe': 'utf - 8',
  21. 'adpicid': '',
  22. 'st': '-1',
  23. 'z': '',
  24. 'ic': '0',
  25. 'hd': '',
  26. 'latest': '',
  27. 'copyright': '',
  28. 'word': '街拍',
  29. 's': '',
  30. 'se': '',
  31. 'tab': '',
  32. 'width': '',
  33. 'height': '',
  34. 'face': '0',
  35. 'istype': '2',
  36. 'qc': '',
  37. 'nc': '1',
  38. 'fr': '',
  39. 'expermode': '',
  40. 'force': '',
  41. 'pn': offset,
  42. 'rn': '30',
  43. 'gsm': '1e',
  44. '1551789143500': '',
  45. }
  46. headers = {
  47. 'Accept': 'text/plain, */*; q=0.01',
  48. 'Accept-Encoding': 'deflate, br',
  49. 'Accept-Language': 'Accept-Language',
  50. 'Connection': 'keep-alive',
  51. 'Cookie': 'BDqhfp=%E8%A1%97%E6%8B%8D%26%260-10-1undefined%26%260%26%261; BIDUPSID=7CA5F033CA22949F5FB6110DBC5DC1EE; BAIDUID=6DDE5BAA44763FD6C7CA84401CB19F36:FG=1; indexPageSugList=%5B%22%E8%A1%97%E6%8B%8D%22%5D; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; uploadTime=1551768107224; userFrom=null; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; cleanHistoryStatus=0',
  52. 'Host': 'image.baidu.com',
  53. 'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E8%A1%97%E6%8B%8D&oq=%E8%A1%97%E6%8B%8D&rsp=-1',
  54. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6735.400 QQBrowser/10.2.2328.400',
  55. 'X-Requested-With': 'XMLHttpRequest',
  56. }
  57. url = 'https://image.baidu.com/search/acjson?' + urlencode(data)
  58. try:
  59. res = requests.get(url, data=data, headers=headers)
  60. res.encoding = 'utf-8' # 网页信息编码
  61. if res.status_code == 200:
  62. return res.json()
  63. except requests.ConnectionError:
  64. return None
  65. def getImage(json):
  66. """解析网页数据并爬取所需的信息"""
  67. try:
  68. data = json.get('data')
  69. if data:
  70. for item in data:
  71. yield {
  72. 'image': item.get('hoverURL'),
  73. 'title': item.get('fromPageTitleEnc'),
  74. }
  75. except:
  76. return None
  77. def saveImage(item):
  78. """把获取的图片与标题封装并存储"""
  79. try:
  80. m = item.get('title')
  81. local_image = item.get('image') # 获取图片的url
  82. image_url = local_image
  83. urlretrieve(image_url, './pic/' + str(m) + '.jpg')
  84. # print('p'+str(m) + '.jpg')
  85. except:
  86. return None
  87. def main(offset):
  88. """调度爬取函数和存储"""
  89. json = getPage(offset)
  90. for item in getImage(json):
  91. print(item)
  92. saveImage(item)
  93. if __name__ == '__main__':
  94. for i in range(5): # 此处循环遍历五次是不可行的 每次data值中的gsm在变化
  95. main(offset=i * 30)
  96. time.sleep(1)