# -*- coding: utf-8 -*- # @Author: wangkun # @Time: 2023/9/11 from urllib.parse import urlencode from urllib.request import urlretrieve import requests import time def getPage(offset): """获取网页信息""" data = { 'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp': 'result', 'queryWord': '街拍', 'cl': '2', 'lm': '-1', 'ie': 'utf - 8', 'oe': 'utf - 8', 'adpicid': '', 'st': '-1', 'z': '', 'ic': '0', 'hd': '', 'latest': '', 'copyright': '', 'word': '街拍', 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': '0', 'istype': '2', 'qc': '', 'nc': '1', 'fr': '', 'expermode': '', 'force': '', 'pn': offset, 'rn': '30', 'gsm': '1e', '1551789143500': '', } headers = { 'Accept': 'text/plain, */*; q=0.01', 'Accept-Encoding': 'deflate, br', 'Accept-Language': 'Accept-Language', 'Connection': 'keep-alive', 'Cookie': 'BDqhfp=%E8%A1%97%E6%8B%8D%26%260-10-1undefined%26%260%26%261; BIDUPSID=7CA5F033CA22949F5FB6110DBC5DC1EE; BAIDUID=6DDE5BAA44763FD6C7CA84401CB19F36:FG=1; indexPageSugList=%5B%22%E8%A1%97%E6%8B%8D%22%5D; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; uploadTime=1551768107224; userFrom=null; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; cleanHistoryStatus=0', 'Host': 'image.baidu.com', 'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E8%A1%97%E6%8B%8D&oq=%E8%A1%97%E6%8B%8D&rsp=-1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6735.400 QQBrowser/10.2.2328.400', 'X-Requested-With': 'XMLHttpRequest', } url = 'https://image.baidu.com/search/acjson?' + urlencode(data) try: res = requests.get(url, data=data, headers=headers) res.encoding = 'utf-8' # 网页信息编码 if res.status_code == 200: return res.json() except requests.ConnectionError: return None def getImage(json): """解析网页数据并爬取所需的信息""" try: data = json.get('data') if data: for item in data: yield { 'image': item.get('hoverURL'), 'title': item.get('fromPageTitleEnc'), } except: return None def saveImage(item): """把获取的图片与标题封装并存储""" try: m = item.get('title') local_image = item.get('image') # 获取图片的url image_url = local_image urlretrieve(image_url, './pic/' + str(m) + '.jpg') # print('p'+str(m) + '.jpg') except: return None def main(offset): """调度爬取函数和存储""" json = getPage(offset) for item in getImage(json): print(item) saveImage(item) if __name__ == '__main__': for i in range(5): # 此处循环遍历五次是不可行的 每次data值中的gsm在变化 main(offset=i * 30) time.sleep(1)