""" @author: luojunhui """ import time import requests from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager def tunnel_proxies(): """ 快代理 :return: """ # 隧道域名:端口号 tunnel = "l901.kdltps.com:15818" # 用户名密码方式 username = "t11983523373311" password = "mtuhdr2z" proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} } return proxies def bjh_url_list(search_title): """ 获取图片list :return: """ url = "https://lab.magiconch.com/api/baidu/images" params = { "text": search_title, "index": 0, "size": 60 } headers = { 'accept': '*/*', 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8', 'content-type': 'application/json', 'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293', 'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"', 'priority': 'u=1, i', 'referer': 'https://lab.magiconch.com/baidu-images/', 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"macOS"', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' } response = requests.request( "GET", url, headers=headers, params=params, proxies=tunnel_proxies() ) res = response.json() url_list = [] for item in res: if "baijiahao.baidu.com" in item['url']: url_list.append(item['url'].split("&")[0]) return url_list def bjh_article(content_url): """ 百家号获取文章 :param content_url: """ # 配置无头浏览器模式 print(content_url) chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") # 安装并设置Chrome驱动 service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) # 打开指定网页 driver.get(content_url) # 等待网页加载完毕 driver.implicitly_wait(5) # 模拟滚动页面 def scroll_page(): # 获取页面高度 last_height = driver.execute_script("return document.body.scrollHeight") while True: # 向下滚动到页面底部 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") # 等待页面加载 time.sleep(2) # 计算新的页面高度并与上次页面高度进行比较 new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height scroll_page() title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]') title = title_element.text page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p') page_text = '\n'.join([element.text for element in page_text_elements]) img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img') img_url_list = [element.get_attribute("src") for element in img_elements] # 打印网页文本 print(title) print(page_text) print(img_url_list) # 关闭浏览器 driver.quit() return title, page_text, img_url_list if __name__ == '__main__': title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!" url_list = bjh_url_list(title) # print(url_list) for url in url_list: try: bjh_article(url) except: pass