| 
					
				 | 
			
			
				@@ -0,0 +1,133 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+@author: luojunhui 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import time 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import requests 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium import webdriver 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.chrome.service import Service 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.chrome.options import Options 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from selenium.webdriver.common.by import By 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from webdriver_manager.chrome import ChromeDriverManager 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def tunnel_proxies(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    快代理 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :return: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 隧道域名:端口号 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    tunnel = "l901.kdltps.com:15818" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 用户名密码方式 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    username = "t11983523373311" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    password = "mtuhdr2z" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    proxies = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return proxies 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def bjh_url_list(search_title): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    获取图片list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :return: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url = "https://lab.magiconch.com/api/baidu/images" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    params = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "text": search_title, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "index": 0, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "size": 60 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    headers = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'accept': '*/*', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'content-type': 'application/json', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'priority': 'u=1, i', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'referer': 'https://lab.magiconch.com/baidu-images/', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-ch-ua-mobile': '?0', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-ch-ua-platform': '"macOS"', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-fetch-dest': 'empty', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-fetch-mode': 'cors', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'sec-fetch-site': 'same-origin', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    response = requests.request( 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "GET", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        url, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        headers=headers, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        params=params, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        proxies=tunnel_proxies() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    ) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    res = response.json() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url_list = [] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for item in res: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        if "baijiahao.baidu.com" in item['url']: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            url_list.append(item['url'].split("&")[0]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return url_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def bjh_article(content_url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    百家号获取文章 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :param content_url: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 配置无头浏览器模式 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(content_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    chrome_options = Options() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    chrome_options.add_argument("--headless") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    chrome_options.add_argument("--disable-gpu") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    chrome_options.add_argument("--no-sandbox") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    chrome_options.add_argument("--disable-dev-shm-usage") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 安装并设置Chrome驱动 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    service = Service(ChromeDriverManager().install()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver = webdriver.Chrome(service=service, options=chrome_options) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 打开指定网页 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.get(content_url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 等待网页加载完毕 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.implicitly_wait(5) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 模拟滚动页面 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    def scroll_page(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        # 获取页面高度 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        last_height = driver.execute_script("return document.body.scrollHeight") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        while True: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 向下滚动到页面底部 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 等待页面加载 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            time.sleep(2) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            # 计算新的页面高度并与上次页面高度进行比较 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            new_height = driver.execute_script("return document.body.scrollHeight") 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            if new_height == last_height: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+                break 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            last_height = new_height 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    scroll_page() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    title = title_element.text 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    page_text = '\n'.join([element.text for element in page_text_elements]) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img') 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    img_url_list = [element.get_attribute("src") for element in img_elements] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 打印网页文本 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(title) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(page_text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    print(img_url_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # 关闭浏览器 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    driver.quit() 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return title, page_text, img_url_list 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+if __name__ == '__main__': 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url_list = bjh_url_list(title) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    # print(url_list) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    for url in url_list: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        try: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            bjh_article(url) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        except: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+            pass 
			 |