|
@@ -0,0 +1,133 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import time
|
|
|
+import requests
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
+from selenium.webdriver.common.by import By
|
|
|
+from webdriver_manager.chrome import ChromeDriverManager
|
|
|
+
|
|
|
+
|
|
|
+def tunnel_proxies():
|
|
|
+ """
|
|
|
+ 快代理
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ # 隧道域名:端口号
|
|
|
+ tunnel = "l901.kdltps.com:15818"
|
|
|
+
|
|
|
+ # 用户名密码方式
|
|
|
+ username = "t11983523373311"
|
|
|
+ password = "mtuhdr2z"
|
|
|
+ proxies = {
|
|
|
+ "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
|
|
|
+ "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
|
|
|
+ }
|
|
|
+ return proxies
|
|
|
+
|
|
|
+
|
|
|
+def bjh_url_list(search_title):
|
|
|
+ """
|
|
|
+ 获取图片list
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ url = "https://lab.magiconch.com/api/baidu/images"
|
|
|
+ params = {
|
|
|
+ "text": search_title,
|
|
|
+ "index": 0,
|
|
|
+ "size": 60
|
|
|
+ }
|
|
|
+ headers = {
|
|
|
+ 'accept': '*/*',
|
|
|
+ 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
|
|
|
+ 'content-type': 'application/json',
|
|
|
+ 'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293',
|
|
|
+ 'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"',
|
|
|
+ 'priority': 'u=1, i',
|
|
|
+ 'referer': 'https://lab.magiconch.com/baidu-images/',
|
|
|
+ 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
|
|
|
+ 'sec-ch-ua-mobile': '?0',
|
|
|
+ 'sec-ch-ua-platform': '"macOS"',
|
|
|
+ 'sec-fetch-dest': 'empty',
|
|
|
+ 'sec-fetch-mode': 'cors',
|
|
|
+ 'sec-fetch-site': 'same-origin',
|
|
|
+ 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
|
|
|
+ }
|
|
|
+ response = requests.request(
|
|
|
+ "GET",
|
|
|
+ url,
|
|
|
+ headers=headers,
|
|
|
+ params=params,
|
|
|
+ proxies=tunnel_proxies()
|
|
|
+ )
|
|
|
+ res = response.json()
|
|
|
+ url_list = []
|
|
|
+ for item in res:
|
|
|
+ if "baijiahao.baidu.com" in item['url']:
|
|
|
+ url_list.append(item['url'].split("&")[0])
|
|
|
+ return url_list
|
|
|
+
|
|
|
+
|
|
|
+def bjh_article(content_url):
|
|
|
+ """
|
|
|
+ 百家号获取文章
|
|
|
+ :param content_url:
|
|
|
+ """
|
|
|
+ # 配置无头浏览器模式
|
|
|
+ print(content_url)
|
|
|
+ chrome_options = Options()
|
|
|
+ chrome_options.add_argument("--headless")
|
|
|
+ chrome_options.add_argument("--disable-gpu")
|
|
|
+ chrome_options.add_argument("--no-sandbox")
|
|
|
+ chrome_options.add_argument("--disable-dev-shm-usage")
|
|
|
+
|
|
|
+ # 安装并设置Chrome驱动
|
|
|
+ service = Service(ChromeDriverManager().install())
|
|
|
+ driver = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
+ # 打开指定网页
|
|
|
+ driver.get(content_url)
|
|
|
+ # 等待网页加载完毕
|
|
|
+ driver.implicitly_wait(5)
|
|
|
+
|
|
|
+ # 模拟滚动页面
|
|
|
+ def scroll_page():
|
|
|
+ # 获取页面高度
|
|
|
+ last_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
+ while True:
|
|
|
+ # 向下滚动到页面底部
|
|
|
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
+ # 等待页面加载
|
|
|
+ time.sleep(2)
|
|
|
+ # 计算新的页面高度并与上次页面高度进行比较
|
|
|
+ new_height = driver.execute_script("return document.body.scrollHeight")
|
|
|
+ if new_height == last_height:
|
|
|
+ break
|
|
|
+ last_height = new_height
|
|
|
+
|
|
|
+ scroll_page()
|
|
|
+ title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]')
|
|
|
+ title = title_element.text
|
|
|
+ page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p')
|
|
|
+ page_text = '\n'.join([element.text for element in page_text_elements])
|
|
|
+ img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img')
|
|
|
+ img_url_list = [element.get_attribute("src") for element in img_elements]
|
|
|
+ # 打印网页文本
|
|
|
+ print(title)
|
|
|
+ print(page_text)
|
|
|
+ print(img_url_list)
|
|
|
+ # 关闭浏览器
|
|
|
+ driver.quit()
|
|
|
+ return title, page_text, img_url_list
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!"
|
|
|
+ url_list = bjh_url_list(title)
|
|
|
+ # print(url_list)
|
|
|
+ for url in url_list:
|
|
|
+ try:
|
|
|
+ bjh_article(url)
|
|
|
+ except:
|
|
|
+ pass
|