123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- """
- @author: luojunhui
- """
- import time
- import requests
- from selenium import webdriver
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.chrome.options import Options
- from selenium.webdriver.common.by import By
- from webdriver_manager.chrome import ChromeDriverManager
- def tunnel_proxies():
- """
- 快代理
- :return:
- """
- # 隧道域名:端口号
- tunnel = "l901.kdltps.com:15818"
- # 用户名密码方式
- username = "t11983523373311"
- password = "mtuhdr2z"
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
- }
- return proxies
- def bjh_url_list(search_title):
- """
- 获取图片list
- :return:
- """
- url = "https://lab.magiconch.com/api/baidu/images"
- params = {
- "text": search_title,
- "index": 0,
- "size": 60
- }
- headers = {
- 'accept': '*/*',
- 'accept-language': 'en,zh;q=0.9,zh-CN;q=0.8',
- 'content-type': 'application/json',
- 'cookie': 'Hm_lvt_f4e477c61adf5c145ce938a05611d5f0=1718784293; Hm_lpvt_f4e477c61adf5c145ce938a05611d5f0=1718784293',
- 'if-none-match': 'W/"5e03-9dK2z/6rD0/7aX0R6HraLuFnLjI"',
- 'priority': 'u=1, i',
- 'referer': 'https://lab.magiconch.com/baidu-images/',
- 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"macOS"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-origin',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
- }
- response = requests.request(
- "GET",
- url,
- headers=headers,
- params=params,
- proxies=tunnel_proxies()
- )
- res = response.json()
- url_list = []
- for item in res:
- if "baijiahao.baidu.com" in item['url']:
- url_list.append(item['url'].split("&")[0])
- return url_list
- def bjh_article(content_url):
- """
- 百家号获取文章
- :param content_url:
- """
- # 配置无头浏览器模式
- print(content_url)
- chrome_options = Options()
- chrome_options.add_argument("--headless")
- chrome_options.add_argument("--disable-gpu")
- chrome_options.add_argument("--no-sandbox")
- chrome_options.add_argument("--disable-dev-shm-usage")
- # 安装并设置Chrome驱动
- service = Service(ChromeDriverManager().install())
- driver = webdriver.Chrome(service=service, options=chrome_options)
- # 打开指定网页
- driver.get(content_url)
- # 等待网页加载完毕
- driver.implicitly_wait(5)
- # 模拟滚动页面
- def scroll_page():
- # 获取页面高度
- last_height = driver.execute_script("return document.body.scrollHeight")
- while True:
- # 向下滚动到页面底部
- driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
- # 等待页面加载
- time.sleep(2)
- # 计算新的页面高度并与上次页面高度进行比较
- new_height = driver.execute_script("return document.body.scrollHeight")
- if new_height == last_height:
- break
- last_height = new_height
- scroll_page()
- title_element = driver.find_element(By.XPATH, '//div[@class="sKHSJ"]')
- title = title_element.text
- page_text_elements = driver.find_elements(By.XPATH, '//div[@data-testid="article"]//p')
- page_text = '\n'.join([element.text for element in page_text_elements])
- img_elements = driver.find_elements(By.XPATH, '//div[@class="_1NCGf"]/img')
- img_url_list = [element.get_attribute("src") for element in img_elements]
- # 打印网页文本
- print(title)
- print(page_text)
- print(img_url_list)
- # 关闭浏览器
- driver.quit()
- return title, page_text, img_url_list
- if __name__ == '__main__':
- title = "老祖宗说“寿有三不过,子孙福气多”!原来话里藏着大秘密!"
- url_list = bjh_url_list(title)
- # print(url_list)
- for url in url_list:
- try:
- bjh_article(url)
- except:
- pass
|