""" @author: luojunhui """ import time import random import requests from lxml import etree from fake_useragent import FakeUserAgent from gne import GeneralNewsExtractor from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from webdriver_manager.chrome import ChromeDriverManager def selenium_text(url): # 配置 Chrome 选项 chrome_options = Options() chrome_options.add_argument('--headless') # 无头模式 # chrome_options.add_argument('--disable-gpu') # chrome_options.add_argument('--no-sandbox') # chrome_options.add_argument('--disable-dev-shm-usage') # chrome_options.add_argument('--disable-blink-features=AutomationControlled') # chrome_options.add_argument( # f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36') # chrome_options.add_argument('--incognito') window_width = random.randint(800, 1200) window_height = random.randint(600, 800) # chrome_options.add_argument('--proxy-server=http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/') service = Service(ChromeDriverManager().install()) driver = webdriver.Chrome(service=service, options=chrome_options) driver.set_window_size(window_width, window_height) driver.get(url) page_text = driver.page_source driver.quit() return page_text def tunnel_proxies(): # 隧道域名:端口号 tunnel = "q796.kdltps.com:15818" # 用户名密码方式 username = "t17772369458618" password = "5zqcjkmy" proxies = { "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} } return proxies def extract(url): """ ttt :param url: :return: """ html_text = selenium_text(url) extractor = GeneralNewsExtractor() result = extractor.extract(html_text) print(result) def sogou_wechat(keyword): """ :param keyword: :return: """ url = "https://weixin.sogou.com/weixin?type=2&query={}".format(keyword) print(url) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': '', 'Referer': 'https://weixin.sogou.com/weixin', 'Upgrade-Insecure-Requests': '1', 'User-Agent': FakeUserAgent().chrome } response = requests.request("GET", url, headers=headers, proxies=tunnel_proxies()) e_tree = etree.HTML(response.text) xpath = r'//ul[@class="news-list"]/li/div/a/@href' url_list = e_tree.xpath(xpath) url_list = ["https://weixin.sogou.com/" + i for i in url_list] for url in url_list: print(url) extract(url) sogou_wechat("人民日报")