123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- """
- @author: luojunhui
- """
- import time
- import random
- import requests
- from lxml import etree
- from fake_useragent import FakeUserAgent
- from gne import GeneralNewsExtractor
- from selenium import webdriver
- from selenium.webdriver.chrome.service import Service
- from selenium.webdriver.chrome.options import Options
- from webdriver_manager.chrome import ChromeDriverManager
- def selenium_text(url):
- # 配置 Chrome 选项
- chrome_options = Options()
- chrome_options.add_argument('--headless') # 无头模式
- # chrome_options.add_argument('--disable-gpu')
- # chrome_options.add_argument('--no-sandbox')
- # chrome_options.add_argument('--disable-dev-shm-usage')
- # chrome_options.add_argument('--disable-blink-features=AutomationControlled')
- # chrome_options.add_argument(
- # f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
- # chrome_options.add_argument('--incognito')
- window_width = random.randint(800, 1200)
- window_height = random.randint(600, 800)
- # chrome_options.add_argument('--proxy-server=http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/')
- service = Service(ChromeDriverManager().install())
- driver = webdriver.Chrome(service=service, options=chrome_options)
- driver.set_window_size(window_width, window_height)
- driver.get(url)
- page_text = driver.page_source
- driver.quit()
- return page_text
- def tunnel_proxies():
- # 隧道域名:端口号
- tunnel = "q796.kdltps.com:15818"
- # 用户名密码方式
- username = "t17772369458618"
- password = "5zqcjkmy"
- proxies = {
- "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
- "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
- }
- return proxies
- def extract(url):
- """
- ttt
- :param url:
- :return:
- """
- html_text = selenium_text(url)
- extractor = GeneralNewsExtractor()
- result = extractor.extract(html_text)
- print(result)
- def sogou_wechat(keyword):
- """
- :param keyword:
- :return:
- """
- url = "https://weixin.sogou.com/weixin?type=2&query={}".format(keyword)
- print(url)
- headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
- 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
- 'Cache-Control': 'max-age=0',
- 'Connection': 'keep-alive',
- 'Cookie': '',
- 'Referer': 'https://weixin.sogou.com/weixin',
- 'Upgrade-Insecure-Requests': '1',
- 'User-Agent': FakeUserAgent().chrome
- }
- response = requests.request("GET", url, headers=headers, proxies=tunnel_proxies())
- e_tree = etree.HTML(response.text)
- xpath = r'//ul[@class="news-list"]/li/div/a/@href'
- url_list = e_tree.xpath(xpath)
- url_list = ["https://weixin.sogou.com/" + i for i in url_list]
- for url in url_list:
- print(url)
- extract(url)
- sogou_wechat("人民日报")
|