|
@@ -0,0 +1,96 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import time
|
|
|
+import random
|
|
|
+import requests
|
|
|
+from lxml import etree
|
|
|
+from fake_useragent import FakeUserAgent
|
|
|
+from gne import GeneralNewsExtractor
|
|
|
+from selenium import webdriver
|
|
|
+from selenium.webdriver.chrome.service import Service
|
|
|
+from selenium.webdriver.chrome.options import Options
|
|
|
+from webdriver_manager.chrome import ChromeDriverManager
|
|
|
+
|
|
|
+
|
|
|
+def selenium_text(url):
|
|
|
+ # 配置 Chrome 选项
|
|
|
+ chrome_options = Options()
|
|
|
+ chrome_options.add_argument('--headless') # 无头模式
|
|
|
+ # chrome_options.add_argument('--disable-gpu')
|
|
|
+ # chrome_options.add_argument('--no-sandbox')
|
|
|
+ # chrome_options.add_argument('--disable-dev-shm-usage')
|
|
|
+ # chrome_options.add_argument('--disable-blink-features=AutomationControlled')
|
|
|
+ # chrome_options.add_argument(
|
|
|
+ # f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
|
|
|
+ # chrome_options.add_argument('--incognito')
|
|
|
+ window_width = random.randint(800, 1200)
|
|
|
+ window_height = random.randint(600, 800)
|
|
|
+
|
|
|
+
|
|
|
+ # chrome_options.add_argument('--proxy-server=http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/')
|
|
|
+ service = Service(ChromeDriverManager().install())
|
|
|
+ driver = webdriver.Chrome(service=service, options=chrome_options)
|
|
|
+ driver.set_window_size(window_width, window_height)
|
|
|
+ driver.get(url)
|
|
|
+ page_text = driver.page_source
|
|
|
+ driver.quit()
|
|
|
+ return page_text
|
|
|
+
|
|
|
+
|
|
|
+def tunnel_proxies():
|
|
|
+ # 隧道域名:端口号
|
|
|
+ tunnel = "q796.kdltps.com:15818"
|
|
|
+ # 用户名密码方式
|
|
|
+ username = "t17772369458618"
|
|
|
+ password = "5zqcjkmy"
|
|
|
+ proxies = {
|
|
|
+ "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
|
|
|
+ "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
|
|
|
+ }
|
|
|
+ return proxies
|
|
|
+
|
|
|
+
|
|
|
+def extract(url):
|
|
|
+ """
|
|
|
+ ttt
|
|
|
+ :param url:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ html_text = selenium_text(url)
|
|
|
+ extractor = GeneralNewsExtractor()
|
|
|
+ result = extractor.extract(html_text)
|
|
|
+ print(result)
|
|
|
+
|
|
|
+
|
|
|
+def sogou_wechat(keyword):
|
|
|
+ """
|
|
|
+ :param keyword:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ url = "https://weixin.sogou.com/weixin?type=2&query={}".format(keyword)
|
|
|
+ print(url)
|
|
|
+ headers = {
|
|
|
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
|
|
+ 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
|
|
|
+ 'Cache-Control': 'max-age=0',
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
+ 'Cookie': '',
|
|
|
+ 'Referer': 'https://weixin.sogou.com/weixin',
|
|
|
+ 'Upgrade-Insecure-Requests': '1',
|
|
|
+ 'User-Agent': FakeUserAgent().chrome
|
|
|
+ }
|
|
|
+
|
|
|
+ response = requests.request("GET", url, headers=headers, proxies=tunnel_proxies())
|
|
|
+ e_tree = etree.HTML(response.text)
|
|
|
+
|
|
|
+ xpath = r'//ul[@class="news-list"]/li/div/a/@href'
|
|
|
+
|
|
|
+ url_list = e_tree.xpath(xpath)
|
|
|
+ url_list = ["https://weixin.sogou.com/" + i for i in url_list]
|
|
|
+ for url in url_list:
|
|
|
+ print(url)
|
|
|
+ extract(url)
|
|
|
+
|
|
|
+
|
|
|
+sogou_wechat("人民日报")
|