wechatSogou.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. """
  2. @author: luojunhui
  3. """
  4. import time
  5. import random
  6. import requests
  7. from lxml import etree
  8. from fake_useragent import FakeUserAgent
  9. from gne import GeneralNewsExtractor
  10. from selenium import webdriver
  11. from selenium.webdriver.chrome.service import Service
  12. from selenium.webdriver.chrome.options import Options
  13. from webdriver_manager.chrome import ChromeDriverManager
  14. def selenium_text(url):
  15. # 配置 Chrome 选项
  16. chrome_options = Options()
  17. chrome_options.add_argument('--headless') # 无头模式
  18. # chrome_options.add_argument('--disable-gpu')
  19. # chrome_options.add_argument('--no-sandbox')
  20. # chrome_options.add_argument('--disable-dev-shm-usage')
  21. # chrome_options.add_argument('--disable-blink-features=AutomationControlled')
  22. # chrome_options.add_argument(
  23. # f'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
  24. # chrome_options.add_argument('--incognito')
  25. window_width = random.randint(800, 1200)
  26. window_height = random.randint(600, 800)
  27. # chrome_options.add_argument('--proxy-server=http://t17772369458618:5zqcjkmy@q796.kdltps.com:15818/')
  28. service = Service(ChromeDriverManager().install())
  29. driver = webdriver.Chrome(service=service, options=chrome_options)
  30. driver.set_window_size(window_width, window_height)
  31. driver.get(url)
  32. page_text = driver.page_source
  33. driver.quit()
  34. return page_text
  35. def tunnel_proxies():
  36. # 隧道域名:端口号
  37. tunnel = "q796.kdltps.com:15818"
  38. # 用户名密码方式
  39. username = "t17772369458618"
  40. password = "5zqcjkmy"
  41. proxies = {
  42. "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
  43. "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
  44. }
  45. return proxies
  46. def extract(url):
  47. """
  48. ttt
  49. :param url:
  50. :return:
  51. """
  52. html_text = selenium_text(url)
  53. extractor = GeneralNewsExtractor()
  54. result = extractor.extract(html_text)
  55. print(result)
  56. def sogou_wechat(keyword):
  57. """
  58. :param keyword:
  59. :return:
  60. """
  61. url = "https://weixin.sogou.com/weixin?type=2&query={}".format(keyword)
  62. print(url)
  63. headers = {
  64. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
  65. 'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8',
  66. 'Cache-Control': 'max-age=0',
  67. 'Connection': 'keep-alive',
  68. 'Cookie': '',
  69. 'Referer': 'https://weixin.sogou.com/weixin',
  70. 'Upgrade-Insecure-Requests': '1',
  71. 'User-Agent': FakeUserAgent().chrome
  72. }
  73. response = requests.request("GET", url, headers=headers, proxies=tunnel_proxies())
  74. e_tree = etree.HTML(response.text)
  75. xpath = r'//ul[@class="news-list"]/li/div/a/@href'
  76. url_list = e_tree.xpath(xpath)
  77. url_list = ["https://weixin.sogou.com/" + i for i in url_list]
  78. for url in url_list:
  79. print(url)
  80. extract(url)
  81. sogou_wechat("人民日报")