| 
					
				 | 
			
			
				@@ -0,0 +1,96 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+@author: luojunhui 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+""" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import urllib.parse 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+import requests 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from fake_useragent import FakeUserAgent 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+from lxml import etree 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def tunnel_proxies(): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        快代理方法 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        :return: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    tunnel = "q796.kdltps.com:15818" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    username = "t17772369458618" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    password = "5zqcjkmy" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    proxies = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return proxies 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def search_article(title): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    通过标题搜索文章 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :param title: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :return: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    url = "https://so.toutiao.com/search" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    params = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "dvpf": "pc", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "source": "search_subtab_switch", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "keyword": title, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "page_num": 0, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "pd": "information", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "action_type": "search_subtab_switch", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "search_id": "", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "from": "news", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "cur_tab_title": "news" 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    headers = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Accept-Language': 'en,zh;q=0.9,zh-CN;q=0.8', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Cache-Control': 'max-age=0', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Connection': 'keep-alive', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Cookie': '__ac_nonce=06656db2c0044df592020; __ac_signature=_02B4Z6wo00f01f7SgiAAAIDA3giITC7VZDn-8oaAABnk69; msToken=2BrMcxLg3_PsS1iKcjuLpU1GS9iZsZ-51KSQTgUSRRLhGQqsQV3zKuJR49smQ7f8hQ8fahWtYCKC6TKJO3kR8022S-NsNfdHXu7X7mPM; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1716968863%7C3459387c0d736a3410ba2c5dbdaeb61a7f85f161bbf6cb1a4f646c2a2edc9aa5;', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Sec-Fetch-Dest': 'document', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Sec-Fetch-Mode': 'navigate', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Sec-Fetch-Site': 'none', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Sec-Fetch-User': '?1', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'Upgrade-Insecure-Requests': '1', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'User-Agent': FakeUserAgent().random, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    response = requests.get(url=url, headers=headers, params=params) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    html = etree.HTML(response.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    xpath = '//div[@class="cs-view cs-view-block cs-card-content"]//a/@href' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result_list = html.xpath(xpath) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    urls = [urllib.parse.unquote(i.replace("/search/jump?url=", "")) for i in result_list] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    article_urls = [url.replace("http://www.toutiao.com/a", "https://www.toutiao.com/article/") for url in urls] 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return article_urls 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+def parse_detail(url): 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :param url: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    :return: 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    """ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    payload = {} 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    headers = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'accept-language': 'en', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; tt_webid=7371293454351697471; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; s_v_web_id=verify_lwiorbpi_iEdehlbX_70wd_4ldg_Bctq_iI8TdJIzXVnI; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_DPR=2.200000047683716; _S_IPAD=0; notRedShot=1; local_city_cache=%E5%8C%97%E4%BA%AC; gfkadpd=24,6457; _S_WIN_WH=1336_726; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1717060308%7C4203558278ecaad696c5c5d3ff2bbab682c3ae0f89b876ab618b4646f04c81ef; tt_scid=GNVu.6vOxpT8JE14FSVyGcUl7IItTWcIP.qGQ.gRujo02xjV1BqV1tKznH7NlULMc251; msToken=dG4Y0x3XtlJGBx5WEqYilfjCRWJq629eYRg-hLWdfiTavnI5szx-h9KwUUAggGcG2i03AksGmCfQ034JCbnfWnTnhuguXU1yISZMr6YE; _ga_QEHZPBE5HH=GS1.1.1717057829.7.1.1717060394.0.0.0', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'priority': 'u=0, i', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'referer': url, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'upgrade-insecure-requests': '1', 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    response = requests.request("GET", url, headers=headers, data=payload, proxies=tunnel_proxies()) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    html = etree.HTML(response.text) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    text_xpath = '//div[@class="article-content"]//p/text()' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    img_xpath = '//div[@class="pgc-img"]/img/@src' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    title_xpath = '//div[@class="article-content"]/h1/text()' 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    result_list = html.xpath(text_xpath) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    img_list = html.xpath(img_xpath) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    title = html.xpath(title_xpath) 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    obj = { 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "article_url": url, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "text": "\n".join(result_list), 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "img_urls": img_list, 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "title": title 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    } 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+    return obj 
			 |