|
@@ -8,12 +8,15 @@ from typing import Dict
|
|
|
|
|
|
import requests
|
|
|
from tqdm import tqdm
|
|
|
+from pymysql.cursors import DictCursor
|
|
|
|
|
|
from applications import bot
|
|
|
from applications import log
|
|
|
+from applications import Functions
|
|
|
from applications.db import DatabaseConnector
|
|
|
from config import long_articles_config
|
|
|
|
|
|
+functions = Functions()
|
|
|
|
|
|
class ToutiaoRecommendCrawler(object):
|
|
|
"""
|
|
@@ -44,36 +47,36 @@ class ToutiaoRecommendCrawler(object):
|
|
|
获取历史推荐流文章
|
|
|
:return:
|
|
|
"""
|
|
|
- url = 'https://www.toutiao.com/api/pc/list/feed'
|
|
|
- params = {
|
|
|
- 'channel_id': '3189398965',
|
|
|
- 'min_behot_time': '0',
|
|
|
- 'offset': '0',
|
|
|
- 'refresh_count': '1',
|
|
|
- 'category': 'pc_profile_channel',
|
|
|
- 'client_extra_params': '{"short_video_item":"filter"}',
|
|
|
- 'aid': '24',
|
|
|
- 'app_name': 'toutiao_web',
|
|
|
- 'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
|
|
|
- 'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
|
|
|
- }
|
|
|
- headers = {
|
|
|
- 'Accept': 'application/json, text/plain, */*',
|
|
|
- 'Accept-Language': 'zh,zh-CN;q=0.9',
|
|
|
- 'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
|
|
|
- 'Priority': 'u=1, i',
|
|
|
- 'Referer': 'https://www.toutiao.com/',
|
|
|
- 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
|
|
- 'sec-ch-ua-mobile': '?0',
|
|
|
- 'sec-ch-ua-platform': '"macOS"',
|
|
|
- 'sec-fetch-dest': 'empty',
|
|
|
- 'sec-fetch-mode': 'cors',
|
|
|
- 'sec-fetch-site': 'same-origin',
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
|
- }
|
|
|
- response = requests.get(url, params=params, headers=headers)
|
|
|
+ select_sql = f"""
|
|
|
+ SELECT request_method, request_url, request_headers, post_data
|
|
|
+ FROM toutiao_request_params
|
|
|
+ WHERE category = 'history' and expire_flag = 0
|
|
|
+ ORDER BY id
|
|
|
+ LIMIT 1;
|
|
|
+ """
|
|
|
+ result = self.db_client.fetch(
|
|
|
+ query=select_sql,
|
|
|
+ cursor_type=DictCursor
|
|
|
+ )
|
|
|
+ if not result:
|
|
|
+ print("cookie没了报警")
|
|
|
+ return {}
|
|
|
+ cookie_obj = result[0]
|
|
|
+ response = requests.request(
|
|
|
+ method=cookie_obj['request_method'],
|
|
|
+ url=cookie_obj['request_url'],
|
|
|
+ headers=json.loads(cookie_obj['request_headers']),
|
|
|
+ proxies=functions.proxy()
|
|
|
+ )
|
|
|
return response.json()
|
|
|
|
|
|
+ def get_tech_recommendation(self) -> Dict:
|
|
|
+ """
|
|
|
+ 获取科技推荐流文章
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ return
|
|
|
+
|
|
|
def insert_each_article(self, item: Dict) -> Dict:
|
|
|
"""
|
|
|
提取文章信息
|
|
@@ -86,7 +89,6 @@ class ToutiaoRecommendCrawler(object):
|
|
|
read_count = item['read_count']
|
|
|
title = item['title']
|
|
|
user_info = item['user_info']
|
|
|
- user_name = user_info.get('name')
|
|
|
user_id = user_info.get('user_id')
|
|
|
abstract = item['Abstract']
|
|
|
publish_time = item['publish_time']
|
|
@@ -138,26 +140,24 @@ class ToutiaoRecommendCrawler(object):
|
|
|
data=error_data,
|
|
|
status='fail'
|
|
|
)
|
|
|
-
|
|
|
-
|
|
|
-def main():
|
|
|
- """
|
|
|
- 主函数
|
|
|
- :return:
|
|
|
- """
|
|
|
- toutiao_recommend_crawler = ToutiaoRecommendCrawler()
|
|
|
- toutiao_recommend_crawler.init_database()
|
|
|
- for i in range(10):
|
|
|
- try:
|
|
|
- article_list = toutiao_recommend_crawler.get_history_recommendation()
|
|
|
- except Exception as e:
|
|
|
- error_data = {
|
|
|
- "error": str(e),
|
|
|
- "error_stack": traceback.format_exc()
|
|
|
- }
|
|
|
- print(error_data)
|
|
|
- continue
|
|
|
- toutiao_recommend_crawler.process_recommendation(article_list)
|
|
|
-
|
|
|
-
|
|
|
-main()
|
|
|
+ else:
|
|
|
+ print("视频文章跳过")
|
|
|
+ else:
|
|
|
+ print("无链接文章跳过")
|
|
|
+
|
|
|
+ def run(self) -> None:
|
|
|
+ """
|
|
|
+ 主函数
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for i in range(10):
|
|
|
+ try:
|
|
|
+ article_list = self.get_history_recommendation()
|
|
|
+ self.process_recommendation(article_list)
|
|
|
+ time.sleep(3)
|
|
|
+ except Exception as e:
|
|
|
+ error_data = {
|
|
|
+ "error": str(e),
|
|
|
+ "error_stack": traceback.format_exc()
|
|
|
+ }
|
|
|
+ print(error_data)
|