|
@@ -0,0 +1,162 @@
|
|
|
+"""
|
|
|
+@author: luojunhui
|
|
|
+"""
|
|
|
+import json
|
|
|
+import time
|
|
|
+import traceback
|
|
|
+from typing import Dict
|
|
|
+
|
|
|
+import requests
|
|
|
+from tqdm import tqdm
|
|
|
+
|
|
|
+from applications import bot
|
|
|
+from applications import log
|
|
|
+from applications.db import DatabaseConnector
|
|
|
+from config import long_articles_config
|
|
|
+
|
|
|
+
|
|
|
+class ToutiaoRecommendCrawler(object):
|
|
|
+ """
|
|
|
+ 今日头条推荐流
|
|
|
+ """
|
|
|
+ def __init__(self) -> None:
|
|
|
+ self.db_client = None
|
|
|
+
|
|
|
+ def init_database(self) -> None:
|
|
|
+ """
|
|
|
+ 初始化数据库
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ self.db_client = DatabaseConnector(db_config=long_articles_config)
|
|
|
+ self.db_client.connect()
|
|
|
+ except Exception as e:
|
|
|
+ bot(
|
|
|
+ title="今日头条推荐流文章抓取任务数据库连接失败",
|
|
|
+ detail={
|
|
|
+ "error": str(e),
|
|
|
+ "error_stack": traceback.format_exc()
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ def get_history_recommendation(self) -> Dict:
|
|
|
+ """
|
|
|
+ 获取历史推荐流文章
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ url = 'https://www.toutiao.com/api/pc/list/feed'
|
|
|
+ params = {
|
|
|
+ 'channel_id': '3189398965',
|
|
|
+ 'min_behot_time': '0',
|
|
|
+ 'offset': '0',
|
|
|
+ 'refresh_count': '1',
|
|
|
+ 'category': 'pc_profile_channel',
|
|
|
+ 'client_extra_params': '{"short_video_item":"filter"}',
|
|
|
+ 'aid': '24',
|
|
|
+ 'app_name': 'toutiao_web',
|
|
|
+ 'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
|
|
|
+ 'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
|
|
|
+ }
|
|
|
+ headers = {
|
|
|
+ 'Accept': 'application/json, text/plain, */*',
|
|
|
+ 'Accept-Language': 'zh,zh-CN;q=0.9',
|
|
|
+ 'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
|
|
|
+ 'Priority': 'u=1, i',
|
|
|
+ 'Referer': 'https://www.toutiao.com/',
|
|
|
+ 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
|
|
+ 'sec-ch-ua-mobile': '?0',
|
|
|
+ 'sec-ch-ua-platform': '"macOS"',
|
|
|
+ 'sec-fetch-dest': 'empty',
|
|
|
+ 'sec-fetch-mode': 'cors',
|
|
|
+ 'sec-fetch-site': 'same-origin',
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
|
|
|
+ }
|
|
|
+ response = requests.get(url, params=params, headers=headers)
|
|
|
+ return response.json()
|
|
|
+
|
|
|
+ def insert_each_article(self, item: Dict) -> Dict:
|
|
|
+ """
|
|
|
+ 提取文章信息
|
|
|
+ :param article_info:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ item_id = item.get('item_id')
|
|
|
+ article_url = item['article_url']
|
|
|
+ like_count = item['like_count']
|
|
|
+ read_count = item['read_count']
|
|
|
+ title = item['title']
|
|
|
+ user_info = item['user_info']
|
|
|
+ user_name = user_info.get('name')
|
|
|
+ user_id = user_info.get('user_id')
|
|
|
+ abstract = item['Abstract']
|
|
|
+ publish_time = item['publish_time']
|
|
|
+ insert_sql = f"""
|
|
|
+ INSERT INTO crawler_meta_article
|
|
|
+ (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, unique_index)
|
|
|
+ VALUES
|
|
|
+ (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
|
|
|
+ """
|
|
|
+ self.db_client.save(
|
|
|
+ query=insert_sql,
|
|
|
+ params=(
|
|
|
+ "toutiao",
|
|
|
+ "recommend",
|
|
|
+ "history",
|
|
|
+ user_id,
|
|
|
+ title,
|
|
|
+ article_url,
|
|
|
+ read_count,
|
|
|
+ like_count,
|
|
|
+ abstract,
|
|
|
+ publish_time,
|
|
|
+ int(time.time()),
|
|
|
+ item_id
|
|
|
+ )
|
|
|
+ )
|
|
|
+
|
|
|
+ def process_recommendation(self, recommendation) -> Dict:
|
|
|
+ """
|
|
|
+ 处理推荐流文章
|
|
|
+ :param recommendation:
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ for item in tqdm(recommendation['data']):
|
|
|
+ if item.get('article_url'):
|
|
|
+ video_flag = item.get('has_video')
|
|
|
+ if not video_flag:
|
|
|
+ try:
|
|
|
+ self.insert_each_article(item)
|
|
|
+ except Exception as e:
|
|
|
+ error_data = {
|
|
|
+ "error": str(e),
|
|
|
+ "error_stack": traceback.format_exc()
|
|
|
+ }
|
|
|
+ log(
|
|
|
+ task='toutiao_recommend',
|
|
|
+ message='头条推荐流文章插入失败',
|
|
|
+ data=error_data,
|
|
|
+ status='fail'
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ """
|
|
|
+ 主函数
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ toutiao_recommend_crawler = ToutiaoRecommendCrawler()
|
|
|
+ toutiao_recommend_crawler.init_database()
|
|
|
+ for i in range(10):
|
|
|
+ try:
|
|
|
+ article_list = toutiao_recommend_crawler.get_history_recommendation()
|
|
|
+ except Exception as e:
|
|
|
+ error_data = {
|
|
|
+ "error": str(e),
|
|
|
+ "error_stack": traceback.format_exc()
|
|
|
+ }
|
|
|
+ print(error_data)
|
|
|
+ continue
|
|
|
+ toutiao_recommend_crawler.process_recommendation(article_list)
|
|
|
+
|
|
|
+
|
|
|
+main()
|