123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- """
- @author: luojunhui
- """
- import json
- import time
- import traceback
- from typing import Dict
- import requests
- from tqdm import tqdm
- from applications import bot
- from applications import log
- from applications.db import DatabaseConnector
- from config import long_articles_config
- class ToutiaoRecommendCrawler(object):
- """
- 今日头条推荐流
- """
- def __init__(self) -> None:
- self.db_client = None
- def init_database(self) -> None:
- """
- 初始化数据库
- :return:
- """
- try:
- self.db_client = DatabaseConnector(db_config=long_articles_config)
- self.db_client.connect()
- except Exception as e:
- bot(
- title="今日头条推荐流文章抓取任务数据库连接失败",
- detail={
- "error": str(e),
- "error_stack": traceback.format_exc()
- }
- )
- def get_history_recommendation(self) -> Dict:
- """
- 获取历史推荐流文章
- :return:
- """
- url = 'https://www.toutiao.com/api/pc/list/feed'
- params = {
- 'channel_id': '3189398965',
- 'min_behot_time': '0',
- 'offset': '0',
- 'refresh_count': '1',
- 'category': 'pc_profile_channel',
- 'client_extra_params': '{"short_video_item":"filter"}',
- 'aid': '24',
- 'app_name': 'toutiao_web',
- 'msToken': '_rhOZjdccInxERSE5rot9jsH_4FDZLRNYC9HVypTRtZ2IEb0wHQCtxjLEjXSoDM4oUNW2EbhPJqomKQvt8_Jg503jGiFSl2hmP3neRKfE9uBanlfhoD1yQ==',
- 'a_bogus': 'xyRh/mL6DkdNXfyI55QLfY3qV4P3YkLG0t9bMDhqTVfSty39HMPd9exEuvhvMy8jxs/gIegjy4hbY3/DrQAJMpyUHuXLUdQ2mymsKl5Q59gCs1feejuQnU4Nmkt-tec25JZ4EKi8o7/aSYuDl2Be-wnAP6ZCcHhMHjD8CpMpvn6lErm=',
- }
- headers = {
- 'Accept': 'application/json, text/plain, */*',
- 'Accept-Language': 'zh,zh-CN;q=0.9',
- 'Cookie': '__ac_signature=_02B4Z6wo00f01uL1Y5QAAIDDwi9p-RyULmbi1WcAAN8B5b; ttcid=a7499fc4f17243e1a6f1d47fc054799e16; _ga=GA1.1.1771235425.1716434457; csrftoken=ee756af695a449eeb73b5a3fc78978b2; _S_IPAD=0; notRedShot=1; tt_webid=7371293454351697471; s_v_web_id=verify_m559znfi_rrsyYOnm_GvTT_4r4T_8FNl_VV6uBCq85ctv; __feed_out_channel_key=history; _S_DPR=2.200000047683716; gfkadpd=24,6457; _S_WIN_WH=1554_860; ttwid=1%7C9b5sTIuwZxZKt0wFsvE-2t5OoFxH_Q5VIpVNWEREbAo%7C1735808360%7Cc9062afa201717f3b20ed1ac1b940d22da7b5dd01e201c26a97350f99c2683ee; tt_scid=i1kVV8f6ncNAV18LRQKFsVz-XuURdfA2uP9P2oTf.IBYozNpDFa5qspcCeSQNAQ5df75; _ga_QEHZPBE5HH=GS1.1.1735807708.20.1.1735809077.0.0.0',
- 'Priority': 'u=1, i',
- 'Referer': 'https://www.toutiao.com/',
- 'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"macOS"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-origin',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
- }
- response = requests.get(url, params=params, headers=headers)
- return response.json()
- def insert_each_article(self, item: Dict) -> Dict:
- """
- 提取文章信息
- :param article_info:
- :return:
- """
- item_id = item.get('item_id')
- article_url = item['article_url']
- like_count = item['like_count']
- read_count = item['read_count']
- title = item['title']
- user_info = item['user_info']
- user_name = user_info.get('name')
- user_id = user_info.get('user_id')
- abstract = item['Abstract']
- publish_time = item['publish_time']
- insert_sql = f"""
- INSERT INTO crawler_meta_article
- (platform, mode, category, out_account_id, title, link, read_cnt, like_cnt, description, publish_time, crawler_time, unique_index)
- VALUES
- (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
- """
- self.db_client.save(
- query=insert_sql,
- params=(
- "toutiao",
- "recommend",
- "history",
- user_id,
- title,
- article_url,
- read_count,
- like_count,
- abstract,
- publish_time,
- int(time.time()),
- item_id
- )
- )
- def process_recommendation(self, recommendation) -> Dict:
- """
- 处理推荐流文章
- :param recommendation:
- :return:
- """
- for item in tqdm(recommendation['data']):
- if item.get('article_url'):
- video_flag = item.get('has_video')
- if not video_flag:
- try:
- self.insert_each_article(item)
- except Exception as e:
- error_data = {
- "error": str(e),
- "error_stack": traceback.format_exc()
- }
- log(
- task='toutiao_recommend',
- message='头条推荐流文章插入失败',
- data=error_data,
- status='fail'
- )
- def main():
- """
- 主函数
- :return:
- """
- toutiao_recommend_crawler = ToutiaoRecommendCrawler()
- toutiao_recommend_crawler.init_database()
- for i in range(10):
- try:
- article_list = toutiao_recommend_crawler.get_history_recommendation()
- except Exception as e:
- error_data = {
- "error": str(e),
- "error_stack": traceback.format_exc()
- }
- print(error_data)
- continue
- toutiao_recommend_crawler.process_recommendation(article_list)
- main()
|