from __future__ import annotations import asyncio import json from typing import Dict, List, Tuple from tqdm.asyncio import tqdm from applications.crawler.tophub import get_hot_point_content class CrawlerHotPointConst: max_page_index = 40 init_status = 0 processing_status = 1 useful_status = 2 unuseful_status = 3 failed_status = 99 not_expired_status = 1 expired_status = 2 class CrawlerHotPointMapper(CrawlerHotPointConst): def __init__(self, pool, log_client, trace_id): self.pool = pool self.log_client = log_client self.trace_id = trace_id async def save_articles(self, articles: List[Tuple]): """插入标题 && Link""" query = """ INSERT INTO hot_point_titles (title, platform, link) VALUES (%s, %s, %s); """ return await self.pool.async_save(query=query, params=articles, batch=True) async def update_useful_status( self, article_id: int, origin_status: int, new_status: int ): """ 更新文章状态 """ query = """ UPDATE hot_point_titles SET useful = %s WHERE id = %s AND useful = %s; """ return await self.pool.async_save( query=query, params=(new_status, article_id, origin_status) ) async def set_as_expired(self, article_id: int): """ 设置文章为过期 """ query = """ UPDATE hot_point_titles SET status = %s WHERE id = %s; """ return await self.pool.async_save( query=query, params=(self.expired_status, article_id) ) class CrawlerHotPointTask(CrawlerHotPointMapper): def __init__(self, pool, log_client, trace_id): super().__init__(pool, log_client, trace_id) @staticmethod def process_raw_data(response_data): """ 处理原始数据 """ articles = [] for item in response_data: platform = item["source"] for article in item["rankList"]: title = article["title"] link = article["link"] articles.append((title, platform, link)) return articles async def crawl_hot_titles(self): """ 爬取热点标题 """ for page in tqdm(range(1, self.max_page_index)): try: raw_data = await get_hot_point_content(page_index=page) articles = self.process_raw_data(raw_data) await self.save_articles(articles) except Exception as e: print(f"crawl_hot_titles error: {e}") async def classify_articles_by_llm(self): """ 用大模型进行分类,判断热点事件是否符合老年人的兴趣爱好 """ pass