| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- from __future__ import annotations
- import asyncio
- import json
- from typing import Dict, List, Tuple
- from tqdm.asyncio import tqdm
- from applications.crawler.tophub import get_hot_point_content
- class CrawlerHotPointConst:
- max_page_index = 40
- init_status = 0
- processing_status = 1
- useful_status = 2
- unuseful_status = 3
- failed_status = 99
- not_expired_status = 1
- expired_status = 2
- class CrawlerHotPointMapper(CrawlerHotPointConst):
- def __init__(self, pool, log_client, trace_id):
- self.pool = pool
- self.log_client = log_client
- self.trace_id = trace_id
- async def save_articles(self, articles: List[Tuple]):
- """插入标题 && Link"""
- query = """
- INSERT INTO hot_point_titles
- (title, platform, link)
- VALUES (%s, %s, %s);
- """
- return await self.pool.async_save(query=query, params=articles, batch=True)
- async def update_useful_status(
- self, article_id: int, origin_status: int, new_status: int
- ):
- """
- 更新文章状态
- """
- query = """
- UPDATE hot_point_titles
- SET useful = %s
- WHERE id = %s AND useful = %s;
- """
- return await self.pool.async_save(
- query=query, params=(new_status, article_id, origin_status)
- )
- async def set_as_expired(self, article_id: int):
- """
- 设置文章为过期
- """
- query = """
- UPDATE hot_point_titles
- SET status = %s
- WHERE id = %s;
- """
- return await self.pool.async_save(
- query=query, params=(self.expired_status, article_id)
- )
- class CrawlerHotPointTask(CrawlerHotPointMapper):
- def __init__(self, pool, log_client, trace_id):
- super().__init__(pool, log_client, trace_id)
- @staticmethod
- def process_raw_data(response_data):
- """
- 处理原始数据
- """
- articles = []
- for item in response_data:
- platform = item["source"]
- for article in item["rankList"]:
- title = article["title"]
- link = article["link"]
- articles.append((title, platform, link))
- return articles
- async def crawl_hot_titles(self):
- """
- 爬取热点标题
- """
- for page in tqdm(range(1, self.max_page_index)):
- try:
- raw_data = await get_hot_point_content(page_index=page)
- articles = self.process_raw_data(raw_data)
- await self.save_articles(articles)
- except Exception as e:
- print(f"crawl_hot_titles error: {e}")
- async def classify_articles_by_llm(self):
- """
- 用大模型进行分类,判断热点事件是否符合老年人的兴趣爱好
- """
- pass
|