|
|
@@ -0,0 +1,102 @@
|
|
|
+from __future__ import annotations
|
|
|
+
|
|
|
+import asyncio
|
|
|
+import json
|
|
|
+from typing import Dict, List, Tuple
|
|
|
+from tqdm.asyncio import tqdm
|
|
|
+
|
|
|
+from applications.crawler.tophub import get_hot_point_content
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerHotPointConst:
|
|
|
+ max_page_index = 40
|
|
|
+
|
|
|
+ init_status = 0
|
|
|
+ processing_status = 1
|
|
|
+ useful_status = 2
|
|
|
+ unuseful_status = 3
|
|
|
+ failed_status = 99
|
|
|
+
|
|
|
+ not_expired_status = 1
|
|
|
+ expired_status = 2
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerHotPointMapper(CrawlerHotPointConst):
|
|
|
+ def __init__(self, pool, log_client, trace_id):
|
|
|
+ self.pool = pool
|
|
|
+ self.log_client = log_client
|
|
|
+ self.trace_id = trace_id
|
|
|
+
|
|
|
+ async def save_articles(self, articles: List[Tuple]):
|
|
|
+ """插入标题 && Link"""
|
|
|
+ query = """
|
|
|
+ INSERT INTO hot_point_titles
|
|
|
+ (title, platform, link)
|
|
|
+ VALUES (%s, %s, %s);
|
|
|
+ """
|
|
|
+ return await self.pool.async_save(query=query, params=articles, batch=True)
|
|
|
+
|
|
|
+ async def update_useful_status(
|
|
|
+ self, article_id: int, origin_status: int, new_status: int
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ 更新文章状态
|
|
|
+ """
|
|
|
+ query = """
|
|
|
+ UPDATE hot_point_titles
|
|
|
+ SET useful = %s
|
|
|
+ WHERE id = %s AND useful = %s;
|
|
|
+ """
|
|
|
+ return await self.pool.async_save(
|
|
|
+ query=query, params=(new_status, article_id, origin_status)
|
|
|
+ )
|
|
|
+
|
|
|
+ async def set_as_expired(self, article_id: int):
|
|
|
+ """
|
|
|
+ 设置文章为过期
|
|
|
+ """
|
|
|
+ query = """
|
|
|
+ UPDATE hot_point_titles
|
|
|
+ SET status = %s
|
|
|
+ WHERE id = %s;
|
|
|
+ """
|
|
|
+ return await self.pool.async_save(
|
|
|
+ query=query, params=(self.expired_status, article_id)
|
|
|
+ )
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerHotPointTask(CrawlerHotPointMapper):
|
|
|
+ def __init__(self, pool, log_client, trace_id):
|
|
|
+ super().__init__(pool, log_client, trace_id)
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def process_raw_data(response_data):
|
|
|
+ """
|
|
|
+ 处理原始数据
|
|
|
+ """
|
|
|
+ articles = []
|
|
|
+ for item in response_data:
|
|
|
+ platform = item["source"]
|
|
|
+ for article in item["rankList"]:
|
|
|
+ title = article["title"]
|
|
|
+ link = article["link"]
|
|
|
+ articles.append((title, platform, link))
|
|
|
+ return articles
|
|
|
+
|
|
|
+ async def crawl_hot_titles(self):
|
|
|
+ """
|
|
|
+ 爬取热点标题
|
|
|
+ """
|
|
|
+ for page in tqdm(range(1, self.max_page_index)):
|
|
|
+ try:
|
|
|
+ raw_data = await get_hot_point_content(page_index=page)
|
|
|
+ articles = self.process_raw_data(raw_data)
|
|
|
+ await self.save_articles(articles)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"crawl_hot_titles error: {e}")
|
|
|
+
|
|
|
+ async def classify_articles_by_llm(self):
|
|
|
+ """
|
|
|
+ 用大模型进行分类,判断热点事件是否符合老年人的兴趣爱好
|
|
|
+ """
|
|
|
+ pass
|