crawler_hot_point.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. from __future__ import annotations
  2. import asyncio
  3. import json
  4. from typing import Dict, List, Tuple
  5. from tqdm.asyncio import tqdm
  6. from applications.crawler.tophub import get_hot_point_content
  7. class CrawlerHotPointConst:
  8. max_page_index = 40
  9. init_status = 0
  10. processing_status = 1
  11. useful_status = 2
  12. unuseful_status = 3
  13. failed_status = 99
  14. not_expired_status = 1
  15. expired_status = 2
  16. class CrawlerHotPointMapper(CrawlerHotPointConst):
  17. def __init__(self, pool, log_client, trace_id):
  18. self.pool = pool
  19. self.log_client = log_client
  20. self.trace_id = trace_id
  21. async def save_articles(self, articles: List[Tuple]):
  22. """插入标题 && Link"""
  23. query = """
  24. INSERT INTO hot_point_titles
  25. (title, platform, link)
  26. VALUES (%s, %s, %s);
  27. """
  28. return await self.pool.async_save(query=query, params=articles, batch=True)
  29. async def update_useful_status(
  30. self, article_id: int, origin_status: int, new_status: int
  31. ):
  32. """
  33. 更新文章状态
  34. """
  35. query = """
  36. UPDATE hot_point_titles
  37. SET useful = %s
  38. WHERE id = %s AND useful = %s;
  39. """
  40. return await self.pool.async_save(
  41. query=query, params=(new_status, article_id, origin_status)
  42. )
  43. async def set_as_expired(self, article_id: int):
  44. """
  45. 设置文章为过期
  46. """
  47. query = """
  48. UPDATE hot_point_titles
  49. SET status = %s
  50. WHERE id = %s;
  51. """
  52. return await self.pool.async_save(
  53. query=query, params=(self.expired_status, article_id)
  54. )
  55. class CrawlerHotPointTask(CrawlerHotPointMapper):
  56. def __init__(self, pool, log_client, trace_id):
  57. super().__init__(pool, log_client, trace_id)
  58. @staticmethod
  59. def process_raw_data(response_data):
  60. """
  61. 处理原始数据
  62. """
  63. articles = []
  64. for item in response_data:
  65. platform = item["source"]
  66. for article in item["rankList"]:
  67. title = article["title"]
  68. link = article["link"]
  69. articles.append((title, platform, link))
  70. return articles
  71. async def crawl_hot_titles(self):
  72. """
  73. 爬取热点标题
  74. """
  75. for page in tqdm(range(1, self.max_page_index)):
  76. try:
  77. raw_data = await get_hot_point_content(page_index=page)
  78. articles = self.process_raw_data(raw_data)
  79. await self.save_articles(articles)
  80. except Exception as e:
  81. print(f"crawl_hot_titles error: {e}")
  82. async def classify_articles_by_llm(self):
  83. """
  84. 用大模型进行分类,判断热点事件是否符合老年人的兴趣爱好
  85. """
  86. pass