2 viikkoa sitten · 91a752bfb6
--- a/applications/tasks/crawler_tasks/crawler_hot_point.py
+++ b/applications/tasks/crawler_tasks/crawler_hot_point.py
@@ -16,23 +16,48 @@ class CrawlerHotPointConst:
 
				     INIT_STATUS = 0
			
 
				     PROCESSING_STATUS = 1
			
 
				     USEFUL_STATUS = 2
			
 
				-    UNUSEFUL_STATUS = 3
			
 
				+    NOT_USEFUL_STATUS = 3
			
 
				     FAILED_STATUS = 99
			
 
				 
			
 
				     NOT_EXPIRED_STATUS = 1
			
 
				     EXPIRED_STATUS = 2
			
 
				 
			
 
				     # batch
			
 
				-    PROCESS_TITLE_BATCH_SIZE = 300
			
 
				+    PROCESS_TITLE_BATCH_SIZE = 500
			
 
				 
			
 
				     # ignore platforms
			
 
				     IGNORE_PLATFORMS = {
			
 
				         "中国日报", "每日珠宝杂志", "iBag包包", "ZAKER", "NASA 🌍", "wikiHow 中文",
			
 
				-        "China Daily", "微信 ‧ 游戏", "Yahoo News"
			
 
				+        "China Daily", "微信 ‧ 游戏", "Yahoo News", "北京天文馆"
			
 
				     }
			
 
				 
			
 
				-
			
 
				-class CrawlerHotPointMapper(CrawlerHotPointConst):
			
 
				+class CrawlerHotPointBase(CrawlerHotPointConst):
			
 
				+    CLASSIFY_PROMPT = """
			
 
				+    你是一个内容分析助手，专门从热榜标题中识别出55岁以上老年人可能喜欢或关注的银发内容。
			
 
				+    银发内容通常涉及健康、养老、退休生活、老年疾病、社会保障、代际关系、奇闻趣事、名人故事、社会事件等主题。
			
 
				+    不要出现政治，当代国家领导人等敏感事件。
			
 
				+    1. **任务描述**：
			
 
				+        扫描所有标题，筛选出与银发内容高度相关时效性新闻信息。相关性判断基于标题是否直接或间接提及老年人相关话题，或可能吸引55岁以上人群的兴趣。返回适合的 id。
			
 
				+        如果遇到敏感人物，正常过滤。请注意，一定要是新闻性事件, 请严格判断标题是否适合老年群体。
			
 
				+    4. **输出格式**：输出结果为 JSON，只需要返回适合老年人话题的 id, 结构为
			
 
				+        {
			
 
				+            "IDS": [1, 2, 3, ...]
			
 
				+        }
			
 
				+    现在， 请处理我输入的标题 && id, please think step by step.
			
 
				+    """
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def format_input_articles(fetch_response: List[Dict]) -> str:
			
 
				+        """
			
 
				+        格式化输入文章为字符串，每个文章占一行，格式为：id, title
			
 
				+        """
			
 
				+        output_string = ""
			
 
				+        for item in fetch_response:
			
 
				+            output_string += f"{item['id']}, {item['title']}\n"
			
 
				+        return output_string
			
 
				+
			
 
				+
			
 
				+class CrawlerHotPointMapper(CrawlerHotPointBase):
			
 
				     def __init__(self, pool, log_client, trace_id):
			
 
				         self.pool = pool
			
 
				         self.log_client = log_client
			
@@ -62,6 +87,28 @@ class CrawlerHotPointMapper(CrawlerHotPointConst):
 
				             query=query, params=(new_status, article_id, origin_status)
			
 
				         )
			
 
				 
			
 
				+    async def set_as_processing(self, title_ids: List[int]) -> int:
			
 
				+        query = """
			
 
				+            UPDATE hot_point_titles
			
 
				+            SET useful = %s
			
 
				+            WHERE id IN %s;"""
			
 
				+        return await self.pool.async_save(
			
 
				+            query=query, params=(self.PROCESSING_STATUS, tuple(title_ids))
			
 
				+        )
			
 
				+
			
 
				+    async def set_as_failed(self, title_ids: List[int]) -> int:
			
 
				+        """
			
 
				+        设置文章为失败
			
 
				+        """
			
 
				+        query = """
			
 
				+            UPDATE hot_point_titles
			
 
				+            SET useful = %s
			
 
				+            WHERE id IN %s;
			
 
				+        """
			
 
				+        return await self.pool.async_save(
			
 
				+            query=query, params=(self.FAILED_STATUS, tuple(title_ids))
			
 
				+        )
			
 
				+
			
 
				     async def set_as_expired(self, article_id: int) -> int:
			
 
				         """
			
 
				         设置文章为过期
			
@@ -88,20 +135,6 @@ class CrawlerHotPointMapper(CrawlerHotPointConst):
 
				 
			
 
				 
			
 
				 class CrawlerHotPointTask(CrawlerHotPointMapper):
			
 
				-    CLASSIFY_PROMPT = """
			
 
				-你是一个内容分析助手，专门从热榜标题中识别出55岁以上老年人可能喜欢或关注的银发内容。
			
 
				-银发内容通常涉及健康、养老、退休生活、老年疾病、社会保障、代际关系、奇闻趣事、名人故事、社会事件等主题。
			
 
				-
			
 
				-1. **任务描述**：
			
 
				-    扫描所有标题，筛选出与银发内容高度相关时效性新闻信息。相关性判断基于标题是否直接或间接提及老年人相关话题，或可能吸引55岁以上人群的兴趣。返回适合的 id。
			
 
				-    如果遇到敏感人物，正常过滤
			
 
				-    请注意，一定要是新闻性事件
			
 
				-4. **输出格式**：输出结果为 JSON，只需要返回适合老年人话题的 id, 结构为
			
 
				-    {
			
 
				-        "IDS": [1, 2, 3, ...]
			
 
				-    }
			
 
				-现在， 请处理我输入的标题 && id
			
 
				-"""
			
 
				 
			
 
				     def __init__(self, pool, log_client, trace_id):
			
 
				         super().__init__(pool, log_client, trace_id)
			
@@ -138,9 +171,21 @@ class CrawlerHotPointTask(CrawlerHotPointMapper):
 
				         用大模型进行分类，判断热点事件是否符合老年人的兴趣爱好
			
 
				         """
			
 
				         infos = await self.fetch_init_articles()
			
 
				-        prompt = f"{self.CLASSIFY_PROMPT}\n{infos}"
			
 
				-        print(prompt)
			
 
				+        # acquire lock
			
 
				+        title_ids = [item["id"] for item in infos]
			
 
				+        await self.set_as_processing(title_ids)
			
 
				+        prompt = f"{self.CLASSIFY_PROMPT}\n{self.format_input_articles(infos)}"
			
 
				         response = fetch_deepseek_completion(
			
 
				-            prompt=prompt, model="default", output_type="json"
			
 
				+            prompt=prompt, model="DeepSeek-R1", output_type="json"
			
 
				         )
			
 
				-        print(response)
			
 
				+        if not response:
			
 
				+            w = await self.set_as_failed([item["id"] for item in infos])
			
 
				+            print(w)
			
 
				+            return
			
 
				+        ids = set(response.get("IDS", []))
			
 
				+        for item in tqdm(infos):
			
 
				+            id_ = item["id"]
			
 
				+            if id_ in ids:
			
 
				+                await self.update_useful_status(id_, self.PROCESSING_STATUS, self.USEFUL_STATUS)
			
 
				+            else:
			
 
				+                await self.update_useful_status(id_, self.PROCESSING_STATUS, self.NOT_USEFUL_STATUS)