|
|
@@ -16,23 +16,48 @@ class CrawlerHotPointConst:
|
|
|
INIT_STATUS = 0
|
|
|
PROCESSING_STATUS = 1
|
|
|
USEFUL_STATUS = 2
|
|
|
- UNUSEFUL_STATUS = 3
|
|
|
+ NOT_USEFUL_STATUS = 3
|
|
|
FAILED_STATUS = 99
|
|
|
|
|
|
NOT_EXPIRED_STATUS = 1
|
|
|
EXPIRED_STATUS = 2
|
|
|
|
|
|
# batch
|
|
|
- PROCESS_TITLE_BATCH_SIZE = 300
|
|
|
+ PROCESS_TITLE_BATCH_SIZE = 500
|
|
|
|
|
|
# ignore platforms
|
|
|
IGNORE_PLATFORMS = {
|
|
|
"中国日报", "每日珠宝杂志", "iBag包包", "ZAKER", "NASA 🌍", "wikiHow 中文",
|
|
|
- "China Daily", "微信 ‧ 游戏", "Yahoo News"
|
|
|
+ "China Daily", "微信 ‧ 游戏", "Yahoo News", "北京天文馆"
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-class CrawlerHotPointMapper(CrawlerHotPointConst):
|
|
|
+class CrawlerHotPointBase(CrawlerHotPointConst):
|
|
|
+ CLASSIFY_PROMPT = """
|
|
|
+ 你是一个内容分析助手,专门从热榜标题中识别出55岁以上老年人可能喜欢或关注的银发内容。
|
|
|
+ 银发内容通常涉及健康、养老、退休生活、老年疾病、社会保障、代际关系、奇闻趣事、名人故事、社会事件等主题。
|
|
|
+ 不要出现政治,当代国家领导人等敏感事件。
|
|
|
+ 1. **任务描述**:
|
|
|
+ 扫描所有标题,筛选出与银发内容高度相关时效性新闻信息。相关性判断基于标题是否直接或间接提及老年人相关话题,或可能吸引55岁以上人群的兴趣。返回适合的 id。
|
|
|
+ 如果遇到敏感人物,正常过滤。请注意,一定要是新闻性事件, 请严格判断标题是否适合老年群体。
|
|
|
+ 4. **输出格式**:输出结果为 JSON,只需要返回适合老年人话题的 id, 结构为
|
|
|
+ {
|
|
|
+ "IDS": [1, 2, 3, ...]
|
|
|
+ }
|
|
|
+ 现在, 请处理我输入的标题 && id, please think step by step.
|
|
|
+ """
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def format_input_articles(fetch_response: List[Dict]) -> str:
|
|
|
+ """
|
|
|
+ 格式化输入文章为字符串,每个文章占一行,格式为:id, title
|
|
|
+ """
|
|
|
+ output_string = ""
|
|
|
+ for item in fetch_response:
|
|
|
+ output_string += f"{item['id']}, {item['title']}\n"
|
|
|
+ return output_string
|
|
|
+
|
|
|
+
|
|
|
+class CrawlerHotPointMapper(CrawlerHotPointBase):
|
|
|
def __init__(self, pool, log_client, trace_id):
|
|
|
self.pool = pool
|
|
|
self.log_client = log_client
|
|
|
@@ -62,6 +87,28 @@ class CrawlerHotPointMapper(CrawlerHotPointConst):
|
|
|
query=query, params=(new_status, article_id, origin_status)
|
|
|
)
|
|
|
|
|
|
+ async def set_as_processing(self, title_ids: List[int]) -> int:
|
|
|
+ query = """
|
|
|
+ UPDATE hot_point_titles
|
|
|
+ SET useful = %s
|
|
|
+ WHERE id IN %s;"""
|
|
|
+ return await self.pool.async_save(
|
|
|
+ query=query, params=(self.PROCESSING_STATUS, tuple(title_ids))
|
|
|
+ )
|
|
|
+
|
|
|
+ async def set_as_failed(self, title_ids: List[int]) -> int:
|
|
|
+ """
|
|
|
+ 设置文章为失败
|
|
|
+ """
|
|
|
+ query = """
|
|
|
+ UPDATE hot_point_titles
|
|
|
+ SET useful = %s
|
|
|
+ WHERE id IN %s;
|
|
|
+ """
|
|
|
+ return await self.pool.async_save(
|
|
|
+ query=query, params=(self.FAILED_STATUS, tuple(title_ids))
|
|
|
+ )
|
|
|
+
|
|
|
async def set_as_expired(self, article_id: int) -> int:
|
|
|
"""
|
|
|
设置文章为过期
|
|
|
@@ -88,20 +135,6 @@ class CrawlerHotPointMapper(CrawlerHotPointConst):
|
|
|
|
|
|
|
|
|
class CrawlerHotPointTask(CrawlerHotPointMapper):
|
|
|
- CLASSIFY_PROMPT = """
|
|
|
-你是一个内容分析助手,专门从热榜标题中识别出55岁以上老年人可能喜欢或关注的银发内容。
|
|
|
-银发内容通常涉及健康、养老、退休生活、老年疾病、社会保障、代际关系、奇闻趣事、名人故事、社会事件等主题。
|
|
|
-
|
|
|
-1. **任务描述**:
|
|
|
- 扫描所有标题,筛选出与银发内容高度相关时效性新闻信息。相关性判断基于标题是否直接或间接提及老年人相关话题,或可能吸引55岁以上人群的兴趣。返回适合的 id。
|
|
|
- 如果遇到敏感人物,正常过滤
|
|
|
- 请注意,一定要是新闻性事件
|
|
|
-4. **输出格式**:输出结果为 JSON,只需要返回适合老年人话题的 id, 结构为
|
|
|
- {
|
|
|
- "IDS": [1, 2, 3, ...]
|
|
|
- }
|
|
|
-现在, 请处理我输入的标题 && id
|
|
|
-"""
|
|
|
|
|
|
def __init__(self, pool, log_client, trace_id):
|
|
|
super().__init__(pool, log_client, trace_id)
|
|
|
@@ -138,9 +171,21 @@ class CrawlerHotPointTask(CrawlerHotPointMapper):
|
|
|
用大模型进行分类,判断热点事件是否符合老年人的兴趣爱好
|
|
|
"""
|
|
|
infos = await self.fetch_init_articles()
|
|
|
- prompt = f"{self.CLASSIFY_PROMPT}\n{infos}"
|
|
|
- print(prompt)
|
|
|
+ # acquire lock
|
|
|
+ title_ids = [item["id"] for item in infos]
|
|
|
+ await self.set_as_processing(title_ids)
|
|
|
+ prompt = f"{self.CLASSIFY_PROMPT}\n{self.format_input_articles(infos)}"
|
|
|
response = fetch_deepseek_completion(
|
|
|
- prompt=prompt, model="default", output_type="json"
|
|
|
+ prompt=prompt, model="DeepSeek-R1", output_type="json"
|
|
|
)
|
|
|
- print(response)
|
|
|
+ if not response:
|
|
|
+ w = await self.set_as_failed([item["id"] for item in infos])
|
|
|
+ print(w)
|
|
|
+ return
|
|
|
+ ids = set(response.get("IDS", []))
|
|
|
+ for item in tqdm(infos):
|
|
|
+ id_ = item["id"]
|
|
|
+ if id_ in ids:
|
|
|
+ await self.update_useful_status(id_, self.PROCESSING_STATUS, self.USEFUL_STATUS)
|
|
|
+ else:
|
|
|
+ await self.update_useful_status(id_, self.PROCESSING_STATUS, self.NOT_USEFUL_STATUS)
|