Procházet zdrojové kódy

Merge branch 'feature/luojunhui/2025-11-21-update-crawler' of Server/LongArticleTaskServer into master

luojunhui před 3 měsíci
rodič
revize
4177d93fce

+ 80 - 2
applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_const.py

@@ -14,7 +14,85 @@ class ArticlePoolColdStartConst:
     # READ_THRESHOLD = 5000
     READ_THRESHOLD = 1000
 
-    TITLE_LENGTH_LIMIT = 15
-    TITLE_LENGTH_MAX = 50
+    TITLE_LENGTH_LIMIT = 12
+    TITLE_LENGTH_MAX = 40
 
     DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "account_association"]
+
+    CATEGORY_CONFIG_MAP = {
+        "知识科普": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "军事历史": {
+            "num": 50,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "家长里短": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "社会法治": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "奇闻趣事": {
+            "num": 150,
+            "read_threshold": 2000,
+            "read_times_threshold": 1.4,
+        },
+        "名人八卦": {
+            "num": 200,
+            "read_threshold": 3000,
+            "read_times_threshold": 1.4,
+        },
+        "健康养生": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "情感故事": {
+            "num": 200,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "国家大事": {
+            "num": 200,
+            "read_threshold": 3000,
+            "read_times_threshold": 1.3,
+        },
+        "现代人物": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "怀旧时光": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "政治新闻": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "历史人物": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "社会现象": {
+            "num": 200,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+        "财经科技": {
+            "num": 100,
+            "read_threshold": 1000,
+            "read_times_threshold": 1.3,
+        },
+    }

+ 4 - 2
applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_strategy.py

@@ -68,8 +68,10 @@ class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
                         "weixin",
                         self.TITLE_NOT_SENSITIVE,
                         self.INIT_STATUS,
-                        self.READ_TIMES_THRESHOLD,
-                        self.READ_THRESHOLD,
+                        # self.READ_TIMES_THRESHOLD,
+                        self.CATEGORY_CONFIG_MAP.get(category, self.READ_TIMES_THRESHOLD).get("read_times_threshold", self.READ_TIMES_THRESHOLD),
+                        # self.READ_THRESHOLD,
+                        self.CATEGORY_CONFIG_MAP.get(category, self.READ_THRESHOLD).get("read_threshold", self.READ_THRESHOLD),
                         self.INIT_STATUS,
                     ),
                 )

+ 11 - 9
applications/tasks/cold_start_tasks/article_pool/article_pool_filter_strategy.py

@@ -18,11 +18,14 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
         """微信过滤漏斗"""
         total_length: int = dataframe.shape[0]
 
-        # 通过标题长度过滤
-        filter_df = dataframe[
-            (dataframe["title"].str.len() <= self.TITLE_LENGTH_MAX)
-            & (dataframe["title"].str.len() >= self.TITLE_LENGTH_LIMIT)
-        ]
+        # 通过标题去重 && 长度过滤
+        dedup_df = dataframe.drop_duplicates(subset="title")
+
+        # 再做标题长度过滤
+        filter_df = dedup_df[
+            (dedup_df["title"].str.len() <= self.TITLE_LENGTH_MAX)
+            & (dedup_df["title"].str.len() >= self.TITLE_LENGTH_LIMIT)
+            ]
         length_level1 = filter_df.shape[0]
 
         # 通过敏感词过滤
@@ -31,14 +34,13 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
             "太极",
             "节",
             "早上好",
+            "下午好",
+            "晚上好",
             "赖清德",
             "普京",
-            "俄",
             "南海",
             "台海",
             "解放军",
-            "蔡英文",
-            "中国",
         ]
         # 构建正则表达式,使用 | 连接表示“或”的关系
         pattern = "|".join(sensitive_keywords)
@@ -50,7 +52,7 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
         # 第4层通过相关性分数过滤
         filter_df = filter_df[filter_df["score"] > self.SIMILARITY_SCORE_THRESHOLD]
         length_level4 = filter_df.shape[0]
-        daily_article_num = 120
+        daily_article_num = self.CATEGORY_CONFIG_MAP.get(category, 120).get("num", 120)
         match strategy:
             case "strategy_v1":
                 await feishu_robot.bot(