před 5 měsíci · 4177d93fce
--- a/applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_const.py
+++ b/applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_const.py
@@ -14,7 +14,85 @@ class ArticlePoolColdStartConst:
 
				     # READ_THRESHOLD = 5000
			
 
				     READ_THRESHOLD = 1000
			
 
				 
			
 
				-    TITLE_LENGTH_LIMIT = 15
			
 
				-    TITLE_LENGTH_MAX = 50
			
 
				+    TITLE_LENGTH_LIMIT = 12
			
 
				+    TITLE_LENGTH_MAX = 40
			
 
				 
			
 
				     DEFAULT_CRAWLER_METHODS = ["1030-手动挑号", "account_association"]
			
 
				+
			
 
				+    CATEGORY_CONFIG_MAP = {
			
 
				+        "知识科普": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "军事历史": {
			
 
				+            "num": 50,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "家长里短": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "社会法治": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "奇闻趣事": {
			
 
				+            "num": 150,
			
 
				+            "read_threshold": 2000,
			
 
				+            "read_times_threshold": 1.4,
			
 
				+        },
			
 
				+        "名人八卦": {
			
 
				+            "num": 200,
			
 
				+            "read_threshold": 3000,
			
 
				+            "read_times_threshold": 1.4,
			
 
				+        },
			
 
				+        "健康养生": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "情感故事": {
			
 
				+            "num": 200,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "国家大事": {
			
 
				+            "num": 200,
			
 
				+            "read_threshold": 3000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "现代人物": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "怀旧时光": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "政治新闻": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "历史人物": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "社会现象": {
			
 
				+            "num": 200,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+        "财经科技": {
			
 
				+            "num": 100,
			
 
				+            "read_threshold": 1000,
			
 
				+            "read_times_threshold": 1.3,
			
 
				+        },
			
 
				+    }
			
--- a/applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_strategy.py
+++ b/applications/tasks/cold_start_tasks/article_pool/article_pool_cold_start_strategy.py
@@ -68,8 +68,10 @@ class ArticlePoolColdStartStrategy(ArticlePoolColdStartConst):
 
				                         "weixin",
			
 
				                         self.TITLE_NOT_SENSITIVE,
			
 
				                         self.INIT_STATUS,
			
 
				-                        self.READ_TIMES_THRESHOLD,
			
 
				-                        self.READ_THRESHOLD,
			
 
				+                        # self.READ_TIMES_THRESHOLD,
			
 
				+                        self.CATEGORY_CONFIG_MAP.get(category, self.READ_TIMES_THRESHOLD).get("read_times_threshold", self.READ_TIMES_THRESHOLD),
			
 
				+                        # self.READ_THRESHOLD,
			
 
				+                        self.CATEGORY_CONFIG_MAP.get(category, self.READ_THRESHOLD).get("read_threshold", self.READ_THRESHOLD),
			
 
				                         self.INIT_STATUS,
			
 
				                     ),
			
 
				                 )
			
--- a/applications/tasks/cold_start_tasks/article_pool/article_pool_filter_strategy.py
+++ b/applications/tasks/cold_start_tasks/article_pool/article_pool_filter_strategy.py
@@ -18,11 +18,14 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
 
				         """微信过滤漏斗"""
			
 
				         total_length: int = dataframe.shape[0]
			
 
				 
			
 
				-        # 通过标题长度过滤
			
 
				-        filter_df = dataframe[
			
 
				-            (dataframe["title"].str.len() <= self.TITLE_LENGTH_MAX)
			
 
				-            & (dataframe["title"].str.len() >= self.TITLE_LENGTH_LIMIT)
			
 
				-        ]
			
 
				+        # 通过标题去重 && 长度过滤
			
 
				+        dedup_df = dataframe.drop_duplicates(subset="title")
			
 
				+
			
 
				+        # 再做标题长度过滤
			
 
				+        filter_df = dedup_df[
			
 
				+            (dedup_df["title"].str.len() <= self.TITLE_LENGTH_MAX)
			
 
				+            & (dedup_df["title"].str.len() >= self.TITLE_LENGTH_LIMIT)
			
 
				+            ]
			
 
				         length_level1 = filter_df.shape[0]
			
 
				 
			
 
				         # 通过敏感词过滤
			
@@ -31,14 +34,13 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
 
				             "太极",
			
 
				             "节",
			
 
				             "早上好",
			
 
				+            "下午好",
			
 
				+            "晚上好",
			
 
				             "赖清德",
			
 
				             "普京",
			
 
				-            "俄",
			
 
				             "南海",
			
 
				             "台海",
			
 
				             "解放军",
			
 
				-            "蔡英文",
			
 
				-            "中国",
			
 
				         ]
			
 
				         # 构建正则表达式，使用 | 连接表示“或”的关系
			
 
				         pattern = "|".join(sensitive_keywords)
			
@@ -50,7 +52,7 @@ class ArticlePoolFilterStrategy(ArticlePoolColdStartConst):
 
				         # 第4层通过相关性分数过滤
			
 
				         filter_df = filter_df[filter_df["score"] > self.SIMILARITY_SCORE_THRESHOLD]
			
 
				         length_level4 = filter_df.shape[0]
			
 
				-        daily_article_num = 120
			
 
				+        daily_article_num = self.CATEGORY_CONFIG_MAP.get(category, 120).get("num", 120)
			
 
				         match strategy:
			
 
				             case "strategy_v1":
			
 
				                 await feishu_robot.bot(