Bläddra i källkod

Update publishCategoryArticles: clean codes and add LLM sensitivity filter

StrayWarrior 2 månader sedan
förälder
incheckning
c7967e8a57
1 ändrade filer med 45 tillägg och 35 borttagningar
  1. 45 35
      coldStartTasks/publish/publishCategoryArticles.py

+ 45 - 35
coldStartTasks/publish/publishCategoryArticles.py

@@ -34,6 +34,7 @@ class CategoryColdStartTask(object):
         self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
         self.READ_THRESHOLD = self.category_cold_start_threshold.get("READ_THRESHOLD", 5000)
         self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
         self.READ_TIMES_THRESHOLD = self.category_cold_start_threshold.get("READ_TIMES_THRESHOLD", 1.3)
         self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
         self.LIMIT_TITLE_LENGTH = self.category_cold_start_threshold.get("LIMIT_TITLE_LENGTH", 15)
+        self.TITLE_LENGTH_MAX = self.category_cold_start_threshold.get("TITLE_LENGTH_MAX", 50)
         log(
         log(
             task="category_publish_task",
             task="category_publish_task",
             function="__init__",
             function="__init__",
@@ -81,7 +82,7 @@ class CategoryColdStartTask(object):
         """
         """
         sql = f"""
         sql = f"""
         SELECT 
         SELECT 
-            article_id, out_account_id, article_index, title, link, read_cnt, status
+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity
         FROM
         FROM
             crawler_meta_article
             crawler_meta_article
         WHERE 
         WHERE 
@@ -98,7 +99,7 @@ class CategoryColdStartTask(object):
             }
             }
         )
         )
         article_df = DataFrame(article_list,
         article_df = DataFrame(article_list,
-                               columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status'])
+                               columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity'])
         return article_df
         return article_df
 
 
     def change_article_status(self, category):
     def change_article_status(self, category):
@@ -162,47 +163,53 @@ class CategoryColdStartTask(object):
         articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
         articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
         total_length = articles_df.shape[0]
         total_length = articles_df.shape[0]
         # 第0层过滤已经发布的文章
         # 第0层过滤已经发布的文章
-        zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
-        zero_level_funnel_length = zero_level_funnel_df.shape[0]
+        filter_df = articles_df[articles_df['status'] == self.INIT_STATUS]
+        length_level0 = filter_df.shape[0]
 
 
         # 第一层漏斗通过阅读均值倍数过滤
         # 第一层漏斗通过阅读均值倍数过滤
-        first_level_funnel_df = zero_level_funnel_df[zero_level_funnel_df['read_times'] >= self.READ_TIMES_THRESHOLD]
-        first_level_funnel_length = first_level_funnel_df.shape[0]
+        filter_df = filter_df[filter_df['read_times'] >= self.READ_TIMES_THRESHOLD]
+        length_level1 = filter_df.shape[0]
 
 
         # 第二层漏斗通过阅读量过滤
         # 第二层漏斗通过阅读量过滤
-        second_level_funnel_df = first_level_funnel_df[
-            first_level_funnel_df['read_cnt'] >= self.READ_THRESHOLD
+        filter_df = filter_df[
+            filter_df['read_cnt'] >= self.READ_THRESHOLD
             ]
             ]
-        second_level_funnel_length = second_level_funnel_df.shape[0]
+        length_level2 = filter_df.shape[0]
 
 
         # 第三层漏斗通过标题长度过滤
         # 第三层漏斗通过标题长度过滤
-        third_level_funnel_df = second_level_funnel_df[
-            second_level_funnel_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH
+        filter_df = filter_df[
+            (filter_df['title'].str.len() >= self.LIMIT_TITLE_LENGTH)
+            & (filter_df['title'].str.len() <= self.TITLE_LENGTH_MAX)
             ]
             ]
-        third_level_funnel_length = third_level_funnel_df.shape[0]
+        length_level3 = filter_df.shape[0]
 
 
-        # 最后一层通过敏感词过滤
-        filter_df = third_level_funnel_df[
-            (~third_level_funnel_df['title'].str.contains('农历'))
-            & (~third_level_funnel_df['title'].str.contains('太极'))
-            & (~third_level_funnel_df['title'].str.contains('节'))
-            & (~third_level_funnel_df['title'].str.contains('早上好'))
-            & (~third_level_funnel_df['title'].str.contains('赖清德'))
-            & (~third_level_funnel_df['title'].str.contains('普京'))
-            & (~third_level_funnel_df['title'].str.contains('俄'))
-            & (~third_level_funnel_df['title'].str.contains('南海'))
-            & (~third_level_funnel_df['title'].str.contains('台海'))
-            & (~third_level_funnel_df['title'].str.contains('解放军'))
-            & (~third_level_funnel_df['title'].str.contains('蔡英文'))
-            & (~third_level_funnel_df['title'].str.contains('中国'))
+        # 第四层通过敏感词过滤
+        filter_df = filter_df[
+            (~filter_df['title'].str.contains('农历'))
+            & (~filter_df['title'].str.contains('太极'))
+            & (~filter_df['title'].str.contains('节'))
+            & (~filter_df['title'].str.contains('早上好'))
+            & (~filter_df['title'].str.contains('赖清德'))
+            & (~filter_df['title'].str.contains('普京'))
+            & (~filter_df['title'].str.contains('俄'))
+            & (~filter_df['title'].str.contains('南海'))
+            & (~filter_df['title'].str.contains('台海'))
+            & (~filter_df['title'].str.contains('解放军'))
+            & (~filter_df['title'].str.contains('蔡英文'))
+            & (~filter_df['title'].str.contains('中国'))
             ]
             ]
-        final_length = filter_df.shape[0]
+        length_level4 = filter_df.shape[0]
+        # 第五层通过LLM敏感度过滤
+        filter_df = filter_df[
+            ~(filter_df['llm_sensitivity'] > 0)
+        ]
+        length_level5 = filter_df.shape[0]
         log(
         log(
             task="category_publish_task",
             task="category_publish_task",
             function="publish_filter_articles",
             function="publish_filter_articles",
             message="过滤后文章总数",
             message="过滤后文章总数",
             data={
             data={
-                "total_articles": final_length,
+                "total_articles": length_level5,
                 "category": category
                 "category": category
             }
             }
         )
         )
@@ -210,16 +217,19 @@ class CategoryColdStartTask(object):
             title="冷启任务发布通知",
             title="冷启任务发布通知",
             detail={
             detail={
                 "总文章数量": total_length,
                 "总文章数量": total_length,
-                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(total_length - zero_level_funnel_length,
-                                                                              zero_level_funnel_length),
+                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(
+                    total_length - length_level0, length_level0),
                 "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(
                 "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(
-                    zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
+                    length_level0 - length_level1, length_level1),
                 "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(
                 "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(
-                    first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
+                    length_level1 - length_level2, length_level2),
                 "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(
                 "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(
-                    second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
-                "通过敏感词过滤": "过滤数量: {}    剩余数量: {}".format(third_level_funnel_length - final_length,
-                                                                        final_length),
+                    length_level2 - length_level3, length_level3),
+                "通过敏感词过滤": "过滤数量: {}    剩余数量: {}".format(
+                    length_level3 - length_level4, length_level4),
+                "通过LLM敏感度过滤": "过滤数量: {}    剩余数量: {}".format(
+                    length_level4 - length_level5, length_level5
+                ),
                 "品类": category,
                 "品类": category,
                 "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
                 "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
                 "阅读量阈值": self.READ_THRESHOLD,
                 "阅读量阈值": self.READ_THRESHOLD,