luojunhui 2 miesięcy temu
rodzic
commit
7f808d9f13

+ 2 - 2
account_cold_start_daily.py

@@ -8,7 +8,7 @@ from applications import longArticlesMySQL, bot
 from coldStartTasks.crawler.weixinCategoryCrawler import weixinCategory
 from coldStartTasks.publish.publishCategoryArticles import CategoryColdStartTask
 
-DEFAULT_CATEGORY_LIST = ['account_association']
+DEFAULT_CATEGORY_LIST = ['1030-手动挑号']
 
 
 class AccountColdStartDailyTask(object):
@@ -109,7 +109,7 @@ def main(category_list=None, article_source=None):
         if article_source == 'weixin':
             task.crawler_task(category_list=category_list)
 
-        # task.publish_task(category_list=category_list, article_source=article_source)
+        task.publish_task(category_list=category_list, article_source=article_source)
 
 
 if __name__ == '__main__':

+ 13 - 4
coldStartTasks/publish/publishCategoryArticles.py

@@ -82,11 +82,12 @@ class CategoryColdStartTask(object):
         """
         sql = f"""
         SELECT 
-            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity
+            article_id, out_account_id, article_index, title, link, read_cnt, status, llm_sensitivity, score
         FROM
             crawler_meta_article
         WHERE 
-            category = "{category}" and platform = "{article_source}";
+            category = "{category}" and platform = "{article_source}"
+        ORDER BY score DESC;
         """
         article_list = self.db_client.select(sql)
         log(
@@ -99,7 +100,7 @@ class CategoryColdStartTask(object):
             }
         )
         article_df = DataFrame(article_list,
-                               columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity'])
+                               columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status', 'llm_sensitivity', 'score'])
         return article_df
 
     def change_article_status(self, category):
@@ -206,6 +207,11 @@ class CategoryColdStartTask(object):
             ~(filter_df['llm_sensitivity'] > 0)
         ]
         length_level5 = filter_df.shape[0]
+
+        # 第六层通过相关性分数过滤
+        filter_df = filter_df[filter_df['score'] > 0.4]
+        length_level6 = filter_df.shape[0]
+
         log(
             task="category_publish_task",
             function="publish_filter_articles",
@@ -232,6 +238,9 @@ class CategoryColdStartTask(object):
                 "通过LLM敏感度过滤": "过滤数量: {}    剩余数量: {}".format(
                     length_level4 - length_level5, length_level5
                 ),
+                "通过相关性分数过滤": "过滤数量: {}    剩余数量: {}".format(
+                    length_level5 - length_level6, length_level6
+                ),
                 "品类": category,
                 "阅读均值倍数阈值": self.READ_TIMES_THRESHOLD,
                 "阅读量阈值": self.READ_THRESHOLD,
@@ -239,7 +248,7 @@ class CategoryColdStartTask(object):
             },
             mention=False
         )
-        return filter_df
+        return filter_df[:1000]
 
     def filter_toutiao_articles(self, articles_df, category):
         """