Browse Source

Merge branch '2025-05-16-bug-fix' of luojunhui/LongArticlesJob into master

luojunhui 5 tháng trước cách đây
mục cha
commit
cc11cb8026
2 tập tin đã thay đổi với 39 bổ sung23 xóa
  1. 2 3
      account_cold_start_daily.py
  2. 37 20
      cold_start/publish/publishCategoryArticles.py

+ 2 - 3
account_cold_start_daily.py

@@ -109,7 +109,7 @@ class AccountColdStartDailyTask(object):
             )
 
 
-def main(date_str, method_list=None, article_source=None):
+def main(method_list=None, article_source=None):
     """
     main job, use crontab to do job daily
     :return:
@@ -135,12 +135,11 @@ if __name__ == '__main__':
 
     # 执行头条发布
     main(
-        date_str=run_date,
         method_list=['history', 'tech', 'finance', 'entertainment'],
         article_source='toutiao'
     )
     # 执行微信抓取发布
-    main(date_str=run_date)
+    main()
 
     # 执行抓取
     crawler_task(

+ 37 - 20
cold_start/publish/publishCategoryArticles.py

@@ -83,21 +83,40 @@ class CategoryColdStartTask(object):
         """
         从长文 meta 库中获取冷启文章
         :return:
-
         """
-        sql = f"""
-            select 
-                article_id, title, link,  llm_sensitivity, score, category_by_ai
-            from crawler_meta_article t1 
-            join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
-            where category = '{category}' 
-                and platform = '{article_source}' 
-                and title_sensitivity = {TITLE_NOT_SENSITIVE} 
-                and t1.status = {self.INIT_STATUS}
-                and t1.read_cnt / t2.read_avg >= {self.READ_TIMES_THRESHOLD}
-                and t1.read_cnt >= {self.READ_THRESHOLD}
-            ORDER BY score DESC;
-            """
+        match article_source:
+            case 'weixin':
+                sql = f"""
+                    select 
+                        article_id, title, link,  llm_sensitivity, score, category_by_ai
+                    from crawler_meta_article t1 
+                    join crawler_meta_article_accounts_read_avg t2 on t1.out_account_id = t2.gh_id and t1.article_index = t2.position
+                    where category = '{category}' 
+                        and platform = '{article_source}' 
+                        and title_sensitivity = {TITLE_NOT_SENSITIVE} 
+                        and t1.status = {self.INIT_STATUS}
+                        and t1.read_cnt / t2.read_avg >= {self.READ_TIMES_THRESHOLD}
+                        and t1.read_cnt >= {self.READ_THRESHOLD}
+                    ORDER BY score DESC;
+                    """
+            case 'toutiao':
+                sql = f"""
+                    select article_id, title, link,  llm_sensitivity, score, category_by_ai
+                    from crawler_meta_article
+                    where category = '{category}' 
+                        and platform = '{article_source}'
+                        and status = {self.INIT_STATUS}
+                    """
+            case _:
+                log(
+                    task="category_publish_task",
+                    function="get_articles_from_meta_table",
+                    message="不支持的文章来源",
+                    data={
+                        "article_source": article_source
+                    }
+                )
+                return None
         article_list = self.db_client.select(sql)
         log(
             task="category_publish_task",
@@ -256,20 +275,18 @@ class CategoryColdStartTask(object):
         头条文章过滤漏斗
         """
         total_length = articles_df.shape[0]
-        # 第一层漏斗通过状态过滤
-        zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
-        zero_level_funnel_length = zero_level_funnel_df.shape[0]
+        # # 第一层漏斗通过状态过滤
+        # zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
+        # zero_level_funnel_length = zero_level_funnel_df.shape[0]
         bot(
             title="账号冷启动---头条推荐流发布",
             detail={
                 "category": category,
                 "总文章数量": total_length,
-                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(total_length - zero_level_funnel_length,
-                                                                              zero_level_funnel_length),
             },
             mention=False
         )
-        return zero_level_funnel_df
+        return articles_df
 
     def update_article_sensitive_status(self, article_id, status):
         """