Forráskód Böngészése

publishCategoryArticles.py

计算阅读均值倍数使用该品类账号全部文章
luojunhui 8 hónapja
szülő
commit
c825bca4ae
1 módosított fájl, 10 hozzáadás és 5 törlés
  1. 10 5
      coldStartTasks/publish/publishCategoryArticles.py

+ 10 - 5
coldStartTasks/publish/publishCategoryArticles.py

@@ -49,11 +49,11 @@ class CategoryColdStartTask(object):
         """
         """
         sql = f"""
         sql = f"""
         SELECT 
         SELECT 
-            article_id, out_account_id, article_index, title, link, read_cnt
+            article_id, out_account_id, article_index, title, link, read_cnt, status
         FROM
         FROM
             crawler_meta_article
             crawler_meta_article
         WHERE 
         WHERE 
-            category = "{category}" and status = '{self.INIT_STATUS}';
+            category = "{category}";
         """
         """
         article_list = self.db_client.select(sql)
         article_list = self.db_client.select(sql)
         log(
         log(
@@ -65,7 +65,7 @@ class CategoryColdStartTask(object):
                 "category": category
                 "category": category
             }
             }
         )
         )
-        article_df = DataFrame(article_list, columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt'])
+        article_df = DataFrame(article_list, columns=['article_id', 'gh_id', 'position', 'title', 'link', 'read_cnt', 'status'])
         return article_df
         return article_df
 
 
     def change_article_status(self, category):
     def change_article_status(self, category):
@@ -131,8 +131,12 @@ class CategoryColdStartTask(object):
         articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
         articles_df['average_read'] = articles_df.groupby(['gh_id', 'position'])['read_cnt'].transform('mean')
         articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
         articles_df['read_times'] = articles_df['read_cnt'] / articles_df['average_read']
         total_length = articles_df.shape[0]
         total_length = articles_df.shape[0]
+        # 第0层过滤已经发布的文章
+        zero_level_funnel_df = articles_df[articles_df['status'] == self.INIT_STATUS]
+        zero_level_funnel_length = zero_level_funnel_df.shape[0]
+
         # 第一层漏斗通过阅读均值倍数过滤
         # 第一层漏斗通过阅读均值倍数过滤
-        first_level_funnel_df = articles_df[articles_df['read_times'] >= self.READ_TIMES_THRESHOLD]
+        first_level_funnel_df = zero_level_funnel_df[zero_level_funnel_df['read_times'] >= self.READ_TIMES_THRESHOLD]
         first_level_funnel_length = first_level_funnel_df.shape[0]
         first_level_funnel_length = first_level_funnel_df.shape[0]
 
 
         # 第二层漏斗通过阅读量过滤
         # 第二层漏斗通过阅读量过滤
@@ -177,7 +181,8 @@ class CategoryColdStartTask(object):
             title="冷启任务发布通知",
             title="冷启任务发布通知",
             detail={
             detail={
                 "总文章数量": total_length,
                 "总文章数量": total_length,
-                "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(total_length - first_level_funnel_length, first_level_funnel_length),
+                "通过已经发布状态过滤": "过滤数量: {}    剩余数量: {}".format(total_length - zero_level_funnel_length, zero_level_funnel_length),
+                "通过阅读均值倍数过滤": "过滤数量: {}    剩余数量: {}".format(zero_level_funnel_length - first_level_funnel_length, first_level_funnel_length),
                 "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
                 "通过阅读量过滤": "过滤数量: {}    剩余数量: {}".format(first_level_funnel_length - second_level_funnel_length, second_level_funnel_length),
                 "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
                 "通过标题长度过滤": "过滤数量: {}    剩余数量: {}".format(second_level_funnel_length - third_level_funnel_length, third_level_funnel_length),
                 "通过敏感词过滤":  "过滤数量: {}    剩余数量: {}".format(third_level_funnel_length - final_length, final_length),
                 "通过敏感词过滤":  "过滤数量: {}    剩余数量: {}".format(third_level_funnel_length - final_length, final_length),