Explorar el Código

article_exit_with_title.py

冷启动文章晋级&&退场管理
luojunhui hace 8 meses
padre
commit
dfa0f8de35
Se han modificado 1 ficheros con 11 adiciones y 21 borrados
  1. 11 21
      flow_pool/article_exit_with_title.py

+ 11 - 21
flow_pool/article_exit_with_title.py

@@ -68,34 +68,22 @@ class ArticleExitWithTitle(object):
             )
         return True
 
-    def get_discovery_published_articles(self) -> pd.DataFrame:
+    def bad_article_manager(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
         """
+        找出质量很差的文章标题,将该标题设置为退场状态
         :return:
         """
         sql = f"""
             SELECT
-                title, max(read_rate), count(1) as title_count
+                title, max(read_rate) as max_rate, count(1) as title_count
             FROM
                 datastat_sort_strategy
             WHERE position > 2 and fans > 10000
-            GROUP BY title;
+            GROUP BY title
+            HAVING title_count >= {discovery_times_threshold} and max_rate < {read_times_on_avg_threshold};
         """
-        articles = self.pq_client.select(sql)
-        article_df = pd.DataFrame(articles, columns=['title', 'max_read_times_on_avg', 'articles_count'])
-        return article_df
-
-    def bad_article_manager(self, read_times_on_avg_threshold, discovery_times_threshold) -> list[str]:
-        """
-        找出质量很差的文章标题,将该标题设置为退场状态
-        :return:
-        """
-        discovery_published_articles_df = self.get_discovery_published_articles()
-        target_bad_dataframe = discovery_published_articles_df[
-            (discovery_published_articles_df['max_read_times_on_avg'] < read_times_on_avg_threshold)
-            & (discovery_published_articles_df['articles_count'] < discovery_times_threshold)
-            ]
-        target_bad_title_list = target_bad_dataframe['title'].tolist()
-        return target_bad_title_list
+        articles = self.lam_client.select(sql)
+        return [i[0] for i in articles]
 
     def record_title_list(self, title_list, status) -> int:
         """
@@ -161,7 +149,7 @@ def main():
     UP_LEVEL_STATUS = 1
     ARTICLE_EXIT_STATUS = -1
     READ_TIMES_ON_AVG_THRESHOLD = 0.5
-    DISCOVERY_TIMES_THRESHOLD = 3
+    DISCOVERY_TIMES_THRESHOLD = 10
 
     article_title_manager = ArticleExitWithTitle()
     article_title_manager.init_database()
@@ -169,6 +157,7 @@ def main():
     # 处理晋级标题
     up_level_title = get_level_up_articles()
     up_level_success_count = article_title_manager.record_title_list(title_list=up_level_title, status=UP_LEVEL_STATUS)
+    # up_level_success_count = 0
 
     # 处理退场标题
     exit_article_list = article_title_manager.bad_article_manager(
@@ -191,4 +180,5 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
+