Browse Source

Merge branch '2024-11-12-luojunhui-article-exit-v2' of luojunhui/LongArticlesJob into master

luojunhui 7 months ago
parent
commit
b09c99a662
1 changed files with 39 additions and 5 deletions
  1. 39 5
      flow_pool/exit_article_with_title.py

+ 39 - 5
flow_pool/exit_article_with_title.py

@@ -3,7 +3,7 @@
 """
 import traceback
 
-import pandas as pd
+from datetime import datetime, timedelta
 
 from applications import PQMySQL, longArticlesMySQL, bot, log
 from applications.aiditApi import get_generated_article_list
@@ -84,6 +84,25 @@ class ArticleTitleStatusManager(object):
         articles = self.lam_client.select(sql)
         return [i[0] for i in articles]
 
+    def get_bad_articles_v2(self, publish_date_threshold, discovery_times_threshold) -> list[str]:
+        """
+        找出第一次发布在一个月之前,且发布次数大于5次的文章
+        :param publish_date_threshold: 发布时间戳阈值
+        :param discovery_times_threshold: 发布次数阈值
+        :return:
+        """
+        sql = f"""
+            SELECT
+                title, count(1) as title_count, min(date_str) as min_date
+            FROM
+                datastat_sort_strategy
+            WHERE position > 2 and fans > 10000
+            GROUP BY title
+            HAVING title_count >= {discovery_times_threshold} and min_date < {publish_date_threshold};
+        """
+        articles = self.lam_client.select(sql)
+        return [i[0] for i in articles]
+
     def save_titles(self, title_list, status) -> int:
         """
         修改标题状态
@@ -149,6 +168,9 @@ def main():
     ARTICLE_EXIT_STATUS = -1
     READ_TIMES_ON_AVG_THRESHOLD = 0.5
     DISCOVERY_TIMES_THRESHOLD = 10
+    PUBLISH_TIMES_THRESHOLD = 5
+    DAYS_THRESHOLD = 30
+    FIRST_PUBLISH_DATE_THRESHOLD = (datetime.now() - timedelta(days=DAYS_THRESHOLD)).strftime('%Y%m%d')
 
     article_title_manager = ArticleTitleStatusManager()
     article_title_manager.init_database()
@@ -160,7 +182,7 @@ def main():
         status=UP_LEVEL_STATUS
     )
 
-    # 处理退场标题
+    # 处理退场标题V1
     exit_article_list = article_title_manager.get_bad_articles(
         read_times_on_avg_threshold=READ_TIMES_ON_AVG_THRESHOLD,
         discovery_times_threshold=DISCOVERY_TIMES_THRESHOLD
@@ -169,13 +191,25 @@ def main():
         title_list=exit_article_list,
         status=ARTICLE_EXIT_STATUS)
 
+    # 处理退场标题v2
+    exit_article_list_v2 = article_title_manager.get_bad_articles_v2(
+        publish_date_threshold=FIRST_PUBLISH_DATE_THRESHOLD,
+        discovery_times_threshold=PUBLISH_TIMES_THRESHOLD
+    )
+    exit_success_count_v2 = article_title_manager.save_titles(
+        title_list=exit_article_list_v2,
+        status=ARTICLE_EXIT_STATUS)
+
     bot(
         title="冷启动文章晋级/退场完成",
         detail={
             "晋级文章数量": up_level_success_count,
-            "退场文章数量": exit_success_count,
-            "阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
-            "探索次数阈值": DISCOVERY_TIMES_THRESHOLD
+            "策略1:退场文章数量": exit_success_count,
+            "策略2:退场文章数量": exit_success_count_v2,
+            "策略1:阅读均值倍数阈值": READ_TIMES_ON_AVG_THRESHOLD,
+            "策略1:探索次数阈值": DISCOVERY_TIMES_THRESHOLD,
+            "策略2:发布次数阈值": PUBLISH_TIMES_THRESHOLD,
+            "策略2:发布天数阈值": DAYS_THRESHOLD
         },
         mention=False
     )