ソースを参照

Merge branch '2025-04-21-luojunhui-read-avg-ci-upper' of luojunhui/LongArticlesJob into master

luojunhui 6 ヶ月 前
コミット
a04a7efc91
2 ファイル変更40 行追加4 行削除
  1. 6 0
      applications/const/__init__.py
  2. 34 4
      updateAccountV3.py

+ 6 - 0
applications/const/__init__.py

@@ -147,6 +147,12 @@ class UpdateAccountReadAvgTaskConst:
     USING_STATUS = 1
     NOT_USING_STATUS = 0
 
+    # 统计周期
+    STAT_PERIOD = 30
+
+    # default upper_quantile, confidence = 0.95
+    DEFAULT_UPPER_QUANTILE = 0.975
+
 
 class WeixinVideoCrawlerConst:
     """

+ 34 - 4
updateAccountV3.py

@@ -4,6 +4,8 @@
 import json
 import time
 
+import numpy as np
+from scipy import stats
 from tqdm import tqdm
 from datetime import datetime, timedelta
 from argparse import ArgumentParser
@@ -60,6 +62,28 @@ class UpdateAccountInfoVersion3(object):
             account_read_rate_dict[key] = item['read_rate_avg']
         return account_read_rate_dict
 
+    def cal_read_avg_ci(self, gh_id, position):
+        """
+        计算阅读均值的置信区间
+        """
+        fetch_query = f"""
+            select read_avg
+            from {read_avg_table}
+            where gh_id = %s and position = %s 
+            order by update_time desc limit {const.STAT_PERIOD};
+        """
+        fetch_response_list = self.piaoquan_crawler_db_client.fetch(
+            query=fetch_query, params=(gh_id, position), cursor_type=DictCursor
+        )
+        read_avg_list = [i["read_avg"] for i in fetch_response_list]
+        n = len(read_avg_list)
+        mean = np.mean(read_avg_list)
+        std = np.std(read_avg_list, ddof=1)
+        se = std / np.sqrt(n)
+        t = stats.t.ppf(const.DEFAULT_UPPER_QUANTILE, df=n - 1)
+        upper_t = mean + t * se
+        return upper_t
+
     def do_task_list(self, dt):
         """
         do it
@@ -95,12 +119,16 @@ class UpdateAccountInfoVersion3(object):
                         # cal read avg
                         read_avg = fans * read_rate_avg
 
+                        # cal read avg ci upper
+                        read_avg_ci_upper = self.cal_read_avg_ci(gh_id, index)
+
                         # insert into database
                         insert_sql = f"""
                             insert into {read_avg_table}
-                            (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type, account_mode, account_source, account_status, business_type, read_rate_avg)
+                            (gh_id, position, update_time, account_name, fans, read_avg, like_avg, status, account_type,
+                             account_mode, account_source, account_status, business_type, read_rate_avg, read_avg_ci_upper)
                             values
-                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+                            (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
                         """
                         try:
                             self.piaoquan_crawler_db_client.save(
@@ -119,13 +147,14 @@ class UpdateAccountInfoVersion3(object):
                                     account['account_source'],
                                     account['status'],
                                     business_type,
-                                    read_rate_avg
+                                    read_rate_avg,
+                                    read_avg_ci_upper
                                 )
                             )
                         except Exception as e:
                             update_sql = f"""
                                 update {read_avg_table}
-                                set fans = %s, read_avg = %s, read_rate_avg = %s
+                                set fans = %s, read_avg = %s, read_rate_avg = %s, read_avg_ci_upper = %s
                                 where gh_id = %s and position = %s and update_time = %s
                             """
                             try:
@@ -135,6 +164,7 @@ class UpdateAccountInfoVersion3(object):
                                         fans,
                                         read_avg,
                                         read_rate_avg,
+                                        read_avg_ci_upper,
                                         account['gh_id'],
                                         index,
                                         dt