Bläddra i källkod

Update category model: better print for residuals

StrayWarrior 4 månader sedan
förälder
incheckning
27b84316b7
2 ändrade filer med 29 tillägg och 19 borttagningar
  1. 12 8
      run_category_model_v1.py
  2. 17 11
      src/long_articles/category_models.py

+ 12 - 8
run_category_model_v1.py

@@ -66,7 +66,7 @@ def compare_version(db_manager, dt_version, new_version, account_id_map):
     # new record
     all_gh_ids = set(list(new_version.keys()) + list(last_version.keys()))
     for gh_id in all_gh_ids:
-        account_name = account_id_map[gh_id]
+        account_name = account_id_map.get(gh_id, None)
         if gh_id not in last_version:
             print(f"new account {account_name}: {new_version[gh_id]}")
         elif gh_id not in new_version:
@@ -86,6 +86,8 @@ def main():
     parser = ArgumentParser()
     parser.add_argument('-n', '--dry-run', action='store_true', help='do not update database')
     parser.add_argument('--run-at', help='dt, also for version')
+    parser.add_argument('--print-matrix', action='store_true')
+    parser.add_argument('--print-residual', action='store_true')
     args = parser.parse_args()
 
     run_date = datetime.today()
@@ -101,8 +103,8 @@ def main():
     cate_model = CategoryRegressionV1()
     df = cate_model.preprocess_data(raw_df)
 
-    if args.dry_run and False:
-        cate_model.build(df)
+    if args.dry_run and args.print_matrix:
+        cate_model.build_and_print_matrix(df)
         return
 
     create_timestamp = int(time.time())
@@ -116,14 +118,15 @@ def main():
         .set_index('gh_id')['account_name'].to_dict()
 
     account_negative_cates = {k: [] for k in account_ids}
+    P_VALUE_THRESHOLD = 0.15
     for account_id in account_ids:
         sub_df = df[df['gh_id'] == account_id]
         account_name = account_id_map[account_id]
         sample_count = len(sub_df)
         if sample_count < 5:
             continue
-        print_error = False
-        params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df, print_error)
+        params, t_stats, p_values = cate_model.run_ols_linear_regression(
+            sub_df, args.print_residual, P_VALUE_THRESHOLD)
         current_record = {}
         current_record['dt'] = dt_version
         current_record['gh_id'] = account_id
@@ -132,12 +135,13 @@ def main():
         for name, param, p_value in zip(param_names, params, p_values):
             cate_name = param_to_category_map.get(name, None)
             # 用于排序的品类相关性
-            if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
+            if abs(param) > 0.1 and p_value < P_VALUE_THRESHOLD and cate_name is not None:
+                scale_factor = min(0.1 / p_value, 1)
                 print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}")
-                truncate_param = round(max(min(param, 0.25), -0.3), 6)
+                truncate_param = round(max(min(param, 0.25), -0.3) * scale_factor, 6)
                 current_record['category_map'][cate_name] = truncate_param
             # 用于冷启文章分配的负向品类
-            if param < -0.1 and cate_name is not None and p_value < 0.3:
+            if param < -0.1 and cate_name is not None and p_value < P_VALUE_THRESHOLD:
                 account_negative_cates[account_id].append(cate_name)
                 # print((account_name, cate_name, param, p_value))
         if not current_record['category_map']:

+ 17 - 11
src/long_articles/category_models.py

@@ -32,11 +32,13 @@ class CategoryRegressionV1:
             df[colname] = df['category'] == cate
             df[colname] = df[colname].astype(int)
 
-        df['ClassY'] = df['read_avg_rate'] > 1
+        df['read_avg_rate'] = df['read_avg_rate'].clip(upper=1.3)
+        df['days_decrease'] = df['first_pub_interval'] * (-0.2 / 120)
+        # df['ClassY'] = df['read_avg_rate'] > 1
         df['RegressionY'] = df['read_avg_rate']
         return df
 
-    def build_and_print(self, df, account_name):
+    def _build_and_print_by_account(self, df, account_name):
         if account_name is not None:
             sub_df = df[df['account_name'] == account_name]
         else:
@@ -50,18 +52,19 @@ class CategoryRegressionV1:
             row += f'\t{param:.3f}\t{p_value:.3f}'
         print(row)
 
-    def build(self, df):
+    def build_and_print_matrix(self, df):
         p_value_column_names = '\t'.join([name + "\tp-" + name for name in
                                           ['bias'] + self.features])
         print('account\tsamples\t{}'.format(p_value_column_names))
-        self.build_and_print(df, None)
+        self._build_and_print_by_account(df, None)
         for account_name in df['account_name'].unique():
-            self.build_and_print(df, account_name)
+            self._build_and_print_by_account(df, account_name)
 
     def get_param_names(self):
         return ['bias'] + self.features
 
-    def run_ols_linear_regression(self, df, print_residual=False):
+    def run_ols_linear_regression(self, df, print_residual=False,
+                                  print_p_value_threshold=0.1):
         X = df[self.features]  # 特征列
         y = df['RegressionY']  # 目标变量
         X = sm.add_constant(X)
@@ -79,6 +82,7 @@ class CategoryRegressionV1:
             new_x = df[['title', 'category']].copy()
             new_x['residual'] = residuals
             new_x['y'] = y
+            select_idx = []
             for index, row in new_x.iterrows():
                 param_name = category_name_map.get(row['category'], None)
                 if not param_name:
@@ -86,11 +90,13 @@ class CategoryRegressionV1:
                 param_index = self.features.index(param_name) + 1
                 param = params.iloc[param_index]
                 p_value = p_values.iloc[param_index]
-                if p_value < 0.1:
-                    print(f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title']}")
-            r_min = residuals.min()
-            r_max = residuals.max()
-            r_avg = residuals.mean()
+                if p_value < print_p_value_threshold:
+                    print(f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title'][0:30]}")
+                    select_idx.append(index)
+            has_category_residuals = residuals.loc[select_idx]
+            r_min = has_category_residuals.min()
+            r_max = has_category_residuals.max()
+            r_avg = has_category_residuals.mean()
             print(f"residuals min: {r_min:.3f}, max: {r_max:.3f}, mean: {r_avg:.3f}")
 
         return params, t_stats, p_values