6 months ago · 27b84316b7
--- a/run_category_model_v1.py
+++ b/run_category_model_v1.py
@@ -66,7 +66,7 @@ def compare_version(db_manager, dt_version, new_version, account_id_map):
 
				     # new record
			
 
				     all_gh_ids = set(list(new_version.keys()) + list(last_version.keys()))
			
 
				     for gh_id in all_gh_ids:
			
 
				-        account_name = account_id_map[gh_id]
			
 
				+        account_name = account_id_map.get(gh_id, None)
			
 
				         if gh_id not in last_version:
			
 
				             print(f"new account {account_name}: {new_version[gh_id]}")
			
 
				         elif gh_id not in new_version:
			
@@ -86,6 +86,8 @@ def main():
 
				     parser = ArgumentParser()
			
 
				     parser.add_argument('-n', '--dry-run', action='store_true', help='do not update database')
			
 
				     parser.add_argument('--run-at', help='dt, also for version')
			
 
				+    parser.add_argument('--print-matrix', action='store_true')
			
 
				+    parser.add_argument('--print-residual', action='store_true')
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     run_date = datetime.today()
			
@@ -101,8 +103,8 @@ def main():
 
				     cate_model = CategoryRegressionV1()
			
 
				     df = cate_model.preprocess_data(raw_df)
			
 
				 
			
 
				-    if args.dry_run and False:
			
 
				-        cate_model.build(df)
			
 
				+    if args.dry_run and args.print_matrix:
			
 
				+        cate_model.build_and_print_matrix(df)
			
 
				         return
			
 
				 
			
 
				     create_timestamp = int(time.time())
			
@@ -116,14 +118,15 @@ def main():
 
				         .set_index('gh_id')['account_name'].to_dict()
			
 
				 
			
 
				     account_negative_cates = {k: [] for k in account_ids}
			
 
				+    P_VALUE_THRESHOLD = 0.15
			
 
				     for account_id in account_ids:
			
 
				         sub_df = df[df['gh_id'] == account_id]
			
 
				         account_name = account_id_map[account_id]
			
 
				         sample_count = len(sub_df)
			
 
				         if sample_count < 5:
			
 
				             continue
			
 
				-        print_error = False
			
 
				-        params, t_stats, p_values = cate_model.run_ols_linear_regression(sub_df, print_error)
			
 
				+        params, t_stats, p_values = cate_model.run_ols_linear_regression(
			
 
				+            sub_df, args.print_residual, P_VALUE_THRESHOLD)
			
 
				         current_record = {}
			
 
				         current_record['dt'] = dt_version
			
 
				         current_record['gh_id'] = account_id
			
@@ -132,12 +135,13 @@ def main():
 
				         for name, param, p_value in zip(param_names, params, p_values):
			
 
				             cate_name = param_to_category_map.get(name, None)
			
 
				             # 用于排序的品类相关性
			
 
				-            if abs(param) > 0.1 and p_value < 0.1 and cate_name is not None:
			
 
				+            if abs(param) > 0.1 and p_value < P_VALUE_THRESHOLD and cate_name is not None:
			
 
				+                scale_factor = min(0.1 / p_value, 1)
			
 
				                 print(f"{account_id} {account_name} {cate_name} {param:.3f} {p_value:.3f}")
			
 
				-                truncate_param = round(max(min(param, 0.25), -0.3), 6)
			
 
				+                truncate_param = round(max(min(param, 0.25), -0.3) * scale_factor, 6)
			
 
				                 current_record['category_map'][cate_name] = truncate_param
			
 
				             # 用于冷启文章分配的负向品类
			
 
				-            if param < -0.1 and cate_name is not None and p_value < 0.3:
			
 
				+            if param < -0.1 and cate_name is not None and p_value < P_VALUE_THRESHOLD:
			
 
				                 account_negative_cates[account_id].append(cate_name)
			
 
				                 # print((account_name, cate_name, param, p_value))
			
 
				         if not current_record['category_map']:
			
--- a/src/long_articles/category_models.py
+++ b/src/long_articles/category_models.py
@@ -32,11 +32,13 @@ class CategoryRegressionV1:
 
				             df[colname] = df['category'] == cate
			
 
				             df[colname] = df[colname].astype(int)
			
 
				 
			
 
				-        df['ClassY'] = df['read_avg_rate'] > 1
			
 
				+        df['read_avg_rate'] = df['read_avg_rate'].clip(upper=1.3)
			
 
				+        df['days_decrease'] = df['first_pub_interval'] * (-0.2 / 120)
			
 
				+        # df['ClassY'] = df['read_avg_rate'] > 1
			
 
				         df['RegressionY'] = df['read_avg_rate']
			
 
				         return df
			
 
				 
			
 
				-    def build_and_print(self, df, account_name):
			
 
				+    def _build_and_print_by_account(self, df, account_name):
			
 
				         if account_name is not None:
			
 
				             sub_df = df[df['account_name'] == account_name]
			
 
				         else:
			
@@ -50,18 +52,19 @@ class CategoryRegressionV1:
 
				             row += f'\t{param:.3f}\t{p_value:.3f}'
			
 
				         print(row)
			
 
				 
			
 
				-    def build(self, df):
			
 
				+    def build_and_print_matrix(self, df):
			
 
				         p_value_column_names = '\t'.join([name + "\tp-" + name for name in
			
 
				                                           ['bias'] + self.features])
			
 
				         print('account\tsamples\t{}'.format(p_value_column_names))
			
 
				-        self.build_and_print(df, None)
			
 
				+        self._build_and_print_by_account(df, None)
			
 
				         for account_name in df['account_name'].unique():
			
 
				-            self.build_and_print(df, account_name)
			
 
				+            self._build_and_print_by_account(df, account_name)
			
 
				 
			
 
				     def get_param_names(self):
			
 
				         return ['bias'] + self.features
			
 
				 
			
 
				-    def run_ols_linear_regression(self, df, print_residual=False):
			
 
				+    def run_ols_linear_regression(self, df, print_residual=False,
			
 
				+                                  print_p_value_threshold=0.1):
			
 
				         X = df[self.features]  # 特征列
			
 
				         y = df['RegressionY']  # 目标变量
			
 
				         X = sm.add_constant(X)
			
@@ -79,6 +82,7 @@ class CategoryRegressionV1:
 
				             new_x = df[['title', 'category']].copy()
			
 
				             new_x['residual'] = residuals
			
 
				             new_x['y'] = y
			
 
				+            select_idx = []
			
 
				             for index, row in new_x.iterrows():
			
 
				                 param_name = category_name_map.get(row['category'], None)
			
 
				                 if not param_name:
			
@@ -86,11 +90,13 @@ class CategoryRegressionV1:
 
				                 param_index = self.features.index(param_name) + 1
			
 
				                 param = params.iloc[param_index]
			
 
				                 p_value = p_values.iloc[param_index]
			
 
				-                if p_value < 0.1:
			
 
				-                    print(f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title']}")
			
 
				-            r_min = residuals.min()
			
 
				-            r_max = residuals.max()
			
 
				-            r_avg = residuals.mean()
			
 
				+                if p_value < print_p_value_threshold:
			
 
				+                    print(f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title'][0:30]}")
			
 
				+                    select_idx.append(index)
			
 
				+            has_category_residuals = residuals.loc[select_idx]
			
 
				+            r_min = has_category_residuals.min()
			
 
				+            r_max = has_category_residuals.max()
			
 
				+            r_avg = has_category_residuals.mean()
			
 
				             print(f"residuals min: {r_min:.3f}, max: {r_max:.3f}, mean: {r_avg:.3f}")
			
 
				 
			
 
				         return params, t_stats, p_values