Browse Source

Update category_models preprocess: debias second-position read rate

StrayWarrior 4 months ago
parent
commit
af9e3f8632
2 changed files with 9 additions and 1 deletions
  1. 2 1
      run_category_model_v1.py
  2. 7 0
      src/long_articles/category_models.py

+ 2 - 1
run_category_model_v1.py

@@ -28,7 +28,7 @@ NIGHT_ACCOUNTS = ('gh_12523d39d809','gh_df4a630c04db','gh_f67df16f4670','gh_ca44
 def prepare_raw_data(dt_begin, dt_end):
     data_fields = ['dt', 'gh_id', 'account_name', 'title', 'similarity',
                    'view_count_rate', 'category', 'read_avg',
-                   'read_avg_rate', 'first_pub_interval']
+                   'read_avg_rate', 'first_pub_interval', '`index`']
     fields_str = ','.join(data_fields)
     db_manager = MySQLManager(Config().MYSQL_LONG_ARTICLES)
     night_accounts_condition = str(NIGHT_ACCOUNTS)
@@ -42,6 +42,7 @@ def prepare_raw_data(dt_begin, dt_end):
         """
     rows = db_manager.select(sql)
     df = pd.DataFrame(rows, columns=data_fields)
+    df.rename(columns={'`index`': 'index'}, inplace=True)
     df = df.drop_duplicates(['dt', 'gh_id', 'title'])
     return df
 

+ 7 - 0
src/long_articles/category_models.py

@@ -32,6 +32,13 @@ class CategoryRegressionV1:
             df[colname] = df['category'] == cate
             df[colname] = df[colname].astype(int)
 
+        # 次条阅读量校正
+        df_idx1 = df[df['index'] == 1][['dt', 'gh_id', 'read_avg_rate']]
+        df = df.merge(df_idx1, how='left', on=['dt', 'gh_id'], suffixes=['', '1'])
+        debias_selection = df.query('index != 1 and read_avg_rate1 < 0.7 and read_avg_rate < 0.7')
+        # print(debias_selection[['account_name', 'read_avg_rate', 'read_avg_rate1']])
+        df = df.drop(debias_selection.index)
+
         df['read_avg_rate'] = df['read_avg_rate'].clip(upper=1.3)
         df['days_decrease'] = df['first_pub_interval'] * (-0.2 / 120)
         # df['ClassY'] = df['read_avg_rate'] > 1