Browse Source

Update category_models: better clip function

StrayWarrior 5 months ago
parent
commit
1eab92358f
1 changed files with 5 additions and 2 deletions
  1. 5 2
      src/long_articles/category_models.py

+ 5 - 2
src/long_articles/category_models.py

@@ -15,6 +15,7 @@ from sklearn.linear_model import LogisticRegression, LinearRegression
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
 from sklearn.metrics import mean_squared_error, r2_score
 from sklearn.metrics import mean_squared_error, r2_score
 import statsmodels.api as sm
 import statsmodels.api as sm
+import numpy as np
 from .consts import category_name_map, reverse_category_name_map
 from .consts import category_name_map, reverse_category_name_map
 
 
 class CategoryRegressionV1:
 class CategoryRegressionV1:
@@ -41,8 +42,10 @@ class CategoryRegressionV1:
         # print(debias_selection[['account_name', 'read_avg_rate', 'read_avg_rate1']])
         # print(debias_selection[['account_name', 'read_avg_rate', 'read_avg_rate1']])
         df = df.drop(debias_selection.index)
         df = df.drop(debias_selection.index)
 
 
-        df['read_avg_rate'] = df['read_avg_rate'].clip(upper=1.4)
-        df['view_count_rate'] = df['view_count_rate'].clip(upper=1.3)
+        def clip_func(x):
+            return x if x < 1.4 else 0.7 * np.log(x) + 1.165
+        df['read_avg_rate'] = df['read_avg_rate'].apply(clip_func)
+        df['view_count_rate'] = df['view_count_rate'].apply(clip_func)
         df['days_decrease'] = df['first_pub_interval'] * (-0.2 / 120)
         df['days_decrease'] = df['first_pub_interval'] * (-0.2 / 120)
         # df['ClassY'] = df['read_avg_rate'] > 1
         # df['ClassY'] = df['read_avg_rate'] > 1
         df['RegressionY'] = df['read_avg_rate']
         df['RegressionY'] = df['read_avg_rate']