category_models.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. #! /usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # vim:fenc=utf-8
  4. #
  5. # Copyright © 2024 StrayWarrior <i@straywarrior.com>
  6. """
  7. Models for long article categories.
  8. """
  9. import pandas as pd
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.linear_model import LogisticRegression, LinearRegression
  12. from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
  13. from sklearn.metrics import mean_squared_error, r2_score
  14. import statsmodels.api as sm
  15. from .consts import category_name_map, reverse_category_name_map
  16. class CategoryRegressionV1:
  17. def __init__(self):
  18. self.features = [
  19. 'CateOddities', 'CateFamily', 'CateHeartwarm',
  20. 'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
  21. 'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
  22. 'view_count_rate'
  23. ]
  24. def preprocess_data(self, df):
  25. for cate in category_name_map:
  26. colname = category_name_map[cate]
  27. df[colname] = df['category'] == cate
  28. df[colname] = df[colname].astype(int)
  29. df['ClassY'] = df['read_avg_rate'] > 1
  30. df['RegressionY'] = df['read_avg_rate']
  31. return df
  32. def build_and_print(self, df, account_name):
  33. if account_name is not None:
  34. sub_df = df[df['account_name'] == account_name]
  35. else:
  36. sub_df = df
  37. if len(sub_df) < 5:
  38. return
  39. sample_count = len(sub_df)
  40. params, t_stats, p_values = self.run_ols_linear_regression(sub_df)
  41. row = f'{account_name}\t{sample_count}'
  42. for param, p_value in zip(params, p_values):
  43. row += f'\t{param:.3f}\t{p_value:.3f}'
  44. print(row)
  45. def build(self, df):
  46. p_value_column_names = '\t'.join([name + "\tp-" + name for name in
  47. ['bias'] + self.features])
  48. print('account\tsamples\t{}'.format(p_value_column_names))
  49. self.build_and_print(df, None)
  50. for account_name in df['account_name'].unique():
  51. self.build_and_print(df, account_name)
  52. def get_param_names(self):
  53. return ['bias'] + self.features
  54. def run_ols_linear_regression(self, df, print_residual=False):
  55. X = df[self.features] # 特征列
  56. y = df['RegressionY'] # 目标变量
  57. X = sm.add_constant(X)
  58. model = sm.OLS(y, X).fit()
  59. params = model.params
  60. t_stats = model.tvalues
  61. p_values = model.pvalues
  62. conf_int = model.conf_int()
  63. if print_residual:
  64. predict_y = model.predict(X)
  65. residuals = y - predict_y
  66. new_x = df[['title', 'category']].copy()
  67. new_x['residual'] = residuals
  68. new_x['y'] = y
  69. for index, row in new_x.iterrows():
  70. param_name = category_name_map.get(row['category'], None)
  71. if not param_name:
  72. continue
  73. param_index = self.features.index(param_name) + 1
  74. param = params.iloc[param_index]
  75. p_value = p_values.iloc[param_index]
  76. if p_value < 0.1:
  77. print(f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title']}")
  78. r_min = residuals.min()
  79. r_max = residuals.max()
  80. r_avg = residuals.mean()
  81. print(f"residuals min: {r_min:.3f}, max: {r_max:.3f}, mean: {r_avg:.3f}")
  82. return params, t_stats, p_values
  83. class CategoryLR:
  84. def __init__(self):
  85. self.features = [
  86. 'CateOddities', 'CateFamily', 'CateHeartwarm',
  87. 'CateHistory', 'CateHealth', 'CateLifeKnowledge', 'CateGossip',
  88. 'CatePolitics', 'CateMilitary', 'CateMovie', 'CateSociety',
  89. 'view_count_rate', 'bias'
  90. ]
  91. def preprocess_data(self, df):
  92. for cate in category_name_map:
  93. colname = category_name_map[cate]
  94. df[colname] = df['category'] == cate
  95. df[colname] = df[colname].astype(int)
  96. df['ClassY'] = df['read_avg_rate'] > 1
  97. df['bias'] = 1.0
  98. return df
  99. def build_and_print(self, df, account_name):
  100. if account_name is not None:
  101. sub_df = df[df['account_name'] == account_name]
  102. else:
  103. sub_df = df
  104. sample_count = len(sub_df)
  105. positive_count = len(sub_df.query('ClassY == 1'))
  106. if sample_count < 10 or positive_count * (sample_count - positive_count) == 0:
  107. return
  108. sample_count = len(sub_df)
  109. params, t_stats, p_values = self.run_logistic_regression(sub_df)
  110. row = f'{account_name}\t{sample_count}'
  111. for param, p_value in zip(params, p_values):
  112. row += f'\t{param:.3f}'
  113. print(row)
  114. def build(self, df):
  115. p_value_column_names = '\t'.join(self.features)
  116. print('account\tsamples\t{}'.format(p_value_column_names))
  117. # self.build_and_print(df, None)
  118. for account_name in df['account_name'].unique():
  119. self.build_and_print(df, account_name)
  120. def get_param_names(self):
  121. return ['bias'] + self.features
  122. def run_logistic_regression(self, df):
  123. X = df[self.features] # 特征列
  124. y = df['ClassY'] # 目标变量
  125. # 将数据集分为训练集和测试集
  126. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  127. # 创建线性回归模型
  128. logreg = LogisticRegression()
  129. # 训练模型
  130. logreg.fit(X_train, y_train)
  131. # 预测测试集
  132. y_pred = logreg.predict(X_test)
  133. # 评估模型性能
  134. accuracy = accuracy_score(y_test, y_pred)
  135. conf_matrix = confusion_matrix(y_test, y_pred)
  136. class_report = classification_report(y_test, y_pred)
  137. # print(f"Accuracy: {accuracy}")
  138. # print(f"Confusion Matrix: \n{conf_matrix}")
  139. # print(f"Classification Report: \n{class_report}")
  140. return logreg.coef_[0], None, [0] * len(logreg.coef_[0])
  141. def main():
  142. df = pd.read_excel('20241101_read_rate_samples.xlsx') # 如果数据来自CSV文件
  143. df['read_avg'] = df['阅读均值']
  144. df['read_avg_rate'] = df['阅读倍数']
  145. df['dt'] = df['日期']
  146. df['similarity'] = df['Similarity']
  147. filter_condition = 'read_avg > 500 ' \
  148. 'and read_avg_rate > 0 and read_avg_rate < 3 ' \
  149. 'and dt > 20240914 and similarity > 0'
  150. df = df.query(filter_condition).copy()
  151. m_cate = CategoryRegressionV1()
  152. df = m_cate.preprocess_data(df)
  153. m_cate.build(df)
  154. if __name__ == '__main__':
  155. main()