models.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # encoding: utf-8
  2. from __future__ import annotations
  3. import numpy as np
  4. import statsmodels.api as sm
  5. from pandas import DataFrame
  6. from app.core.config import GlobalConfigSettings
  7. config = GlobalConfigSettings()
  8. CATEGORY_FEATURES = config.category.features
  9. CATEGORY_MAP = config.category.category_map
  10. class CategoryRegression:
  11. """品类回归模型"""
  12. def __init__(self, features=None, category_map=None):
  13. self.features = features or CATEGORY_FEATURES
  14. self.category_map = category_map or CATEGORY_MAP
  15. @staticmethod
  16. def clip_func(x):
  17. """
  18. 阅读率均值倍数调整
  19. """
  20. return x if x < 1.4 else 0.7 * np.log(x) + 1.165
  21. def preprocess_data(self, raw_dataframe: DataFrame) -> DataFrame:
  22. """预处理数据"""
  23. for category in self.category_map:
  24. colname = self.category_map[category]
  25. raw_dataframe[colname] = raw_dataframe["category"] == category
  26. raw_dataframe[colname] = raw_dataframe[colname].astype(int)
  27. # 次条阅读量校正
  28. df_idx1 = raw_dataframe[raw_dataframe["index"] == 1][
  29. ["dt", "gh_id", "read_avg_rate"]
  30. ]
  31. merged_dataframe = raw_dataframe.merge(
  32. df_idx1, how="left", on=["dt", "gh_id"], suffixes=("", "1")
  33. )
  34. debias_selection = merged_dataframe.query(
  35. "index != 1 and read_avg_rate1 < 0.7 and read_avg_rate < 0.7"
  36. )
  37. output_dataframe = merged_dataframe.drop(debias_selection.index)
  38. output_dataframe["read_avg_rate"] = output_dataframe["read_avg_rate"].apply(
  39. self.clip_func
  40. )
  41. output_dataframe["view_count_rate"] = output_dataframe["view_count_rate"].apply(
  42. self.clip_func
  43. )
  44. output_dataframe["days_decrease"] = output_dataframe["first_pub_interval"] * (
  45. -0.2 / 120
  46. )
  47. output_dataframe["RegressionY"] = output_dataframe["read_avg_rate"]
  48. return output_dataframe
  49. def _build_and_print_by_account(
  50. self, raw_dataframe: DataFrame, account_name: str | None
  51. ) -> None:
  52. if account_name:
  53. sub_df = raw_dataframe[raw_dataframe["account_name"] == account_name]
  54. else:
  55. sub_df = raw_dataframe
  56. if len(sub_df) < 5:
  57. return
  58. sample_count = len(sub_df)
  59. params, t_stats, p_values = self.run_ols_linear_regression(sub_df)
  60. row = f"{account_name}\t{sample_count}"
  61. for param, p_value in zip(params, p_values):
  62. row += f"\t{param:.3f}\t{p_value:.3f}"
  63. print(row)
  64. def build_and_print_matrix(self, raw_dataframe: DataFrame) -> None:
  65. p_value_column_names = "\t".join(
  66. [name + "\tp-" + name for name in ["bias"] + self.features]
  67. )
  68. print("account\tsamples\t{}".format(p_value_column_names))
  69. # self._build_and_print_by_account(raw_dataframe, None)
  70. for account_name in raw_dataframe["account_name"].unique():
  71. self._build_and_print_by_account(raw_dataframe, account_name)
  72. def get_param_names(self):
  73. return ["bias"] + self.features
  74. def run_ols_linear_regression(
  75. self,
  76. raw_dataframe: DataFrame.series,
  77. print_residual: bool = False,
  78. print_p_value_threshold: float = 0.1,
  79. ):
  80. X = raw_dataframe[self.features] # 特征列
  81. y = raw_dataframe["RegressionY"] # 目标变量
  82. X = sm.add_constant(X, has_constant="add")
  83. model = sm.OLS(y, X).fit()
  84. params = model.params
  85. t_stats = model.tvalues
  86. p_values = model.pvalues
  87. conf_int = model.conf_int()
  88. if print_residual:
  89. predict_y = model.predict(X)
  90. residuals = y - predict_y
  91. new_x = raw_dataframe[["title", "category"]].copy()
  92. new_x["residual"] = residuals
  93. new_x["y"] = y
  94. select_idx = []
  95. for index, row in new_x.iterrows():
  96. param_name = self.category_map.get(row["category"], None)
  97. if not param_name:
  98. continue
  99. param_index = self.features.index(param_name) + 1
  100. param = params.iloc[param_index]
  101. p_value = p_values.iloc[param_index]
  102. if p_value < print_p_value_threshold:
  103. print(
  104. f"{row['y']:.3f}\t{row['residual']:.3f}\t{row['category']}\t{param:.2f}\t{row['title'][0:30]}"
  105. )
  106. select_idx.append(index)
  107. has_category_residuals = residuals.loc[select_idx]
  108. r_min = has_category_residuals.min()
  109. r_max = has_category_residuals.max()
  110. r_avg = has_category_residuals.mean()
  111. print(f"residuals min: {r_min:.3f}, max: {r_max:.3f}, mean: {r_avg:.3f}")
  112. return params, t_stats, p_values