main_userupload.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. """
  2. 针对用户类型数据单独训练模型
  3. """
  4. import os
  5. import sys
  6. import json
  7. import optuna
  8. import numpy as np
  9. from sklearn.preprocessing import LabelEncoder
  10. sys.path.append(os.getcwd())
  11. import pandas as pd
  12. import lightgbm as lgb
  13. from scipy.stats import randint as sp_randint
  14. from scipy.stats import uniform as sp_uniform
  15. from sklearn.model_selection import RandomizedSearchCV, train_test_split
  16. from sklearn.metrics import roc_auc_score, accuracy_score
  17. class LightGBM(object):
  18. """
  19. LightGBM model for classification
  20. """
  21. def __init__(self, flag, dt):
  22. self.label_encoder = LabelEncoder()
  23. self.my_c = [
  24. "channel",
  25. "user_fans",
  26. "user_view_30",
  27. "user_share_30",
  28. "user_return_30",
  29. "user_rov",
  30. "user_str",
  31. "user_return_videos_30",
  32. "user_return_videos_3",
  33. "user_return_3",
  34. "user_view_3",
  35. "user_share_3",
  36. "address",
  37. "tag1",
  38. "tag2",
  39. "tag3"
  40. ]
  41. self.str_columns = ["channel", "address", "tag1", "tag2", "tag3"]
  42. self.float_columns = [
  43. "user_fans",
  44. "user_view_30",
  45. "user_share_30",
  46. "user_return_30",
  47. "user_rov",
  48. "user_str",
  49. "user_return_videos_30",
  50. "user_return_videos_3",
  51. "user_return_3",
  52. "user_share_3",
  53. "user_view_3"
  54. ]
  55. self.split_c = 0.7
  56. self.yc = 0.8
  57. self.model = "models/lightgbm_0402_user.bin"
  58. self.flag = flag
  59. self.dt = dt
  60. def read_data(self, path):
  61. """
  62. Read data from local
  63. :return:
  64. """
  65. df = pd.read_json(path)
  66. df = df.dropna(subset=['label'])
  67. labels = df['label']
  68. temp = sorted(labels)
  69. yc = temp[int(len(temp) * 0.8)]
  70. print("阈值", yc)
  71. labels = [0 if i < yc else 1 for i in labels]
  72. features = df.drop("label", axis=1)
  73. for key in self.float_columns:
  74. features[key] = pd.to_numeric(features[key], errors="coerce")
  75. for key in self.str_columns:
  76. features[key] = self.label_encoder.fit_transform(features[key])
  77. return features, labels
  78. def best_params(self):
  79. path = "data/train_data/spider_train_20240402"
  80. X, y = self.read_data(path)
  81. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  82. lgbm = lgb.LGBMClassifier(objective='binary')
  83. # 设置搜索的参数范围
  84. param_dist = {
  85. 'num_leaves': sp_randint(20, 40),
  86. 'learning_rate': sp_uniform(0.001, 0.1),
  87. 'feature_fraction': sp_uniform(0.5, 0.9),
  88. 'bagging_fraction': sp_uniform(0.5, 0.9),
  89. 'bagging_freq': sp_randint(1, 10),
  90. 'min_child_samples': sp_randint(5, 100),
  91. }
  92. # 定义 RandomizedSearchCV
  93. rsearch = RandomizedSearchCV(
  94. estimator=lgbm,
  95. param_distributions=param_dist,
  96. n_iter=100,
  97. cv=3,
  98. scoring='roc_auc',
  99. random_state=42, verbose=2
  100. )
  101. # 开始搜索
  102. rsearch.fit(X_train, y_train)
  103. # 打印最佳参数和对应的AUC得分
  104. print("Best parameters found: ", rsearch.best_params_)
  105. print("Best AUC found: ", rsearch.best_score_)
  106. # 使用最佳参数在测试集上的表现
  107. best_model = rsearch.best_estimator_
  108. y_pred = best_model.predict_proba(X_test)[:, 1]
  109. auc = roc_auc_score(y_test, y_pred)
  110. print("AUC on test set: ", auc)
  111. def train_model(self):
  112. """
  113. Load dataset
  114. :return:
  115. """
  116. path = "data/train_data/spider_train_20240402"
  117. x, y = self.read_data(path)
  118. train_size = int(len(x) * self.split_c)
  119. X_train, X_test = x[:train_size], x[train_size:]
  120. Y_train, Y_test = y[:train_size], y[train_size:]
  121. train_data = lgb.Dataset(
  122. X_train,
  123. label=Y_train,
  124. categorical_feature=self.str_columns,
  125. )
  126. test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
  127. params = {
  128. 'bagging_fraction': 0.7938866919252519,
  129. 'bagging_freq': 7,
  130. 'feature_fraction': 0.9687508340232414,
  131. 'learning_rate': 0.09711720243493492,
  132. 'min_child_samples': 89,
  133. 'num_leaves': 35,
  134. 'num_threads': 16
  135. }
  136. # 训练模型
  137. num_round = 100
  138. print("开始训练......")
  139. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  140. bst.save_model(self.model)
  141. print("模型训练完成✅")
  142. def evaluate_model(self):
  143. """
  144. 评估模型性能
  145. :return:
  146. """
  147. fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
  148. path = 'data/predict_data/predict_{}.json'.format(dt)
  149. x, y = self.read_data(path)
  150. bst = lgb.Booster(model_file=self.model)
  151. y_pred = bst.predict(x, num_iteration=bst.best_iteration)
  152. temp = sorted(list(y_pred))
  153. yuzhi = temp[int(len(temp) * 0.7) - 1]
  154. y_pred_binary = [0 if i <= yuzhi else 1 for i in list(y_pred)]
  155. # 转换为二进制输出
  156. score_list = []
  157. for index, item in enumerate(list(y_pred)):
  158. real_label = y[index]
  159. score = item
  160. prid_label = y_pred_binary[index]
  161. print(real_label, "\t", prid_label, "\t", score)
  162. fw.write("{}\t{}\t{}\n".format(real_label, prid_label, score))
  163. score_list.append(score)
  164. print("预测样本总量: {}".format(len(score_list)))
  165. data_series = pd.Series(score_list)
  166. print("统计 score 信息")
  167. print(data_series.describe())
  168. # 评估模型
  169. accuracy = accuracy_score(y, y_pred_binary)
  170. print(f"Accuracy: {accuracy}")
  171. fw.close()
  172. def feature_importance(self):
  173. """
  174. Get the importance of each feature
  175. :return:
  176. """
  177. lgb_model = lgb.Booster(model_file=self.model)
  178. importance = lgb_model.feature_importance(importance_type='split')
  179. feature_name = lgb_model.feature_name()
  180. feature_importance = sorted(zip(feature_name, importance), key=lambda x: x[1], reverse=True)
  181. # 打印特征重要性
  182. for name, imp in feature_importance:
  183. print(name, imp)
  184. if __name__ == "__main__":
  185. i = int(input("输入 1 训练, 输入 2 预测:\n"))
  186. if i == 1:
  187. f = "train"
  188. dt = "whole"
  189. L = LightGBM(flag=f, dt=dt)
  190. L.train_model()
  191. elif i == 2:
  192. f = "predict"
  193. dt = int(input("输入日期, 20240316-21:\n"))
  194. L = LightGBM(flag=f, dt=dt)
  195. L.evaluate_model()
  196. L.feature_importance()
  197. elif i == 3:
  198. L = LightGBM("train", "whole")
  199. L.best_params()