main.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201
  1. """
  2. 仅使用标题信息
  3. """
  4. import os
  5. import sys
  6. from sklearn.preprocessing import LabelEncoder
  7. sys.path.append(os.getcwd())
  8. import pandas as pd
  9. import lightgbm as lgb
  10. from scipy.stats import randint as sp_randint
  11. from scipy.stats import uniform as sp_uniform
  12. from sklearn.model_selection import RandomizedSearchCV, train_test_split
  13. from sklearn.metrics import roc_auc_score, accuracy_score
  14. class LightGBM(object):
  15. """
  16. LightGBM model for classification
  17. """
  18. def __init__(self, flag, dt):
  19. self.label_encoder = LabelEncoder()
  20. self.my_c = [
  21. "tag1",
  22. "tag2",
  23. "tag3",
  24. "tag4"
  25. ]
  26. self.str_columns = ["tag1", "tag2"]
  27. self.split_c = 0.75
  28. self.yc = 0.8
  29. self.model = "models/lightgbm_0408_all_tags.bin"
  30. self.flag = flag
  31. self.dt = dt
  32. def read_data(self, path, yc=None):
  33. """
  34. Read data from local
  35. :return:
  36. """
  37. df = pd.read_json(path)
  38. df = df.dropna(subset=['label']) # 把 label 为空的删掉
  39. df = df.dropna(subset=['tag1', 'tag2'], how="all") # 把 tag 为空的数据也删掉
  40. labels = df['label']
  41. features = df.drop(['label', 'tag3', 'tag4'], axis=1)
  42. for key in self.str_columns:
  43. features[key] = self.label_encoder.fit_transform(features[key])
  44. return features, labels, df
  45. def best_params(self):
  46. """
  47. find best params for lightgbm
  48. """
  49. path = "data/train_data/all_train_20240408.json"
  50. X, y, ori_df = self.read_data(path)
  51. print(len(list(y)))
  52. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  53. lgb_ = lgb.LGBMClassifier(objective='binary')
  54. # 设置搜索的参数范围
  55. param_dist = {
  56. 'num_leaves': sp_randint(20, 40),
  57. 'learning_rate': sp_uniform(0.001, 0.1),
  58. 'feature_fraction': sp_uniform(0.5, 0.9),
  59. 'bagging_fraction': sp_uniform(0.5, 0.9),
  60. 'bagging_freq': sp_randint(1, 10),
  61. 'min_child_samples': sp_randint(5, 100),
  62. }
  63. # 定义 RandomizedSearchCV
  64. rsearch = RandomizedSearchCV(
  65. estimator=lgb_,
  66. param_distributions=param_dist,
  67. n_iter=100,
  68. cv=3,
  69. scoring='roc_auc',
  70. random_state=42, verbose=2
  71. )
  72. # 开始搜索
  73. rsearch.fit(X_train, y_train)
  74. # 打印最佳参数和对应的AUC得分
  75. print("Best parameters found: ", rsearch.best_params_)
  76. print("Best AUC found: ", rsearch.best_score_)
  77. # 使用最佳参数在测试集上的表现
  78. best_model = rsearch.best_estimator_
  79. y_pred = best_model.predict_proba(X_test)[:, 1]
  80. auc = roc_auc_score(y_test, y_pred)
  81. print("AUC on test set: ", auc)
  82. def train_model(self):
  83. """
  84. Load dataset
  85. :return:
  86. """
  87. path = "data/train_data/all_train_20240408.json"
  88. x, y, ori_df = self.read_data(path)
  89. train_size = int(len(x) * self.split_c)
  90. X_train, X_test = x[:train_size], x[train_size:]
  91. Y_train, Y_test = y[:train_size], y[train_size:]
  92. train_data = lgb.Dataset(
  93. X_train,
  94. label=Y_train,
  95. categorical_feature=["tag1", "tag2"],
  96. )
  97. test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
  98. params = {
  99. 'bagging_fraction': 0.9323330736797192,
  100. 'bagging_freq': 1,
  101. 'feature_fraction': 0.8390650729441467,
  102. 'learning_rate': 0.07595782999760721,
  103. 'min_child_samples': 93,
  104. 'num_leaves': 36,
  105. 'num_threads': 16
  106. }
  107. # 训练模型
  108. num_round = 100
  109. print("开始训练......")
  110. bst = lgb.train(params, train_data, num_round, valid_sets=[test_data])
  111. bst.save_model(self.model)
  112. print("模型训练完成✅")
  113. def evaluate_model(self):
  114. """
  115. 评估模型性能
  116. :return:
  117. """
  118. fw = open("result/summary_{}.txt".format(dt), "a+", encoding="utf-8")
  119. path = 'data/predict_data/all_predict_{}.json'.format(dt)
  120. x, y, ori_df = self.read_data(path, yc=6)
  121. true_label_df = pd.DataFrame(list(y), columns=['ture_label'])
  122. bst = lgb.Booster(model_file=self.model)
  123. y_pred = bst.predict(x, num_iteration=bst.best_iteration)
  124. pred_score_df = pd.DataFrame(list(y_pred), columns=['pred_score'])
  125. temp = sorted(list(y_pred))
  126. yuzhi = temp[int(len(temp) * 0.5) - 1]
  127. y_pred_binary = [0 if i <= yuzhi else 1 for i in list(y_pred)]
  128. pred_label_df = pd.DataFrame(list(y_pred_binary), columns=['pred_label'])
  129. score_list = []
  130. for index, item in enumerate(list(y_pred)):
  131. real_label = y[index]
  132. score = item
  133. prid_label = y_pred_binary[index]
  134. fw.write("{}\t{}\t{}\n".format(real_label, prid_label, score))
  135. score_list.append(score)
  136. print("预测样本总量: {}".format(len(score_list)))
  137. data_series = pd.Series(score_list)
  138. print("统计 score 信息")
  139. print(data_series.describe())
  140. # 评估模型
  141. accuracy = accuracy_score(y, y_pred_binary)
  142. print(f"Accuracy: {accuracy}")
  143. fw.close()
  144. # 水平合并
  145. df_concatenated = pd.concat([ori_df, true_label_df, pred_score_df, pred_label_df], axis=1)
  146. # for key in self.str_columns:
  147. # df_concatenated[key] = [self.label_mapping[key][i] for i in df_concatenated[key]]
  148. df_concatenated.to_excel("data/predict_data/spider_predict_result_{}.xlsx".format(dt), index=False)
  149. def feature_importance(self):
  150. """
  151. Get the importance of each feature
  152. :return:
  153. """
  154. lgb_model = lgb.Booster(model_file=self.model)
  155. importance = lgb_model.feature_importance(importance_type='split')
  156. feature_name = lgb_model.feature_name()
  157. feature_importance = sorted(zip(feature_name, importance), key=lambda x: x[1], reverse=True)
  158. # 打印特征重要性
  159. for name, imp in feature_importance:
  160. print(name, imp)
  161. # "cat summary_20240326.txt | awk -F "\t" '{print $1" "$3}'| /root/AUC/AUC/AUC"
  162. """
  163. ossutil64 cp /root/luojunhui/alg/data/predict_data/spider_predict_result_20240330.xlsx oss://art-pubbucket/0temp/
  164. """
  165. if __name__ == "__main__":
  166. i = int(input("输入 1 训练, 输入 2 预测:\n"))
  167. if i == 1:
  168. f = "train"
  169. dt = "whole"
  170. L = LightGBM(flag=f, dt=dt)
  171. L.train_model()
  172. elif i == 2:
  173. f = "predict"
  174. dt = int(input("输入日期, 20240316-21:\n"))
  175. L = LightGBM(flag=f, dt=dt)
  176. L.evaluate_model()
  177. L.feature_importance()
  178. elif i == 3:
  179. L = LightGBM("train", "whole")
  180. L.best_params()