|
@@ -1,59 +1,67 @@
|
|
|
import pandas as pd
|
|
|
import datetime
|
|
|
+import time
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from xgboost.sklearn import XGBClassifier
|
|
|
from sklearn import metrics
|
|
|
|
|
|
|
|
|
-now_date = datetime.datetime.today()
|
|
|
-dt = datetime.datetime.strftime(now_date, '%Y%m%d')
|
|
|
-# 1. 读取数据
|
|
|
-# data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
|
|
|
-# print(data.shape)
|
|
|
-train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv')
|
|
|
-print(train_data.shape)
|
|
|
-test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv')
|
|
|
-print(test_data.shape)
|
|
|
-# 2. 划分x和y
|
|
|
-# data_columns = data.columns.values.tolist()
|
|
|
-# x = data[data_columns[:-1]]
|
|
|
-# y = data[data_columns[-1]]
|
|
|
-# print(f"x_shape: {x.shape}, y_shape: {y.shape}")
|
|
|
-data_columns = train_data.columns.values.tolist()
|
|
|
-x_train = train_data[data_columns[:-1]]
|
|
|
-y_train = train_data[data_columns[-1]]
|
|
|
-x_test = test_data[data_columns[:-1]]
|
|
|
-y_test = test_data[data_columns[-1]]
|
|
|
-# 3. 训练集和测试集分割
|
|
|
-# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
|
|
|
-print(f"x_train_shape: {x_train.shape}")
|
|
|
-print(f"x_test_shape: {x_test.shape}")
|
|
|
-# 4. 模型训练
|
|
|
-xgb_model = XGBClassifier(
|
|
|
- objective='binary:logistic',
|
|
|
- learning_rate=0.3,
|
|
|
- max_depth=5,
|
|
|
- eval_metric=['mae', 'auc']
|
|
|
-)
|
|
|
-xgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])
|
|
|
-# 5. 模型保存
|
|
|
-xgb_model.save_model('./data/ad_xgb.model')
|
|
|
-# 6. 测试集预测
|
|
|
-y_test_pre = xgb_model.predict(x_test)
|
|
|
+def xgboost_train():
|
|
|
+ now_date = datetime.datetime.today()
|
|
|
+ dt = datetime.datetime.strftime(now_date, '%Y%m%d')
|
|
|
+ # 1. 读取数据
|
|
|
+ # data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
|
|
|
+ # print(data.shape)
|
|
|
+ train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv')
|
|
|
+ print(train_data.shape)
|
|
|
+ test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv')
|
|
|
+ print(test_data.shape)
|
|
|
+ # 2. 划分x和y
|
|
|
+ # data_columns = data.columns.values.tolist()
|
|
|
+ # x = data[data_columns[:-1]]
|
|
|
+ # y = data[data_columns[-1]]
|
|
|
+ # print(f"x_shape: {x.shape}, y_shape: {y.shape}")
|
|
|
+ data_columns = train_data.columns.values.tolist()
|
|
|
+ x_train = train_data[data_columns[:-1]]
|
|
|
+ y_train = train_data[data_columns[-1]]
|
|
|
+ x_test = test_data[data_columns[:-1]]
|
|
|
+ y_test = test_data[data_columns[-1]]
|
|
|
+ # 3. 训练集和测试集分割
|
|
|
+ # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
|
|
|
+ print(f"x_train_shape: {x_train.shape}")
|
|
|
+ print(f"x_test_shape: {x_test.shape}")
|
|
|
+ # 4. 模型训练
|
|
|
+ xgb_model = XGBClassifier(
|
|
|
+ objective='binary:logistic',
|
|
|
+ learning_rate=0.3,
|
|
|
+ max_depth=5,
|
|
|
+ eval_metric=['mae', 'auc']
|
|
|
+ )
|
|
|
+ xgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])
|
|
|
+ # 5. 模型保存
|
|
|
+ xgb_model.save_model('./data/ad_xgb.model')
|
|
|
+ # 6. 测试集预测
|
|
|
+ y_test_pre = xgb_model.predict(x_test)
|
|
|
|
|
|
-# test_df = x_test.copy()
|
|
|
-# test_df['y'] = y_test
|
|
|
-# test_df['y_pre'] = y_test_pre
|
|
|
-# test_df.to_csv('./data/test_pre.csv', index=False)
|
|
|
+ # test_df = x_test.copy()
|
|
|
+ # test_df['y'] = y_test
|
|
|
+ # test_df['y_pre'] = y_test_pre
|
|
|
+ # test_df.to_csv('./data/test_pre.csv', index=False)
|
|
|
|
|
|
-# 7. 模型效果验证
|
|
|
-test_accuracy = metrics.accuracy_score(y_test, y_test_pre)
|
|
|
-print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
|
|
|
-test_auc = metrics.roc_auc_score(y_test, y_test_pre)
|
|
|
-print("auc: %.2f%%" % (test_auc * 100.0))
|
|
|
-test_recall = metrics.recall_score(y_test, y_test_pre)
|
|
|
-print("recall:%.2f%%"%(test_recall*100.0))
|
|
|
-test_f1 = metrics.f1_score(y_test, y_test_pre)
|
|
|
-print("f1:%.2f%%"%(test_f1*100.0))
|
|
|
-test_precision = metrics.precision_score(y_test, y_test_pre)
|
|
|
-print("precision:%.2f%%"%(test_precision*100.0))
|
|
|
+ # 7. 模型效果验证
|
|
|
+ test_accuracy = metrics.accuracy_score(y_test, y_test_pre)
|
|
|
+ print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
|
|
|
+ test_auc = metrics.roc_auc_score(y_test, y_test_pre)
|
|
|
+ print("auc: %.2f%%" % (test_auc * 100.0))
|
|
|
+ test_recall = metrics.recall_score(y_test, y_test_pre)
|
|
|
+ print("recall:%.2f%%"%(test_recall*100.0))
|
|
|
+ test_f1 = metrics.f1_score(y_test, y_test_pre)
|
|
|
+ print("f1:%.2f%%"%(test_f1*100.0))
|
|
|
+ test_precision = metrics.precision_score(y_test, y_test_pre)
|
|
|
+ print("precision:%.2f%%"%(test_precision*100.0))
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ st_time = time.time()
|
|
|
+ xgboost_train()
|
|
|
+ print(f"{time.time() - st_time}s")
|