import datetime import pandas as pd from xgboost.sklearn import XGBClassifier from sklearn.model_selection import GridSearchCV from sklearn import metrics now_date = datetime.datetime.today() dt = datetime.datetime.strftime(now_date, '%Y%m%d') # 1. 读取数据 train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv') print(train_data.shape) test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv') print(test_data.shape) # 2. 划分x和y data_columns = train_data.columns.values.tolist() x_train = train_data[data_columns[:-1]] y_train = train_data[data_columns[-1]] x_test = test_data[data_columns[:-1]] y_test = test_data[data_columns[-1]] print(f"x_train_shape: {x_train.shape}") print(f"x_test_shape: {x_test.shape}") parameters = { 'max_depth': range(3, 10, 2), # 树的最大深度 'learning_rate': [i/10 for i in range(1, 5)], # 学习率 'n_estimators': range(50, 500, 50), # 最大迭代次数 'min_child_weight': range(1, 6, 2), # 叶子结点的最小权重 } grid = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1, # 学习率,控制每次迭代更新权重时的步长,默认0.3;调参:值越小,训练越慢;典型值为0.01-0.2 n_estimators=50, # 总共迭代的次数,即决策树的个数 max_depth=3, # 树的深度,默认值为6,典型值3-10;调参:值越大,越容易过拟合;值越小,越容易欠拟合 min_child_weight=5, # 叶子节点最小权重;默认值为1;调参:值越大,越容易欠拟合;值越小,越容易过拟合 objective='binary:logistic', eval_metric=['error', 'auc'] ), param_grid=parameters, scoring='roc_auc', n_jobs=4, iid=False, cv=5) grid.fit(x_train, y_train) print(f"grid.best_params_: {grid.best_params_}") print(f"grid.best_score_: {grid.best_score_}") clf = grid.best_estimator_ print(f"grid.best_estimator_: {grid.best_estimator_}") clf.fit(x_train, y_train) y_train_pred = clf.predict(x_train) train_accuracy = metrics.accuracy_score(y_train, y_train_pred) train_auc = metrics.roc_auc_score(y_train, y_train_pred) train_recall = metrics.recall_score(y_train, y_train_pred) train_f1 = metrics.f1_score(y_train, y_train_pred) train_precision = metrics.precision_score(y_train, y_train_pred) print(f"Train Accuracy: {train_accuracy * 100.0}.2f%%, " f"auc: {train_auc}.2f, " f"recall: {train_recall * 100.0}.2f%%, " f"f1: {train_f1 * 100.0}.2f%%, " f"precision: {train_precision * 100.0}.2f%%") y_test_pred = clf.predict(x_test) test_accuracy = metrics.accuracy_score(y_test, y_test_pred) test_auc = metrics.roc_auc_score(y_test, y_test_pred) test_recall = metrics.recall_score(y_test, y_test_pred) test_f1 = metrics.f1_score(y_test, y_test_pred) test_precision = metrics.precision_score(y_test, y_test_pred) print(f"Test Accuracy: {test_accuracy * 100.0}.2f%%, " f"auc: {test_auc}.2f, " f"recall: {test_recall * 100.0}.2f%%, " f"f1: {test_f1 * 100.0}.2f%%, " f"precision: {test_precision * 100.0}.2f%%")