ad_xgb_model_params_search.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import datetime
  2. import pandas as pd
  3. from xgboost.sklearn import XGBClassifier
  4. from sklearn.model_selection import GridSearchCV
  5. from sklearn import metrics
  6. now_date = datetime.datetime.today()
  7. dt = datetime.datetime.strftime(now_date, '%Y%m%d')
  8. # 1. 读取数据
  9. train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv')
  10. print(train_data.shape)
  11. test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv')
  12. print(test_data.shape)
  13. # 2. 划分x和y
  14. data_columns = train_data.columns.values.tolist()
  15. x_train = train_data[data_columns[:-1]]
  16. y_train = train_data[data_columns[-1]]
  17. x_test = test_data[data_columns[:-1]]
  18. y_test = test_data[data_columns[-1]]
  19. print(f"x_train_shape: {x_train.shape}")
  20. print(f"x_test_shape: {x_test.shape}")
  21. parameters = {
  22. 'max_depth': range(3, 10, 2), # 树的最大深度
  23. 'learning_rate': [i/10 for i in range(1, 5)], # 学习率
  24. 'n_estimators': range(50, 500, 50), # 最大迭代次数
  25. 'min_child_weight': range(1, 6, 2), # 叶子结点的最小权重
  26. }
  27. grid = GridSearchCV(
  28. estimator=XGBClassifier(
  29. learning_rate=0.1, # 学习率,控制每次迭代更新权重时的步长,默认0.3;调参:值越小,训练越慢;典型值为0.01-0.2
  30. n_estimators=50, # 总共迭代的次数,即决策树的个数
  31. max_depth=3, # 树的深度,默认值为6,典型值3-10;调参:值越大,越容易过拟合;值越小,越容易欠拟合
  32. min_child_weight=5, # 叶子节点最小权重;默认值为1;调参:值越大,越容易欠拟合;值越小,越容易过拟合
  33. objective='binary:logistic',
  34. eval_metric=['error', 'auc']
  35. ),
  36. param_grid=parameters,
  37. scoring='roc_auc',
  38. n_jobs=4,
  39. iid=False,
  40. cv=5)
  41. grid.fit(x_train, y_train)
  42. print(f"grid.best_params_: {grid.best_params_}")
  43. print(f"grid.best_score_: {grid.best_score_}")
  44. clf = grid.best_estimator_
  45. print(f"grid.best_estimator_: {grid.best_estimator_}")
  46. clf.fit(x_train, y_train)
  47. y_train_pred = clf.predict(x_train)
  48. train_accuracy = metrics.accuracy_score(y_train, y_train_pred)
  49. train_auc = metrics.roc_auc_score(y_train, y_train_pred)
  50. train_recall = metrics.recall_score(y_train, y_train_pred)
  51. train_f1 = metrics.f1_score(y_train, y_train_pred)
  52. train_precision = metrics.precision_score(y_train, y_train_pred)
  53. print(f"Train Accuracy: {train_accuracy * 100.0}.2f%%, "
  54. f"auc: {train_auc}.2f, "
  55. f"recall: {train_recall * 100.0}.2f%%, "
  56. f"f1: {train_f1 * 100.0}.2f%%, "
  57. f"precision: {train_precision * 100.0}.2f%%")
  58. y_test_pred = clf.predict(x_test)
  59. test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
  60. test_auc = metrics.roc_auc_score(y_test, y_test_pred)
  61. test_recall = metrics.recall_score(y_test, y_test_pred)
  62. test_f1 = metrics.f1_score(y_test, y_test_pred)
  63. test_precision = metrics.precision_score(y_test, y_test_pred)
  64. print(f"Test Accuracy: {test_accuracy * 100.0}.2f%%, "
  65. f"auc: {test_auc}.2f, "
  66. f"recall: {test_recall * 100.0}.2f%%, "
  67. f"f1: {test_f1 * 100.0}.2f%%, "
  68. f"precision: {test_precision * 100.0}.2f%%")