ad_xgboost_train.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
  1. import pandas as pd
  2. import datetime
  3. from sklearn.model_selection import train_test_split
  4. from xgboost.sklearn import XGBClassifier
  5. from sklearn import metrics
  6. now_date = datetime.datetime.today()
  7. dt = datetime.datetime.strftime(now_date, '%Y%m%d')
  8. # 1. 读取数据
  9. # data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
  10. # print(data.shape)
  11. train_data = pd.read_csv(f'./data/train_test_data/train_{dt}.csv')
  12. print(train_data.shape)
  13. test_data = pd.read_csv(f'./data/train_test_data/test_{dt}.csv')
  14. print(test_data.shape)
  15. # 2. 划分x和y
  16. # data_columns = data.columns.values.tolist()
  17. # x = data[data_columns[:-1]]
  18. # y = data[data_columns[-1]]
  19. # print(f"x_shape: {x.shape}, y_shape: {y.shape}")
  20. data_columns = train_data.columns.values.tolist()
  21. x_train = train_data[data_columns[:-1]]
  22. y_train = train_data[data_columns[-1]]
  23. x_test = test_data[data_columns[:-1]]
  24. y_test = test_data[data_columns[-1]]
  25. # 3. 训练集和测试集分割
  26. # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
  27. print(f"x_train_shape: {x_train.shape}")
  28. print(f"x_test_shape: {x_test.shape}")
  29. # 4. 模型训练
  30. xgb_model = XGBClassifier(
  31. objective='binary:logistic',
  32. learning_rate=0.3,
  33. max_depth=5,
  34. eval_metric=['error', 'logloss', 'auc']
  35. )
  36. xgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])
  37. # 5. 模型保存
  38. xgb_model.save_model('./data/ad_xgb.model')
  39. # 6. 测试集预测
  40. y_test_pre = xgb_model.predict(x_test)
  41. test_df = x_test.copy()
  42. test_df['y'] = y_test
  43. test_df['y_pre'] = y_test_pre
  44. test_df.to_csv('./data/test_pre.csv', index=False)
  45. # 7. 模型效果验证
  46. test_accuracy = metrics.accuracy_score(y_test, y_test_pre)
  47. print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
  48. test_auc = metrics.roc_auc_score(y_test, y_test_pre)
  49. print("auc: %.2f%%" % (test_auc * 100.0))