algorithm
/
rov-offline


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344
							import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBClassifier
from sklearn import metrics


now_date = datetime.datetime.today()
dt = datetime.datetime.strftime(now_date, '%Y%m%d')
# 1. 读取数据
data = pd.read_csv(f'./data/train_test_data/train_test_{dt}.csv')
print(data.shape)
# 2. 划分x和y
data_columns = data.columns.values.tolist()
x = data[data_columns[:-1]]
y = data[data_columns[-1]]
print(f"x_shape: {x.shape}, y_shape: {y.shape}")
# 3. 训练集和测试集分割
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
print(f"x_train_shape: {x_train.shape}")
print(f"x_test_shape: {x_test.shape}")
# 4. 模型训练
xgb_model = XGBClassifier(
    objective='binary:logistic',
    learning_rate=0.3,
    max_depth=10,
    eval_metric=['error', 'logloss', 'auc']
)
xgb_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_test, y_test)])
# 5. 模型保存
xgb_model.save_model('./data/ad_xgb.model')
# 6. 测试集预测
y_test_pre = xgb_model.predict(x_test)

test_df = x_test.copy()
test_df['y'] = y_test
test_df['y_pre'] = y_test_pre
test_df.to_csv('./data/test_pre.csv', index=False)

# 7. 模型效果验证
test_accuracy = metrics.accuracy_score(y_test, y_test_pre)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))
test_auc = metrics.roc_auc_score(y_test, y_test_pre)
print("auc: %.2f%%" % (test_auc * 100.0))