|
@@ -16,6 +16,11 @@ import lightgbm as lgb
|
|
|
from sklearn.preprocessing import LabelEncoder
|
|
|
from sklearn.metrics import accuracy_score
|
|
|
|
|
|
+from sklearn.model_selection import train_test_split, StratifiedKFold
|
|
|
+from sklearn.datasets import load_breast_cancer
|
|
|
+from sklearn.metrics import roc_auc_score
|
|
|
+from bayes_opt import BayesianOptimization
|
|
|
+
|
|
|
|
|
|
class LightGBM(object):
|
|
|
"""
|
|
@@ -67,40 +72,42 @@ class LightGBM(object):
|
|
|
features[key] = self.label_encoder.fit_transform(features[key])
|
|
|
return features, labels
|
|
|
|
|
|
- def bays_params(self, trial):
|
|
|
- """
|
|
|
- Bayesian parameters for
|
|
|
- :return: best parameters
|
|
|
- """
|
|
|
- # 定义搜索空间
|
|
|
- param = {
|
|
|
- 'objective': 'binary',
|
|
|
- 'metric': 'binary_logloss',
|
|
|
- 'verbosity': -1,
|
|
|
- 'boosting_type': 'gbdt',
|
|
|
- 'num_leaves': trial.suggest_int('num_leaves', 20, 40),
|
|
|
- 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
|
|
|
- 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
|
|
|
- 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
|
|
|
- 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
|
|
|
- 'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
|
|
|
- "num_threads": 16, # 线程数量
|
|
|
- }
|
|
|
+ def bays(self):
|
|
|
+ # 创建LightGBM数据集,注意不要在这里指定categorical_feature,因为我们使用的是玩具数据集
|
|
|
x, y = self.read_data()
|
|
|
- train_size = int(len(x) * self.split_c)
|
|
|
+ train_size = int(len(x) * 0.9)
|
|
|
X_train, X_test = x[:train_size], x[train_size:]
|
|
|
Y_train, Y_test = y[:train_size], y[train_size:]
|
|
|
- train_data = lgb.Dataset(
|
|
|
- X_train,
|
|
|
- label=Y_train,
|
|
|
- categorical_feature=["channel", "mode", "out_user_id", "tag1", "tag2", "tag3"],
|
|
|
- )
|
|
|
- test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
|
|
|
- gbm = lgb.train(param, train_data, num_boost_round=100, valid_sets=[test_data])
|
|
|
- preds = gbm.predict(X_test)
|
|
|
- pred_labels = np.rint(preds)
|
|
|
- accuracy = accuracy_score(Y_test, pred_labels)
|
|
|
- return accuracy
|
|
|
+ train_data = lgb.Dataset(X_train, label=Y_train)
|
|
|
+ def lgbm_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, min_child_samples):
|
|
|
+ params = {
|
|
|
+ 'objective': 'binary',
|
|
|
+ 'metric': 'auc',
|
|
|
+ 'verbose': -1,
|
|
|
+ 'num_leaves': int(num_leaves),
|
|
|
+ 'learning_rate': learning_rate,
|
|
|
+ 'feature_fraction': feature_fraction,
|
|
|
+ 'bagging_fraction': bagging_fraction,
|
|
|
+ 'bagging_freq': int(bagging_freq),
|
|
|
+ 'min_child_samples': int(min_child_samples),
|
|
|
+ }
|
|
|
+ cv_result = lgb.cv(params, train_data, nfold=5, seed=42, stratified=True, metrics=['auc'],
|
|
|
+ early_stopping_rounds=10)
|
|
|
+ return max(cv_result['auc-mean'])
|
|
|
+
|
|
|
+ param_bounds = {
|
|
|
+ 'num_leaves': (20, 40),
|
|
|
+ 'learning_rate': (1e-4, 1e-2),
|
|
|
+ 'feature_fraction': (0.5, 0.8),
|
|
|
+ 'bagging_fraction': (0.5, 0.8),
|
|
|
+ 'bagging_freq': (1, 10),
|
|
|
+ 'min_child_samples': (20, 100),
|
|
|
+ }
|
|
|
+
|
|
|
+ optimizer = BayesianOptimization(f=lgbm_eval, pbounds=param_bounds, random_state=42)
|
|
|
+ optimizer.maximize(init_points=5, n_iter=25)
|
|
|
+
|
|
|
+ print("Best Parameters:", optimizer.max['params'])
|
|
|
|
|
|
def train_model(self):
|
|
|
"""
|
|
@@ -127,7 +134,6 @@ class LightGBM(object):
|
|
|
'num_threads': 16
|
|
|
}
|
|
|
|
|
|
-
|
|
|
# 训练模型
|
|
|
num_round = 100
|
|
|
print("开始训练......")
|
|
@@ -194,19 +200,20 @@ class LightGBM(object):
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- i = int(input("输入 1 训练, 输入 2 预测:\n"))
|
|
|
- if i == 1:
|
|
|
- f = "train"
|
|
|
- dt = "whole"
|
|
|
- L = LightGBM(flag=f, dt=dt)
|
|
|
- L.train_model()
|
|
|
- elif i == 2:
|
|
|
- f = "predict"
|
|
|
- dt = int(input("输入日期, 16-21:\n"))
|
|
|
- L = LightGBM(flag=f, dt=dt)
|
|
|
- L.evaluate_model()
|
|
|
- L.feature_importance()
|
|
|
- # L = LightGBM("train", "whole")
|
|
|
+ # i = int(input("输入 1 训练, 输入 2 预测:\n"))
|
|
|
+ # if i == 1:
|
|
|
+ # f = "train"
|
|
|
+ # dt = "whole"
|
|
|
+ # L = LightGBM(flag=f, dt=dt)
|
|
|
+ # L.train_model()
|
|
|
+ # elif i == 2:
|
|
|
+ # f = "predict"
|
|
|
+ # dt = int(input("输入日期, 16-21:\n"))
|
|
|
+ # L = LightGBM(flag=f, dt=dt)
|
|
|
+ # L.evaluate_model()
|
|
|
+ # L.feature_importance()
|
|
|
+ L = LightGBM("train", "whole")
|
|
|
+ L.bays()
|
|
|
# study = optuna.create_study(direction='maximize')
|
|
|
# study.optimize(L.bays_params, n_trials=100)
|
|
|
# print('Number of finished trials:', len(study.trials))
|