Browse Source

generate label for mysql

罗俊辉 1 year ago
parent
commit
e09e6dd213
1 changed files with 52 additions and 45 deletions
  1. 52 45
      main_spider.py

+ 52 - 45
main_spider.py

@@ -16,6 +16,11 @@ import lightgbm as lgb
 from sklearn.preprocessing import LabelEncoder
 from sklearn.metrics import accuracy_score
 
+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.datasets import load_breast_cancer
+from sklearn.metrics import roc_auc_score
+from bayes_opt import BayesianOptimization
+
 
 class LightGBM(object):
     """
@@ -67,40 +72,42 @@ class LightGBM(object):
             features[key] = self.label_encoder.fit_transform(features[key])
         return features, labels
 
-    def bays_params(self, trial):
-        """
-        Bayesian parameters for
-        :return: best parameters
-        """
-        # 定义搜索空间
-        param = {
-            'objective': 'binary',
-            'metric': 'binary_logloss',
-            'verbosity': -1,
-            'boosting_type': 'gbdt',
-            'num_leaves': trial.suggest_int('num_leaves', 20, 40),
-            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
-            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
-            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
-            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
-            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
-            "num_threads": 16,  # 线程数量
-        }
+    def bays(self):
+        # 创建LightGBM数据集,注意不要在这里指定categorical_feature,因为我们使用的是玩具数据集
         x, y = self.read_data()
-        train_size = int(len(x) * self.split_c)
+        train_size = int(len(x) * 0.9)
         X_train, X_test = x[:train_size], x[train_size:]
         Y_train, Y_test = y[:train_size], y[train_size:]
-        train_data = lgb.Dataset(
-            X_train,
-            label=Y_train,
-            categorical_feature=["channel", "mode", "out_user_id", "tag1", "tag2", "tag3"],
-        )
-        test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
-        gbm = lgb.train(param, train_data, num_boost_round=100, valid_sets=[test_data])
-        preds = gbm.predict(X_test)
-        pred_labels = np.rint(preds)
-        accuracy = accuracy_score(Y_test, pred_labels)
-        return accuracy
+        train_data = lgb.Dataset(X_train, label=Y_train)
+        def lgbm_eval(num_leaves, learning_rate, feature_fraction, bagging_fraction, bagging_freq, min_child_samples):
+            params = {
+                'objective': 'binary',
+                'metric': 'auc',
+                'verbose': -1,
+                'num_leaves': int(num_leaves),
+                'learning_rate': learning_rate,
+                'feature_fraction': feature_fraction,
+                'bagging_fraction': bagging_fraction,
+                'bagging_freq': int(bagging_freq),
+                'min_child_samples': int(min_child_samples),
+            }
+            cv_result = lgb.cv(params, train_data, nfold=5, seed=42, stratified=True, metrics=['auc'],
+                               early_stopping_rounds=10)
+            return max(cv_result['auc-mean'])
+
+        param_bounds = {
+            'num_leaves': (20, 40),
+            'learning_rate': (1e-4, 1e-2),
+            'feature_fraction': (0.5, 0.8),
+            'bagging_fraction': (0.5, 0.8),
+            'bagging_freq': (1, 10),
+            'min_child_samples': (20, 100),
+        }
+
+        optimizer = BayesianOptimization(f=lgbm_eval, pbounds=param_bounds, random_state=42)
+        optimizer.maximize(init_points=5, n_iter=25)
+
+        print("Best Parameters:", optimizer.max['params'])
 
     def train_model(self):
         """
@@ -127,7 +134,6 @@ class LightGBM(object):
             'num_threads': 16
         }
 
-
         # 训练模型
         num_round = 100
         print("开始训练......")
@@ -194,19 +200,20 @@ class LightGBM(object):
 
 
 if __name__ == "__main__":
-    i = int(input("输入 1 训练, 输入 2 预测:\n"))
-    if i == 1:
-        f = "train"
-        dt = "whole"
-        L = LightGBM(flag=f, dt=dt)
-        L.train_model()
-    elif i == 2:
-        f = "predict"
-        dt = int(input("输入日期, 16-21:\n"))
-        L = LightGBM(flag=f, dt=dt)
-        L.evaluate_model()
-        L.feature_importance()
-    # L = LightGBM("train", "whole")
+    # i = int(input("输入 1 训练, 输入 2 预测:\n"))
+    # if i == 1:
+    #     f = "train"
+    #     dt = "whole"
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.train_model()
+    # elif i == 2:
+    #     f = "predict"
+    #     dt = int(input("输入日期, 16-21:\n"))
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.evaluate_model()
+    #     L.feature_importance()
+    L = LightGBM("train", "whole")
+    L.bays()
     # study = optuna.create_study(direction='maximize')
     # study.optimize(L.bays_params, n_trials=100)
     # print('Number of finished trials:', len(study.trials))