Browse Source

处理爬虫模型

罗俊辉 1 year ago
parent
commit
14a80bd8f7
1 changed files with 21 additions and 23 deletions
  1. 21 23
      main_spider.py

+ 21 - 23
main_spider.py

@@ -26,7 +26,6 @@ class LightGBM(object):
         self.label_encoder = LabelEncoder()
         self.my_c = [
             "channel",
-            "fans",
             "view_count_user_30days",
             "share_count_user_30days",
             "return_count_user_30days",
@@ -41,9 +40,8 @@ class LightGBM(object):
             "tag2",
             "tag3"
         ]
-        self.str_columns = ["uid", "type", "channel", "mode", "out_user_id", "tag1", "tag2", "tag3"]
+        self.str_columns = ["channel", "mode", "out_user_id", "tag1", "tag2", "tag3"]
         self.float_columns = [
-            "fans",
             "view_count_user_30days",
             "share_count_user_30days",
             "return_count_user_30days",
@@ -52,11 +50,10 @@ class LightGBM(object):
             "out_play_cnt",
             "out_like_cnt",
             "out_share_cnt",
-            "out_collection_cnt",
         ]
         self.split_c = 0.999
         self.yc = 0.8
-        self.model = "lightgbm_0326.bin"
+        self.model = "lightgbm_0326_spider.bin"
         self.flag = flag
         self.dt = dt
 
@@ -84,7 +81,7 @@ class LightGBM(object):
         train_data = lgb.Dataset(
             X_train,
             label=Y_train,
-            categorical_feature=["uid", "type", "channel", "mode", "out_user_id", "tag1", "tag2", "tag3"],
+            categorical_feature=["channel", "mode", "out_user_id", "tag1", "tag2", "tag3"],
         )
         test_data = lgb.Dataset(X_test, label=Y_test, reference=train_data)
         gbm = lgb.train(param, train_data, num_boost_round=100, valid_sets=[test_data])
@@ -98,7 +95,7 @@ class LightGBM(object):
         Generate data for feature engineering
         :return:
         """
-        with open("data/produce_data/x_data_total_return_{}_{}.json".format(self.flag, self.dt)) as f1:
+        with open("data/produce_data/x_data_total_return_{}_{}_spider.json".format(self.flag, self.dt)) as f1:
             x_list = json.loads(f1.read())
         index_t = int(len(x_list) * self.split_c)
         X_train = pd.DataFrame(x_list[:index_t], columns=self.my_c)
@@ -118,7 +115,7 @@ class LightGBM(object):
         Generate data for label
         :return:
         """
-        with open("data/produce_data/y_data_total_return_{}_{}.json".format(self.flag, self.dt)) as f2:
+        with open("data/produce_data/y_data_total_return_{}_{}_spider.json".format(self.flag, self.dt)) as f2:
             y_list = json.loads(f2.read())
         index_t = int(len(y_list) * self.split_c)
         temp = sorted(y_list)
@@ -219,21 +216,22 @@ class LightGBM(object):
 
 
 if __name__ == "__main__":
-    i = int(input("输入 1 训练, 输入 2 预测:\n"))
-    if i == 1:
-        f = "train"
-        dt = "whole"
-        L = LightGBM(flag=f, dt=dt)
-        L.train_model()
-    elif i == 2:
-        f = "predict"
-        dt = int(input("输入日期, 16-21:\n"))
-        L = LightGBM(flag=f, dt=dt)
-        L.evaluate_model()
-    # study = optuna.create_study(direction='maximize')
-    # study.optimize(L.bays_params, n_trials=100)
-    # print('Number of finished trials:', len(study.trials))
-    # print('Best trial:', study.best_trial.params)
+    # i = int(input("输入 1 训练, 输入 2 预测:\n"))
+    # if i == 1:
+    #     f = "train"
+    #     dt = "whole"
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.train_model()
+    # elif i == 2:
+    #     f = "predict"
+    #     dt = int(input("输入日期, 16-21:\n"))
+    #     L = LightGBM(flag=f, dt=dt)
+    #     L.evaluate_model()
+    L = LightGBM("train", "whole")
+    study = optuna.create_study(direction='maximize')
+    study.optimize(L.bays_params, n_trials=100)
+    print('Number of finished trials:', len(study.trials))
+    print('Best trial:', study.best_trial.params)
     # L.train_model()
     # L.evaluate_model()
     # L.feature_importance()