|
@@ -47,10 +47,25 @@ class LightGBM(object):
|
|
|
]
|
|
|
self.split_c = 0.7
|
|
|
self.yc = 0.8
|
|
|
- self.model = "lightgbm_0327_spider_v2.bin"
|
|
|
+ self.model = "models/lightgbm_0401_spider.bin"
|
|
|
self.flag = flag
|
|
|
self.dt = dt
|
|
|
|
|
|
+ def read_data(self):
|
|
|
+ """
|
|
|
+ Read data from local
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ path = "data/train_data_20240401.json"
|
|
|
+ df = pd.read_json(path)
|
|
|
+ labels = df['label']
|
|
|
+ features = df.drop("label", axis=1)
|
|
|
+ for key in self.float_columns:
|
|
|
+ features[key] = pd.to_numeric(features[key], errors="coerce")
|
|
|
+ for key in self.str_columns:
|
|
|
+ features[key] = self.label_encoder.fit_transform(features[key])
|
|
|
+ return features, labels
|
|
|
+
|
|
|
def bays_params(self, trial):
|
|
|
"""
|
|
|
Bayesian parameters for
|
|
@@ -62,7 +77,7 @@ class LightGBM(object):
|
|
|
'metric': 'binary_logloss',
|
|
|
'verbosity': -1,
|
|
|
'boosting_type': 'gbdt',
|
|
|
- 'num_leaves': trial.suggest_int('num_leaves', 20, 40),
|
|
|
+ 'num_leaves': trial.suggest_int('num_leaves', 10, 40),
|
|
|
'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
|
|
|
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
|
|
|
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
|
|
@@ -70,8 +85,10 @@ class LightGBM(object):
|
|
|
'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
|
|
|
"num_threads": 16, # 线程数量
|
|
|
}
|
|
|
- X_train, X_test = self.generate_x_data()
|
|
|
- Y_train, Y_test = self.generate_y_data()
|
|
|
+ x, y = self.read_data()
|
|
|
+ train_size = int(len(x) * self.split_c)
|
|
|
+ X_train, X_test = x[:train_size], x[train_size:]
|
|
|
+ Y_train, Y_test = y[:train_size], y[train_size:]
|
|
|
train_data = lgb.Dataset(
|
|
|
X_train,
|
|
|
label=Y_train,
|
|
@@ -84,42 +101,6 @@ class LightGBM(object):
|
|
|
accuracy = accuracy_score(Y_test, pred_labels)
|
|
|
return accuracy
|
|
|
|
|
|
- def generate_x_data(self):
|
|
|
- """
|
|
|
- Generate data for feature engineering
|
|
|
- :return:
|
|
|
- """
|
|
|
- with open("data/produce_data/x_data_total_return_{}_{}_spider.json".format(self.flag, self.dt)) as f1:
|
|
|
- x_list = json.loads(f1.read())
|
|
|
- index_t = int(len(x_list) * self.split_c)
|
|
|
- X_train = pd.DataFrame(x_list[:index_t], columns=self.my_c)
|
|
|
- for key in self.str_columns:
|
|
|
- X_train[key] = self.label_encoder.fit_transform(X_train[key])
|
|
|
- for key in self.float_columns:
|
|
|
- X_train[key] = pd.to_numeric(X_train[key], errors="coerce")
|
|
|
- X_test = pd.DataFrame(x_list[index_t:], columns=self.my_c)
|
|
|
- for key in self.str_columns:
|
|
|
- X_test[key] = self.label_encoder.fit_transform(X_test[key])
|
|
|
- for key in self.float_columns:
|
|
|
- X_test[key] = pd.to_numeric(X_test[key], errors="coerce")
|
|
|
- return X_train, X_test
|
|
|
-
|
|
|
- def generate_y_data(self):
|
|
|
- """
|
|
|
- Generate data for label
|
|
|
- :return:
|
|
|
- """
|
|
|
- with open("data/produce_data/y_data_total_return_{}_{}_spider.json".format(self.flag, self.dt)) as f2:
|
|
|
- y_list = json.loads(f2.read())
|
|
|
- index_t = int(len(y_list) * self.split_c)
|
|
|
- temp = sorted(y_list)
|
|
|
- yuzhi = temp[int(len(temp) * self.yc) - 1]
|
|
|
- print("阈值是: {}".format(yuzhi))
|
|
|
- y__list = [0 if i <= yuzhi else 1 for i in y_list]
|
|
|
- y_train = np.array(y__list[:index_t])
|
|
|
- y_test = np.array(y__list[index_t:])
|
|
|
- return y_train, y_test
|
|
|
-
|
|
|
def train_model(self):
|
|
|
"""
|
|
|
Load dataset
|
|
@@ -210,20 +191,20 @@ class LightGBM(object):
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- i = int(input("输入 1 训练, 输入 2 预测:\n"))
|
|
|
- if i == 1:
|
|
|
- f = "train"
|
|
|
- dt = "whole"
|
|
|
- L = LightGBM(flag=f, dt=dt)
|
|
|
- L.train_model()
|
|
|
- elif i == 2:
|
|
|
- f = "predict"
|
|
|
- dt = int(input("输入日期, 16-21:\n"))
|
|
|
- L = LightGBM(flag=f, dt=dt)
|
|
|
- L.evaluate_model()
|
|
|
- L.feature_importance()
|
|
|
- # L = LightGBM("train", "whole")
|
|
|
- # study = optuna.create_study(direction='maximize')
|
|
|
- # study.optimize(L.bays_params, n_trials=100)
|
|
|
- # print('Number of finished trials:', len(study.trials))
|
|
|
- # print('Best trial:', study.best_trial.params)
|
|
|
+ # i = int(input("输入 1 训练, 输入 2 预测:\n"))
|
|
|
+ # if i == 1:
|
|
|
+ # f = "train"
|
|
|
+ # dt = "whole"
|
|
|
+ # L = LightGBM(flag=f, dt=dt)
|
|
|
+ # L.train_model()
|
|
|
+ # elif i == 2:
|
|
|
+ # f = "predict"
|
|
|
+ # dt = int(input("输入日期, 16-21:\n"))
|
|
|
+ # L = LightGBM(flag=f, dt=dt)
|
|
|
+ # L.evaluate_model()
|
|
|
+ # L.feature_importance()
|
|
|
+ L = LightGBM("train", "whole")
|
|
|
+ study = optuna.create_study(direction='maximize')
|
|
|
+ study.optimize(L.bays_params, n_trials=100)
|
|
|
+ print('Number of finished trials:', len(study.trials))
|
|
|
+ print('Best trial:', study.best_trial.params)
|