import pandas as pd import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_absolute_percentage_error, r2_score from rov_train import process_data from config import set_config config_, env = set_config() train_filename = config_.TRAIN_DATA_FILENAME X, Y, videos, fea = process_data(filename=train_filename) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33) # params = { # 'boosting_type': 'gbdt', # 'objective': 'regression', # 'metric': 'mape', # 'nthread': 4, # 'learning_rate': 0.1, # 'num_leaves': 30, # 'max_depth': 5, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # } # data_train = lgb.Dataset(x_train, y_train) # cv_results = lgb.cv( # params, # data_train, # num_boost_round=1000, # nfold=5, # stratified=False, # shuffle=True, # metrics='mape', # early_stopping_rounds=50, # seed=0 # ) # print('cv_results: ', cv_results) # print('best n_estimators: ', len(cv_results['mape-mean'])) # print('best cv score: ', pd.Series(cv_results['mape-mean']).min()) # best n_estimators: 1000 # best cv score: 0.1578205729958749 # params_test1 = {'max_depth': range(3, 8, 1), 'num_leaves': range(5, 100, 10)} # gsearch1 = GridSearchCV( # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1, # n_estimators=1000, max_depth=6, bagging_fraction=0.8, feature_fraction=0.8), # param_grid=params_test1, scoring='r2', cv=5, n_jobs=-1) # gsearch1.fit(x_train, y_train) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) # {'max_depth': 3, 'num_leaves': 5} 0.9263654009553877 # params_test2 = {'max_bin': range(5, 256, 10), 'min_data_in_leaf': range(1, 102, 10)} # # gsearch2 = GridSearchCV( # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1, # n_estimators=1000, max_depth=3, num_leaves=5, bagging_fraction=0.8, # feature_fraction=0.8), # param_grid=params_test2, scoring='r2', cv=5, n_jobs=-1) # gsearch2.fit(x_train, y_train) # print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_) # {'max_bin': 125, 'min_data_in_leaf': 21} 0.7918455469932647 # params_test3 = {'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0], # 'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0], # 'bagging_freq': range(0, 81, 10) # } # # gsearch3 = GridSearchCV( # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1, # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21), # param_grid=params_test3, scoring='r2', cv=5, n_jobs=-1) # gsearch3.fit(x_train, y_train) # print(gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_) # {'bagging_fraction': 0.6, 'bagging_freq': 0, 'feature_fraction': 0.9} 0.9003939890191465 # params_test4 = {'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0], # 'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0] # } # # gsearch4 = GridSearchCV( # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1, # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21, # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9), # param_grid=params_test4, scoring='r2', cv=5, n_jobs=-1) # gsearch4.fit(x_train, y_train) # print(gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_) # {'lambda_l1': 0.7, 'lambda_l2': 0.9} 0.7538021411131314 # params_test5 = {'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]} # gsearch5 = GridSearchCV( # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1, # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21, # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9, # lambda_l1=0.7, lambda_l2=0.9), # param_grid=params_test5, scoring='r2', cv=5, n_jobs=-1) # gsearch5.fit(x_train, y_train) # print(gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_) # # {'min_split_gain': 0.0} 0.8903744931054016 # model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.01, # n_estimators=7000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21, # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9, # lambda_l1=0.7, lambda_l2=0.9, min_split_gain=0) # model.fit(x_train, y_train) # y_pre = model.predict(x_test) # print('mape: ', mean_absolute_percentage_error(y_test, y_pre)) # print('r2: ', r2_score(y_test, y_pre)) # mape: 0.7407816559793686 # r2: 0.9069208480190998 # model = lgb.LGBMRegressor() # model.fit(x_train, y_train) # y_pre = model.predict(x_test) # print('mape: ', mean_absolute_percentage_error(y_test, y_pre)) # print('r2: ', r2_score(y_test, y_pre)) # mape: 0.211698833035155 # r2: 0.893979193478978 model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.08, n_estimators=3000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21, bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9, lambda_l1=0.7, lambda_l2=0.9, min_split_gain=0) model.fit(x_train, y_train) y_pre = model.predict(x_test) print('mape: ', mean_absolute_percentage_error(y_test, y_pre)) print('r2: ', r2_score(y_test, y_pre))