params_search.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. import pandas as pd
  2. import lightgbm as lgb
  3. from sklearn.model_selection import train_test_split
  4. from sklearn.model_selection import GridSearchCV
  5. from sklearn.metrics import mean_absolute_percentage_error, r2_score
  6. from rov_train import process_data
  7. from config import set_config
  8. config_, env = set_config()
  9. train_filename = config_.TRAIN_DATA_FILENAME
  10. X, Y, videos, fea = process_data(filename=train_filename)
  11. x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
  12. # params = {
  13. # 'boosting_type': 'gbdt',
  14. # 'objective': 'regression',
  15. # 'metric': 'mape',
  16. # 'nthread': 4,
  17. # 'learning_rate': 0.1,
  18. # 'num_leaves': 30,
  19. # 'max_depth': 5,
  20. # 'subsample': 0.8,
  21. # 'colsample_bytree': 0.8,
  22. # }
  23. # data_train = lgb.Dataset(x_train, y_train)
  24. # cv_results = lgb.cv(
  25. # params,
  26. # data_train,
  27. # num_boost_round=1000,
  28. # nfold=5,
  29. # stratified=False,
  30. # shuffle=True,
  31. # metrics='mape',
  32. # early_stopping_rounds=50,
  33. # seed=0
  34. # )
  35. # print('cv_results: ', cv_results)
  36. # print('best n_estimators: ', len(cv_results['mape-mean']))
  37. # print('best cv score: ', pd.Series(cv_results['mape-mean']).min())
  38. # best n_estimators: 1000
  39. # best cv score: 0.1578205729958749
  40. # params_test1 = {'max_depth': range(3, 8, 1), 'num_leaves': range(5, 100, 10)}
  41. # gsearch1 = GridSearchCV(
  42. # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1,
  43. # n_estimators=1000, max_depth=6, bagging_fraction=0.8, feature_fraction=0.8),
  44. # param_grid=params_test1, scoring='r2', cv=5, n_jobs=-1)
  45. # gsearch1.fit(x_train, y_train)
  46. # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
  47. # {'max_depth': 3, 'num_leaves': 5} 0.9263654009553877
  48. # params_test2 = {'max_bin': range(5, 256, 10), 'min_data_in_leaf': range(1, 102, 10)}
  49. #
  50. # gsearch2 = GridSearchCV(
  51. # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1,
  52. # n_estimators=1000, max_depth=3, num_leaves=5, bagging_fraction=0.8,
  53. # feature_fraction=0.8),
  54. # param_grid=params_test2, scoring='r2', cv=5, n_jobs=-1)
  55. # gsearch2.fit(x_train, y_train)
  56. # print(gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_)
  57. # {'max_bin': 125, 'min_data_in_leaf': 21} 0.7918455469932647
  58. # params_test3 = {'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
  59. # 'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
  60. # 'bagging_freq': range(0, 81, 10)
  61. # }
  62. #
  63. # gsearch3 = GridSearchCV(
  64. # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1,
  65. # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21),
  66. # param_grid=params_test3, scoring='r2', cv=5, n_jobs=-1)
  67. # gsearch3.fit(x_train, y_train)
  68. # print(gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_)
  69. # {'bagging_fraction': 0.6, 'bagging_freq': 0, 'feature_fraction': 0.9} 0.9003939890191465
  70. # params_test4 = {'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
  71. # 'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
  72. # }
  73. #
  74. # gsearch4 = GridSearchCV(
  75. # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1,
  76. # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21,
  77. # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9),
  78. # param_grid=params_test4, scoring='r2', cv=5, n_jobs=-1)
  79. # gsearch4.fit(x_train, y_train)
  80. # print(gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_)
  81. # {'lambda_l1': 0.7, 'lambda_l2': 0.9} 0.7538021411131314
  82. # params_test5 = {'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
  83. # gsearch5 = GridSearchCV(
  84. # estimator=lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.1,
  85. # n_estimators=1000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21,
  86. # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9,
  87. # lambda_l1=0.7, lambda_l2=0.9),
  88. # param_grid=params_test5, scoring='r2', cv=5, n_jobs=-1)
  89. # gsearch5.fit(x_train, y_train)
  90. # print(gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_)
  91. #
  92. # {'min_split_gain': 0.0} 0.8903744931054016
  93. # model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.01,
  94. # n_estimators=7000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21,
  95. # bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9,
  96. # lambda_l1=0.7, lambda_l2=0.9, min_split_gain=0)
  97. # model.fit(x_train, y_train)
  98. # y_pre = model.predict(x_test)
  99. # print('mape: ', mean_absolute_percentage_error(y_test, y_pre))
  100. # print('r2: ', r2_score(y_test, y_pre))
  101. # mape: 0.7407816559793686
  102. # r2: 0.9069208480190998
  103. # model = lgb.LGBMRegressor()
  104. # model.fit(x_train, y_train)
  105. # y_pre = model.predict(x_test)
  106. # print('mape: ', mean_absolute_percentage_error(y_test, y_pre))
  107. # print('r2: ', r2_score(y_test, y_pre))
  108. # mape: 0.211698833035155
  109. # r2: 0.893979193478978
  110. model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metrics='mape', learning_rate=0.08,
  111. n_estimators=3000, max_depth=3, num_leaves=5, max_bin=125, min_data_in_leaf=21,
  112. bagging_fraction=0.6, bagging_freq=0, feature_fraction=0.9,
  113. lambda_l1=0.7, lambda_l2=0.9, min_split_gain=0)
  114. model.fit(x_train, y_train)
  115. y_pre = model.predict(x_test)
  116. print('mape: ', mean_absolute_percentage_error(y_test, y_pre))
  117. print('r2: ', r2_score(y_test, y_pre))