rov_train.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. import lightgbm as lgb
  2. from sklearn.model_selection import train_test_split
  3. from sklearn.metrics import mean_absolute_error, r2_score
  4. from config import set_config
  5. from utils import read_from_pickle
  6. from log import Log
  7. config_ = set_config()
  8. log_ = Log()
  9. def process_data(filename):
  10. """
  11. 数据清洗、预处理
  12. :param filename: type-DataFrame
  13. :return: x, y, video_ids, features
  14. """
  15. # 获取数据
  16. data = read_from_pickle(filename)
  17. # 获取y,并将 y <= 0 的值更新为1
  18. data['futre7dayreturn'].loc[data['futre7dayreturn'] <= 0] = 1
  19. y = data['futre7dayreturn']
  20. # 获取视频id列
  21. video_ids = data['videoid']
  22. # 获取x
  23. drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
  24. x = data.drop(columns=drop_columns)
  25. # 计算后一天的回流比前一天的回流差值
  26. x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  27. x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  28. x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  29. # 计算后一天回流比前一天回流的增长率
  30. x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
  31. x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
  32. x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
  33. # 缺失值填充为0
  34. x.fillna(0)
  35. # 获取当前所使用的特征列表
  36. features = list(x)
  37. return x, y, video_ids, features
  38. def train(x, y):
  39. """
  40. 训练模型
  41. :param x:
  42. :param y:
  43. :return:
  44. """
  45. # 训练集、测试集分割
  46. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  47. log_.info('x_train shape: {}, y_train shape: {}'.format(x_train.shape, y_train.shape))
  48. log_.info('x_test shape: {}, y_test shape: {}'.format(x_test.shape, y_test.shape))
  49. # 训练参数设置
  50. params = {
  51. "objective": "regression",
  52. "reg_sqrt": True,
  53. "metric": "mape",
  54. "max_depth": -1,
  55. "num_leaves": 50,
  56. "learning_rate": 0.1,
  57. "bagging_fraction": 0.7,
  58. "feature_fraction": 0.7,
  59. "bagging_freq": 8,
  60. "bagging_seed": 2018,
  61. "lambda_l1": 0.11,
  62. "boosting": "dart",
  63. "nthread": 4,
  64. "verbosity": -1
  65. }
  66. # 初始化数据集
  67. train_set = lgb.Dataset(data=x_train, label=y_train)
  68. test_set = lgb.Dataset(data=x_test, label=y_test)
  69. # 模型训练
  70. evals_result = {}
  71. model = lgb.train(params=params, train_set=train_set, num_boost_round=5000,
  72. valid_sets=[test_set], early_stopping_rounds=100,
  73. verbose_eval=100, evals_result=evals_result)
  74. # 测试集预测
  75. pre_test_y = model.predict(data=x_test, num_iteration=model.best_iteration)
  76. y_test = y_test.values
  77. err_mae = mean_absolute_error(y_test, pre_test_y)
  78. r2 = r2_score(y_test, pre_test_y)
  79. print(err_mae, r2)
  80. if __name__ == '__main__':
  81. # dt_test = '20211007'
  82. # project_test = 'usercdm'
  83. # table_test = 'rov_feature_add_v1'
  84. # res = get_rov_feature_table(dt_test, table_test)
  85. # res = get_data_with_date(date=dt_test, delta_days=2, project=project_test, table=table_test)
  86. # print(res.shape)
  87. # write_to_pickle(res, 'test.pickle')
  88. # data = read_from_pickle('test.pickle')
  89. # if data is not None:
  90. # print(data.shape, type(data))
  91. # print(list(data))
  92. # print(data[data['futre7dayreturn']<0])
  93. # else:
  94. # print(data)
  95. train_filename = config_.TRAIN_DATA_FILENAME
  96. x, y, videos, fea = process_data(filename=train_filename)
  97. print(x.shape, y.shape)
  98. print(len(fea), fea)
  99. train(x, y)