rov_train.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. import os
  2. import lightgbm as lgb
  3. import pandas as pd
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
  6. from config import set_config
  7. from utils import read_from_pickle, write_to_pickle
  8. from log import Log
  9. config_ = set_config()
  10. log_ = Log()
  11. def process_data(filename):
  12. """
  13. 数据清洗、预处理
  14. :param filename: type-DataFrame
  15. :return: x, y, video_ids, features
  16. """
  17. # 获取数据
  18. data = read_from_pickle(filename)
  19. # 获取y,并将 y <= 0 的值更新为1
  20. data['futre7dayreturn'].loc[data['futre7dayreturn'] <= 0] = 1
  21. y = data['futre7dayreturn']
  22. # 获取视频id列
  23. video_ids = data['videoid']
  24. # 获取x
  25. drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
  26. x = data.drop(columns=drop_columns)
  27. # 计算后一天的回流比前一天的回流差值
  28. x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  29. x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  30. x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  31. # 计算后一天回流比前一天回流的增长率
  32. x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
  33. x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
  34. x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
  35. # 缺失值填充为0
  36. x.fillna(0)
  37. # 获取当前所使用的特征列表
  38. features = list(x)
  39. return x, y, video_ids, features
  40. def train(x, y, features):
  41. """
  42. 训练模型
  43. :param x: X
  44. :param y: Y
  45. :param features: 特征列表
  46. :return: None
  47. """
  48. # 训练集、测试集分割
  49. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  50. log_.info('x_train shape: {}, y_train shape: {}'.format(x_train.shape, y_train.shape))
  51. log_.info('x_test shape: {}, y_test shape: {}'.format(x_test.shape, y_test.shape))
  52. # 训练参数设置
  53. params = {
  54. "objective": "regression",
  55. "reg_sqrt": True,
  56. "metric": "mape",
  57. "max_depth": -1,
  58. "num_leaves": 50,
  59. "learning_rate": 0.1,
  60. "bagging_fraction": 0.7,
  61. "feature_fraction": 0.7,
  62. "bagging_freq": 8,
  63. "bagging_seed": 2018,
  64. "lambda_l1": 0.11,
  65. "boosting": "dart",
  66. "nthread": 4,
  67. "verbosity": -1
  68. }
  69. # 初始化数据集
  70. train_set = lgb.Dataset(data=x_train, label=y_train)
  71. test_set = lgb.Dataset(data=x_test, label=y_test)
  72. # 模型训练
  73. evals_result = {}
  74. model = lgb.train(params=params, train_set=train_set, num_boost_round=5000,
  75. valid_sets=[test_set], early_stopping_rounds=100,
  76. verbose_eval=100, evals_result=evals_result)
  77. # 将模型特征重要度存入csv
  78. feature_importance_data = {'feature': features, 'feature_importance': model.feature_importance()}
  79. feature_importance_filename = 'model_feature_importance.csv'
  80. pack_result_to_csv(filename=feature_importance_filename, sort_columns=['feature_importance'],
  81. ascending=False, **feature_importance_data)
  82. # 测试集预测
  83. pre_y_test = model.predict(data=x_test, num_iteration=model.best_iteration)
  84. y_test = y_test.values
  85. err_mae = mean_absolute_error(y_test, pre_y_test)
  86. err_mape = mean_absolute_percentage_error(y_test, pre_y_test)
  87. r2 = r2_score(y_test, pre_y_test)
  88. # 将测试集结果存入csv
  89. test_data = {'pre_y_test': pre_y_test, 'y_test': y_test}
  90. test_result_filename = 'test_result.csv'
  91. pack_result_to_csv(filename=test_result_filename, sort_columns=['pre_y_test'], ascending=False, **test_data)
  92. print(err_mae, err_mape, r2)
  93. # 保存模型
  94. write_to_pickle(data=model, filename=config_.MODEL_FILENAME)
  95. def pack_result_to_csv(filename, sort_columns=None, filepath=config_.DATA_DIR_PATH, ascending=True, **data):
  96. """
  97. 打包数据并存入csv
  98. :param filename: csv文件名
  99. :param sort_columns: 指定排序列名列名,type-list, 默认为None
  100. :param filepath: csv文件存放路径,默认为config_.DATA_DIR_PATH
  101. :param ascending: 是否按指定列的数组升序排列,默认为True,即升序排列
  102. :param data: 数据
  103. :return: None
  104. """
  105. if not os.path.exists(filepath):
  106. os.makedirs(filepath)
  107. file = os.path.join(filepath, filename)
  108. df = pd.DataFrame(data=data)
  109. if sort_columns:
  110. df = df.sort_values(by=sort_columns, ascending=ascending)
  111. df.to_csv(file, index=False)
  112. def predict():
  113. """预测"""
  114. # 读取预测数据并进行清洗
  115. x, y, video_ids, _ = process_data(config_.PREDICT_DATA_FILENAME)
  116. # 获取训练好的模型
  117. model = read_from_pickle(filename=config_.MODEL_FILENAME)
  118. # 预测
  119. y_ = model.predict(x)
  120. # 打包预测结果存入csv
  121. predict_data = {'y_': y_, 'y': y, 'video_ids': video_ids}
  122. predict_result_filename = 'predict.csv'
  123. pack_result_to_csv(filename=predict_result_filename, sort_columns=['y_'], ascending=False, **predict_data)
  124. if __name__ == '__main__':
  125. train_filename = config_.TRAIN_DATA_FILENAME
  126. X, Y, videos, fea = process_data(filename=train_filename)
  127. print(X.shape, Y.shape)
  128. print(len(fea), fea)
  129. train(X, Y, features=fea)
  130. predict()