import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.metrics import mean_absolute_error, r2_score from config import set_config from utils import read_from_pickle from log import Log config_ = set_config() log_ = Log() def process_data(filename): """ 数据清洗、预处理 :param filename: type-DataFrame :return: x, y, video_ids, features """ # 获取数据 data = read_from_pickle(filename) # 获取y,并将 y <= 0 的值更新为1 data['futre7dayreturn'].loc[data['futre7dayreturn'] <= 0] = 1 y = data['futre7dayreturn'] # 获取视频id列 video_ids = data['videoid'] # 获取x drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags'] x = data.drop(columns=drop_columns) # 计算后一天的回流比前一天的回流差值 x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn'] x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn'] x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn'] # 计算后一天回流比前一天回流的增长率 x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn'] x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn'] x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn'] # 缺失值填充为0 x.fillna(0) # 获取当前所使用的特征列表 features = list(x) return x, y, video_ids, features def train(x, y): """ 训练模型 :param x: :param y: :return: """ # 训练集、测试集分割 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33) log_.info('x_train shape: {}, y_train shape: {}'.format(x_train.shape, y_train.shape)) log_.info('x_test shape: {}, y_test shape: {}'.format(x_test.shape, y_test.shape)) # 训练参数设置 params = { "objective": "regression", "reg_sqrt": True, "metric": "mape", "max_depth": -1, "num_leaves": 50, "learning_rate": 0.1, "bagging_fraction": 0.7, "feature_fraction": 0.7, "bagging_freq": 8, "bagging_seed": 2018, "lambda_l1": 0.11, "boosting": "dart", "nthread": 4, "verbosity": -1 } # 初始化数据集 train_set = lgb.Dataset(data=x_train, label=y_train) test_set = lgb.Dataset(data=x_test, label=y_test) # 模型训练 evals_result = {} model = lgb.train(params=params, train_set=train_set, num_boost_round=5000, valid_sets=[test_set], early_stopping_rounds=100, verbose_eval=100, evals_result=evals_result) # 测试集预测 pre_test_y = model.predict(data=x_test, num_iteration=model.best_iteration) y_test = y_test.values err_mae = mean_absolute_error(y_test, pre_test_y) r2 = r2_score(y_test, pre_test_y) print(err_mae, r2) if __name__ == '__main__': # dt_test = '20211007' # project_test = 'usercdm' # table_test = 'rov_feature_add_v1' # res = get_rov_feature_table(dt_test, table_test) # res = get_data_with_date(date=dt_test, delta_days=2, project=project_test, table=table_test) # print(res.shape) # write_to_pickle(res, 'test.pickle') # data = read_from_pickle('test.pickle') # if data is not None: # print(data.shape, type(data)) # print(list(data)) # print(data[data['futre7dayreturn']<0]) # else: # print(data) train_filename = config_.TRAIN_DATA_FILENAME x, y, videos, fea = process_data(filename=train_filename) print(x.shape, y.shape) print(len(fea), fea) train(x, y)