|
@@ -54,6 +54,40 @@ def process_data(filename):
|
|
|
return x, y, video_ids, features
|
|
|
|
|
|
|
|
|
+def process_predict_data(filename):
|
|
|
+ """
|
|
|
+ 预测数据清洗、预处理
|
|
|
+ :param filename: type-DataFrame
|
|
|
+ :return: x, y, video_ids, features
|
|
|
+ """
|
|
|
+
|
|
|
+ data = read_from_pickle(filename)
|
|
|
+
|
|
|
+
|
|
|
+ video_ids = data['videoid']
|
|
|
+
|
|
|
+ video_id_list = [int(video_id) for video_id in video_ids]
|
|
|
+ filtered_videos = [str(item) for item in filter_video_status(video_ids=video_id_list)]
|
|
|
+ data = data.loc[data['videoid'].isin(filtered_videos)]
|
|
|
+
|
|
|
+
|
|
|
+ drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
|
|
|
+ x = data.drop(columns=drop_columns)
|
|
|
+
|
|
|
+ x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
|
|
|
+ x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
|
|
|
+ x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
|
|
|
+
|
|
|
+ x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
|
|
|
+ x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
|
|
|
+ x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
|
|
|
+
|
|
|
+
|
|
|
+ x.fillna(0)
|
|
|
+
|
|
|
+ return x, filtered_videos
|
|
|
+
|
|
|
+
|
|
|
def train(x, y, features):
|
|
|
"""
|
|
|
训练模型
|
|
@@ -160,7 +194,7 @@ def pack_list_result_to_csv(filename, data, columns=None, sort_columns=None, fil
|
|
|
def predict():
|
|
|
"""预测"""
|
|
|
|
|
|
- x, y, video_ids, _ = process_data(config_.PREDICT_DATA_FILENAME)
|
|
|
+ x, video_ids = process_predict_data(config_.PREDICT_DATA_FILENAME)
|
|
|
log_.info('predict data shape: x={}'.format(x.shape))
|
|
|
|
|
|
model = read_from_pickle(filename=config_.MODEL_FILENAME)
|
|
@@ -175,7 +209,7 @@ def predict():
|
|
|
|
|
|
predict_data = []
|
|
|
for i, video_id in enumerate(video_ids):
|
|
|
- data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i], 'y': y[i]}
|
|
|
+ data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i]}
|
|
|
predict_data.append(data)
|
|
|
predict_data_sorted = sorted(predict_data, key=lambda temp: temp['normal_y_'], reverse=True)
|
|
|
|
|
@@ -197,7 +231,7 @@ def predict():
|
|
|
predict_result_filename = 'predict.csv'
|
|
|
pack_list_result_to_csv(filename=predict_result_filename,
|
|
|
data=predict_result,
|
|
|
- columns=['video_id', 'rov_score', 'normal_y_', 'y_', 'y'],
|
|
|
+ columns=['video_id', 'rov_score', 'normal_y_', 'y_'],
|
|
|
sort_columns=['rov_score'],
|
|
|
ascending=False)
|
|
|
|