liqian 3 rokov pred
rodič
commit
117eafe1cc
1 zmenil súbory, kde vykonal 37 pridanie a 3 odobranie
  1. 37 3
      rov_train.py

+ 37 - 3
rov_train.py

@@ -54,6 +54,40 @@ def process_data(filename):
     return x, y, video_ids, features
 
 
+def process_predict_data(filename):
+    """
+    预测数据清洗、预处理
+    :param filename: type-DataFrame
+    :return: x, y, video_ids, features
+    """
+    # 获取数据
+    data = read_from_pickle(filename)
+
+    # 获取视频id列
+    video_ids = data['videoid']
+    # 视频状态过滤
+    video_id_list = [int(video_id) for video_id in video_ids]
+    filtered_videos = [str(item) for item in filter_video_status(video_ids=video_id_list)]
+    data = data.loc[data['videoid'].isin(filtered_videos)]
+
+    # 获取x
+    drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
+    x = data.drop(columns=drop_columns)
+    # 计算后一天的回流比前一天的回流差值
+    x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
+    x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
+    x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
+    # 计算后一天回流比前一天回流的增长率
+    x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
+    x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
+    x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
+
+    # 缺失值填充为0
+    x.fillna(0)
+
+    return x, filtered_videos
+
+
 def train(x, y, features):
     """
     训练模型
@@ -160,7 +194,7 @@ def pack_list_result_to_csv(filename, data, columns=None, sort_columns=None, fil
 def predict():
     """预测"""
     # 读取预测数据并进行清洗
-    x, y, video_ids, _ = process_data(config_.PREDICT_DATA_FILENAME)
+    x, video_ids = process_predict_data(config_.PREDICT_DATA_FILENAME)
     log_.info('predict data shape: x={}'.format(x.shape))
     # 获取训练好的模型
     model = read_from_pickle(filename=config_.MODEL_FILENAME)
@@ -175,7 +209,7 @@ def predict():
     # 按照normal_y_降序排序
     predict_data = []
     for i, video_id in enumerate(video_ids):
-        data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i], 'y': y[i]}
+        data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i]}
         predict_data.append(data)
     predict_data_sorted = sorted(predict_data, key=lambda temp: temp['normal_y_'], reverse=True)
 
@@ -197,7 +231,7 @@ def predict():
     predict_result_filename = 'predict.csv'
     pack_list_result_to_csv(filename=predict_result_filename,
                             data=predict_result,
-                            columns=['video_id', 'rov_score', 'normal_y_', 'y_', 'y'],
+                            columns=['video_id', 'rov_score', 'normal_y_', 'y_'],
                             sort_columns=['rov_score'],
                             ascending=False)