|
@@ -1,12 +1,16 @@
|
|
import os
|
|
import os
|
|
|
|
+import time
|
|
|
|
+
|
|
import lightgbm as lgb
|
|
import lightgbm as lgb
|
|
import pandas as pd
|
|
import pandas as pd
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
|
|
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
|
|
|
|
+
|
|
from config import set_config
|
|
from config import set_config
|
|
-from utils import read_from_pickle, write_to_pickle
|
|
|
|
|
|
+from utils import read_from_pickle, write_to_pickle, data_normalization, request_post
|
|
from log import Log
|
|
from log import Log
|
|
|
|
+from db_helper import RedisHelper
|
|
|
|
|
|
config_ = set_config()
|
|
config_ = set_config()
|
|
log_ = Log()
|
|
log_ = Log()
|
|
@@ -98,7 +102,6 @@ def train(x, y, features):
|
|
pre_y_test = model.predict(data=x_test, num_iteration=model.best_iteration)
|
|
pre_y_test = model.predict(data=x_test, num_iteration=model.best_iteration)
|
|
y_test = y_test.values
|
|
y_test = y_test.values
|
|
|
|
|
|
- err_mae = mean_absolute_error(y_test, pre_y_test)
|
|
|
|
err_mape = mean_absolute_percentage_error(y_test, pre_y_test)
|
|
err_mape = mean_absolute_percentage_error(y_test, pre_y_test)
|
|
r2 = r2_score(y_test, pre_y_test)
|
|
r2 = r2_score(y_test, pre_y_test)
|
|
|
|
|
|
@@ -107,7 +110,7 @@ def train(x, y, features):
|
|
test_result_filename = 'test_result.csv'
|
|
test_result_filename = 'test_result.csv'
|
|
pack_result_to_csv(filename=test_result_filename, sort_columns=['pre_y_test'], ascending=False, **test_data)
|
|
pack_result_to_csv(filename=test_result_filename, sort_columns=['pre_y_test'], ascending=False, **test_data)
|
|
|
|
|
|
- print(err_mae, err_mape, r2)
|
|
|
|
|
|
+ log_.info('err_mape={}, r2={}'.format(err_mape, r2))
|
|
|
|
|
|
# 保存模型
|
|
# 保存模型
|
|
write_to_pickle(data=model, filename=config_.MODEL_FILENAME)
|
|
write_to_pickle(data=model, filename=config_.MODEL_FILENAME)
|
|
@@ -136,20 +139,50 @@ def predict():
|
|
"""预测"""
|
|
"""预测"""
|
|
# 读取预测数据并进行清洗
|
|
# 读取预测数据并进行清洗
|
|
x, y, video_ids, _ = process_data(config_.PREDICT_DATA_FILENAME)
|
|
x, y, video_ids, _ = process_data(config_.PREDICT_DATA_FILENAME)
|
|
|
|
+ log_.info('predict data shape: x={}'.format(x.shape))
|
|
# 获取训练好的模型
|
|
# 获取训练好的模型
|
|
model = read_from_pickle(filename=config_.MODEL_FILENAME)
|
|
model = read_from_pickle(filename=config_.MODEL_FILENAME)
|
|
# 预测
|
|
# 预测
|
|
y_ = model.predict(x)
|
|
y_ = model.predict(x)
|
|
|
|
+ log_.info('predict finished!')
|
|
|
|
+ # 将结果进行归一化到[0, 100]
|
|
|
|
+ normal_y_ = data_normalization(list(y_))
|
|
|
|
+ log_.info('normalization finished!')
|
|
# 打包预测结果存入csv
|
|
# 打包预测结果存入csv
|
|
- predict_data = {'y_': y_, 'y': y, 'video_ids': video_ids}
|
|
|
|
|
|
+ predict_data = {'normal_y_': normal_y_, 'y_': y_, 'y': y, 'video_ids': video_ids}
|
|
predict_result_filename = 'predict.csv'
|
|
predict_result_filename = 'predict.csv'
|
|
- pack_result_to_csv(filename=predict_result_filename, sort_columns=['y_'], ascending=False, **predict_data)
|
|
|
|
|
|
+ pack_result_to_csv(filename=predict_result_filename, sort_columns=['normal_y_'], ascending=False, **predict_data)
|
|
|
|
+ # 上传redis
|
|
|
|
+ redis_data = {}
|
|
|
|
+ json_data = []
|
|
|
|
+ for i in range(len(video_ids)):
|
|
|
|
+ redis_data[video_ids[i]] = normal_y_[i]
|
|
|
|
+ json_data.append({'videoId': video_ids[i], 'rovScore': normal_y_[i]})
|
|
|
|
+ key_name = config_.RECALL_KEY_NAME_PREFIX + time.strftime('%Y%m%d')
|
|
|
|
+ redis_helper = RedisHelper()
|
|
|
|
+ redis_helper.add_data_with_zset(key_name=key_name, data=redis_data)
|
|
|
|
+ log_.info('data to redis finished!')
|
|
|
|
+ # 通知后端更新数据
|
|
|
|
+ result = request_post(request_url=config_.NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL, request_data={'videos': json_data})
|
|
|
|
+ if result['code'] == 0:
|
|
|
|
+ log_.info('notify backend success!')
|
|
|
|
+ else:
|
|
|
|
+ log_.error('notify backend fail!')
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
|
+ log_.info('rov model train start...')
|
|
|
|
+ train_start = time.time()
|
|
train_filename = config_.TRAIN_DATA_FILENAME
|
|
train_filename = config_.TRAIN_DATA_FILENAME
|
|
X, Y, videos, fea = process_data(filename=train_filename)
|
|
X, Y, videos, fea = process_data(filename=train_filename)
|
|
- print(X.shape, Y.shape)
|
|
|
|
- print(len(fea), fea)
|
|
|
|
|
|
+ log_.info('X_shape = {}, Y_sahpe = {}'.format(X.shape, Y.shape))
|
|
train(X, Y, features=fea)
|
|
train(X, Y, features=fea)
|
|
|
|
+ train_end = time.time()
|
|
|
|
+ log_.info('rov model train end, execute time = {}ms'.format((train_end - train_start)*1000))
|
|
|
|
+
|
|
|
|
+ log_.info('rov model predict start...')
|
|
|
|
+ predict_start = time.time()
|
|
predict()
|
|
predict()
|
|
|
|
+ predict_end = time.time()
|
|
|
|
+ log_.info('rov model predict end, execute time = {}ms'.format((predict_end - predict_start)*1000))
|
|
|
|
+
|