rov_train.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. import os
  2. import random
  3. import time
  4. import lightgbm as lgb
  5. import pandas as pd
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error
  8. from my_config import set_config
  9. from my_utils import read_from_pickle, write_to_pickle, data_normalization, \
  10. request_post, filter_video_status, update_video_w_h_rate, filter_video_status_app, filter_shield_video
  11. from log import Log
  12. from db_helper import RedisHelper, MysqlHelper
  13. config_, env = set_config()
  14. log_ = Log()
  15. def process_data(filename):
  16. """
  17. 数据清洗、预处理
  18. :param filename: type-DataFrame
  19. :return: x, y, video_ids, features
  20. """
  21. # 获取数据
  22. data = read_from_pickle(filename)
  23. # 获取y,并将 y <= 0 的值更新为1
  24. data['futre7dayreturn'].loc[data['futre7dayreturn'] <= 0] = 1
  25. y = data['futre7dayreturn']
  26. # 获取视频id列
  27. video_ids = data['videoid']
  28. # 获取x
  29. drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
  30. x = data.drop(columns=drop_columns)
  31. # 计算后一天的回流比前一天的回流差值
  32. x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  33. x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  34. x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  35. # 计算后一天回流比前一天回流的增长率
  36. x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
  37. x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
  38. x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
  39. # 缺失值填充为0
  40. x.fillna(0, inplace=True)
  41. # 获取当前所使用的特征列表
  42. features = list(x)
  43. return x, y, video_ids, features
  44. def process_predict_data(filename):
  45. """
  46. 预测数据清洗、预处理
  47. :param filename: type-DataFrame
  48. :return: x, y, video_ids, features
  49. """
  50. # 获取数据
  51. data = read_from_pickle(filename)
  52. # 获取视频id列
  53. video_ids = data['videoid']
  54. # 视频状态过滤
  55. video_id_list = [int(video_id) for video_id in video_ids]
  56. filtered_videos = [str(item) for item in filter_video_status(video_ids=video_id_list)]
  57. data = data.loc[data['videoid'].isin(filtered_videos)]
  58. video_id_final = data['videoid']
  59. # 获取x
  60. drop_columns = ['videoid', 'dt', 'futre7dayreturn', 'videotags', 'words_without_tags']
  61. x = data.drop(columns=drop_columns)
  62. # 计算后一天的回流比前一天的回流差值
  63. x['stage_four_return_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  64. x['stage_three_return_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  65. x['stage_two_return_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  66. # 计算后一天回流比前一天回流的增长率
  67. x['stage_four_return_ratio'] = x['stage_four_return_added'] / x['stage_four_retrn']
  68. x['stage_three_return_ratio'] = x['stage_three_return_added'] / x['stage_three_retrn']
  69. x['stage_two_return_ratio'] = x['stage_two_return_added'] / x['stage_two_retrn']
  70. # 缺失值填充为0
  71. x.fillna(0, inplace=True)
  72. return x, video_id_final
  73. def train(x, y, features):
  74. """
  75. 训练模型
  76. :param x: X
  77. :param y: Y
  78. :param features: 特征列表
  79. :return: None
  80. """
  81. # 训练集、测试集分割
  82. x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  83. log_.info('x_train shape: {}, y_train shape: {}'.format(x_train.shape, y_train.shape))
  84. log_.info('x_test shape: {}, y_test shape: {}'.format(x_test.shape, y_test.shape))
  85. # 训练参数设置
  86. params = {
  87. "objective": "regression",
  88. "reg_sqrt": True,
  89. "metric": "mape",
  90. "max_depth": -1,
  91. "num_leaves": 50,
  92. "learning_rate": 0.1,
  93. "bagging_fraction": 0.7,
  94. "feature_fraction": 0.7,
  95. "bagging_freq": 8,
  96. "bagging_seed": 2018,
  97. "lambda_l1": 0.11,
  98. "boosting": "dart",
  99. "nthread": 4,
  100. "verbosity": -1
  101. }
  102. # 初始化数据集
  103. train_set = lgb.Dataset(data=x_train, label=y_train)
  104. test_set = lgb.Dataset(data=x_test, label=y_test)
  105. # 模型训练
  106. evals_result = {}
  107. model = lgb.train(params=params, train_set=train_set, num_boost_round=5000,
  108. valid_sets=[test_set], early_stopping_rounds=100,
  109. verbose_eval=100, evals_result=evals_result)
  110. # 将模型特征重要度存入csv
  111. feature_importance_data = {'feature': features, 'feature_importance': model.feature_importance()}
  112. feature_importance_filename = 'model_feature_importance.csv'
  113. pack_result_to_csv(filename=feature_importance_filename, sort_columns=['feature_importance'],
  114. ascending=False, **feature_importance_data)
  115. # 测试集预测
  116. pre_y_test = model.predict(data=x_test, num_iteration=model.best_iteration)
  117. y_test = y_test.values
  118. err_mape = mean_absolute_percentage_error(y_test, pre_y_test)
  119. r2 = r2_score(y_test, pre_y_test)
  120. # 将测试集结果存入csv
  121. test_data = {'pre_y_test': pre_y_test, 'y_test': y_test}
  122. test_result_filename = 'test_result.csv'
  123. pack_result_to_csv(filename=test_result_filename, sort_columns=['pre_y_test'], ascending=False, **test_data)
  124. log_.info('err_mape={}, r2={}'.format(err_mape, r2))
  125. # 保存模型
  126. write_to_pickle(data=model, filename=config_.MODEL_FILENAME)
  127. def pack_result_to_csv(filename, sort_columns=None, filepath=config_.DATA_DIR_PATH, ascending=True, **data):
  128. """
  129. 打包数据并存入csv
  130. :param filename: csv文件名
  131. :param sort_columns: 指定排序列名列名,type-list, 默认为None
  132. :param filepath: csv文件存放路径,默认为config_.DATA_DIR_PATH
  133. :param ascending: 是否按指定列的数组升序排列,默认为True,即升序排列
  134. :param data: 数据, type-dict
  135. :return: None
  136. """
  137. if not os.path.exists(filepath):
  138. os.makedirs(filepath)
  139. file = os.path.join(filepath, filename)
  140. df = pd.DataFrame(data=data)
  141. if sort_columns:
  142. df = df.sort_values(by=sort_columns, ascending=ascending)
  143. df.to_csv(file, index=False)
  144. def pack_list_result_to_csv(filename, data, columns=None, sort_columns=None, filepath=config_.DATA_DIR_PATH, ascending=True):
  145. """
  146. 打包数据并存入csv, 数据为字典列表
  147. :param filename: csv文件名
  148. :param data: 数据,type-list [{}, {},...]
  149. :param columns: 列名顺序
  150. :param sort_columns: 指定排序列名列名,type-list, 默认为None
  151. :param filepath: csv文件存放路径,默认为config_.DATA_DIR_PATH
  152. :param ascending: 是否按指定列的数组升序排列,默认为True,即升序排列
  153. :return: None
  154. """
  155. if not os.path.exists(filepath):
  156. os.makedirs(filepath)
  157. file = os.path.join(filepath, filename)
  158. df = pd.DataFrame(data=data)
  159. if sort_columns:
  160. df = df.sort_values(by=sort_columns, ascending=ascending)
  161. df.to_csv(file, index=False, columns=columns)
  162. def predict():
  163. """预测"""
  164. # 读取预测数据并进行清洗
  165. x, video_ids = process_predict_data(config_.PREDICT_DATA_FILENAME)
  166. log_.info('predict data shape: x={}'.format(x.shape))
  167. # 获取训练好的模型
  168. model = read_from_pickle(filename=config_.MODEL_FILENAME)
  169. # 预测
  170. y_ = model.predict(x)
  171. log_.info('predict finished!')
  172. # 将结果进行归一化到[0, 100]
  173. normal_y_ = data_normalization(list(y_))
  174. log_.info('normalization finished!')
  175. # 按照normal_y_降序排序
  176. predict_data = []
  177. for i, video_id in enumerate(video_ids):
  178. data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i]}
  179. predict_data.append(data)
  180. predict_data_sorted = sorted(predict_data, key=lambda temp: temp['normal_y_'], reverse=True)
  181. # 按照排序,从100以固定差值做等差递减,以该值作为rovScore
  182. predict_result = []
  183. redis_data = {}
  184. json_data = []
  185. video_id_list = []
  186. for j, item in enumerate(predict_data_sorted):
  187. video_id = int(item['video_id'])
  188. rov_score = 100 - j * config_.ROV_SCORE_D
  189. item['rov_score'] = rov_score
  190. predict_result.append(item)
  191. redis_data[video_id] = rov_score
  192. json_data.append({'videoId': video_id, 'rovScore': rov_score})
  193. video_id_list.append(video_id)
  194. # 打包预测结果存入csv
  195. predict_result_filename = 'predict.csv'
  196. pack_list_result_to_csv(filename=predict_result_filename,
  197. data=predict_result,
  198. columns=['video_id', 'rov_score', 'normal_y_', 'y_'],
  199. sort_columns=['rov_score'],
  200. ascending=False)
  201. # 过滤
  202. applet_status_filtered_videos = filter_video_status(video_ids=video_id_list)
  203. log_.info('applet_status_filtered_videos count = {}'.format(len(applet_status_filtered_videos)))
  204. # 屏蔽视频过滤
  205. applet_filtered_videos = filter_shield_video(video_ids=applet_status_filtered_videos,
  206. shield_key_name_list=config_.SHIELD_CONFIG.get('-1'))
  207. log_.info('applet_filtered_videos count = {}'.format(len(applet_filtered_videos)))
  208. # 获取视频对应分数
  209. applet_redis_data = {}
  210. for video_id in applet_filtered_videos:
  211. applet_redis_data[video_id] = redis_data.get(video_id)
  212. # 上传redis
  213. key_name = config_.RECALL_KEY_NAME_PREFIX + time.strftime('%Y%m%d')
  214. redis_helper = RedisHelper()
  215. redis_helper.add_data_with_zset(key_name=key_name, data=applet_redis_data, expire_time=2*24*3600)
  216. log_.info('applet data to redis finished!')
  217. # 清空修改ROV的视频数据
  218. redis_helper.del_keys(key_name=config_.UPDATE_ROV_KEY_NAME)
  219. # 通知后端更新数据
  220. log_.info('json_data count = {}'.format(len(json_data)))
  221. result = request_post(request_url=config_.NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL, request_data={'videos': json_data})
  222. if result['code'] == 0:
  223. log_.info('notify backend success!')
  224. else:
  225. log_.error('notify backend fail!')
  226. # ##### 下线
  227. # # 更新视频的宽高比数据
  228. # if video_id_list:
  229. # update_video_w_h_rate(video_ids=video_id_list,
  230. # key_name=config_.W_H_RATE_UP_1_VIDEO_LIST_KEY_NAME['rov_recall'])
  231. # log_.info('update video w_h_rate to redis finished!')
  232. # ####### app应用数据更新
  233. # 过滤
  234. app_status_filtered_videos = filter_video_status_app(video_ids=video_id_list)
  235. log_.info('app_status_filtered_videos count = {}'.format(len(app_status_filtered_videos)))
  236. # 屏蔽视频过滤
  237. app_filtered_videos = filter_shield_video(video_ids=app_status_filtered_videos,
  238. shield_key_name_list=config_.SHIELD_CONFIG.get('-1'))
  239. log_.info('app_filtered_videos count = {}'.format(len(app_filtered_videos)))
  240. # 获取视频对应分数
  241. app_redis_data = {}
  242. for video_id in app_filtered_videos:
  243. app_redis_data[video_id] = redis_data.get(video_id)
  244. # 上传Redis
  245. redis_helper = RedisHelper()
  246. app_key_name = config_.RECALL_KEY_NAME_PREFIX_APP + time.strftime('%Y%m%d')
  247. redis_helper.add_data_with_zset(key_name=app_key_name, data=app_redis_data, expire_time=2*24*3600)
  248. log_.info('app data to redis finished!')
  249. # 清空修改ROV的视频数据
  250. redis_helper.del_keys(key_name=config_.UPDATE_ROV_KEY_NAME_APP)
  251. def predict_test():
  252. """测试环境数据生成"""
  253. # 获取测试环境中最近发布的40000条视频
  254. sql = "SELECT id FROM wx_video ORDER BY id DESC LIMIT 40000;"
  255. mysql_helper = MysqlHelper(mysql_info=config_.MYSQL_INFO)
  256. data = mysql_helper.get_data(sql=sql)
  257. video_ids = [video[0] for video in data]
  258. # 视频状态过滤
  259. applet_status_filtered_videos = filter_video_status(video_ids=video_ids)
  260. log_.info('applet_status_filtered_videos count = {}'.format(len(applet_status_filtered_videos)))
  261. # 屏蔽视频过滤
  262. applet_filtered_videos = filter_shield_video(video_ids=applet_status_filtered_videos,
  263. shield_key_name_list=config_.SHIELD_CONFIG.get('-1'))
  264. log_.info('applet_filtered_videos count = {}'.format(len(applet_filtered_videos)))
  265. # 随机生成 0-100 数作为分数
  266. redis_data = {}
  267. json_data = []
  268. for video_id in applet_filtered_videos:
  269. score = random.uniform(0, 100)
  270. redis_data[video_id] = score
  271. json_data.append({'videoId': video_id, 'rovScore': score})
  272. log_.info('json_data count = {}'.format(len(json_data)))
  273. # 上传Redis
  274. redis_helper = RedisHelper()
  275. key_name = config_.RECALL_KEY_NAME_PREFIX + time.strftime('%Y%m%d')
  276. redis_helper.add_data_with_zset(key_name=key_name, data=redis_data, expire_time=2*24*3600)
  277. log_.info('test data to redis finished!')
  278. # 清空修改ROV的视频数据
  279. redis_helper.del_keys(key_name=config_.UPDATE_ROV_KEY_NAME)
  280. # 通知后端更新数据
  281. result = request_post(request_url=config_.NOTIFY_BACKEND_UPDATE_ROV_SCORE_URL, request_data={'videos': json_data})
  282. if result['code'] == 0:
  283. log_.info('notify backend success!')
  284. else:
  285. log_.error('notify backend fail!')
  286. # ##### 下线
  287. # # 更新视频的宽高比数据
  288. # if filtered_videos:
  289. # update_video_w_h_rate(video_ids=filtered_videos,
  290. # key_name=config_.W_H_RATE_UP_1_VIDEO_LIST_KEY_NAME['rov_recall'])
  291. # log_.info('update video w_h_rate to redis finished!')
  292. # ####### app应用数据更新
  293. # 过滤
  294. app_status_filtered_videos = filter_video_status_app(video_ids=applet_filtered_videos)
  295. log_.info('app_status_filtered_videos count = {}'.format(len(app_status_filtered_videos)))
  296. # 屏蔽视频过滤
  297. app_filtered_videos = filter_shield_video(video_ids=app_status_filtered_videos,
  298. shield_key_name_list=config_.SHIELD_CONFIG.get('-1'))
  299. log_.info('app_filtered_videos count = {}'.format(len(app_filtered_videos)))
  300. # 获取视频对应分数
  301. app_redis_data = {}
  302. for video_id in app_filtered_videos:
  303. app_redis_data[video_id] = redis_data.get(video_id)
  304. # 上传Redis
  305. redis_helper = RedisHelper()
  306. app_key_name = config_.RECALL_KEY_NAME_PREFIX_APP + time.strftime('%Y%m%d')
  307. redis_helper.add_data_with_zset(key_name=app_key_name, data=app_redis_data, expire_time=2*24*3600)
  308. log_.info('app test data to redis finished!')
  309. # 清空修改ROV的视频数据
  310. redis_helper.del_keys(key_name=config_.UPDATE_ROV_KEY_NAME_APP)
  311. # ####### appType: [18, 19] 应用数据更新
  312. # for app_type in [config_.APP_TYPE['LAO_HAO_KAN_VIDEO'], config_.APP_TYPE['ZUI_JING_QI']]:
  313. # log_.info(f"app_type = {app_type}")
  314. # videos_temp = random.sample(filtered_videos, 300)
  315. # redis_data_temp = {}
  316. # csv_data_temp = []
  317. # for video_id in videos_temp:
  318. # score = random.uniform(0, 100)
  319. # redis_data_temp[video_id] = score
  320. # csv_data_temp.append({'video_id': video_id, 'rov_score': score})
  321. # # 打包预测结果存入csv
  322. # predict_result_filename = f'predict_{app_type}.csv'
  323. # pack_list_result_to_csv(filename=predict_result_filename,
  324. # data=csv_data_temp,
  325. # columns=['video_id', 'rov_score'],
  326. # sort_columns=['rov_score'],
  327. # ascending=False)
  328. #
  329. # # 上传redis
  330. # key_name = f"{config_.RECALL_KEY_NAME_PREFIX_APP_TYPE}{app_type}.{time.strftime('%Y%m%d')}"
  331. # redis_helper = RedisHelper()
  332. # redis_helper.add_data_with_zset(key_name=key_name, data=redis_data_temp)
  333. # log_.info('data to redis finished!')
  334. def predict_18_19():
  335. """预测 app_type:[18, 19]"""
  336. for app_type in [config_.APP_TYPE['LAO_HAO_KAN_VIDEO'], config_.APP_TYPE['ZUI_JING_QI']]:
  337. log_.info(f"app_type = {app_type}")
  338. # 读取预测数据并进行清洗
  339. predict_data_filename = config_.PREDICT_DATA_FILENAME_18_19[str(app_type)]
  340. x, video_ids = process_predict_data(predict_data_filename)
  341. log_.info('predict data shape: x = {}'.format(x.shape))
  342. # 获取训练好的模型
  343. model = read_from_pickle(filename=config_.MODEL_FILENAME)
  344. # 预测
  345. y_ = model.predict(x)
  346. log_.info('predict finished!')
  347. # 将结果进行归一化到[0, 100]
  348. normal_y_ = data_normalization(list(y_))
  349. log_.info('normalization finished!')
  350. # 按照normal_y_降序排序
  351. predict_data = []
  352. for i, video_id in enumerate(video_ids):
  353. data = {'video_id': video_id, 'normal_y_': normal_y_[i], 'y_': y_[i]}
  354. predict_data.append(data)
  355. predict_data_sorted = sorted(predict_data, key=lambda temp: temp['normal_y_'], reverse=True)
  356. # 按照排序,从100以固定差值做等差递减,以该值作为rovScore
  357. predict_result = []
  358. redis_data = {}
  359. json_data = []
  360. video_id_list = []
  361. for j, item in enumerate(predict_data_sorted):
  362. video_id = int(item['video_id'])
  363. rov_score = 100 - j * config_.ROV_SCORE_D
  364. item['rov_score'] = rov_score
  365. predict_result.append(item)
  366. redis_data[video_id] = rov_score
  367. json_data.append({'videoId': video_id, 'rovScore': rov_score})
  368. video_id_list.append(video_id)
  369. # 打包预测结果存入csv
  370. predict_result_filename = f'predict_{app_type}.csv'
  371. pack_list_result_to_csv(filename=predict_result_filename,
  372. data=predict_result,
  373. columns=['video_id', 'rov_score', 'normal_y_', 'y_'],
  374. sort_columns=['rov_score'],
  375. ascending=False)
  376. # 上传redis
  377. key_name = f"{config_.RECALL_KEY_NAME_PREFIX_APP_TYPE}{app_type}.{time.strftime('%Y%m%d')}"
  378. redis_helper = RedisHelper()
  379. redis_helper.add_data_with_zset(key_name=key_name, data=redis_data)
  380. log_.info('data to redis finished!')
  381. if __name__ == '__main__':
  382. log_.info('rov model train start...')
  383. train_start = time.time()
  384. train_filename = config_.TRAIN_DATA_FILENAME
  385. X, Y, videos, fea = process_data(filename=train_filename)
  386. log_.info('X_shape = {}, Y_sahpe = {}'.format(X.shape, Y.shape))
  387. train(X, Y, features=fea)
  388. train_end = time.time()
  389. log_.info('rov model train end, execute time = {}ms'.format((train_end - train_start)*1000))
  390. log_.info('rov model predict start...')
  391. predict_start = time.time()
  392. if env in ['dev', 'test']:
  393. predict_test()
  394. elif env in ['pre', 'pro']:
  395. predict()
  396. # predict_18_19()
  397. else:
  398. log_.error('env error')
  399. predict_end = time.time()
  400. log_.info('rov model predict end, execute time = {}ms'.format((predict_end - predict_start)*1000))