rov_train.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from sklearn.metrics import r2_score
  4. import os
  5. import pandas as pd
  6. import gc
  7. import math
  8. import numpy as np
  9. import time
  10. from sklearn.linear_model import SGDRegressor
  11. from sklearn.linear_model import SGDClassifier
  12. import lightgbm as lgb
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.model_selection import StratifiedKFold
  15. from sklearn.preprocessing import MultiLabelBinarizer
  16. from sklearn import metrics
  17. import pickle
  18. from sklearn.metrics import mean_squared_error
  19. import seaborn as sns
  20. import matplotlib.pylab as plt
  21. from odps import ODPS
  22. from odps.df import DataFrame as odpsdf
  23. from datetime import datetime as dt
  24. import datetime
  25. from scipy import sparse
  26. from scipy.sparse import hstack
  27. import process_feature
  28. import process_tag
  29. def getRovfeaturetable(dt, table):
  30. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  31. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  32. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  33. featureArray = []
  34. for record in odps.read_table(table, partition='dt=%s' % dt):
  35. valueFeature = {}
  36. for i in process_feature.featurename:
  37. if i == 'dt':
  38. valueFeature[i] = dt
  39. else:
  40. valueFeature[i] = record[i]
  41. featureArray.append(valueFeature)
  42. featureArray = pd.DataFrame(featureArray)
  43. print(dt, table, 'feature table finish')
  44. return featureArray
  45. def getdatasample(date, max_range, table):
  46. new_date = dt.strptime(date, '%Y%m%d')
  47. datelist = []
  48. testlist = []
  49. for i in range(0, max_range):
  50. delta = datetime.timedelta(days=i)
  51. tar_dt = new_date - delta
  52. datelist.append(tar_dt.strftime("%Y%m%d"))
  53. print(datelist)
  54. for tm in datelist:
  55. testlist.append(getRovfeaturetable(tm, table))
  56. testdata = pd.concat(testlist)
  57. testdata.reset_index(inplace=True)
  58. testdata = testdata.drop(axis=1, columns='index')
  59. return testdata
  60. def select_recent_video(df):
  61. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  62. df['dt'] = df['dt'].astype(int)
  63. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  64. df = df[df['rk'] == 1]
  65. return df
  66. def basic_cal(df):
  67. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  68. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  69. ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
  70. df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
  71. return df
  72. def today_view_category(predict_data):
  73. ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
  74. data_test1_view1 = predict_data.loc[predict_data['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
  75. data_test1_view2 = predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
  76. data_test1_view3 = predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
  77. data_test1_view4 = predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
  78. data_test1_view5 = predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
  79. data_test1_view6 = predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
  80. data_test1_view7 = predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
  81. predict_data.loc[predict_data['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
  82. predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
  83. predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
  84. predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
  85. predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
  86. predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
  87. predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
  88. return predict_data
  89. def dataprepare(df_pre):
  90. # 直接将特征送进去,不加交叉特征。
  91. # 是否对数据补零
  92. df_pre = df_pre.fillna(0)
  93. df_new_feature = df_pre[process_feature.features]
  94. df_target = df_pre['weighted_retrn_log']
  95. df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
  96. return df_new_feature, df_target
  97. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  98. Feature_Data= pd.DataFrame()
  99. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  100. fold1_df1 = df.iloc[0:values_lenth,:]
  101. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  102. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  103. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  104. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  105. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  106. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  107. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  108. return Feature_Data
  109. def MAPE(true, pred):
  110. true = np.array(true)
  111. sum_ = 0
  112. count = 0
  113. for i in range(len(true)):
  114. if true[i] != 0:
  115. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  116. count = count + 1
  117. else:
  118. continue
  119. return sum_ / count
  120. def process_train_predict_data():
  121. now_date = datetime.date.today()
  122. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  123. diff_1 = datetime.timedelta(days=1)
  124. diff_5 = datetime.timedelta(days=7)
  125. predict_dt = now_date - diff_1
  126. predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
  127. train_dt = now_date - diff_5
  128. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  129. train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
  130. predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
  131. #TODO save tempt
  132. import _pickle as cPickle
  133. with open('train_data.pickle','wb') as output_file:
  134. cPickle.dump(train_data, output_file)
  135. with open('predict_data.pickle','wb') as output_file:
  136. cPickle.dump(predict_data, output_file)
  137. exit()
  138. with open(r"train_data.pickle", "rb") as input_file:
  139. train_data = cPickle.load(input_file)
  140. with open(r"predict_data.pickle", "rb") as input_file:
  141. predict_data = cPickle.load(input_file)
  142. train_data = basic_cal(train_data)
  143. predict_data = basic_cal(predict_data)
  144. predict_data = select_recent_video(predict_data)
  145. predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
  146. predict_data = predict_data.drop(axis=1, columns='rk')
  147. train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  148. predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  149. train_data = train_data.fillna(0)
  150. predict_data = predict_data.fillna(0)
  151. train_data = process_feature.cal_feature(train_data)
  152. predict_data = process_feature.cal_feature(predict_data)
  153. predict_data = today_view_category(predict_data)
  154. predict_data['videoid'] = predict_data['videoid'].astype('int')
  155. df_new_feature,df_target= dataprepare(train_data)
  156. df_new_feature_predict, df_target_predict = dataprepare(predict_data)
  157. df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
  158. df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
  159. print('value feature generate successfully')
  160. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  161. predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
  162. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  163. predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
  164. allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
  165. allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
  166. mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
  167. train_videoid = mlb_model_videoid.transform(train_videoid_list)
  168. predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
  169. print('videoid feature generate successfully')
  170. #获取tag-one-hot
  171. tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
  172. #获取tag tfidf
  173. tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
  174. print('lenth tag_dict:',len(tag_dict))
  175. #获取tfidf_tag 稀疏矩阵
  176. tag_corpus = tags.tolist() #corpus
  177. tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
  178. tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
  179. tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
  180. tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
  181. print('tag tfidf feature generate successfully')
  182. print('tag dimension:', len(tag_tfidf_list))
  183. #获取values without tag
  184. words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag')
  185. #获取words tfidf
  186. words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
  187. print('lenth words_dict:',len(words_dict))
  188. #获取tfidf_tag 稀疏矩阵
  189. words_corpus = words.tolist() #corpus
  190. words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
  191. words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
  192. words_feature_train = train_words.multiply(words_tf_idf_matrix)
  193. words_feature_test = test_words.multiply(words_tf_idf_matrix)
  194. print('tag tfidf feature generate successfully')
  195. print('words dimension:', len(words_tfidf_list))
  196. df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
  197. df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
  198. def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
  199. #target
  200. df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
  201. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  202. param = {'num_leaves': 18,
  203. 'min_data_in_leaf': 60,
  204. 'objective': 'regression',
  205. 'max_depth': -1,
  206. 'learning_rate': 0.01,
  207. "min_child_samples": 30,
  208. "boosting": "gbdt",
  209. "feature_fraction": 0.8,
  210. "bagging_freq": 1,
  211. "bagging_fraction": 0.8,
  212. "bagging_seed": 11,
  213. "metric": 'rmse',
  214. "lambda_l1": 0.1,
  215. "verbosity": -1,
  216. "nthread": 4,
  217. # 'max_bin': 512,
  218. "random_state": 4590}
  219. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  220. oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  221. predictions = np.zeros(len(df_target_predict))
  222. feature_importance_df = pd.DataFrame()
  223. # values_lenth = len(process_feature.features + process_feature.cate_feat)
  224. # video_id_lenth = len(mlb_model_videoid.classes_)
  225. # tag_length = len(tag_tfidf_list)
  226. # word_length = len(words_tfidf_list)
  227. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
  228. change_view = change_view.sort_index()
  229. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
  230. print("folds {}".format(fold_))
  231. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  232. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  233. num_round = 10000
  234. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  235. early_stopping_rounds=200)
  236. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  237. predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
  238. fold_importance_df = pd.DataFrame()
  239. # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  240. # fold_importance_df["Feature"] = np.array(column)
  241. # fold_importance_df["importance"] = clf.feature_importance()
  242. # fold_importance_df["fold"] = fold_ + 1
  243. # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  244. # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  245. # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  246. # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  247. # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  248. # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  249. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  250. print('oof_mse:', mean_squared_error(df_target, oof))
  251. print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
  252. print('test_mse:', mean_squared_error(df_target_predict, predictions))
  253. print('oof_mape:', MAPE(df_target, oof))
  254. print('test_mape:', MAPE(df_target_predict, predictions))
  255. print('verification r2:', r2_score(df_target, oof))
  256. print('test r2:', r2_score(df_target_predict, predictions))
  257. sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
  258. sub_df_['score'] = predictions
  259. print('regre ranking shape', sub_df_.shape)
  260. if __name__ == '__main__':
  261. process_train_predict_data()