rov_train.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from sklearn.metrics import r2_score
  4. import os
  5. import pandas as pd
  6. import gc
  7. import math
  8. import numpy as np
  9. import time
  10. from sklearn.linear_model import SGDRegressor
  11. from sklearn.linear_model import SGDClassifier
  12. import lightgbm as lgb
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.model_selection import StratifiedKFold
  15. from sklearn.preprocessing import MultiLabelBinarizer
  16. from sklearn import metrics
  17. import pickle
  18. from sklearn.metrics import mean_squared_error
  19. import seaborn as sns
  20. import matplotlib.pylab as plt
  21. from odps import ODPS
  22. from odps.df import DataFrame as odpsdf
  23. from datetime import datetime as dt
  24. import datetime
  25. from scipy import sparse
  26. from scipy.sparse import hstack
  27. import process_feature
  28. import process_tag
  29. def getRovfeaturetable(dt, table):
  30. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  31. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  32. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  33. featureArray = []
  34. for record in odps.read_table(table, partition='dt=%s' % dt):
  35. valueFeature = {}
  36. for i in process_feature.featurename:
  37. if i == 'dt':
  38. valueFeature[i] = dt
  39. else:
  40. valueFeature[i] = record[i]
  41. featureArray.append(valueFeature)
  42. featureArray = pd.DataFrame(featureArray)
  43. print(dt, table, 'feature table finish')
  44. return featureArray
  45. def getdatasample(date, max_range, table):
  46. new_date = dt.strptime(date, '%Y%m%d')
  47. datelist = []
  48. testlist = []
  49. for i in range(0, max_range):
  50. delta = datetime.timedelta(days=i)
  51. tar_dt = new_date - delta
  52. datelist.append(tar_dt.strftime("%Y%m%d"))
  53. for tm in datelist:
  54. testlist.append(getRovfeaturetable(tm, table))
  55. testdata = pd.concat(testlist)
  56. testdata.reset_index(inplace=True)
  57. testdata = testdata.drop(axis=1, columns='index')
  58. return testdata
  59. def select_recent_video(df):
  60. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  61. df['dt'] = df['dt'].astype(int)
  62. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  63. df = df[df['rk'] == 1]
  64. return df
  65. def basic_cal(df):
  66. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  67. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  68. ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
  69. df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
  70. return df
  71. def today_view_category(predict_data):
  72. ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
  73. data_test1_view1 = predict_data.loc[predict_data['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
  74. data_test1_view2 = predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
  75. data_test1_view3 = predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
  76. data_test1_view4 = predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
  77. data_test1_view5 = predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
  78. data_test1_view6 = predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
  79. data_test1_view7 = predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
  80. predict_data.loc[predict_data['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
  81. predict_data.loc[(predict_data['day1viewcount_rank'] > 3000)&(predict_data['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
  82. predict_data.loc[(predict_data['day1viewcount_rank'] > 1000)&(predict_data['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
  83. predict_data.loc[(predict_data['day1viewcount_rank'] > 300)&(predict_data['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
  84. predict_data.loc[(predict_data['day1viewcount_rank'] > 100)&(predict_data['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
  85. predict_data.loc[(predict_data['day1viewcount_rank'] > 30)&(predict_data['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
  86. predict_data.loc[(predict_data['day1viewcount_rank'] > 0)&(predict_data['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
  87. return predict_data
  88. def dataprepare(df_pre):
  89. # 直接将特征送进去,不加交叉特征。
  90. # 是否对数据补零
  91. df_pre = df_pre.fillna(0)
  92. #df_new_feature = df_pre[process_feature.features]
  93. df_new_feature = df_pre[process_feature.filter_recent_features()]
  94. print(df_new_feature.shape)
  95. df_target = df_pre['weighted_retrn_log']
  96. df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
  97. return df_new_feature, df_target
  98. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  99. Feature_Data= pd.DataFrame()
  100. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  101. fold1_df1 = df.iloc[0:values_lenth,:]
  102. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  103. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  104. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  105. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  106. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  107. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  108. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  109. return Feature_Data
  110. def MAPE(true, pred):
  111. true = np.array(true)
  112. sum_ = 0
  113. count = 0
  114. for i in range(len(true)):
  115. if true[i] != 0:
  116. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  117. count = count + 1
  118. else:
  119. continue
  120. return sum_ / count
  121. def process_train_predict_data():
  122. now_date = datetime.date.today()
  123. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  124. diff_1 = datetime.timedelta(days=1)
  125. diff_5 = datetime.timedelta(days=7)
  126. predict_dt = now_date - diff_1
  127. predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
  128. train_dt = now_date - diff_5
  129. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  130. #read data from ali
  131. train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
  132. predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
  133. #pickle for test
  134. import _pickle as cPickle
  135. with open('train_data.pickle','wb') as output_file:
  136. cPickle.dump(train_data, output_file)
  137. with open('predict_data.pickle','wb') as output_file:
  138. cPickle.dump(predict_data, output_file)
  139. #with open(r"train_data.pickle", "rb") as input_file:
  140. '''
  141. with open(r"train_data.pickle", "rb") as input_file:
  142. train_data = cPickle.load(input_file)
  143. with open(r"predict_data.pickle", "rb") as input_file:
  144. predict_data = cPickle.load(input_file)
  145. '''
  146. #end pickle
  147. train_data = basic_cal(train_data)
  148. predict_data = basic_cal(predict_data)
  149. predict_data = select_recent_video(predict_data)
  150. predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
  151. predict_data = predict_data.drop(axis=1, columns='rk')
  152. train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  153. predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  154. train_data = train_data.fillna(0)
  155. predict_data = predict_data.fillna(0)
  156. train_data = process_feature.cal_feature(train_data)
  157. predict_data = process_feature.cal_feature(predict_data)
  158. predict_data = today_view_category(predict_data)
  159. predict_data['videoid'] = predict_data['videoid'].astype('int')
  160. df_new_feature,df_target= dataprepare(train_data)
  161. df_new_feature_predict, df_target_predict = dataprepare(predict_data)
  162. print(df_target_predict)
  163. df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
  164. df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
  165. print('value feature generate successfully')
  166. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  167. predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
  168. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  169. predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
  170. allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
  171. allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
  172. mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
  173. train_videoid = mlb_model_videoid.transform(train_videoid_list)
  174. predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
  175. print('videoid feature generate successfully')
  176. #获取tag-one-hot
  177. tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
  178. #获取tag tfidf
  179. tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
  180. print('lenth tag_dict:',len(tag_dict))
  181. #获取tfidf_tag 稀疏矩阵
  182. tag_corpus = tags.tolist() #corpus
  183. tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
  184. tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
  185. tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
  186. tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
  187. print('tag tfidf feature generate successfully')
  188. print('tag dimension:', len(tag_tfidf_list))
  189. #获取values without tag
  190. words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
  191. #获取words tfidf
  192. words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
  193. print('lenth words_dict:',len(words_dict))
  194. #获取tfidf_tag 稀疏矩阵
  195. words_corpus = words.tolist() #corpus
  196. words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
  197. words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
  198. words_feature_train = train_words.multiply(words_tf_idf_matrix)
  199. words_feature_test = test_words.multiply(words_tf_idf_matrix)
  200. print('tag tfidf feature generate successfully')
  201. print('words dimension:', len(words_tfidf_list))
  202. df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
  203. df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
  204. return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
  205. def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
  206. #target
  207. df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
  208. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  209. param = {'num_leaves': 18,
  210. 'min_data_in_leaf': 60,
  211. 'objective': 'regression',
  212. 'max_depth': -1,
  213. 'learning_rate': 0.01,
  214. "min_child_samples": 30,
  215. "boosting": "gbdt",
  216. "feature_fraction": 0.8,
  217. "bagging_freq": 1,
  218. "bagging_fraction": 0.8,
  219. "bagging_seed": 11,
  220. "metric": 'rmse',
  221. "lambda_l1": 0.1,
  222. "verbosity": -1,
  223. "nthread": 4,
  224. # 'max_bin': 512,
  225. "random_state": 4590}
  226. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  227. #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  228. oof = np.zeros(len(df_target))
  229. predictions = np.zeros(len(df_target_predict))
  230. feature_importance_df = pd.DataFrame()
  231. # values_lenth = len(process_feature.features + process_feature.cate_feat)
  232. # video_id_lenth = len(mlb_model_videoid.classes_)
  233. # tag_length = len(tag_tfidf_list)
  234. # word_length = len(words_tfidf_list)
  235. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
  236. change_view = change_view.sort_index()
  237. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
  238. print("folds {}".format(fold_))
  239. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  240. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  241. num_round = 10000
  242. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  243. early_stopping_rounds=200)
  244. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  245. predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
  246. fold_importance_df = pd.DataFrame()
  247. # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  248. # fold_importance_df["Feature"] = np.array(column)
  249. # fold_importance_df["importance"] = clf.feature_importance()
  250. # fold_importance_df["fold"] = fold_ + 1
  251. # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  252. # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  253. # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  254. # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  255. # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  256. # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  257. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  258. print('oof_mse:', mean_squared_error(df_target, oof))
  259. print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
  260. print('test_mse:', mean_squared_error(df_target_predict, predictions))
  261. print('oof_mape:', MAPE(df_target, oof))
  262. print('test_mape:', MAPE(df_target_predict, predictions))
  263. print('verification r2:', r2_score(df_target, oof))
  264. print('test r2:', r2_score(df_target_predict, predictions))
  265. sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
  266. sub_df_['score'] = predictions
  267. print('regre ranking shape', sub_df_.shape)
  268. sub_df_.to_csv('result.csv')
  269. if __name__ == '__main__':
  270. train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
  271. do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)