rov_train.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from sklearn.metrics import r2_score
  4. import os
  5. import pandas as pd
  6. import gc
  7. import math
  8. import numpy as np
  9. import time
  10. from sklearn.linear_model import SGDRegressor
  11. from sklearn.linear_model import SGDClassifier
  12. import lightgbm as lgb
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.model_selection import StratifiedKFold
  15. from sklearn import metrics
  16. import pickle
  17. from sklearn.metrics import mean_squared_error
  18. import seaborn as sns
  19. import matplotlib.pylab as plt
  20. from odps import ODPS
  21. from odps.df import DataFrame as odpsdf
  22. from datetime import datetime as dt
  23. import datetime
  24. now_date = datetime.date.today()
  25. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  26. diff_1 = datetime.timedelta(days=1)
  27. diff_5 = datetime.timedelta(days=7)
  28. input_dt = now_date - diff_1
  29. input_day = datetime.datetime.strftime(input_dt, '%Y%m%d')
  30. now_day = datetime.datetime.strftime(now_date, '%Y%m%d')
  31. train_dt = now_date - diff_5
  32. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  33. def getRovfeaturetable(dt, table):
  34. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  35. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  36. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  37. featureArray = []
  38. for record in odps.read_table(table, partition='dt=%s' % dt):
  39. valueFeature = {}
  40. for i in featurename:
  41. if i == 'dt':
  42. valueFeature[i] = dt
  43. else:
  44. valueFeature[i] = record[i]
  45. featureArray.append(valueFeature)
  46. featureArray = pd.DataFrame(featureArray)
  47. print(dt, table, 'feature table finish')
  48. return featureArray
  49. def getdatasample(date, max_range, table):
  50. new_date = dt.strptime(date, '%Y%m%d')
  51. datelist = []
  52. testlist = []
  53. for i in range(0, max_range):
  54. delta = datetime.timedelta(days=i)
  55. tar_dt = new_date - delta
  56. datelist.append(tar_dt.strftime("%Y%m%d"))
  57. print(datelist)
  58. for tm in datelist:
  59. testlist.append(getRovtestable(tm, table))
  60. testdata = pd.concat(testlist)
  61. testdata.reset_index(inplace=True)
  62. testdata = testdata.drop(axis=1, columns='index')
  63. return testdata
  64. traindata = getrainingdata(train_day, 30, 'rov_feature_add_v1')
  65. data_test_ori_rk = getestingdata(input_day, 1, 'rov_predict_table_add_v1')
  66. def select_recent_video(df):
  67. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  68. df['dt'] = df['dt'].astype(int)
  69. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  70. df = df[df['rk'] == 1]
  71. return df
  72. data_test_ori = select_recent_video(data_test_ori_rk)
  73. data_test_ori.loc[data_test_ori['dt'] != int(input_day), 'futre7dayreturn'] = 0
  74. data_test_ori = data_test_ori.drop(axis=1, columns='rk')
  75. traindata.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  76. data_test_ori.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  77. def basic_cal(df):
  78. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  79. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  80. ## 设置回流大于thresh, label就是1, 没有分享或有分享但是回流数是零的标为0
  81. df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
  82. return df
  83. data_train = basic_cal(traindata)
  84. data_test = basic_cal(data_test_ori)
  85. def today_view_category(df):
  86. ### 对当天的曝光量分三个级别,未来三天的曝光量分3个级别,添加Category feaure
  87. data_test1_view1 = df.loc[data_test['day1viewcount_rank'] > 10000]['day1viewcount'].mean()
  88. data_test1_view2 = df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000)]['day1viewcount'].mean()
  89. data_test1_view3 = df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000)]['day1viewcount'].mean()
  90. data_test1_view4 = df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000)]['day1viewcount'].mean()
  91. data_test1_view5 = df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300)]['day1viewcount'].mean()
  92. data_test1_view6 = df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100)]['day1viewcount'].mean()
  93. data_test1_view7 = df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30)]['day1viewcount'].mean()
  94. df.loc[df['day1viewcount_rank'] > 10000, 'todyviewcount'] = data_test1_view1
  95. df.loc[(data_test['day1viewcount_rank'] > 3000)&(data_test['day1viewcount_rank'] <= 10000), 'todyviewcount'] = data_test1_view2
  96. df.loc[(data_test['day1viewcount_rank'] > 1000)&(data_test['day1viewcount_rank'] <= 3000), 'todyviewcount'] = data_test1_view3
  97. df.loc[(data_test['day1viewcount_rank'] > 300)&(data_test['day1viewcount_rank'] <= 1000), 'todyviewcount'] = data_test1_view4
  98. df.loc[(data_test['day1viewcount_rank'] > 100)&(data_test['day1viewcount_rank'] <= 300), 'todyviewcount'] = data_test1_view5
  99. df.loc[(data_test['day1viewcount_rank'] > 30)&(data_test['day1viewcount_rank'] <= 100), 'todyviewcount'] = data_test1_view6
  100. df.loc[(data_test['day1viewcount_rank'] > 0)&(data_test['day1viewcount_rank'] <= 30), 'todyviewcount'] = data_test1_view7
  101. return df
  102. data_test = today_view_category(data_test)
  103. def dataprepare(df_pre):
  104. # 直接将特征送进去,不加交叉特征。
  105. # 是否对数据补零
  106. df_pre = df_pre.fillna(0)
  107. df_new_feature = df_pre[features]
  108. df_target = df_pre['weighted_retrn_log']
  109. df_new_feature = pd.concat([df_new_feature, df_pre[cate_feat],df_pre[one_hot_feature]], axis=1)
  110. return df_new_feature, df_target
  111. data_test['videoid'] = data_test['videoid'].astype('int')
  112. data_train = data_train[data_train['weighted_retrn'] > 0]
  113. print(data_train.shape, 'train shape')
  114. data_test = pd.merge(data_test, recall_video_stage_one, on=['videoid'], how='inner')
  115. print('score>0.5 video_count:', data_test.shape)
  116. df_new_feature,df_target= dataprepare(data_train)
  117. df_new_feature_test, df_target_test = dataprepare(data_test)
  118. #数值
  119. from scipy import sparse
  120. df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
  121. df_new_feature_test_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_test).loc[:,'day1playcount':'videocategory555']))
  122. print('value feature generate successfully')
  123. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  124. test_videoid = pd.DataFrame(df_new_feature_test).loc[:,'videoid']
  125. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  126. test_videoid_list = pd.DataFrame(df_new_feature_test).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_test).loc[:,'videoid']),1).tolist()
  127. allvideo_raw = list(set(np.array(pd.concat([train_videoid,test_videoid])).tolist()))
  128. allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
  129. from sklearn.preprocessing import MultiLabelBinarizer
  130. mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
  131. train_videoid = mlb_model_videoid.transform(train_videoid_list)
  132. test_videoid = mlb_model_videoid.transform(test_videoid_list)
  133. print('videoid feature generate successfully')
  134. #获取tag-one-hot
  135. tags ,train_tag,test_tag = tag_preprocessing('tag')
  136. #获取tag tfidf
  137. tag_dict = get_tag_tfidf('20200305','video_tag_tf_idf')
  138. print('lenth tag_dict:',len(tag_dict))
  139. #获取tfidf_tag 稀疏矩阵
  140. tag_corpus = tags.tolist() #corpus
  141. tag_tfidf_list = ttfidf_list_generation(tag_corpus,tag_dict )
  142. tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
  143. tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
  144. tag_feature_test = test_tag.multiply(tag_tf_idf_matrix)
  145. print('tag tfidf feature generate successfully')
  146. print('tag dimension:', len(tag_tfidf_list))
  147. # In[28]:
  148. #获取values without tag
  149. words ,train_words,test_words = tag_preprocessing('words_no_tag')
  150. #获取words tfidf
  151. words_dict = get_tag_tfidf('20200305','video_words_without_tags_tfidf')
  152. print('lenth words_dict:',len(words_dict))
  153. #获取tfidf_tag 稀疏矩阵
  154. words_corpus = words.tolist() #corpus
  155. words_tfidf_list = ttfidf_list_generation(words_corpus,words_dict )
  156. words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
  157. words_feature_train = train_words.multiply(words_tf_idf_matrix)
  158. words_feature_test = test_words.multiply(words_tf_idf_matrix)
  159. print('tag tfidf feature generate successfully')
  160. print('words dimension:', len(words_tfidf_list))
  161. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  162. Feature_Data= pd.DataFrame()
  163. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  164. fold1_df1 = df.iloc[0:values_lenth,:]
  165. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  166. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  167. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  168. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  169. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  170. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  171. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  172. return Feature_Data
  173. def MAPE(true, pred):
  174. true = np.array(true)
  175. sum_ = 0
  176. count = 0
  177. for i in range(len(true)):
  178. if true[i] != 0:
  179. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  180. count = count + 1
  181. else:
  182. continue
  183. return sum_ / count
  184. def do_train():
  185. from scipy.sparse import hstack
  186. df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
  187. df_new_feature_test = hstack([df_new_feature_test_part_one,test_videoid,tag_feature_test,words_feature_test])
  188. #target
  189. df_target_test = sparse.csr_matrix(pd.DataFrame(df_target_test).values).toarray()
  190. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  191. param = {'num_leaves': 18,
  192. 'min_data_in_leaf': 60,
  193. 'objective': 'regression',
  194. 'max_depth': -1,
  195. 'learning_rate': 0.01,
  196. "min_child_samples": 30,
  197. "boosting": "gbdt",
  198. "feature_fraction": 0.8,
  199. "bagging_freq": 1,
  200. "bagging_fraction": 0.8,
  201. "bagging_seed": 11,
  202. "metric": 'rmse',
  203. "lambda_l1": 0.1,
  204. "verbosity": -1,
  205. "nthread": 4,
  206. # 'max_bin': 512,
  207. "random_state": 4590}
  208. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  209. oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  210. predictions = np.zeros(len(df_target_test))
  211. feature_importance_df = pd.DataFrame()
  212. values_lenth = len(features + cate_feat)
  213. video_id_lenth = len(mlb_model_videoid.classes_)
  214. tag_length = len(tag_tfidf_list)
  215. word_length = len(words_tfidf_list)
  216. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_test.toarray()))
  217. change_view = change_view.sort_index()
  218. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, data_train['return_back'].values)):
  219. print("folds {}".format(fold_))
  220. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  221. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  222. num_round = 10000
  223. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  224. early_stopping_rounds=200)
  225. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  226. predictions += clf.predict(df_new_feature_test, num_iteration=clf.best_iteration) / folds.n_splits
  227. fold_importance_df = pd.DataFrame()
  228. column = features+cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  229. fold_importance_df["Feature"] = np.array(column)
  230. fold_importance_df["importance"] = clf.feature_importance()
  231. fold_importance_df["fold"] = fold_ + 1
  232. feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  233. fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  234. fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  235. fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  236. fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  237. feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  238. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  239. print('oof_mse:', mean_squared_error(df_target, oof))
  240. print('test_rmse:', np.sqrt(mean_squared_error(df_target_test, predictions)))
  241. print('test_mse:', mean_squared_error(df_target_test, predictions))
  242. print('oof_mape:', MAPE(df_target, oof))
  243. print('test_mape:', MAPE(df_target_test, predictions))
  244. print('verification r2:', r2_score(df_target, oof))
  245. print('test r2:', r2_score(df_target_test, predictions))
  246. sub_df_ = pd.DataFrame({"videoid": data_test["videoid"].values})
  247. sub_df_['score'] = predictions
  248. print('regre ranking shape', sub_df_.shape)