rov_train2.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from sklearn.metrics import r2_score
  4. import os
  5. import pandas as pd
  6. import gc
  7. import math
  8. import numpy as np
  9. import time
  10. from sklearn.linear_model import SGDRegressor
  11. from sklearn.linear_model import SGDClassifier
  12. import lightgbm as lgb
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.model_selection import StratifiedKFold
  15. from sklearn.preprocessing import MultiLabelBinarizer
  16. from sklearn import metrics
  17. import pickle
  18. from sklearn.metrics import mean_squared_error
  19. import seaborn as sns
  20. import matplotlib.pylab as plt
  21. from odps import ODPS
  22. from odps.df import DataFrame as odpsdf
  23. from datetime import datetime as dt
  24. import datetime
  25. from scipy import sparse
  26. from scipy.sparse import hstack
  27. import process_feature
  28. import process_tag
  29. def getRovfeaturetable(dt, table):
  30. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  31. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  32. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  33. featureArray = []
  34. for record in odps.read_table(table, partition='dt=%s' % dt):
  35. valueFeature = {}
  36. for i in process_feature.featurename:
  37. if i == 'dt':
  38. valueFeature[i] = dt
  39. else:
  40. valueFeature[i] = record[i]
  41. featureArray.append(valueFeature)
  42. featureArray = pd.DataFrame(featureArray)
  43. print(dt, table, 'feature table finish')
  44. return featureArray
  45. def getdatasample(date, max_range, table):
  46. new_date = dt.strptime(date, '%Y%m%d')
  47. datelist = []
  48. testlist = []
  49. for i in range(0, max_range):
  50. delta = datetime.timedelta(days=i)
  51. tar_dt = new_date - delta
  52. datelist.append(tar_dt.strftime("%Y%m%d"))
  53. for tm in datelist:
  54. testlist.append(getRovfeaturetable(tm, table))
  55. testdata = pd.concat(testlist)
  56. testdata.reset_index(inplace=True)
  57. testdata = testdata.drop(axis=1, columns='index')
  58. return testdata
  59. def select_recent_video(df):
  60. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  61. df['dt'] = df['dt'].astype(int)
  62. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  63. df = df[df['rk'] == 1]
  64. return df
  65. def basic_cal(df):
  66. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  67. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  68. df['return_back'] = df.apply(lambda x:1 if x['weighted_retrn']> 0 else 0,axis=1)
  69. return df
  70. def dataprepare(df_pre):
  71. # 直接将特征送进去,不加交叉特征。
  72. df_pre = df_pre.fillna(0)
  73. #df_new_feature = df_pre[process_feature.features]
  74. df_new_feature = df_pre[process_feature.filter_recent_features()]
  75. df_target = df_pre['weighted_retrn_log']
  76. df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
  77. return df_new_feature, df_target
  78. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  79. Feature_Data= pd.DataFrame()
  80. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  81. fold1_df1 = df.iloc[0:values_lenth,:]
  82. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  83. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  84. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  85. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  86. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  87. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  88. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  89. return Feature_Data
  90. def MAPE(true, pred):
  91. true = np.array(true)
  92. sum_ = 0
  93. count = 0
  94. for i in range(len(true)):
  95. if true[i] != 0:
  96. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  97. count = count + 1
  98. else:
  99. continue
  100. return sum_ / count
  101. def process_train_predict_data():
  102. now_date = datetime.date.today()
  103. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  104. diff_1 = datetime.timedelta(days=1)
  105. diff_5 = datetime.timedelta(days=7)
  106. predict_dt = now_date - diff_1
  107. predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
  108. train_dt = now_date - diff_5
  109. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  110. #read data from ali
  111. train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
  112. predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
  113. #pickle for test
  114. import _pickle as cPickle
  115. with open('train_data.pickle','wb') as output_file:
  116. cPickle.dump(train_data, output_file)
  117. with open('predict_data.pickle','wb') as output_file:
  118. cPickle.dump(predict_data, output_file)
  119. #with open(r"train_data.pickle", "rb") as input_file:
  120. '''
  121. with open(r"train_data.pickle", "rb") as input_file:
  122. train_data = cPickle.load(input_file)
  123. with open(r"predict_data.pickle", "rb") as input_file:
  124. predict_data = cPickle.load(input_file)
  125. '''
  126. #end pickle
  127. train_data = basic_cal(train_data)
  128. predict_data = basic_cal(predict_data)
  129. predict_data = select_recent_video(predict_data)
  130. #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
  131. predict_data = predict_data.drop(axis=1, columns='rk')
  132. train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  133. predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  134. train_data = train_data.fillna(0)
  135. predict_data = predict_data.fillna(0)
  136. train_data = process_feature.cal_feature(train_data)
  137. predict_data = process_feature.cal_feature(predict_data)
  138. predict_data['videoid'] = predict_data['videoid'].astype('int')
  139. df_new_feature,df_target= dataprepare(train_data)
  140. df_new_feature_predict, df_target_predict = dataprepare(predict_data)
  141. df_new_feature_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature).loc[:,'day1playcount':'videocategory555']))
  142. df_new_feature_predict_part_one = sparse.csr_matrix(np.array(pd.DataFrame(df_new_feature_predict).loc[:,'day1playcount':'videocategory555']))
  143. print('value feature generate successfully')
  144. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  145. predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
  146. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  147. predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
  148. allvideo_raw = list(set(np.array(pd.concat([train_videoid,predict_videoid])).tolist()))
  149. allvideo = np.array(allvideo_raw).reshape(len(allvideo_raw),1).tolist()
  150. mlb_model_videoid = MultiLabelBinarizer(sparse_output=True).fit(allvideo)
  151. train_videoid = mlb_model_videoid.transform(train_videoid_list)
  152. predict_videoid = mlb_model_videoid.transform(predict_videoid_list)
  153. print('videoid feature generate successfully')
  154. #获取tag-one-hot
  155. tags ,train_tag,predict_tag = process_tag.tag_preprocessing('tag', df_new_feature, df_new_feature_predict)
  156. #获取tag tfidf
  157. tag_dict = process_tag.get_tag_tfidf('20200305','video_tag_tf_idf')
  158. print('lenth tag_dict:',len(tag_dict))
  159. #获取tfidf_tag 稀疏矩阵
  160. tag_corpus = tags.tolist() #corpus
  161. tag_tfidf_list = process_tag.ttfidf_list_generation(tag_corpus,tag_dict )
  162. tag_tf_idf_matrix = sparse.csr_matrix(np.array(tag_tfidf_list))
  163. tag_feature_train = train_tag.multiply(tag_tf_idf_matrix)
  164. tag_feature_test = predict_tag.multiply(tag_tf_idf_matrix)
  165. print('tag tfidf feature generate successfully')
  166. print('tag dimension:', len(tag_tfidf_list))
  167. #获取values without tag
  168. words ,train_words,test_words = process_tag.tag_preprocessing('words_no_tag', df_new_feature, df_new_feature_predict)
  169. #获取words tfidf
  170. words_dict = process_tag.get_tag_tfidf('20200305','video_words_without_tags_tfidf')
  171. print('lenth words_dict:',len(words_dict))
  172. #获取tfidf_tag 稀疏矩阵
  173. words_corpus = words.tolist() #corpus
  174. words_tfidf_list = process_tag.ttfidf_list_generation(words_corpus,words_dict )
  175. words_tf_idf_matrix = sparse.csr_matrix(np.array(words_tfidf_list))
  176. words_feature_train = train_words.multiply(words_tf_idf_matrix)
  177. words_feature_test = test_words.multiply(words_tf_idf_matrix)
  178. print('tag tfidf feature generate successfully')
  179. print('words dimension:', len(words_tfidf_list))
  180. df_new_feature = hstack([df_new_feature_part_one,train_videoid,tag_feature_train, words_feature_train])
  181. df_new_feature_predict = hstack([df_new_feature_predict_part_one,predict_videoid,tag_feature_test,words_feature_test])
  182. return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
  183. def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
  184. #target
  185. df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
  186. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  187. param = {'num_leaves': 18,
  188. 'min_data_in_leaf': 60,
  189. 'objective': 'regression',
  190. 'max_depth': -1,
  191. 'learning_rate': 0.01,
  192. "min_child_samples": 30,
  193. "boosting": "gbdt",
  194. "feature_fraction": 0.8,
  195. "bagging_freq": 1,
  196. "bagging_fraction": 0.8,
  197. "bagging_seed": 11,
  198. "metric": 'rmse',
  199. "lambda_l1": 0.1,
  200. "verbosity": -1,
  201. "nthread": 4,
  202. # 'max_bin': 512,
  203. "random_state": 4590}
  204. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  205. #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  206. oof = np.zeros(len(df_target))
  207. predictions = np.zeros(len(df_target_predict))
  208. feature_importance_df = pd.DataFrame()
  209. # values_lenth = len(process_feature.features + process_feature.cate_feat)
  210. # video_id_lenth = len(mlb_model_videoid.classes_)
  211. # tag_length = len(tag_tfidf_list)
  212. # word_length = len(words_tfidf_list)
  213. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
  214. change_view = change_view.sort_index()
  215. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature, train_data['return_back'].values)):
  216. print("folds {}".format(fold_))
  217. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  218. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  219. num_round = 10000
  220. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  221. early_stopping_rounds=200)
  222. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  223. predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
  224. fold_importance_df = pd.DataFrame()
  225. # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  226. # fold_importance_df["Feature"] = np.array(column)
  227. # fold_importance_df["importance"] = clf.feature_importance()
  228. # fold_importance_df["fold"] = fold_ + 1
  229. # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  230. # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  231. # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  232. # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  233. # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  234. # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  235. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  236. print('oof_mse:', mean_squared_error(df_target, oof))
  237. print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
  238. print('test_mse:', mean_squared_error(df_target_predict, predictions))
  239. print('oof_mape:', MAPE(df_target, oof))
  240. print('test_mape:', MAPE(df_target_predict, predictions))
  241. print('verification r2:', r2_score(df_target, oof))
  242. print('test r2:', r2_score(df_target_predict, predictions))
  243. sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
  244. sub_df_['score'] = predictions
  245. print('regre ranking shape', sub_df_.shape)
  246. sub_df_.to_csv('result.csv')
  247. if __name__ == '__main__':
  248. train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
  249. do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)