rov_train2.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. from sklearn.metrics import r2_score
  4. import os
  5. import pandas as pd
  6. import gc
  7. import math
  8. import numpy as np
  9. import time
  10. from sklearn.linear_model import SGDRegressor
  11. from sklearn.linear_model import SGDClassifier
  12. import lightgbm as lgb
  13. from sklearn.model_selection import train_test_split
  14. from sklearn.model_selection import StratifiedKFold
  15. from sklearn.preprocessing import MultiLabelBinarizer
  16. from sklearn import metrics
  17. import pickle
  18. from sklearn.metrics import mean_squared_error
  19. import seaborn as sns
  20. import matplotlib.pylab as plt
  21. from odps import ODPS
  22. from odps.df import DataFrame as odpsdf
  23. from datetime import datetime as dt
  24. import datetime
  25. from scipy import sparse
  26. from scipy.sparse import hstack
  27. import process_feature
  28. import process_tag
  29. def getRovfeaturetable(dt, table):
  30. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  31. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  32. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  33. featureArray = []
  34. for record in odps.read_table(table, partition='dt=%s' % dt):
  35. valueFeature = {}
  36. for i in process_feature.featurename:
  37. if i == 'dt':
  38. valueFeature[i] = dt
  39. else:
  40. valueFeature[i] = record[i]
  41. featureArray.append(valueFeature)
  42. featureArray = pd.DataFrame(featureArray)
  43. print(dt, table, 'feature table finish')
  44. return featureArray
  45. def getdatasample(date, max_range, table):
  46. new_date = dt.strptime(date, '%Y%m%d')
  47. datelist = []
  48. testlist = []
  49. for i in range(0, max_range):
  50. delta = datetime.timedelta(days=i)
  51. tar_dt = new_date - delta
  52. datelist.append(tar_dt.strftime("%Y%m%d"))
  53. for tm in datelist:
  54. testlist.append(getRovfeaturetable(tm, table))
  55. testdata = pd.concat(testlist)
  56. testdata.reset_index(inplace=True)
  57. testdata = testdata.drop(axis=1, columns='index')
  58. return testdata
  59. def select_recent_video(df):
  60. """对每一个视频添加row number,按照日期排序,最后选取最近的那一天"""
  61. df['dt'] = df['dt'].astype(int)
  62. df['rk'] = df['dt'].groupby(df['videoid']).rank(ascending=0, method='first')
  63. df = df[df['rk'] == 1]
  64. return df
  65. def basic_cal(df):
  66. df['weighted_retrn'] = df['futre7dayreturn'].astype('int')
  67. df['weighted_retrn_log'] = df.apply(lambda x: np.log(x['weighted_retrn'] + 1),axis=1)
  68. return df
  69. def dataprepare(df_pre):
  70. # 直接将特征送进去,不加交叉特征。
  71. df_pre = df_pre.fillna(0)
  72. #df_new_feature = df_pre[process_feature.features]
  73. df_new_feature = df_pre[process_feature.filter_recent_features()]
  74. df_target = df_pre['weighted_retrn_log']
  75. df_new_feature = pd.concat([df_new_feature, df_pre[process_feature.cate_feat],df_pre[process_feature.one_hot_feature]], axis=1)
  76. return df_new_feature, df_target
  77. def featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length):
  78. Feature_Data= pd.DataFrame()
  79. for df in (fold1_df,fold2_df,fold3_df,fold4_df):
  80. fold1_df1 = df.iloc[0:values_lenth,:]
  81. videoid_fold1_importance = df.iloc[values_lenth:values_lenth+video_id_lenth,:]['importance'].sum()
  82. fold1_df2 = pd.DataFrame([{'Feature':'videoid','importance':videoid_fold1_importance,'fold':1}])
  83. tag_fold1_importance = df.iloc[values_lenth+video_id_lenth:values_lenth+video_id_lenth+tag_length,:]['importance'].sum()
  84. fold1_df3 = pd.DataFrame([{'Feature':'tags','importance':tag_fold1_importance,'fold':1}])
  85. words_fold1_importance = df.iloc[values_lenth+video_id_lenth+tag_length:values_lenth+video_id_lenth+tag_length+word_length,:]['importance'].sum()
  86. fold1_df4 = pd.DataFrame([{'Feature':'words','importance':words_fold1_importance,'fold':1}])
  87. Feature_Data = pd.concat([Feature_Data,fold1_df1,fold1_df2,fold1_df3,fold1_df4])
  88. return Feature_Data
  89. def MAPE(true, pred):
  90. true = np.array(true)
  91. sum_ = 0
  92. count = 0
  93. for i in range(len(true)):
  94. if true[i] != 0:
  95. sum_ = sum_ + np.abs(true[i] - pred[i]) / true[i]
  96. count = count + 1
  97. else:
  98. continue
  99. return sum_ / count
  100. def process_train_predict_data():
  101. now_date = datetime.date.today()
  102. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  103. diff_1 = datetime.timedelta(days=1)
  104. diff_5 = datetime.timedelta(days=7)
  105. predict_dt = now_date - diff_1
  106. predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
  107. train_dt = now_date - diff_5
  108. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  109. #read data from ali
  110. train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
  111. predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
  112. #pickle for test
  113. import _pickle as cPickle
  114. # with open('train_data.pickle','wb') as output_file:
  115. # cPickle.dump(train_data, output_file)
  116. # with open('predict_data.pickle','wb') as output_file:
  117. # cPickle.dump(predict_data, output_file)
  118. #with open(r"train_data.pickle", "rb") as input_file:
  119. with open(r"train_data.pickle", "rb") as input_file:
  120. train_data = cPickle.load(input_file)
  121. with open(r"predict_data.pickle", "rb") as input_file:
  122. predict_data = cPickle.load(input_file)
  123. #end pickle
  124. train_data = basic_cal(train_data)
  125. predict_data = basic_cal(predict_data)
  126. predict_data = select_recent_video(predict_data)
  127. #predict_data.loc[predict_data['dt'] != int(predict_day), 'futre7dayreturn'] = 0
  128. predict_data = predict_data.drop(axis=1, columns='rk')
  129. train_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  130. predict_data.drop_duplicates(subset=['videoid', 'dt'], keep='first', inplace=True)
  131. train_data = train_data.fillna(0)
  132. predict_data = predict_data.fillna(0)
  133. train_data = process_feature.cal_feature(train_data)
  134. predict_data = process_feature.cal_feature(predict_data)
  135. predict_data['videoid'] = predict_data['videoid'].astype('int')
  136. df_new_feature,df_target= dataprepare(train_data)
  137. df_new_feature_predict, df_target_predict = dataprepare(predict_data)
  138. train_videoid = pd.DataFrame(df_new_feature).loc[:,'videoid']
  139. predict_videoid = pd.DataFrame(df_new_feature_predict).loc[:,'videoid']
  140. train_videoid_list = pd.DataFrame(df_new_feature).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature).loc[:,'videoid']),1).tolist()
  141. predict_videoid_list = pd.DataFrame(df_new_feature_predict).loc[:,'videoid'].to_numpy().reshape(len(pd.DataFrame(df_new_feature_predict).loc[:,'videoid']),1).tolist()
  142. return train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict
  143. def do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict):
  144. #target
  145. df_target_predict = sparse.csr_matrix(pd.DataFrame(df_target_predict).values).toarray()
  146. df_target = sparse.csr_matrix(pd.DataFrame(df_target).values).toarray()
  147. param = {'num_leaves': 18,
  148. 'min_data_in_leaf': 60,
  149. 'objective': 'regression',
  150. 'max_depth': -1,
  151. 'learning_rate': 0.01,
  152. "min_child_samples": 30,
  153. "boosting": "gbdt",
  154. "feature_fraction": 0.8,
  155. "bagging_freq": 1,
  156. "bagging_fraction": 0.8,
  157. "bagging_seed": 11,
  158. "metric": 'rmse',
  159. "lambda_l1": 0.1,
  160. "verbosity": -1,
  161. "nthread": 4,
  162. # 'max_bin': 512,
  163. "random_state": 4590}
  164. folds = StratifiedKFold(n_splits=4, shuffle=True, random_state=4590)
  165. #oof = np.zeros(len(pd.DataFrame(df_new_feature.toarray())))
  166. oof = np.zeros(len(df_target))
  167. predictions = np.zeros(len(df_target_predict))
  168. feature_importance_df = pd.DataFrame()
  169. change_view = pd.DataFrame(pd.DataFrame(df_new_feature_predict.toarray()))
  170. change_view = change_view.sort_index()
  171. for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_new_feature)):
  172. print("folds {}".format(fold_))
  173. trn_data = lgb.Dataset(df_new_feature.tocsr()[trn_idx,:], label=pd.DataFrame(df_target).iloc[trn_idx])
  174. val_data = lgb.Dataset(df_new_feature.tocsr()[val_idx,:], label=pd.DataFrame(df_target).iloc[val_idx])
  175. num_round = 10000
  176. clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=100,
  177. early_stopping_rounds=200)
  178. oof[val_idx] = clf.predict(df_new_feature.tocsr()[val_idx,:], num_iteration=clf.best_iteration)
  179. predictions += clf.predict(df_new_feature_predict, num_iteration=clf.best_iteration) / folds.n_splits
  180. fold_importance_df = pd.DataFrame()
  181. # column = process_feature.features+process_feature.cate_feat+mlb_model_videoid.classes_.tolist()+ tag_corpus + words_corpus
  182. # fold_importance_df["Feature"] = np.array(column)
  183. # fold_importance_df["importance"] = clf.feature_importance()
  184. # fold_importance_df["fold"] = fold_ + 1
  185. # feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
  186. # fold1_df = feature_importance_df.loc[feature_importance_df['fold']==1]
  187. # fold2_df = feature_importance_df.loc[feature_importance_df['fold']==2]
  188. # fold3_df = feature_importance_df.loc[feature_importance_df['fold']==3]
  189. # fold4_df = feature_importance_df.loc[feature_importance_df['fold']==4]
  190. # feature_importance_df = featureImportance(fold1_df,fold2_df,fold3_df,fold4_df,values_lenth,video_id_lenth,tag_length,word_length)
  191. print('oof_rmse:', np.sqrt(mean_squared_error(df_target, oof)))
  192. print('oof_mse:', mean_squared_error(df_target, oof))
  193. print('test_rmse:', np.sqrt(mean_squared_error(df_target_predict, predictions)))
  194. print('test_mse:', mean_squared_error(df_target_predict, predictions))
  195. print('oof_mape:', MAPE(df_target, oof))
  196. # print('test_mape:', MAPE(df_target_predict, predictions))
  197. print('verification r2:', r2_score(df_target, oof))
  198. # print('test r2:', r2_score(df_target_predict, predictions))
  199. sub_df_ = pd.DataFrame({"videoid": predict_data["videoid"].values})
  200. sub_df_['score'] = predictions
  201. print('regre ranking shape', sub_df_.shape)
  202. sub_df_.to_csv('result.csv')
  203. if __name__ == '__main__':
  204. train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict = process_train_predict_data()
  205. do_train(train_data, predict_data, df_target, df_target_predict, df_new_feature, df_new_feature_predict)