rov_train_new.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. import os
  4. import pandas as pd
  5. import gc
  6. import math
  7. import numpy as np
  8. import time
  9. import lightgbm as lgb
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import StratifiedKFold
  12. from sklearn.metrics import r2_score
  13. from sklearn import metrics
  14. import pickle
  15. from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
  16. import seaborn as sns
  17. import matplotlib.pylab as plt
  18. from odps import ODPS
  19. from odps.df import DataFrame as odpsdf
  20. from datetime import datetime as dt
  21. import datetime
  22. import process_feature
  23. import _pickle as cPickle
  24. from sklearn.feature_selection import SelectFromModel
  25. from sklearn.linear_model import LogisticRegression
  26. def getRovfeaturetable(dt, table):
  27. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  28. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  29. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  30. featureArray = []
  31. for record in odps.read_table(table, partition='dt=%s' % dt):
  32. valueFeature = {}
  33. for i in process_feature.featurename:
  34. if i == 'dt':
  35. valueFeature[i] = dt
  36. else:
  37. valueFeature[i] = record[i]
  38. featureArray.append(valueFeature)
  39. featureArray = pd.DataFrame(featureArray)
  40. print(dt, table, 'feature table finish')
  41. return featureArray
  42. def getdatasample(date, max_range, table):
  43. new_date = dt.strptime(date, '%Y%m%d')
  44. datelist = []
  45. testlist = []
  46. for i in range(0, max_range):
  47. delta = datetime.timedelta(days=i)
  48. tar_dt = new_date - delta
  49. datelist.append(tar_dt.strftime("%Y%m%d"))
  50. for tm in datelist:
  51. testlist.append(getRovfeaturetable(tm, table))
  52. data = pd.concat(testlist)
  53. data.reset_index(inplace=True)
  54. data = data.drop(axis=1, columns='index')
  55. return data
  56. def clean_data(df):
  57. #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
  58. df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
  59. y = df['futre7dayreturn']
  60. df_vids = df['videoid']
  61. #drop string
  62. #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
  63. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  64. #drop future
  65. #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
  66. x = x.drop(['futre7dayreturn'], axis=1)
  67. x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  68. x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  69. x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  70. x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
  71. x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
  72. x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
  73. features = list(x)
  74. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  75. x = x.drop(drop_features, axis=1)
  76. x = x.fillna(0)
  77. x = x.astype('float64')
  78. #x.fillna(0)
  79. x.clip(0,2000000)
  80. features = [f for f in features if f not in drop_features]
  81. return x, y , df_vids, features
  82. def feature_selection(X, y):
  83. selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
  84. return selector
  85. def auto_train(X_train, y_train):
  86. from flaml import AutoML
  87. automl = AutoML()
  88. automl_settings = {
  89. "time_budget": 10, # in seconds
  90. "metric": 'r2',
  91. "task": 'regression',
  92. "log_file_name": "test/auto.log",
  93. "estimator_list": ["lgbm"]
  94. }
  95. automl.fit(X_train=X_train, y_train=y_train,
  96. **automl_settings)
  97. pred_test_y = automl.predict(X_train)
  98. y_test = y_train.values
  99. #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  100. r2 = r2_score(y_test, pred_test_y)
  101. #print('err_mape', err_mape)
  102. print('r2', r2)
  103. pack_result(pred_test_y, y_test,[],'autoval.csv')
  104. def train(x,y,features):
  105. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  106. '''
  107. selector = feature_selection(X_train, y_train)
  108. X_train = selector.transform(X_train)
  109. X_test = selector.transform(X_test)
  110. selected_features = []
  111. _supported = selector.get_support()
  112. for i in range(0, len(_supported)):
  113. if _supported[i]:
  114. selected_features.append(features[i])
  115. features = selected_features
  116. '''
  117. print(len(X_train), len(X_test))
  118. params = {
  119. "objective": "regression",
  120. "reg_sqrt":True,
  121. "metric": "mape",
  122. "max_depth": 6,
  123. "num_leaves": 30,
  124. "learning_rate": 0.05,
  125. "bagging_fraction": 0.7,
  126. "feature_fraction": 0.7,
  127. "bagging_freq": 8,
  128. "bagging_seed": 2018,
  129. "lambda_l1": 0.1,
  130. "boosting": "gbdt",
  131. "nthread": 4,
  132. "verbosity": -1
  133. }
  134. lgtrain = lgb.Dataset(X_train, label=y_train)
  135. lgval = lgb.Dataset(X_test, label=y_test)
  136. evals_result = {}
  137. model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
  138. evals_result=evals_result)
  139. pack_result(model.feature_importance(), features, [], 'importance.csv')
  140. pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
  141. y_test = y_test.values
  142. err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  143. r2 = r2_score(y_test, pred_test_y)
  144. print('err_mape', err_mape)
  145. print('r2', r2)
  146. pack_result(pred_test_y, y_test,[],'val.csv')
  147. return pred_test_y, model, evals_result
  148. def pack_result(y_, y, vid, fp):
  149. #y_ = y_.astype(int)
  150. y_.reshape(len(y_),1)
  151. df = pd.DataFrame(data=y_, columns=['score'])
  152. if len(vid) >0:
  153. df['vid'] = vid
  154. df['y'] = y
  155. df = df.sort_values(by=['score'], ascending=False)
  156. df.to_csv(fp, index=False)
  157. if __name__ == '__main__':
  158. with open(r"train_data_x.pickle", "rb") as input_file:
  159. train_data = cPickle.load(input_file)
  160. with open(r"predict_data_x.pickle", "rb") as input_file:
  161. predict_data = cPickle.load(input_file)
  162. x,y,_,features = clean_data(train_data)
  163. #auto train
  164. #auto_train(x,y)
  165. #train
  166. _, model, _ = train(x, y, features)
  167. with open('model.pickle','wb') as output_file:
  168. cPickle.dump(model, output_file)
  169. '''
  170. with open(r"model.pickle", "rb") as input_file:
  171. model = cPickle.load(input_file)
  172. '''
  173. x,y,vid,_ = clean_data(predict_data)
  174. y_ = model.predict(x, num_iteration=model.best_iteration)
  175. pack_result(y_, y, vid, 'pred.csv')