rov_train_regression_auto.py 6.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. import os
  4. import pandas as pd
  5. import gc
  6. import math
  7. import numpy as np
  8. import time
  9. import lightgbm as lgb
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import StratifiedKFold
  12. from sklearn.metrics import r2_score
  13. from sklearn import metrics
  14. import pickle
  15. from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
  16. import seaborn as sns
  17. import matplotlib.pylab as plt
  18. from odps import ODPS
  19. from odps.df import DataFrame as odpsdf
  20. from datetime import datetime as dt
  21. import datetime
  22. import process_feature
  23. import _pickle as cPickle
  24. from sklearn.feature_selection import SelectFromModel
  25. from sklearn.linear_model import LogisticRegression
  26. def getRovfeaturetable(dt, table):
  27. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  28. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  29. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  30. featureArray = []
  31. for record in odps.read_table(table, partition='dt=%s' % dt):
  32. valueFeature = {}
  33. for i in process_feature.featurename:
  34. if i == 'dt':
  35. valueFeature[i] = dt
  36. else:
  37. valueFeature[i] = record[i]
  38. featureArray.append(valueFeature)
  39. featureArray = pd.DataFrame(featureArray)
  40. print(dt, table, 'feature table finish')
  41. return featureArray
  42. def getdatasample(date, max_range, table):
  43. new_date = dt.strptime(date, '%Y%m%d')
  44. datelist = []
  45. testlist = []
  46. for i in range(0, max_range):
  47. delta = datetime.timedelta(days=i)
  48. tar_dt = new_date - delta
  49. datelist.append(tar_dt.strftime("%Y%m%d"))
  50. for tm in datelist:
  51. testlist.append(getRovfeaturetable(tm, table))
  52. data = pd.concat(testlist)
  53. data.reset_index(inplace=True)
  54. data = data.drop(axis=1, columns='index')
  55. return data
  56. def clean_data(df):
  57. #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
  58. df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
  59. y = df['futre7dayreturn']
  60. df_vids = df['videoid']
  61. #drop string
  62. #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
  63. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  64. #drop future
  65. #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
  66. x = x.drop(['futre7dayreturn'], axis=1)
  67. x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  68. x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  69. x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  70. x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
  71. x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
  72. x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
  73. features = list(x)
  74. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  75. x = x.drop(drop_features, axis=1)
  76. x.fillna(0)
  77. x = x.astype('float64')
  78. x = x.clip(1,2000000)
  79. #features = [f for f in features if f not in drop_features]
  80. features = list(x)
  81. return x, y , df_vids, features
  82. def std_data(df, features):
  83. for f in features:
  84. if df[f].max()>1:
  85. df[f] = (df[f]-df[f].min()) / (df[f]-df[f].max()+1)
  86. return df
  87. def feature_selection(X, y):
  88. selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
  89. return selector
  90. def auto_train(X_train, y_train):
  91. from flaml import AutoML
  92. automl = AutoML()
  93. automl_settings = {
  94. "time_budget": 8000, # in seconds
  95. "metric": 'mae',
  96. "task": 'regression',
  97. "log_file_name": "auto.log",
  98. "estimator_list": ["lgbm"]
  99. }
  100. automl.fit(X_train=X_train, y_train=y_train,
  101. **automl_settings)
  102. pred_test_y = automl.predict(X_train)
  103. y_test = y_train.values
  104. err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  105. r2 = r2_score(y_test, pred_test_y)
  106. print('err_mape', err_mape)
  107. print('r2', r2)
  108. pack_result(pred_test_y, y_test,[],'autoval.csv')
  109. def train(x,y,features):
  110. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
  111. '''
  112. selector = feature_selection(X_train, y_train)
  113. X_train = selector.transform(X_train)
  114. X_test = selector.transform(X_test)
  115. selected_features = []
  116. _supported = selector.get_support()
  117. for i in range(0, len(_supported)):
  118. if _supported[i]:
  119. selected_features.append(features[i])
  120. features = selected_features
  121. '''
  122. print(len(X_train), len(X_test))
  123. params = {
  124. "objective": "regression",
  125. "reg_sqrt":True,
  126. "metric": "mape",
  127. "max_depth": -1,
  128. "num_leaves": 50,
  129. "learning_rate": 0.1,
  130. "bagging_fraction": 0.7,
  131. "feature_fraction": 0.7,
  132. "bagging_freq": 8,
  133. "bagging_seed": 2018,
  134. "lambda_l1": 0.11,
  135. "boosting": "gbdt",
  136. "nthread": 4,
  137. "verbosity": -1
  138. }
  139. lgtrain = lgb.Dataset(X_train, label=y_train)
  140. lgval = lgb.Dataset(X_test, label=y_test)
  141. evals_result = {}
  142. model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
  143. evals_result=evals_result)
  144. pack_result(model.feature_importance(), features, [], 'importance.csv')
  145. pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
  146. y_test = y_test.values
  147. err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  148. r2 = r2_score(y_test, pred_test_y)
  149. print('err_mape', err_mape)
  150. print('r2', r2)
  151. pack_result(pred_test_y, y_test,[],'val.csv')
  152. return pred_test_y, model, evals_result
  153. def pack_result(y_, y, vid, fp):
  154. #y_ = y_.astype(int)
  155. y_.reshape(len(y_),1)
  156. df = pd.DataFrame(data=y_, columns=['score'])
  157. if len(vid) >0:
  158. df['vid'] = vid
  159. df['y'] = y
  160. df = df.sort_values(by=['score'], ascending=False)
  161. df.to_csv(fp, index=False)
  162. if __name__ == '__main__':
  163. with open(r"train_data_x.pickle", "rb") as input_file:
  164. train_data = cPickle.load(input_file)
  165. with open(r"predict_data_x.pickle", "rb") as input_file:
  166. predict_data = cPickle.load(input_file)
  167. x,y,_,features = clean_data(train_data)
  168. #x = std_data(x, features)
  169. #print(x.describe())
  170. #auto train
  171. auto_train(x,y)