rov_train_classify.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. import os
  4. import pandas as pd
  5. import gc
  6. import math
  7. import numpy as np
  8. import time
  9. import lightgbm as lgb
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import StratifiedKFold
  12. from sklearn import metrics
  13. import pickle
  14. from sklearn.metrics import top_k_accuracy_score
  15. import seaborn as sns
  16. import matplotlib.pylab as plt
  17. from odps import ODPS
  18. from odps.df import DataFrame as odpsdf
  19. from datetime import datetime as dt
  20. import datetime
  21. import process_feature
  22. import _pickle as cPickle
  23. from sklearn.feature_selection import SelectFromModel
  24. from sklearn.linear_model import LogisticRegression
  25. def getRovfeaturetable(dt, table):
  26. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  27. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  28. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  29. featureArray = []
  30. for record in odps.read_table(table, partition='dt=%s' % dt):
  31. valueFeature = {}
  32. for i in process_feature.featurename:
  33. if i == 'dt':
  34. valueFeature[i] = dt
  35. else:
  36. valueFeature[i] = record[i]
  37. featureArray.append(valueFeature)
  38. featureArray = pd.DataFrame(featureArray)
  39. print(dt, table, 'feature table finish')
  40. return featureArray
  41. def getdatasample(date, max_range, table):
  42. new_date = dt.strptime(date, '%Y%m%d')
  43. datelist = []
  44. testlist = []
  45. for i in range(0, max_range):
  46. delta = datetime.timedelta(days=i)
  47. tar_dt = new_date - delta
  48. datelist.append(tar_dt.strftime("%Y%m%d"))
  49. for tm in datelist:
  50. testlist.append(getRovfeaturetable(tm, table))
  51. data = pd.concat(testlist)
  52. data.reset_index(inplace=True)
  53. data = data.drop(axis=1, columns='index')
  54. return data
  55. def discrete_y(y):
  56. y = float(y)
  57. if y>1000000:
  58. return 7
  59. elif y>500000:
  60. return 6
  61. elif y>100000:
  62. return 5
  63. elif y>50000:
  64. return 4
  65. elif y>10000:
  66. return 3
  67. elif y>5000:
  68. return 2
  69. elif y>1000:
  70. return 1
  71. else:
  72. return 0
  73. def clean_data(df):
  74. #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
  75. df['futre7dayreturn'] = df['futre7dayreturn'].apply(discrete_y)
  76. y = df['futre7dayreturn']
  77. print(y)
  78. df_vids = df['videoid']
  79. #drop string
  80. #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
  81. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  82. #drop future
  83. #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
  84. x = x.drop(['futre7dayreturn'], axis=1)
  85. x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  86. x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  87. x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  88. x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
  89. x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
  90. x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
  91. features = list(x)
  92. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  93. x = x.drop(drop_features, axis=1)
  94. x = x.fillna(0)
  95. x = x.astype('float64')
  96. #x.fillna(0)
  97. x.clip(0,2000000)
  98. features = [f for f in features if f not in drop_features]
  99. return x, y , df_vids, features
  100. def feature_selection(X, y):
  101. selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
  102. return selector
  103. def auto_train(X_train, y_train):
  104. from flaml import AutoML
  105. automl = AutoML()
  106. automl_settings = {
  107. "time_budget": 10, # in seconds
  108. "metric": 'r2',
  109. "task": 'regression',
  110. "log_file_name": "test/auto.log",
  111. "estimator_list": ["lgbm"]
  112. }
  113. automl.fit(X_train=X_train, y_train=y_train,
  114. **automl_settings)
  115. pred_test_y = automl.predict(X_train)
  116. y_test = y_train.values
  117. #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  118. r2 = r2_score(y_test, pred_test_y)
  119. #print('err_mape', err_mape)
  120. print('r2', r2)
  121. pack_result(pred_test_y, y_test,[],'autoval.csv')
  122. def train(x,y,features):
  123. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y )
  124. '''
  125. selector = feature_selection(X_train, y_train)
  126. X_train = selector.transform(X_train)
  127. X_test = selector.transform(X_test)
  128. selected_features = []
  129. _supported = selector.get_support()
  130. for i in range(0, len(_supported)):
  131. if _supported[i]:
  132. selected_features.append(features[i])
  133. features = selected_features
  134. '''
  135. print(len(X_train), len(X_test))
  136. params = {
  137. "objective": "multiclass",
  138. "num_classes": 8,
  139. "max_depth": 6,
  140. "num_leaves": 30,
  141. "learning_rate": 0.05,
  142. "bagging_fraction": 0.7,
  143. "feature_fraction": 0.7,
  144. "bagging_freq": 8,
  145. "bagging_seed": 2018,
  146. "lambda_l1": 0.1,
  147. "boosting": "gbdt",
  148. "nthread": 4,
  149. "verbosity": -1
  150. }
  151. lgtrain = lgb.Dataset(X_train, label=y_train)
  152. lgval = lgb.Dataset(X_test, label=y_test)
  153. evals_result = {}
  154. model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
  155. evals_result=evals_result)
  156. pack_result(model.feature_importance(), features, [], 'importance.csv')
  157. pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
  158. top_k_accuracy_score(y_test, pred_test_y, k=1)
  159. print('top_k_accuracy_score', top_k_accuracy_score)
  160. pack_result(pred_test_y, y_test,[],'val.csv')
  161. return pred_test_y, model, evals_result
  162. def pack_result(y_, y, vid, fp):
  163. #y_ = y_.astype(int)
  164. y_.reshape(len(y_),1)
  165. df = pd.DataFrame(data=y_, columns=['score'])
  166. if len(vid) >0:
  167. df['vid'] = vid
  168. df['y'] = y
  169. df = df.sort_values(by=['score'], ascending=False)
  170. df.to_csv(fp, index=False)
  171. if __name__ == '__main__':
  172. with open(r"train_data_x.pickle", "rb") as input_file:
  173. train_data = cPickle.load(input_file)
  174. with open(r"predict_data_x.pickle", "rb") as input_file:
  175. predict_data = cPickle.load(input_file)
  176. x,y,_,features = clean_data(train_data)
  177. #auto train
  178. #auto_train(x,y)
  179. #train
  180. _, model, _ = train(x, y, features)
  181. with open('model.pickle','wb') as output_file:
  182. cPickle.dump(model, output_file)
  183. '''
  184. with open(r"model.pickle", "rb") as input_file:
  185. model = cPickle.load(input_file)
  186. '''
  187. x,y,vid,_ = clean_data(predict_data)
  188. y_ = model.predict(x, num_iteration=model.best_iteration)
  189. pack_result(y_, y, vid, 'pred.csv')