rov_train_new.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import warnings
  2. warnings.filterwarnings("ignore")
  3. import os
  4. import pandas as pd
  5. import gc
  6. import math
  7. import numpy as np
  8. import time
  9. import lightgbm as lgb
  10. from sklearn.model_selection import train_test_split
  11. from sklearn.model_selection import StratifiedKFold
  12. from sklearn.metrics import mean_absolute_percentage_error, r2_score
  13. from sklearn import metrics
  14. import pickle
  15. from sklearn.metrics import mean_squared_error
  16. import seaborn as sns
  17. import matplotlib.pylab as plt
  18. from odps import ODPS
  19. from odps.df import DataFrame as odpsdf
  20. from datetime import datetime as dt
  21. import datetime
  22. import process_feature
  23. import _pickle as cPickle
  24. def getRovfeaturetable(dt, table):
  25. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  26. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  27. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  28. featureArray = []
  29. for record in odps.read_table(table, partition='dt=%s' % dt):
  30. valueFeature = {}
  31. for i in process_feature.featurename:
  32. if i == 'dt':
  33. valueFeature[i] = dt
  34. else:
  35. valueFeature[i] = record[i]
  36. featureArray.append(valueFeature)
  37. featureArray = pd.DataFrame(featureArray)
  38. print(dt, table, 'feature table finish')
  39. return featureArray
  40. def getdatasample(date, max_range, table):
  41. new_date = dt.strptime(date, '%Y%m%d')
  42. datelist = []
  43. testlist = []
  44. for i in range(0, max_range):
  45. delta = datetime.timedelta(days=i)
  46. tar_dt = new_date - delta
  47. datelist.append(tar_dt.strftime("%Y%m%d"))
  48. for tm in datelist:
  49. testlist.append(getRovfeaturetable(tm, table))
  50. data = pd.concat(testlist)
  51. data.reset_index(inplace=True)
  52. data = data.drop(axis=1, columns='index')
  53. return data
  54. def clean_data(df):
  55. #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
  56. y = df['futre7dayreturn']
  57. df_vids = df['videoid']
  58. #drop string
  59. #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
  60. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  61. #drop future
  62. #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
  63. x = x.drop(['futre7dayreturn'], axis=1)
  64. features = list(x)
  65. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  66. x = x.drop(drop_features, axis=1)
  67. features = [f for f in features if f not in drop_features]
  68. return x, y , df_vids, features
  69. def train(x,y,features):
  70. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
  71. params = {
  72. "objective": "regression",
  73. "metric": "mape",
  74. "max_depth": 5,
  75. "num_leaves": 30,
  76. "learning_rate": 0.1,
  77. "bagging_fraction": 0.7,
  78. "feature_fraction": 0.7,
  79. "bagging_freq": 5,
  80. "bagging_seed": 2018,
  81. "lambda_l1": 0.1,
  82. "boosting": "gbdt",
  83. "nthread": 4,
  84. "verbosity": -1
  85. }
  86. lgtrain = lgb.Dataset(X_train, label=y_train)
  87. lgval = lgb.Dataset(X_test, label=y_test)
  88. evals_result = {}
  89. model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
  90. evals_result=evals_result)
  91. pack_result(model.feature_importance(), features, [], 'importance.csv')
  92. pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
  93. y_test = y_test.values
  94. #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
  95. r2 = r2_score(y_test, pred_test_y)
  96. #print('err_mape', err_mape)
  97. print('r2', r2)
  98. pack_result(pred_test_y, y_test,[],'val.csv')
  99. return pred_test_y, model, evals_result
  100. def pack_result(y_, y, vid, fp):
  101. #y_ = y_.astype(int)
  102. y_.reshape(len(y_),1)
  103. df = pd.DataFrame(data=y_, columns=['score'])
  104. if len(vid) >0:
  105. df['vid'] = vid
  106. df['y'] = y
  107. df = df.sort_values(by=['score'], ascending=False)
  108. df.to_csv(fp, index=False)
  109. if __name__ == '__main__':
  110. with open(r"train_data.pickle", "rb") as input_file:
  111. train_data = cPickle.load(input_file)
  112. with open(r"predict_data.pickle", "rb") as input_file:
  113. predict_data = cPickle.load(input_file)
  114. #train
  115. x,y,_,features = clean_data(train_data)
  116. _, model, _ = train(x, y, features)
  117. with open('model.pickle','wb') as output_file:
  118. cPickle.dump(model, output_file)
  119. '''
  120. with open(r"model.pickle", "rb") as input_file:
  121. model = cPickle.load(input_file)
  122. '''
  123. x,y,vid,_ = clean_data(predict_data)
  124. y_ = model.predict(x, num_iteration=model.best_iteration)
  125. pack_result(y_, y, vid, 'pred.csv')