|  | @@ -10,10 +10,10 @@ import time
 | 
											
												
													
														|  |  import lightgbm as lgb
 |  |  import lightgbm as lgb
 | 
											
												
													
														|  |  from sklearn.model_selection import train_test_split
 |  |  from sklearn.model_selection import train_test_split
 | 
											
												
													
														|  |  from sklearn.model_selection import StratifiedKFold
 |  |  from sklearn.model_selection import StratifiedKFold
 | 
											
												
													
														|  | -from sklearn.metrics import mean_absolute_percentage_error, r2_score
 |  | 
 | 
											
												
													
														|  | 
 |  | +from sklearn.metrics import r2_score
 | 
											
												
													
														|  |  from sklearn import metrics
 |  |  from sklearn import metrics
 | 
											
												
													
														|  |  import pickle
 |  |  import pickle
 | 
											
												
													
														|  | -from sklearn.metrics import mean_squared_error
 |  | 
 | 
											
												
													
														|  | 
 |  | +from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
 | 
											
												
													
														|  |  import seaborn as sns
 |  |  import seaborn as sns
 | 
											
												
													
														|  |  import matplotlib.pylab as plt
 |  |  import matplotlib.pylab as plt
 | 
											
												
													
														|  |  from odps import ODPS
 |  |  from odps import ODPS
 | 
											
										
											
												
													
														|  | @@ -22,7 +22,8 @@ from datetime import datetime as dt
 | 
											
												
													
														|  |  import datetime
 |  |  import datetime
 | 
											
												
													
														|  |  import process_feature
 |  |  import process_feature
 | 
											
												
													
														|  |  import _pickle as cPickle
 |  |  import _pickle as cPickle
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | 
 |  | +from sklearn.feature_selection import SelectFromModel
 | 
											
												
													
														|  | 
 |  | +from sklearn.linear_model import LogisticRegression
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  def getRovfeaturetable(dt, table):
 |  |  def getRovfeaturetable(dt, table):
 | 
											
												
													
														|  |      odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 |  |      odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
 | 
											
										
											
												
													
														|  | @@ -60,6 +61,7 @@ def getdatasample(date, max_range, table):
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  def clean_data(df):
 |  |  def clean_data(df):
 | 
											
												
													
														|  |      #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
 |  |      #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
 | 
											
												
													
														|  | 
 |  | +    df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
 | 
											
												
													
														|  |      y = df['futre7dayreturn']
 |  |      y = df['futre7dayreturn']
 | 
											
												
													
														|  |      df_vids = df['videoid']
 |  |      df_vids = df['videoid']
 | 
											
												
													
														|  |      #drop string
 |  |      #drop string
 | 
											
										
											
												
													
														|  | @@ -68,24 +70,81 @@ def clean_data(df):
 | 
											
												
													
														|  |      #drop future
 |  |      #drop future
 | 
											
												
													
														|  |      #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
 |  |      #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
 | 
											
												
													
														|  |      x = x.drop(['futre7dayreturn'], axis=1)
 |  |      x = x.drop(['futre7dayreturn'], axis=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
 | 
											
												
													
														|  | 
 |  | +    x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
 | 
											
												
													
														|  | 
 |  | +    x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
 | 
											
												
													
														|  | 
 |  | +    x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
 | 
											
												
													
														|  | 
 |  | +    x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |      features = list(x)
 |  |      features = list(x)
 | 
											
												
													
														|  |      drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
 |  |      drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
 | 
											
												
													
														|  |      x = x.drop(drop_features, axis=1)
 |  |      x = x.drop(drop_features, axis=1)
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | 
 |  | +    x = x.fillna(0)
 | 
											
												
													
														|  | 
 |  | +    x = x.astype('float64')
 | 
											
												
													
														|  | 
 |  | +    #x.fillna(0)
 | 
											
												
													
														|  | 
 |  | +    x.clip(0,2000000)
 | 
											
												
													
														|  | 
 |  | + 
 | 
											
												
													
														|  |      features = [f for f in features if f not in drop_features]
 |  |      features = [f for f in features if f not in drop_features]
 | 
											
												
													
														|  |      return x, y , df_vids, features
 |  |      return x, y , df_vids, features
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | 
 |  | +def feature_selection(X, y):
 | 
											
												
													
														|  | 
 |  | +    selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
 | 
											
												
													
														|  | 
 |  | +    return selector
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def auto_train(X_train, y_train):
 | 
											
												
													
														|  | 
 |  | +    from flaml import  AutoML
 | 
											
												
													
														|  | 
 |  | +    automl = AutoML()
 | 
											
												
													
														|  | 
 |  | +    automl_settings = {
 | 
											
												
													
														|  | 
 |  | +    "time_budget": 10,  # in seconds
 | 
											
												
													
														|  | 
 |  | +    "metric": 'r2',
 | 
											
												
													
														|  | 
 |  | +    "task": 'regression',
 | 
											
												
													
														|  | 
 |  | +    "log_file_name": "test/auto.log",
 | 
											
												
													
														|  | 
 |  | +    "estimator_list": ["lgbm"]
 | 
											
												
													
														|  | 
 |  | +    }
 | 
											
												
													
														|  | 
 |  | +    automl.fit(X_train=X_train, y_train=y_train,
 | 
											
												
													
														|  | 
 |  | +           **automl_settings) 
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    pred_test_y = automl.predict(X_train)
 | 
											
												
													
														|  | 
 |  | +    y_test = y_train.values
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
 | 
											
												
													
														|  | 
 |  | +    r2 = r2_score(y_test, pred_test_y)
 | 
											
												
													
														|  | 
 |  | +    #print('err_mape', err_mape)
 | 
											
												
													
														|  | 
 |  | +    print('r2', r2)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    pack_result(pred_test_y, y_test,[],'autoval.csv')
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  |  def train(x,y,features):
 |  |  def train(x,y,features):
 | 
											
												
													
														|  | -    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
 |  | 
 | 
											
												
													
														|  | 
 |  | +    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    '''
 | 
											
												
													
														|  | 
 |  | +    selector = feature_selection(X_train, y_train) 
 | 
											
												
													
														|  | 
 |  | +    X_train = selector.transform(X_train)
 | 
											
												
													
														|  | 
 |  | +    X_test = selector.transform(X_test)
 | 
											
												
													
														|  | 
 |  | +    selected_features = []
 | 
											
												
													
														|  | 
 |  | +    _supported = selector.get_support()
 | 
											
												
													
														|  | 
 |  | +    for i in range(0, len(_supported)):
 | 
											
												
													
														|  | 
 |  | +        if _supported[i]:
 | 
											
												
													
														|  | 
 |  | +            selected_features.append(features[i])
 | 
											
												
													
														|  | 
 |  | +    features = selected_features 
 | 
											
												
													
														|  | 
 |  | +    '''
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    print(len(X_train), len(X_test))
 | 
											
												
													
														|  |      params = {
 |  |      params = {
 | 
											
												
													
														|  |          "objective": "regression",
 |  |          "objective": "regression",
 | 
											
												
													
														|  | 
 |  | +        "reg_sqrt":True,
 | 
											
												
													
														|  |          "metric": "mape",
 |  |          "metric": "mape",
 | 
											
												
													
														|  | -        "max_depth": 5,
 |  | 
 | 
											
												
													
														|  | 
 |  | +        "max_depth": 6,
 | 
											
												
													
														|  |          "num_leaves": 30,
 |  |          "num_leaves": 30,
 | 
											
												
													
														|  | -        "learning_rate": 0.1,
 |  | 
 | 
											
												
													
														|  | 
 |  | +        "learning_rate": 0.05,
 | 
											
												
													
														|  |          "bagging_fraction": 0.7,
 |  |          "bagging_fraction": 0.7,
 | 
											
												
													
														|  |          "feature_fraction": 0.7,
 |  |          "feature_fraction": 0.7,
 | 
											
												
													
														|  | -        "bagging_freq": 5,
 |  | 
 | 
											
												
													
														|  | 
 |  | +        "bagging_freq": 8,
 | 
											
												
													
														|  |          "bagging_seed": 2018,
 |  |          "bagging_seed": 2018,
 | 
											
												
													
														|  |          "lambda_l1": 0.1,
 |  |          "lambda_l1": 0.1,
 | 
											
												
													
														|  |          "boosting": "gbdt",
 |  |          "boosting": "gbdt",
 | 
											
										
											
												
													
														|  | @@ -95,7 +154,7 @@ def train(x,y,features):
 | 
											
												
													
														|  |      lgtrain = lgb.Dataset(X_train, label=y_train)
 |  |      lgtrain = lgb.Dataset(X_train, label=y_train)
 | 
											
												
													
														|  |      lgval = lgb.Dataset(X_test, label=y_test)
 |  |      lgval = lgb.Dataset(X_test, label=y_test)
 | 
											
												
													
														|  |      evals_result = {}
 |  |      evals_result = {}
 | 
											
												
													
														|  | -    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
 |  | 
 | 
											
												
													
														|  | 
 |  | +    model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
 | 
											
												
													
														|  |                        evals_result=evals_result)
 |  |                        evals_result=evals_result)
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      pack_result(model.feature_importance(), features, [], 'importance.csv')
 |  |      pack_result(model.feature_importance(), features, [], 'importance.csv')
 | 
											
										
											
												
													
														|  | @@ -103,9 +162,9 @@ def train(x,y,features):
 | 
											
												
													
														|  |      pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
 |  |      pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
 | 
											
												
													
														|  |      y_test = y_test.values
 |  |      y_test = y_test.values
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
 |  | 
 | 
											
												
													
														|  | 
 |  | +    err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
 | 
											
												
													
														|  |      r2 = r2_score(y_test, pred_test_y)
 |  |      r2 = r2_score(y_test, pred_test_y)
 | 
											
												
													
														|  | -    #print('err_mape', err_mape)
 |  | 
 | 
											
												
													
														|  | 
 |  | +    print('err_mape', err_mape)
 | 
											
												
													
														|  |      print('r2', r2)
 |  |      print('r2', r2)
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      pack_result(pred_test_y, y_test,[],'val.csv')
 |  |      pack_result(pred_test_y, y_test,[],'val.csv')
 | 
											
										
											
												
													
														|  | @@ -125,13 +184,17 @@ def pack_result(y_, y, vid, fp):
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      
 |  |      
 | 
											
												
													
														|  |  if __name__ == '__main__':
 |  |  if __name__ == '__main__':
 | 
											
												
													
														|  | -    with open(r"train_data.pickle", "rb") as input_file:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    with open(r"train_data_x.pickle", "rb") as input_file:
 | 
											
												
													
														|  |          train_data = cPickle.load(input_file)    
 |  |          train_data = cPickle.load(input_file)    
 | 
											
												
													
														|  | -    with open(r"predict_data.pickle", "rb") as input_file:
 |  | 
 | 
											
												
													
														|  | 
 |  | +    with open(r"predict_data_x.pickle", "rb") as input_file:
 | 
											
												
													
														|  |          predict_data = cPickle.load(input_file)   
 |  |          predict_data = cPickle.load(input_file)   
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    #train
 |  | 
 | 
											
												
													
														|  |      x,y,_,features = clean_data(train_data)
 |  |      x,y,_,features = clean_data(train_data)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #auto train
 | 
											
												
													
														|  | 
 |  | +    #auto_train(x,y)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    #train
 | 
											
												
													
														|  |      _, model, _ = train(x, y, features)
 |  |      _, model, _ = train(x, y, features)
 | 
											
												
													
														|  |      with open('model.pickle','wb') as output_file:
 |  |      with open('model.pickle','wb') as output_file:
 | 
											
												
													
														|  |          cPickle.dump(model, output_file)
 |  |          cPickle.dump(model, output_file)
 |