|
@@ -10,10 +10,10 @@ import time
|
|
import lightgbm as lgb
|
|
import lightgbm as lgb
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import StratifiedKFold
|
|
from sklearn.model_selection import StratifiedKFold
|
|
-from sklearn.metrics import mean_absolute_percentage_error, r2_score
|
|
|
|
|
|
+from sklearn.metrics import r2_score
|
|
from sklearn import metrics
|
|
from sklearn import metrics
|
|
import pickle
|
|
import pickle
|
|
-from sklearn.metrics import mean_squared_error
|
|
|
|
|
|
+from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
|
|
import seaborn as sns
|
|
import seaborn as sns
|
|
import matplotlib.pylab as plt
|
|
import matplotlib.pylab as plt
|
|
from odps import ODPS
|
|
from odps import ODPS
|
|
@@ -22,7 +22,8 @@ from datetime import datetime as dt
|
|
import datetime
|
|
import datetime
|
|
import process_feature
|
|
import process_feature
|
|
import _pickle as cPickle
|
|
import _pickle as cPickle
|
|
-
|
|
|
|
|
|
+from sklearn.feature_selection import SelectFromModel
|
|
|
|
+from sklearn.linear_model import LogisticRegression
|
|
|
|
|
|
def getRovfeaturetable(dt, table):
|
|
def getRovfeaturetable(dt, table):
|
|
odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
|
|
odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
|
|
@@ -60,6 +61,7 @@ def getdatasample(date, max_range, table):
|
|
|
|
|
|
def clean_data(df):
|
|
def clean_data(df):
|
|
#y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
|
|
#y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
|
|
|
|
+ df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
|
|
y = df['futre7dayreturn']
|
|
y = df['futre7dayreturn']
|
|
df_vids = df['videoid']
|
|
df_vids = df['videoid']
|
|
#drop string
|
|
#drop string
|
|
@@ -68,24 +70,81 @@ def clean_data(df):
|
|
#drop future
|
|
#drop future
|
|
#x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
|
|
#x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
|
|
x = x.drop(['futre7dayreturn'], axis=1)
|
|
x = x.drop(['futre7dayreturn'], axis=1)
|
|
|
|
+
|
|
|
|
+ x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
|
|
|
|
+ x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
|
|
|
|
+ x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
|
|
|
|
+
|
|
|
|
+ x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
|
|
|
|
+ x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
|
|
|
|
+ x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
|
|
|
|
+
|
|
features = list(x)
|
|
features = list(x)
|
|
drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
x = x.drop(drop_features, axis=1)
|
|
x = x.drop(drop_features, axis=1)
|
|
-
|
|
|
|
|
|
+ x = x.fillna(0)
|
|
|
|
+ x = x.astype('float64')
|
|
|
|
+ #x.fillna(0)
|
|
|
|
+ x.clip(0,2000000)
|
|
|
|
+
|
|
features = [f for f in features if f not in drop_features]
|
|
features = [f for f in features if f not in drop_features]
|
|
return x, y , df_vids, features
|
|
return x, y , df_vids, features
|
|
|
|
|
|
|
|
+def feature_selection(X, y):
|
|
|
|
+ selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
|
|
|
|
+ return selector
|
|
|
|
+
|
|
|
|
+def auto_train(X_train, y_train):
|
|
|
|
+ from flaml import AutoML
|
|
|
|
+ automl = AutoML()
|
|
|
|
+ automl_settings = {
|
|
|
|
+ "time_budget": 10, # in seconds
|
|
|
|
+ "metric": 'r2',
|
|
|
|
+ "task": 'regression',
|
|
|
|
+ "log_file_name": "test/auto.log",
|
|
|
|
+ "estimator_list": ["lgbm"]
|
|
|
|
+ }
|
|
|
|
+ automl.fit(X_train=X_train, y_train=y_train,
|
|
|
|
+ **automl_settings)
|
|
|
|
+
|
|
|
|
+ pred_test_y = automl.predict(X_train)
|
|
|
|
+ y_test = y_train.values
|
|
|
|
+
|
|
|
|
+ #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
|
|
+ r2 = r2_score(y_test, pred_test_y)
|
|
|
|
+ #print('err_mape', err_mape)
|
|
|
|
+ print('r2', r2)
|
|
|
|
+
|
|
|
|
+ pack_result(pred_test_y, y_test,[],'autoval.csv')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
def train(x,y,features):
|
|
def train(x,y,features):
|
|
- X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
|
|
|
|
|
+ X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)
|
|
|
|
+
|
|
|
|
+ '''
|
|
|
|
+ selector = feature_selection(X_train, y_train)
|
|
|
|
+ X_train = selector.transform(X_train)
|
|
|
|
+ X_test = selector.transform(X_test)
|
|
|
|
+ selected_features = []
|
|
|
|
+ _supported = selector.get_support()
|
|
|
|
+ for i in range(0, len(_supported)):
|
|
|
|
+ if _supported[i]:
|
|
|
|
+ selected_features.append(features[i])
|
|
|
|
+ features = selected_features
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ print(len(X_train), len(X_test))
|
|
params = {
|
|
params = {
|
|
"objective": "regression",
|
|
"objective": "regression",
|
|
|
|
+ "reg_sqrt":True,
|
|
"metric": "mape",
|
|
"metric": "mape",
|
|
- "max_depth": 5,
|
|
|
|
|
|
+ "max_depth": 6,
|
|
"num_leaves": 30,
|
|
"num_leaves": 30,
|
|
- "learning_rate": 0.1,
|
|
|
|
|
|
+ "learning_rate": 0.05,
|
|
"bagging_fraction": 0.7,
|
|
"bagging_fraction": 0.7,
|
|
"feature_fraction": 0.7,
|
|
"feature_fraction": 0.7,
|
|
- "bagging_freq": 5,
|
|
|
|
|
|
+ "bagging_freq": 8,
|
|
"bagging_seed": 2018,
|
|
"bagging_seed": 2018,
|
|
"lambda_l1": 0.1,
|
|
"lambda_l1": 0.1,
|
|
"boosting": "gbdt",
|
|
"boosting": "gbdt",
|
|
@@ -95,7 +154,7 @@ def train(x,y,features):
|
|
lgtrain = lgb.Dataset(X_train, label=y_train)
|
|
lgtrain = lgb.Dataset(X_train, label=y_train)
|
|
lgval = lgb.Dataset(X_test, label=y_test)
|
|
lgval = lgb.Dataset(X_test, label=y_test)
|
|
evals_result = {}
|
|
evals_result = {}
|
|
- model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
|
|
|
|
|
|
+ model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
|
|
evals_result=evals_result)
|
|
evals_result=evals_result)
|
|
|
|
|
|
pack_result(model.feature_importance(), features, [], 'importance.csv')
|
|
pack_result(model.feature_importance(), features, [], 'importance.csv')
|
|
@@ -103,9 +162,9 @@ def train(x,y,features):
|
|
pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
|
|
pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
|
|
y_test = y_test.values
|
|
y_test = y_test.values
|
|
|
|
|
|
- #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
|
|
|
|
+ err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
r2 = r2_score(y_test, pred_test_y)
|
|
r2 = r2_score(y_test, pred_test_y)
|
|
- #print('err_mape', err_mape)
|
|
|
|
|
|
+ print('err_mape', err_mape)
|
|
print('r2', r2)
|
|
print('r2', r2)
|
|
|
|
|
|
pack_result(pred_test_y, y_test,[],'val.csv')
|
|
pack_result(pred_test_y, y_test,[],'val.csv')
|
|
@@ -125,13 +184,17 @@ def pack_result(y_, y, vid, fp):
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
- with open(r"train_data.pickle", "rb") as input_file:
|
|
|
|
|
|
+ with open(r"train_data_x.pickle", "rb") as input_file:
|
|
train_data = cPickle.load(input_file)
|
|
train_data = cPickle.load(input_file)
|
|
- with open(r"predict_data.pickle", "rb") as input_file:
|
|
|
|
|
|
+ with open(r"predict_data_x.pickle", "rb") as input_file:
|
|
predict_data = cPickle.load(input_file)
|
|
predict_data = cPickle.load(input_file)
|
|
|
|
|
|
- #train
|
|
|
|
x,y,_,features = clean_data(train_data)
|
|
x,y,_,features = clean_data(train_data)
|
|
|
|
+
|
|
|
|
+ #auto train
|
|
|
|
+ #auto_train(x,y)
|
|
|
|
+
|
|
|
|
+ #train
|
|
_, model, _ = train(x, y, features)
|
|
_, model, _ = train(x, y, features)
|
|
with open('model.pickle','wb') as output_file:
|
|
with open('model.pickle','wb') as output_file:
|
|
cPickle.dump(model, output_file)
|
|
cPickle.dump(model, output_file)
|