|
@@ -23,6 +23,7 @@ import datetime
|
|
|
import process_feature
|
|
|
import _pickle as cPickle
|
|
|
|
|
|
+
|
|
|
def getRovfeaturetable(dt, table):
|
|
|
odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
|
|
|
endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
|
|
@@ -56,26 +57,39 @@ def getdatasample(date, max_range, table):
|
|
|
data = data.drop(axis=1, columns='index')
|
|
|
return data
|
|
|
|
|
|
+
|
|
|
def clean_data(df):
|
|
|
+ #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
|
|
|
y = df['futre7dayreturn']
|
|
|
df_vids = df['videoid']
|
|
|
#drop string
|
|
|
- x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'])
|
|
|
+ #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
|
|
|
+ x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
|
|
|
#drop future
|
|
|
- x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'])
|
|
|
- return x, y , df_vids
|
|
|
+ #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
|
|
|
+ x = x.drop(['futre7dayreturn'], axis=1)
|
|
|
+ features = list(x)
|
|
|
+ drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
|
+ x = x.drop(drop_features, axis=1)
|
|
|
+
|
|
|
+ features = [f for f in features if f not in drop_features]
|
|
|
+ return x, y , df_vids, features
|
|
|
|
|
|
-def train(x,y):
|
|
|
+def train(x,y,features):
|
|
|
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
|
|
|
params = {
|
|
|
"objective": "regression",
|
|
|
- "metric": "rmse",
|
|
|
+ "metric": "mape",
|
|
|
+ "max_depth": 5,
|
|
|
"num_leaves": 30,
|
|
|
"learning_rate": 0.1,
|
|
|
"bagging_fraction": 0.7,
|
|
|
"feature_fraction": 0.7,
|
|
|
- "bagging_frequency": 5,
|
|
|
+ "bagging_freq": 5,
|
|
|
"bagging_seed": 2018,
|
|
|
+ "lambda_l1": 0.1,
|
|
|
+ "boosting": "gbdt",
|
|
|
+ "nthread": 4,
|
|
|
"verbosity": -1
|
|
|
}
|
|
|
lgtrain = lgb.Dataset(X_train, label=y_train)
|
|
@@ -84,13 +98,31 @@ def train(x,y):
|
|
|
model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=100, verbose_eval=20,
|
|
|
evals_result=evals_result)
|
|
|
|
|
|
+ pack_result(model.feature_importance(), features, [], 'importance.csv')
|
|
|
+
|
|
|
pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
|
|
|
- err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
|
+ y_test = y_test.values
|
|
|
+
|
|
|
+ #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
|
r2 = r2_score(y_test, pred_test_y)
|
|
|
- print('err_mape', err_mape)
|
|
|
+ #print('err_mape', err_mape)
|
|
|
print('r2', r2)
|
|
|
|
|
|
+ pack_result(pred_test_y, y_test,[],'val.csv')
|
|
|
+
|
|
|
return pred_test_y, model, evals_result
|
|
|
+
|
|
|
+
|
|
|
+def pack_result(y_, y, vid, fp):
|
|
|
+ #y_ = y_.astype(int)
|
|
|
+ y_.reshape(len(y_),1)
|
|
|
+ df = pd.DataFrame(data=y_, columns=['score'])
|
|
|
+ if len(vid) >0:
|
|
|
+ df['vid'] = vid
|
|
|
+ df['y'] = y
|
|
|
+ df = df.sort_values(by=['score'], ascending=False)
|
|
|
+ df.to_csv(fp, index=False)
|
|
|
+
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
with open(r"train_data.pickle", "rb") as input_file:
|
|
@@ -99,5 +131,15 @@ if __name__ == '__main__':
|
|
|
predict_data = cPickle.load(input_file)
|
|
|
|
|
|
#train
|
|
|
- x,y,_ = clean_data(train_data)
|
|
|
- train(x, y)
|
|
|
+ x,y,_,features = clean_data(train_data)
|
|
|
+ _, model, _ = train(x, y, features)
|
|
|
+ with open('model.pickle','wb') as output_file:
|
|
|
+ cPickle.dump(model, output_file)
|
|
|
+ '''
|
|
|
+ with open(r"model.pickle", "rb") as input_file:
|
|
|
+ model = cPickle.load(input_file)
|
|
|
+ '''
|
|
|
+ x,y,vid,_ = clean_data(predict_data)
|
|
|
+ y_ = model.predict(x, num_iteration=model.best_iteration)
|
|
|
+
|
|
|
+ pack_result(y_, y, vid, 'pred.csv')
|