|
@@ -0,0 +1,222 @@
|
|
|
+import warnings
|
|
|
+
|
|
|
+warnings.filterwarnings("ignore")
|
|
|
+import os
|
|
|
+import pandas as pd
|
|
|
+import gc
|
|
|
+import math
|
|
|
+import numpy as np
|
|
|
+import time
|
|
|
+import lightgbm as lgb
|
|
|
+from sklearn.model_selection import train_test_split
|
|
|
+from sklearn.model_selection import StratifiedKFold
|
|
|
+from sklearn import metrics
|
|
|
+import pickle
|
|
|
+from sklearn.metrics import top_k_accuracy_score
|
|
|
+import seaborn as sns
|
|
|
+import matplotlib.pylab as plt
|
|
|
+from odps import ODPS
|
|
|
+from odps.df import DataFrame as odpsdf
|
|
|
+from datetime import datetime as dt
|
|
|
+import datetime
|
|
|
+import process_feature
|
|
|
+import _pickle as cPickle
|
|
|
+from sklearn.feature_selection import SelectFromModel
|
|
|
+from sklearn.linear_model import LogisticRegression
|
|
|
+
|
|
|
+def getRovfeaturetable(dt, table):
|
|
|
+ odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
|
|
|
+ endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
|
|
|
+ read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
|
|
|
+
|
|
|
+ featureArray = []
|
|
|
+ for record in odps.read_table(table, partition='dt=%s' % dt):
|
|
|
+ valueFeature = {}
|
|
|
+ for i in process_feature.featurename:
|
|
|
+ if i == 'dt':
|
|
|
+ valueFeature[i] = dt
|
|
|
+ else:
|
|
|
+ valueFeature[i] = record[i]
|
|
|
+ featureArray.append(valueFeature)
|
|
|
+ featureArray = pd.DataFrame(featureArray)
|
|
|
+ print(dt, table, 'feature table finish')
|
|
|
+ return featureArray
|
|
|
+
|
|
|
+def getdatasample(date, max_range, table):
|
|
|
+ new_date = dt.strptime(date, '%Y%m%d')
|
|
|
+ datelist = []
|
|
|
+ testlist = []
|
|
|
+ for i in range(0, max_range):
|
|
|
+ delta = datetime.timedelta(days=i)
|
|
|
+ tar_dt = new_date - delta
|
|
|
+ datelist.append(tar_dt.strftime("%Y%m%d"))
|
|
|
+ for tm in datelist:
|
|
|
+ testlist.append(getRovfeaturetable(tm, table))
|
|
|
+ data = pd.concat(testlist)
|
|
|
+ data.reset_index(inplace=True)
|
|
|
+ data = data.drop(axis=1, columns='index')
|
|
|
+ return data
|
|
|
+
|
|
|
+def discrete_y(y):
|
|
|
+ y = float(y)
|
|
|
+ if y>1000000:
|
|
|
+ return 7
|
|
|
+ elif y>500000:
|
|
|
+ return 6
|
|
|
+ elif y>100000:
|
|
|
+ return 5
|
|
|
+ elif y>50000:
|
|
|
+ return 4
|
|
|
+ elif y>10000:
|
|
|
+ return 3
|
|
|
+ elif y>5000:
|
|
|
+ return 2
|
|
|
+ elif y>1000:
|
|
|
+ return 1
|
|
|
+ else:
|
|
|
+ return 0
|
|
|
+
|
|
|
+def clean_data(df):
|
|
|
+ #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
|
|
|
+ df['futre7dayreturn'] = df['futre7dayreturn'].apply(discrete_y)
|
|
|
+ y = df['futre7dayreturn']
|
|
|
+ print(y)
|
|
|
+ df_vids = df['videoid']
|
|
|
+ #drop string
|
|
|
+ #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
|
|
|
+ x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
|
|
|
+ #drop future
|
|
|
+ #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
|
|
|
+ x = x.drop(['futre7dayreturn'], axis=1)
|
|
|
+
|
|
|
+ x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
|
|
|
+ x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
|
|
|
+ x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
|
|
|
+
|
|
|
+ x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
|
|
|
+ x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
|
|
|
+ x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
|
|
|
+
|
|
|
+ features = list(x)
|
|
|
+ drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
|
+ x = x.drop(drop_features, axis=1)
|
|
|
+ x = x.fillna(0)
|
|
|
+ x = x.astype('float64')
|
|
|
+ #x.fillna(0)
|
|
|
+ x.clip(0,2000000)
|
|
|
+
|
|
|
+ features = [f for f in features if f not in drop_features]
|
|
|
+ return x, y , df_vids, features
|
|
|
+
|
|
|
+def feature_selection(X, y):
|
|
|
+ selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
|
|
|
+ return selector
|
|
|
+
|
|
|
+def auto_train(X_train, y_train):
|
|
|
+ from flaml import AutoML
|
|
|
+ automl = AutoML()
|
|
|
+ automl_settings = {
|
|
|
+ "time_budget": 10, # in seconds
|
|
|
+ "metric": 'r2',
|
|
|
+ "task": 'regression',
|
|
|
+ "log_file_name": "test/auto.log",
|
|
|
+ "estimator_list": ["lgbm"]
|
|
|
+ }
|
|
|
+ automl.fit(X_train=X_train, y_train=y_train,
|
|
|
+ **automl_settings)
|
|
|
+
|
|
|
+ pred_test_y = automl.predict(X_train)
|
|
|
+ y_test = y_train.values
|
|
|
+
|
|
|
+ #err_mape = mean_absolute_percentage_error(y_test, pred_test_y)
|
|
|
+ r2 = r2_score(y_test, pred_test_y)
|
|
|
+ #print('err_mape', err_mape)
|
|
|
+ print('r2', r2)
|
|
|
+
|
|
|
+ pack_result(pred_test_y, y_test,[],'autoval.csv')
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def train(x,y,features):
|
|
|
+ X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, stratify=y )
|
|
|
+
|
|
|
+ '''
|
|
|
+ selector = feature_selection(X_train, y_train)
|
|
|
+ X_train = selector.transform(X_train)
|
|
|
+ X_test = selector.transform(X_test)
|
|
|
+ selected_features = []
|
|
|
+ _supported = selector.get_support()
|
|
|
+ for i in range(0, len(_supported)):
|
|
|
+ if _supported[i]:
|
|
|
+ selected_features.append(features[i])
|
|
|
+ features = selected_features
|
|
|
+ '''
|
|
|
+
|
|
|
+ print(len(X_train), len(X_test))
|
|
|
+ params = {
|
|
|
+ "objective": "multiclass",
|
|
|
+ "num_classes": 8,
|
|
|
+ "max_depth": 6,
|
|
|
+ "num_leaves": 30,
|
|
|
+ "learning_rate": 0.05,
|
|
|
+ "bagging_fraction": 0.7,
|
|
|
+ "feature_fraction": 0.7,
|
|
|
+ "bagging_freq": 8,
|
|
|
+ "bagging_seed": 2018,
|
|
|
+ "lambda_l1": 0.1,
|
|
|
+ "boosting": "gbdt",
|
|
|
+ "nthread": 4,
|
|
|
+ "verbosity": -1
|
|
|
+ }
|
|
|
+ lgtrain = lgb.Dataset(X_train, label=y_train)
|
|
|
+ lgval = lgb.Dataset(X_test, label=y_test)
|
|
|
+ evals_result = {}
|
|
|
+ model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20,
|
|
|
+ evals_result=evals_result)
|
|
|
+
|
|
|
+ pack_result(model.feature_importance(), features, [], 'importance.csv')
|
|
|
+
|
|
|
+ pred_test_y = model.predict(X_test, num_iteration=model.best_iteration)
|
|
|
+
|
|
|
+ top_k_accuracy_score(y_test, pred_test_y, k=1)
|
|
|
+ print('top_k_accuracy_score', top_k_accuracy_score)
|
|
|
+
|
|
|
+ pack_result(pred_test_y, y_test,[],'val.csv')
|
|
|
+
|
|
|
+ return pred_test_y, model, evals_result
|
|
|
+
|
|
|
+
|
|
|
+def pack_result(y_, y, vid, fp):
|
|
|
+ #y_ = y_.astype(int)
|
|
|
+ y_.reshape(len(y_),1)
|
|
|
+ df = pd.DataFrame(data=y_, columns=['score'])
|
|
|
+ if len(vid) >0:
|
|
|
+ df['vid'] = vid
|
|
|
+ df['y'] = y
|
|
|
+ df = df.sort_values(by=['score'], ascending=False)
|
|
|
+ df.to_csv(fp, index=False)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ with open(r"train_data_x.pickle", "rb") as input_file:
|
|
|
+ train_data = cPickle.load(input_file)
|
|
|
+ with open(r"predict_data_x.pickle", "rb") as input_file:
|
|
|
+ predict_data = cPickle.load(input_file)
|
|
|
+
|
|
|
+ x,y,_,features = clean_data(train_data)
|
|
|
+
|
|
|
+ #auto train
|
|
|
+ #auto_train(x,y)
|
|
|
+
|
|
|
+ #train
|
|
|
+ _, model, _ = train(x, y, features)
|
|
|
+ with open('model.pickle','wb') as output_file:
|
|
|
+ cPickle.dump(model, output_file)
|
|
|
+ '''
|
|
|
+ with open(r"model.pickle", "rb") as input_file:
|
|
|
+ model = cPickle.load(input_file)
|
|
|
+ '''
|
|
|
+ x,y,vid,_ = clean_data(predict_data)
|
|
|
+ y_ = model.predict(x, num_iteration=model.best_iteration)
|
|
|
+
|
|
|
+ pack_result(y_, y, vid, 'pred.csv')
|