import warnings warnings.filterwarnings("ignore") import os import pandas as pd import gc import math import numpy as np import time import lightgbm as lgb from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedKFold from sklearn.metrics import r2_score from sklearn import metrics import pickle from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error import seaborn as sns import matplotlib.pylab as plt from odps import ODPS from odps.df import DataFrame as odpsdf from datetime import datetime as dt import datetime import process_feature import _pickle as cPickle from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression def getRovfeaturetable(dt, table): odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm', endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \ read_timeout=500000, pool_maxsize=1000, pool_connections=1000) featureArray = [] for record in odps.read_table(table, partition='dt=%s' % dt): valueFeature = {} for i in process_feature.featurename: if i == 'dt': valueFeature[i] = dt else: valueFeature[i] = record[i] featureArray.append(valueFeature) featureArray = pd.DataFrame(featureArray) print(dt, table, 'feature table finish') return featureArray def getdatasample(date, max_range, table): new_date = dt.strptime(date, '%Y%m%d') datelist = [] testlist = [] for i in range(0, max_range): delta = datetime.timedelta(days=i) tar_dt = new_date - delta datelist.append(tar_dt.strftime("%Y%m%d")) for tm in datelist: testlist.append(getRovfeaturetable(tm, table)) data = pd.concat(testlist) data.reset_index(inplace=True) data = data.drop(axis=1, columns='index') return data def clean_data(df): #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1)) df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1 y = df['futre7dayreturn'] df_vids = df['videoid'] #drop string #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1) x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1) #drop future #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1) x = x.drop(['futre7dayreturn'], axis=1) x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn'] x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn'] x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn'] x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn'] x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn'] x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn'] features = list(x) drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)] x = x.drop(drop_features, axis=1) x.fillna(0) x = x.astype('float64') x = x.clip(1,2000000) #features = [f for f in features if f not in drop_features] features = list(x) return x, y , df_vids, features def std_data(df, features): for f in features: if df[f].max()>1: df[f] = (df[f]-df[f].min()) / (df[f]-df[f].max()+1) return df def feature_selection(X, y): selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y) return selector def auto_train(X_train, y_train): from flaml import AutoML automl = AutoML() automl_settings = { "time_budget": 8000, # in seconds "metric": 'mae', "task": 'regression', "log_file_name": "auto.log", "estimator_list": ["lgbm"] } automl.fit(X_train=X_train, y_train=y_train, **automl_settings) pred_test_y = automl.predict(X_train) y_test = y_train.values err_mape = mean_absolute_percentage_error(y_test, pred_test_y) r2 = r2_score(y_test, pred_test_y) print('err_mape', err_mape) print('r2', r2) pack_result(pred_test_y, y_test,[],'autoval.csv') def train(x,y,features): X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) ''' selector = feature_selection(X_train, y_train) X_train = selector.transform(X_train) X_test = selector.transform(X_test) selected_features = [] _supported = selector.get_support() for i in range(0, len(_supported)): if _supported[i]: selected_features.append(features[i]) features = selected_features ''' print(len(X_train), len(X_test)) params = { "objective": "regression", "reg_sqrt":True, "metric": "mape", "max_depth": -1, "num_leaves": 50, "learning_rate": 0.1, "bagging_fraction": 0.7, "feature_fraction": 0.7, "bagging_freq": 8, "bagging_seed": 2018, "lambda_l1": 0.11, "boosting": "gbdt", "nthread": 4, "verbosity": -1 } lgtrain = lgb.Dataset(X_train, label=y_train) lgval = lgb.Dataset(X_test, label=y_test) evals_result = {} model = lgb.train(params, lgtrain, 10000, valid_sets=[lgval], early_stopping_rounds=200, verbose_eval=20, evals_result=evals_result) pack_result(model.feature_importance(), features, [], 'importance.csv') pred_test_y = model.predict(X_test, num_iteration=model.best_iteration) y_test = y_test.values err_mape = mean_absolute_percentage_error(y_test, pred_test_y) r2 = r2_score(y_test, pred_test_y) print('err_mape', err_mape) print('r2', r2) pack_result(pred_test_y, y_test,[],'val.csv') return pred_test_y, model, evals_result def pack_result(y_, y, vid, fp): #y_ = y_.astype(int) y_.reshape(len(y_),1) df = pd.DataFrame(data=y_, columns=['score']) if len(vid) >0: df['vid'] = vid df['y'] = y df = df.sort_values(by=['score'], ascending=False) df.to_csv(fp, index=False) if __name__ == '__main__': with open(r"train_data_x.pickle", "rb") as input_file: train_data = cPickle.load(input_file) with open(r"predict_data_x.pickle", "rb") as input_file: predict_data = cPickle.load(input_file) x,y,_,features = clean_data(train_data) #x = std_data(x, features) #print(x.describe()) #auto train auto_train(x,y)