import _pickle as cPickle import pandas as pd def clean_data(df): #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1)) df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1 y = df['futre7dayreturn'] df_vids = df['videoid'] #drop string #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1) x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1) #drop future #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1) x = x.drop(['futre7dayreturn'], axis=1) x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn'] x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn'] x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn'] x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn'] x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn'] x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn'] features = list(x) drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)] x = x.drop(drop_features, axis=1) x = x.fillna(0) x = x.astype('float64') x = x.clip(0,2000000) features = [f for f in features if f not in drop_features] return x, y , df_vids, features if __name__ == '__main__': with open(r"train_data_x.pickle", "rb") as input_file: train_data = cPickle.load(input_file) with open(r"predict_data_x.pickle", "rb") as input_file: predict_data = cPickle.load(input_file) X,Y,_,_ = clean_data(train_data) x,y,_,_ = clean_data(predict_data) print(X.describe()) print(x.describe()) #Y.describe()