import pandas as pd import numpy as np from paddle.io import Dataset class RovDataset(Dataset): def clean_data(df): #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1)) y = df['futre7dayreturn'] df_vids = df['videoid'] #drop string #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1) x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1) #drop future #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1) x = x.drop(['futre7dayreturn'], axis=1) features = list(x) drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)] x = x.drop(drop_features, axis=1) features = [f for f in features if f not in drop_features] return x, y , df_vids, features def pack_result(y_, y, vid, fp): #y_ = y_.astype(int) y_.reshape(len(y_),1) df = pd.DataFrame(data=y_, columns=['score']) if len(vid) >0: df['vid'] = vid df['y'] = y df = df.sort_values(by=['score'], ascending=False) df.to_csv(fp, index=False) if __name__ == '__main__': with open(r"train_data.pickle", "rb") as input_file: train_data = cPickle.load(input_file) with open(r"predict_data.pickle", "rb") as input_file: predict_data = cPickle.load(input_file) #train x,y,_,features = clean_data(train_data) _, model, _ = train(x, y, features) with open('model.pickle','wb') as output_file: cPickle.dump(model, output_file) ''' with open(r"model.pickle", "rb") as input_file: model = cPickle.load(input_file) ''' x,y,vid,_ = clean_data(predict_data) y_ = model.predict(x, num_iteration=model.best_iteration) pack_result(y_, y, vid, 'pred.csv')