import pandas as pd
import numpy as np
from paddle.io import Dataset

class RovDataset(Dataset):

def clean_data(df):
    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
    y = df['futre7dayreturn']
    df_vids = df['videoid']
    #drop string
    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
    #drop future
    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
    x = x.drop(['futre7dayreturn'], axis=1)
    features = list(x)
    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
    x = x.drop(drop_features, axis=1)

    features = [f for f in features if f not in drop_features]
    return x, y , df_vids, features


def pack_result(y_, y, vid, fp):
    #y_ = y_.astype(int)
    y_.reshape(len(y_),1) 
    df = pd.DataFrame(data=y_, columns=['score'])
    if len(vid) >0:
        df['vid'] = vid
    df['y'] = y
    df = df.sort_values(by=['score'], ascending=False)
    df.to_csv(fp, index=False)

    
if __name__ == '__main__':
    with open(r"train_data.pickle", "rb") as input_file:
        train_data = cPickle.load(input_file)    
    with open(r"predict_data.pickle", "rb") as input_file:
        predict_data = cPickle.load(input_file)   

    #train
    x,y,_,features = clean_data(train_data)
    _, model, _ = train(x, y, features)
    with open('model.pickle','wb') as output_file:
        cPickle.dump(model, output_file)
    '''
    with open(r"model.pickle", "rb") as input_file:
        model = cPickle.load(input_file)    
    ''' 
    x,y,vid,_ = clean_data(predict_data)
    y_ = model.predict(x, num_iteration=model.best_iteration)

    pack_result(y_, y, vid, 'pred.csv')