|  | @@ -1,26 +1,66 @@
 | 
											
												
													
														|  |  import pandas as pd
 |  |  import pandas as pd
 | 
											
												
													
														|  |  import numpy as np
 |  |  import numpy as np
 | 
											
												
													
														|  | 
 |  | +import _pickle as cPickle
 | 
											
												
													
														|  |  from paddle.io import Dataset
 |  |  from paddle.io import Dataset
 | 
											
												
													
														|  | 
 |  | +from paddle import paddle
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  class RovDataset(Dataset):
 |  |  class RovDataset(Dataset):
 | 
											
												
													
														|  | 
 |  | +    def __init__(self, path):
 | 
											
												
													
														|  | 
 |  | +        super(RovDataset, self).__init__()
 | 
											
												
													
														|  | 
 |  | +        self.path = path
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -def clean_data(df):
 |  | 
 | 
											
												
													
														|  | -    #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
 |  | 
 | 
											
												
													
														|  | -    y = df['futre7dayreturn']
 |  | 
 | 
											
												
													
														|  | -    df_vids = df['videoid']
 |  | 
 | 
											
												
													
														|  | -    #drop string
 |  | 
 | 
											
												
													
														|  | -    #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
 |  | 
 | 
											
												
													
														|  | -    x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
 |  | 
 | 
											
												
													
														|  | -    #drop future
 |  | 
 | 
											
												
													
														|  | -    #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
 |  | 
 | 
											
												
													
														|  | -    x = x.drop(['futre7dayreturn'], axis=1)
 |  | 
 | 
											
												
													
														|  | -    features = list(x)
 |  | 
 | 
											
												
													
														|  | -    drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
 |  | 
 | 
											
												
													
														|  | -    x = x.drop(drop_features, axis=1)
 |  | 
 | 
											
												
													
														|  | 
 |  | +    def _parse_dataset(self):
 | 
											
												
													
														|  | 
 |  | +        self.data = []
 | 
											
												
													
														|  | 
 |  | +        self.labels = []
 | 
											
												
													
														|  | 
 |  | +        with open(self.path, "rb") as input_file:
 | 
											
												
													
														|  | 
 |  | +            df = cPickle.load(input_file)
 | 
											
												
													
														|  | 
 |  | +            y = df['futre7dayreturn']
 | 
											
												
													
														|  | 
 |  | +            df_vids = df['videoid']
 | 
											
												
													
														|  | 
 |  | +            x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
 | 
											
												
													
														|  | 
 |  | +            x = x.drop(['futre7dayreturn'], axis=1)
 | 
											
												
													
														|  | 
 |  | +            features = list(x)
 | 
											
												
													
														|  | 
 |  | +            drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
 | 
											
												
													
														|  | 
 |  | +            x = x.drop(drop_features, axis=1)
 | 
											
												
													
														|  | 
 |  | +            x = x.apply(lambda x: (x - np.min())/ (np.max(x) - np.min(x)))
 | 
											
												
													
														|  | 
 |  | +            #features = [f for f in features if f not in drop_features]
 | 
											
												
													
														|  | 
 |  | +            self.data = x
 | 
											
												
													
														|  | 
 |  | +            self.labels = y
 | 
											
												
													
														|  | 
 |  | +            #return x, y , df_vids, features
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | -    features = [f for f in features if f not in drop_features]
 |  | 
 | 
											
												
													
														|  | -    return x, y , df_vids, features
 |  | 
 | 
											
												
													
														|  | 
 |  | +    def __getitem__(self, idx):
 | 
											
												
													
														|  | 
 |  | +        data, label = self.data.iloc[idx], self.labels.iloc[idx]
 | 
											
												
													
														|  | 
 |  | +        return data.astype('float21'), label.astype('float32')
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  | 
 |  | +    def __len__(self):
 | 
											
												
													
														|  | 
 |  | +        return len(self.labels)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +def train():
 | 
											
												
													
														|  | 
 |  | +    feature_dim = 0
 | 
											
												
													
														|  | 
 |  | +    result_dim = 1
 | 
											
												
													
														|  | 
 |  | +    batch_size = 100
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    train_dataset = RovDataset("train_data.pickle")
 | 
											
												
													
														|  | 
 |  | +    test_dataset = RovDataset("predict_data.pickle")
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    linear=paddle.nn.Sequential(
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.Linear(feature_dim, 4096),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.ReLU(),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.Linear(4096, 1024),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.ReLU(),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.Dropout(0.2),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.Linear(1024,19),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.ReLU(),
 | 
											
												
													
														|  | 
 |  | +            paddle.nn.Linear(19,1)
 | 
											
												
													
														|  | 
 |  | +            )
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    model=paddle.Model(linear)
 | 
											
												
													
														|  | 
 |  | +    model.prepare(paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
 | 
											
												
													
														|  | 
 |  | +                paddle.nn.MSELoss())
 | 
											
												
													
														|  | 
 |  | +    model.fit(train_dataset, epochs=3, batch_size=batch_size, verbose=1)
 | 
											
												
													
														|  | 
 |  | +
 | 
											
												
													
														|  | 
 |  | +    model.evaluate(test_dataset,batch_size=batch_size,verbose=1)
 | 
											
												
													
														|  | 
 |  | +    
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |  def pack_result(y_, y, vid, fp):
 |  |  def pack_result(y_, y, vid, fp):
 | 
											
												
													
														|  |      #y_ = y_.astype(int)
 |  |      #y_ = y_.astype(int)
 | 
											
										
											
												
													
														|  | @@ -34,21 +74,6 @@ def pack_result(y_, y, vid, fp):
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      
 |  |      
 | 
											
												
													
														|  |  if __name__ == '__main__':
 |  |  if __name__ == '__main__':
 | 
											
												
													
														|  | -    with open(r"train_data.pickle", "rb") as input_file:
 |  | 
 | 
											
												
													
														|  | -        train_data = cPickle.load(input_file)    
 |  | 
 | 
											
												
													
														|  | -    with open(r"predict_data.pickle", "rb") as input_file:
 |  | 
 | 
											
												
													
														|  | -        predict_data = cPickle.load(input_file)   
 |  | 
 | 
											
												
													
														|  |  
 |  |  
 | 
											
												
													
														|  |      #train
 |  |      #train
 | 
											
												
													
														|  | -    x,y,_,features = clean_data(train_data)
 |  | 
 | 
											
												
													
														|  | -    _, model, _ = train(x, y, features)
 |  | 
 | 
											
												
													
														|  | -    with open('model.pickle','wb') as output_file:
 |  | 
 | 
											
												
													
														|  | -        cPickle.dump(model, output_file)
 |  | 
 | 
											
												
													
														|  | -    '''
 |  | 
 | 
											
												
													
														|  | -    with open(r"model.pickle", "rb") as input_file:
 |  | 
 | 
											
												
													
														|  | -        model = cPickle.load(input_file)    
 |  | 
 | 
											
												
													
														|  | -    ''' 
 |  | 
 | 
											
												
													
														|  | -    x,y,vid,_ = clean_data(predict_data)
 |  | 
 | 
											
												
													
														|  | -    y_ = model.predict(x, num_iteration=model.best_iteration)
 |  | 
 | 
											
												
													
														|  | -
 |  | 
 | 
											
												
													
														|  | -    pack_result(y_, y, vid, 'pred.csv')
 |  | 
 | 
											
												
													
														|  | 
 |  | +    pass
 |