import pandas as pd import numpy as np import _pickle as cPickle from paddle.io import Dataset from paddle import paddle class RovDataset(Dataset): def __init__(self, path): super(RovDataset, self).__init__() self.path = path _,_,_,self.features = self._parse_dataset() def _parse_dataset(self): self.data = [] self.labels = [] with open(self.path, "rb") as input_file: df = cPickle.load(input_file) y = df['futre7dayreturn'] df_vids = df['videoid'] x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1) x = x.drop(['futre7dayreturn'], axis=1) features = list(x) drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)] x = x.drop(drop_features, axis=1) x.fillna(0) x = x.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) ) features = [f for f in features if f not in drop_features] self.data = x self.labels = y return x, y , df_vids, features def __getitem__(self, idx): try: data, label = np.array(self.data.iloc[idx]), np.array(self.labels.iloc[idx]) return data.astype('float32'), label.astype('float32') except Exception as e: print(e) def __len__(self): return len(self.labels) def train(): feature_dim = 0 result_dim = 1 batch_size = 100 train_dataset = RovDataset("train_data.pickle") test_dataset = RovDataset("predict_data.pickle") feature_dim = len(train_dataset.features) linear=paddle.nn.Sequential( paddle.nn.Linear(feature_dim, 4096), paddle.nn.ReLU(), paddle.nn.Linear(4096, 1024), paddle.nn.ReLU(), paddle.nn.Dropout(0.2), paddle.nn.Linear(1024,19), paddle.nn.ReLU(), paddle.nn.Linear(19,result_dim) ) model=paddle.Model(linear) model.prepare(paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()), paddle.nn.MSELoss()) model.fit(train_dataset, epochs=3, batch_size=batch_size, verbose=1) model.evaluate(test_dataset,batch_size=batch_size,verbose=1) def pack_result(y_, y, vid, fp): #y_ = y_.astype(int) y_.reshape(len(y_),1) df = pd.DataFrame(data=y_, columns=['score']) if len(vid) >0: df['vid'] = vid df['y'] = y df = df.sort_values(by=['score'], ascending=False) df.to_csv(fp, index=False) if __name__ == '__main__': train()