|
@@ -1,26 +1,66 @@
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
+import _pickle as cPickle
|
|
|
from paddle.io import Dataset
|
|
|
+from paddle import paddle
|
|
|
|
|
|
class RovDataset(Dataset):
|
|
|
+ def __init__(self, path):
|
|
|
+ super(RovDataset, self).__init__()
|
|
|
+ self.path = path
|
|
|
|
|
|
-def clean_data(df):
|
|
|
- #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
|
|
|
- y = df['futre7dayreturn']
|
|
|
- df_vids = df['videoid']
|
|
|
- #drop string
|
|
|
- #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
|
|
|
- x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
|
|
|
- #drop future
|
|
|
- #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
|
|
|
- x = x.drop(['futre7dayreturn'], axis=1)
|
|
|
- features = list(x)
|
|
|
- drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
|
- x = x.drop(drop_features, axis=1)
|
|
|
+ def _parse_dataset(self):
|
|
|
+ self.data = []
|
|
|
+ self.labels = []
|
|
|
+ with open(self.path, "rb") as input_file:
|
|
|
+ df = cPickle.load(input_file)
|
|
|
+ y = df['futre7dayreturn']
|
|
|
+ df_vids = df['videoid']
|
|
|
+ x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
|
|
|
+ x = x.drop(['futre7dayreturn'], axis=1)
|
|
|
+ features = list(x)
|
|
|
+ drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
|
|
|
+ x = x.drop(drop_features, axis=1)
|
|
|
+ x = x.apply(lambda x: (x - np.min())/ (np.max(x) - np.min(x)))
|
|
|
+ #features = [f for f in features if f not in drop_features]
|
|
|
+ self.data = x
|
|
|
+ self.labels = y
|
|
|
+ #return x, y , df_vids, features
|
|
|
|
|
|
- features = [f for f in features if f not in drop_features]
|
|
|
- return x, y , df_vids, features
|
|
|
+ def __getitem__(self, idx):
|
|
|
+ data, label = self.data.iloc[idx], self.labels.iloc[idx]
|
|
|
+ return data.astype('float21'), label.astype('float32')
|
|
|
|
|
|
+ def __len__(self):
|
|
|
+ return len(self.labels)
|
|
|
+
|
|
|
+
|
|
|
+def train():
|
|
|
+ feature_dim = 0
|
|
|
+ result_dim = 1
|
|
|
+ batch_size = 100
|
|
|
+
|
|
|
+ train_dataset = RovDataset("train_data.pickle")
|
|
|
+ test_dataset = RovDataset("predict_data.pickle")
|
|
|
+
|
|
|
+ linear=paddle.nn.Sequential(
|
|
|
+ paddle.nn.Linear(feature_dim, 4096),
|
|
|
+ paddle.nn.ReLU(),
|
|
|
+ paddle.nn.Linear(4096, 1024),
|
|
|
+ paddle.nn.ReLU(),
|
|
|
+ paddle.nn.Dropout(0.2),
|
|
|
+ paddle.nn.Linear(1024,19),
|
|
|
+ paddle.nn.ReLU(),
|
|
|
+ paddle.nn.Linear(19,1)
|
|
|
+ )
|
|
|
+
|
|
|
+ model=paddle.Model(linear)
|
|
|
+ model.prepare(paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
|
|
|
+ paddle.nn.MSELoss())
|
|
|
+ model.fit(train_dataset, epochs=3, batch_size=batch_size, verbose=1)
|
|
|
+
|
|
|
+ model.evaluate(test_dataset,batch_size=batch_size,verbose=1)
|
|
|
+
|
|
|
|
|
|
def pack_result(y_, y, vid, fp):
|
|
|
#y_ = y_.astype(int)
|
|
@@ -34,21 +74,6 @@ def pack_result(y_, y, vid, fp):
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- with open(r"train_data.pickle", "rb") as input_file:
|
|
|
- train_data = cPickle.load(input_file)
|
|
|
- with open(r"predict_data.pickle", "rb") as input_file:
|
|
|
- predict_data = cPickle.load(input_file)
|
|
|
|
|
|
#train
|
|
|
- x,y,_,features = clean_data(train_data)
|
|
|
- _, model, _ = train(x, y, features)
|
|
|
- with open('model.pickle','wb') as output_file:
|
|
|
- cPickle.dump(model, output_file)
|
|
|
- '''
|
|
|
- with open(r"model.pickle", "rb") as input_file:
|
|
|
- model = cPickle.load(input_file)
|
|
|
- '''
|
|
|
- x,y,vid,_ = clean_data(predict_data)
|
|
|
- y_ = model.predict(x, num_iteration=model.best_iteration)
|
|
|
-
|
|
|
- pack_result(y_, y, vid, 'pred.csv')
|
|
|
+ pass
|