rov_train_paddle.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. import pandas as pd
  2. import numpy as np
  3. import _pickle as cPickle
  4. from paddle.io import Dataset
  5. from paddle import paddle
  6. class RovDataset(Dataset):
  7. def __init__(self, path):
  8. super(RovDataset, self).__init__()
  9. self.path = path
  10. _,_,_,self.features = self._parse_dataset()
  11. def _parse_dataset(self):
  12. self.data = []
  13. self.labels = []
  14. with open(self.path, "rb") as input_file:
  15. df = cPickle.load(input_file)
  16. y = df['futre7dayreturn']
  17. df_vids = df['videoid']
  18. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  19. x = x.drop(['futre7dayreturn'], axis=1)
  20. features = list(x)
  21. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  22. x = x.drop(drop_features, axis=1)
  23. x.fillna(0)
  24. x = x.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) )
  25. features = [f for f in features if f not in drop_features]
  26. self.data = x
  27. self.labels = y
  28. return x, y , df_vids, features
  29. def __getitem__(self, idx):
  30. try:
  31. data, label = np.array(self.data.iloc[idx]), np.array(self.labels.iloc[idx])
  32. return data.astype('float32'), label.astype('float32')
  33. except Exception as e:
  34. print(e)
  35. def __len__(self):
  36. return len(self.labels)
  37. def train():
  38. feature_dim = 0
  39. result_dim = 1
  40. batch_size = 100
  41. train_dataset = RovDataset("train_data.pickle")
  42. test_dataset = RovDataset("predict_data.pickle")
  43. feature_dim = len(train_dataset.features)
  44. linear=paddle.nn.Sequential(
  45. paddle.nn.Linear(feature_dim, 4096),
  46. paddle.nn.ReLU(),
  47. paddle.nn.Linear(4096, 1024),
  48. paddle.nn.ReLU(),
  49. paddle.nn.Dropout(0.2),
  50. paddle.nn.Linear(1024,19),
  51. paddle.nn.ReLU(),
  52. paddle.nn.Linear(19,result_dim)
  53. )
  54. model=paddle.Model(linear)
  55. model.prepare(paddle.optimizer.Adam(learning_rate=0.001, parameters=model.parameters()),
  56. paddle.nn.MSELoss())
  57. model.fit(train_dataset, epochs=3, batch_size=batch_size, verbose=1)
  58. model.evaluate(test_dataset,batch_size=batch_size,verbose=1)
  59. def pack_result(y_, y, vid, fp):
  60. #y_ = y_.astype(int)
  61. y_.reshape(len(y_),1)
  62. df = pd.DataFrame(data=y_, columns=['score'])
  63. if len(vid) >0:
  64. df['vid'] = vid
  65. df['y'] = y
  66. df = df.sort_values(by=['score'], ascending=False)
  67. df.to_csv(fp, index=False)
  68. if __name__ == '__main__':
  69. train()