data_desc.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. import _pickle as cPickle
  2. import pandas as pd
  3. def clean_data(df):
  4. #y = df['futre7dayreturn'].apply(lambda x: np.log(df['futre7dayreturn']+1))
  5. df['futre7dayreturn'].loc[df['futre7dayreturn']<=0] = 1
  6. y = df['futre7dayreturn']
  7. df_vids = df['videoid']
  8. #drop string
  9. #x = df.drop(['videoid', 'videotags', 'videotitle', 'videodescr', 'videodistribute_title', 'videoallwords', 'words_without_tags'], axis=1)
  10. x = df.drop(['videoid', 'videotags', 'words_without_tags', 'dt'], axis=1)
  11. #drop future
  12. #x = df.drop(['futr5viewcount', 'futr5returncount', 'futre7dayreturn'], axis=1)
  13. x = x.drop(['futre7dayreturn'], axis=1)
  14. x['stage_four_retrn_added'] = x['stage_four_retrn'] - x['stage_three_retrn']
  15. x['stage_three_retrn_added'] = x['stage_three_retrn'] - x['stage_two_retrn']
  16. x['stage_two_retrn_added'] = x['stage_two_retrn'] - x['stage_one_retrn']
  17. x['stage_four_retrn_ratio'] = (x['stage_four_retrn'] - x['stage_three_retrn'])/x['stage_four_retrn']
  18. x['stage_three_retrn_ratio'] = (x['stage_three_retrn'] - x['stage_two_retrn'])/x['stage_three_retrn']
  19. x['stage_two_retrn_ratio'] = (x['stage_two_retrn'] - x['stage_one_retrn'])/x['stage_two_retrn']
  20. features = list(x)
  21. drop_features = [f for f in features if (f.find('day30')!=-1 or f.find('day60')!=-1)]
  22. x = x.drop(drop_features, axis=1)
  23. x = x.fillna(0)
  24. x = x.astype('float64')
  25. x = x.clip(0,2000000)
  26. features = [f for f in features if f not in drop_features]
  27. return x, y , df_vids, features
  28. if __name__ == '__main__':
  29. with open(r"train_data_x.pickle", "rb") as input_file:
  30. train_data = cPickle.load(input_file)
  31. with open(r"predict_data_x.pickle", "rb") as input_file:
  32. predict_data = cPickle.load(input_file)
  33. X,Y,_,_ = clean_data(train_data)
  34. x,y,_,_ = clean_data(predict_data)
  35. print(X.describe())
  36. print(x.describe())
  37. #Y.describe()