get_data.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. import datetime
  2. from datetime import datetime as dt
  3. from odps import ODPS
  4. import process_feature
  5. import pandas as pd
  6. def getRovfeaturetable(dt, table):
  7. odps = ODPS('LTAI4FtW5ZzxMvdw35aNkmcp', '0VKnydcaHK3ITjylbgUsLubX6rwiwc', 'usercdm',
  8. endpoint='http://service.cn.maxcompute.aliyun.com/api', connect_timeout=3000, \
  9. read_timeout=500000, pool_maxsize=1000, pool_connections=1000)
  10. featureArray = []
  11. for record in odps.read_table(table, partition='dt=%s' % dt):
  12. valueFeature = {}
  13. for i in process_feature.featurename:
  14. if i == 'dt':
  15. valueFeature[i] = dt
  16. else:
  17. valueFeature[i] = record[i]
  18. featureArray.append(valueFeature)
  19. featureArray = pd.DataFrame(featureArray)
  20. print(dt, table, 'feature table finish')
  21. return featureArray
  22. def getdatasample(date, max_range, table):
  23. new_date = dt.strptime(date, '%Y%m%d')
  24. datelist = []
  25. testlist = []
  26. for i in range(0, max_range):
  27. delta = datetime.timedelta(days=i)
  28. tar_dt = new_date - delta
  29. datelist.append(tar_dt.strftime("%Y%m%d"))
  30. print(datelist)
  31. for tm in datelist:
  32. testlist.append(getRovfeaturetable(tm, table))
  33. testdata = pd.concat(testlist)
  34. testdata.reset_index(inplace=True)
  35. testdata = testdata.drop(axis=1, columns='index')
  36. return testdata
  37. def process_train_predict_data():
  38. now_date = datetime.date.today()
  39. # day = datetime.datetime.strftime(now_date, '%Y%m%d')
  40. DIFF1 = 1
  41. DIFF7= 7
  42. diff_1 = datetime.timedelta(days=DIFF1)
  43. diff_5 = datetime.timedelta(days=DIFF7)
  44. predict_dt = now_date - diff_1
  45. predict_day = datetime.datetime.strftime(predict_dt, '%Y%m%d')
  46. train_dt = now_date - diff_5
  47. train_day = datetime.datetime.strftime(train_dt, '%Y%m%d')
  48. #read data from ali
  49. train_data = getdatasample(train_day, 30, 'rov_feature_add_v1')
  50. predict_data = getdatasample(predict_day, 1, 'rov_predict_table_add_v1')
  51. #pickle for test
  52. import _pickle as cPickle
  53. with open('train_data_all.pickle','wb') as output_file:
  54. cPickle.dump(train_data, output_file)
  55. with open('predict_data_all.pickle','wb') as output_file:
  56. cPickle.dump(predict_data, output_file)
  57. if __name__ == '__main__' :
  58. process_train_predict_data()