rov_train.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import pandas as pd
  2. import datetime
  3. import pickle
  4. import process_feature
  5. from odps import ODPS
  6. from datetime import datetime as dt
  7. def get_rov_feature_table(date, table):
  8. """
  9. 从DataWorks表中获取对应的特征值
  10. :param date: 日期 type-string '%Y%m%d'
  11. :param table: 表名 type-string
  12. :return: feature_array type-DataFrame
  13. """
  14. odps = ODPS(
  15. access_id='LTAI4FtW5ZzxMvdw35aNkmcp',
  16. secret_access_key='0VKnydcaHK3ITjylbgUsLubX6rwiwc',
  17. project='usercdm',
  18. endpoint='http://service.cn.maxcompute.aliyun.com/api',
  19. connect_timeout=3000,
  20. read_timeout=500000,
  21. pool_maxsize=1000,
  22. pool_connections=1000
  23. )
  24. feature_value_list = []
  25. for record in odps.read_table(name=table, partition='dt=%s' % date):
  26. feature_value = {}
  27. for feature_name in process_feature.features:
  28. if feature_name == 'dt':
  29. feature_value[feature_name] = date
  30. else:
  31. feature_value[feature_name] = record[feature_name]
  32. feature_value_list.append(feature_value)
  33. feature_array = pd.DataFrame(feature_value_list)
  34. print(date, table, 'feature table finish')
  35. return feature_array
  36. def get_data_with_date(date, delta_days, table):
  37. """
  38. 获取某一时间范围的特征数据
  39. :param date: 标准日期,delta基准,type-string,'%Y%m%d'
  40. :param delta_days: 日期范围(天),type-int,「 >0: date前,<0: date后 」
  41. :param table: DataWorks表名,type-string
  42. :return: data,type-DataFrame
  43. """
  44. base_date = dt.strptime(date, '%Y%m%d')
  45. data_list = []
  46. for days in range(0, delta_days):
  47. delta = datetime.timedelta(days=days)
  48. delta_date = base_date - delta
  49. # 获取特征数据
  50. delta_data = get_rov_feature_table(date=delta_date.strftime('%Y%m%d'), table=table)
  51. data_list.append(delta_data)
  52. data = pd.concat(data_list)
  53. # 重新进行索引
  54. data.reset_index(inplace=True)
  55. # 删除index列
  56. data = data.drop(columns=['index'])
  57. return data
  58. if __name__ == '__main__':
  59. dt_test = '20211006'
  60. table_test = 'rov_feature_add_v1'
  61. # res = get_rov_feature_table(dt_test, table_test)
  62. res = get_data_with_date(date=dt_test, delta_days=3, table=table_test)
  63. print(res.shape)