data_monitor.py 3.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. # 对训练数据的分布进行监控
  2. import numpy as np
  3. import pandas as pd
  4. import datetime
  5. from config import set_config
  6. from rov_train import process_data, process_predict_data
  7. config_, env = set_config()
  8. def get_feature_distribution(feature_name, feature_data):
  9. statistical_results = {'feature_name': feature_name}
  10. feature_data = np.array(feature_data)
  11. feature_data_sorted = sorted(feature_data)
  12. length = len(feature_data_sorted)
  13. count_0 = len([item for item in feature_data_sorted if item == 0])
  14. print('data_count = {}, count_0 = {}, rate_0 = {}'.format(length, count_0, count_0/length))
  15. statistical_results['data_count'] = length
  16. statistical_results['0_count'] = count_0
  17. statistical_results['0_rate'] = count_0/length
  18. # 整体数据分布
  19. for percentile in [0.25, 0.5, 0.75, 1]:
  20. data_count = int(length * percentile)
  21. data = feature_data_sorted[:data_count + 1]
  22. data_mean = np.mean(data)
  23. data_var = np.var(data)
  24. data_std = np.std(data)
  25. # print('percentile = {}, data_count = {}, mean = {}, var = {}, std = {}'.format(
  26. # percentile, data_count, data_mean, data_var, data_std))
  27. statistical_results['mean_{}'.format(percentile)] = data_mean
  28. statistical_results['var_{}'.format(percentile)] = data_var
  29. statistical_results['std_{}'.format(percentile)] = data_std
  30. # 非零数据分布
  31. data_non_zero = [item for item in feature_data_sorted if item != 0]
  32. for percentile in [0.25, 0.5, 0.75, 1]:
  33. data_count = int(len(data_non_zero) * percentile)
  34. data = data_non_zero[:data_count + 1]
  35. data_mean = np.mean(data)
  36. data_var = np.var(data)
  37. dat_std = np.std(data)
  38. # print('percentile = {}, data_count = {}, mean = {}, var = {}, std = {}'.format(
  39. # percentile, data_count, data_mean, data_var, dat_std))
  40. statistical_results['non_zero_mean_{}'.format(percentile)] = data_mean
  41. statistical_results['non_zero_var_{}'.format(percentile)] = data_var
  42. statistical_results['non_zero_std_{}'.format(percentile)] = data_std
  43. return statistical_results
  44. def all_feature_distribution(data, file):
  45. res = []
  46. columns = [
  47. 'feature_name', 'data_count', '0_count', '0_rate',
  48. 'mean_0.25', 'mean_0.5', 'mean_0.75', 'mean_1',
  49. 'var_0.25', 'var_0.5', 'var_0.75', 'var_1',
  50. 'std_0.25', 'std_0.5', 'std_0.75', 'std_1',
  51. 'non_zero_mean_0.25', 'non_zero_mean_0.5', 'non_zero_mean_0.75', 'non_zero_mean_1',
  52. 'non_zero_var_0.25', 'non_zero_var_0.5', 'non_zero_var_0.75', 'non_zero_var_1',
  53. 'non_zero_std_0.25', 'non_zero_std_0.5', 'non_zero_std_0.75', 'non_zero_std_1'
  54. ]
  55. feature_importance = pd.read_csv('data/model_feature_importance.csv')
  56. feature_name_list = list(feature_importance['feature'])
  57. for feature_name in feature_name_list:
  58. print(feature_name)
  59. feature_data = data[feature_name]
  60. statistical_results = get_feature_distribution(feature_name=feature_name, feature_data=feature_data)
  61. res.append(statistical_results)
  62. df = pd.DataFrame(res, columns=columns)
  63. df.to_csv(file)
  64. def main():
  65. now_date = datetime.datetime.strftime(datetime.datetime.today(), '%Y%m%d')
  66. # now_date = '20220119'
  67. # 训练数据
  68. print('train data monitor...')
  69. train_data_file = 'data/train_data_monitor_{}.csv'.format(now_date)
  70. train_filename = config_.TRAIN_DATA_FILENAME
  71. train_x, train_y, videos, fea = process_data(filename=train_filename)
  72. all_feature_distribution(train_x, file=train_data_file)
  73. # 预测数据
  74. print('predict data monitor...')
  75. predict_data_file = 'data/predict_data_monitor_{}.csv'.format(now_date)
  76. predict_filename = config_.PREDICT_DATA_FILENAME
  77. predict_x, video_ids = process_predict_data(filename=predict_filename)
  78. all_feature_distribution(predict_x, file=predict_data_file)
  79. if __name__ == '__main__':
  80. main()