check_data.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import pandas as pd
  2. old_date_train = f"/Users/zhao/Downloads/20241010.csv"
  3. new_date_train = f"/Users/zhao/Desktop/Code/Python/model_monitor/XGB/data/all/20241010_train.csv"
  4. # 读取两个 CSV 文件
  5. old_df = pd.read_csv(old_date_train)
  6. new_df = pd.read_csv(new_date_train)
  7. if old_df.shape[0] != new_df.shape[0]:
  8. print(f"新老训练数据集长度不一样 新数据集: {new_df.shape[0]}, 老数据集: {old_df.shape[0]}")
  9. old_df_col = old_df.columns
  10. new_df_col = new_df.columns
  11. if len(old_df_col) != len(new_df_col):
  12. print(f"两个文件列数不一样 新文件: {new_df_col}, 老文件: {old_df_col}")
  13. for col in old_df_col:
  14. if col not in new_df_col:
  15. print(f"列 {col} 在老文件存在,新文件不存在")
  16. for col in new_df_col:
  17. if col not in old_df_col:
  18. print(f"列 {col} 在新文件存在,老文件不存在")
  19. old_df.set_index("vid", inplace=True)
  20. new_df.set_index("vid", inplace=True)
  21. old_dict = old_df.to_dict(orient="index")
  22. new_dict = new_df.to_dict(orient="index")
  23. for e in new_dict:
  24. if e not in old_dict:
  25. print(f"vid {e} 在新文件中存在,在老文件中不存在")
  26. new_row = new_dict[e]
  27. old_row = old_dict[e]
  28. for col in new_df_col:
  29. if col not in old_row:
  30. print(f"vid {e} 的列 {col} 在老文件中不存在")
  31. continue
  32. if col in new_row:
  33. print(f"vid {e} 的列 {col} 在新文件中不存在")
  34. continue
  35. if old_row[col] != new_row[col]:
  36. print(f"vid {e} 列 {col} 的值在新老文件不一样, 新文件的值: {new_row[col]}, 老文件的值: {old_row[col]}")