model_predict_analyse.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import argparse
  2. import gzip
  3. import sys
  4. import ad_monitor_util
  5. import pandas as pd
  6. from hdfs import InsecureClient
  7. client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
  8. def read_predict(hdfs_path: str) -> list:
  9. result = []
  10. for file in client.list(hdfs_path):
  11. with client.read(hdfs_path + file) as reader:
  12. with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
  13. for line in gz_file.read().decode("utf-8").split("\n"):
  14. split = line.split("\t")
  15. if len(split) != 4:
  16. continue
  17. cid = split[3].split("_")[0]
  18. label = split[0]
  19. score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
  20. result.append({
  21. "cid": cid,
  22. "label": label,
  23. "score": score
  24. })
  25. return result
  26. def _main(model1_predict_path: str, model2_predict_path: str):
  27. model1_result = read_predict(model1_predict_path)
  28. model2_result = read_predict(model2_predict_path)
  29. m1 = pd.DataFrame(model1_result)
  30. g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
  31. # 获取出现次数最多的十个 cid
  32. most_common_cid1 = g1.nlargest(10, 'count')
  33. m2 = pd.DataFrame(model2_result)
  34. g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
  35. # 获取出现次数最多的十个 cid
  36. most_common_cid2 = g2.nlargest(10, 'count')
  37. # 合并两个 DataFrame,按 'cid' 匹配
  38. merged = pd.merge(most_common_cid1, most_common_cid2, on='cid', suffixes=('_m1', '_m2'))
  39. # 计算 'average_value' 的差值绝对值,并保留六位小数
  40. merged['score_diff'] = (merged['average_value_m1'] - merged['average_value_m2']).abs().round(6)
  41. # 计算差值的平均值,并保留六位小数
  42. mean_abs_diff = round(merged['score_diff'].mean(), 6)
  43. print(mean_abs_diff)
  44. if __name__ == '__main__':
  45. parser = argparse.ArgumentParser(description="model_predict_analyse.py")
  46. parser.add_argument("-p", "--predict_path_list", nargs='*',
  47. help="模型评估结果保存路径,第一个为老模型评估结果,第二个为新模型评估结果")
  48. args = parser.parse_args()
  49. predict_path_list = args.predict_path_list
  50. # 判断参数是否正常
  51. if len(predict_path_list) != 2:
  52. sys.exit(1)
  53. _main(predict_path_list[0], predict_path_list[1])