model_predict_analyse.py 2.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import argparse
  2. import gzip
  3. import sys
  4. import pandas as pd
  5. from hdfs import InsecureClient
  6. client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
  7. def read_predict(hdfs_path: str) -> list:
  8. result = []
  9. for file in client.list(hdfs_path):
  10. with client.read(hdfs_path + file) as reader:
  11. with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
  12. for line in gz_file.read().decode("utf-8").split("\n"):
  13. split = line.split("\t")
  14. if len(split) != 4:
  15. continue
  16. cid = split[3].split("_")[0]
  17. label = split[0]
  18. score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
  19. result.append({
  20. "cid": cid,
  21. "label": label,
  22. "score": score
  23. })
  24. return result
  25. def _main(model1_predict_path: str, model2_predict_path: str):
  26. # 设置 pandas 显示选项
  27. pd.set_option('display.max_rows', None) # 显示所有行
  28. pd.set_option('display.max_columns', None) # 显示所有列
  29. model1_result = read_predict(model1_predict_path)
  30. model2_result = read_predict(model2_predict_path)
  31. m1 = pd.DataFrame(model1_result)
  32. g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
  33. # 获取出现次数最多的十个 cid
  34. most_common_cid1 = g1.nlargest(1000, 'count')
  35. print(most_common_cid1)
  36. m2 = pd.DataFrame(model2_result)
  37. g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
  38. # 获取出现次数最多的十个 cid
  39. most_common_cid2 = g2.nlargest(1000, 'count')
  40. print(most_common_cid2)
  41. if __name__ == '__main__':
  42. parser = argparse.ArgumentParser(description="model_predict_analyse.py")
  43. parser.add_argument("-p", "--predict_path_list", nargs='*', help="config file path")
  44. args = parser.parse_args()
  45. predict_path_list = args.predict_path_list
  46. # 判断参数是否正常
  47. if len(predict_path_list) != 2:
  48. sys.exit(1)
  49. _main(predict_path_list[0], predict_path_list[1])