model_predict_analyse.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. import gzip
  2. import pandas as pd
  3. from hdfs import InsecureClient
  4. client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")
  5. def read_predict(hdfs_path: str) -> list:
  6. result = []
  7. for file in client.list(hdfs_path):
  8. with client.read(hdfs_path + file) as reader:
  9. with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
  10. for line in gz_file.read().decode("utf-8").split("\n"):
  11. split = line.split("\t")
  12. if len(split) != 4:
  13. continue
  14. cid = split[3].split("_")[0]
  15. label = split[0]
  16. score = split[2].replace("[", "").replace("]", "").split(",")[1]
  17. result.append({
  18. "cid": cid,
  19. "label": label,
  20. "score": score
  21. })
  22. return result
  23. def _main():
  24. model1_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/")
  25. model2_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/")
  26. m1 = pd.DataFrame(model1_result)
  27. g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
  28. # 获取出现次数最多的十个 cid
  29. most_common_cid1 = g1.nlargest(10, 'count')
  30. print(most_common_cid1)
  31. m2 = pd.DataFrame(model2_result)
  32. g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean'))
  33. # 获取出现次数最多的十个 cid
  34. most_common_cid2 = g2.nlargest(10, 'count')
  35. print(most_common_cid2)
  36. if __name__ == '__main__':
  37. # parser = argparse.ArgumentParser(description="model_predict_analyse.py")
  38. # parser.add_argument("-p", "--predict_path_list", type=list, help="config file path")
  39. # args = parser.parse_args()
  40. #
  41. # predict_path_list = args.predict_path_list
  42. # # 判断参数是否正常
  43. # if len(predict_path_list) != 2:
  44. # sys.exit(1)
  45. _main()