import gzip import pandas as pd from hdfs import InsecureClient client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark") def read_predict(hdfs_path: str) -> list: result = [] for file in client.list(hdfs_path): with client.read(hdfs_path + file) as reader: with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file: for line in gz_file.read().decode("utf-8").split("\n"): split = line.split("\t") if len(split) != 4: continue cid = split[3].split("_")[0] label = split[0] score = split[2].replace("[", "").replace("]", "").split(",")[1] result.append({ "cid": cid, "label": label, "score": score }) return result def _main(): model1_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/") model2_result = read_predict("/dw/recommend/model/34_ad_predict_data/20241004_351_0927_1003_1000/") m1 = pd.DataFrame(model1_result) g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean')) # 获取出现次数最多的十个 cid most_common_cid1 = g1.nlargest(10, 'count') print(most_common_cid1) m2 = pd.DataFrame(model2_result) g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', 'mean')) # 获取出现次数最多的十个 cid most_common_cid2 = g2.nlargest(10, 'count') print(most_common_cid2) if __name__ == '__main__': # parser = argparse.ArgumentParser(description="model_predict_analyse.py") # parser.add_argument("-p", "--predict_path_list", type=list, help="config file path") # args = parser.parse_args() # # predict_path_list = args.predict_path_list # # 判断参数是否正常 # if len(predict_path_list) != 2: # sys.exit(1) _main()