algorithm
/
recommend-emr-dataprocess


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
							import argparse
import gzip
import sys

import pandas as pd
from hdfs import InsecureClient

client = InsecureClient("http://master-1-1.c-7f31a3eea195cb73.cn-hangzhou.emr.aliyuncs.com:9870", user="spark")


def read_predict(hdfs_path: str) -> list:
    result = []
    for file in client.list(hdfs_path):
        with client.read(hdfs_path + file) as reader:
            with gzip.GzipFile(fileobj=reader, mode="rb") as gz_file:
                for line in gz_file.read().decode("utf-8").split("\n"):
                    split = line.split("\t")
                    if len(split) != 4:
                        continue
                    cid = split[3].split("_")[0]
                    label = split[0]
                    score = float(split[2].replace("[", "").replace("]", "").split(",")[1])

                    result.append({
                        "cid": cid,
                        "label": label,
                        "score": score
                    })

    return result


def _main(model1_predict_path: str, model2_predict_path: str):
    if not model1_predict_path.endswith("/"):
        model1_predict_path += "/"

    if not model2_predict_path.endswith("/"):
        model2_predict_path += "/"

    # # 设置 pandas 显示选项
    # pd.set_option('display.max_rows', None)  # 显示所有行
    # pd.set_option('display.max_columns', None)  # 显示所有列

    model1_result = read_predict(model1_predict_path)
    model2_result = read_predict(model2_predict_path)

    m1 = pd.DataFrame(model1_result)
    g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
    # 获取出现次数最多的十个 cid
    most_common_cid1 = g1.nlargest(10, 'count')

    m2 = pd.DataFrame(model2_result)
    g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
    # 获取出现次数最多的十个 cid
    most_common_cid2 = g2.nlargest(10, 'count')

    # 合并两个 DataFrame，按 'cid' 匹配
    merged = pd.merge(most_common_cid1, most_common_cid2, on='cid', suffixes=('_m1', '_m2'))

    # 计算 'average_value' 的差值绝对值，并保留六位小数
    merged['score_diff'] = (merged['average_value_m1'] - merged['average_value_m2']).abs().round(6)

    # 计算差值的平均值，并保留六位小数
    mean_abs_diff = round(merged['score_diff'].mean(), 6)
    print(mean_abs_diff)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="model_predict_analyse.py")
    parser.add_argument("-p", "--predict_path_list", nargs='*',
                        help="模型评估结果保存路径，第一个为老模型评估结果，第二个为新模型评估结果")
    args = parser.parse_args()

    predict_path_list = args.predict_path_list
    # 判断参数是否正常
    if len(predict_path_list) != 2:
        sys.exit(1)
    _main(predict_path_list[0], predict_path_list[1])