|  | @@ -18,7 +18,7 @@ def read_predict(hdfs_path: str) -> list:
 | 
	
		
			
				|  |  |                      if len(split) != 4:
 | 
	
		
			
				|  |  |                          continue
 | 
	
		
			
				|  |  |                      cid = split[3].split("_")[0]
 | 
	
		
			
				|  |  | -                    label = split[0]
 | 
	
		
			
				|  |  | +                    label = int(split[0])
 | 
	
		
			
				|  |  |                      score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |                      result.append({
 | 
	
	
		
			
				|  | @@ -30,7 +30,7 @@ def read_predict(hdfs_path: str) -> list:
 | 
	
		
			
				|  |  |      return result
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -def _main(model1_predict_path: str, model2_predict_path: str):
 | 
	
		
			
				|  |  | +def _main(model1_predict_path: str, model2_predict_path: str, file: str):
 | 
	
		
			
				|  |  |      if not model1_predict_path.endswith("/"):
 | 
	
		
			
				|  |  |          model1_predict_path += "/"
 | 
	
		
			
				|  |  |  
 | 
	
	
		
			
				|  | @@ -45,34 +45,47 @@ def _main(model1_predict_path: str, model2_predict_path: str):
 | 
	
		
			
				|  |  |      model2_result = read_predict(model2_predict_path)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      m1 = pd.DataFrame(model1_result)
 | 
	
		
			
				|  |  | -    g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
 | 
	
		
			
				|  |  | -    # 获取出现次数最多的十个 cid
 | 
	
		
			
				|  |  | -    most_common_cid1 = g1.nlargest(10, 'count')
 | 
	
		
			
				|  |  | +    g1 = m1.groupby("cid").agg(
 | 
	
		
			
				|  |  | +        view=('cid', 'size'),
 | 
	
		
			
				|  |  | +        conv=('label', 'sum'),
 | 
	
		
			
				|  |  | +        old_score_avg=('score', lambda x: round(x.mean(), 6))
 | 
	
		
			
				|  |  | +    ).reset_index()
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    g1['true'] = g1['conv'] / g1['view']
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      m2 = pd.DataFrame(model2_result)
 | 
	
		
			
				|  |  | -    g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
 | 
	
		
			
				|  |  | -    # 获取出现次数最多的十个 cid
 | 
	
		
			
				|  |  | -    most_common_cid2 = g2.nlargest(10, 'count')
 | 
	
		
			
				|  |  | +    g2 = m2.groupby("cid").agg(
 | 
	
		
			
				|  |  | +        new_score_avg=('score', lambda x: round(x.mean(), 6))
 | 
	
		
			
				|  |  | +    )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    merged = pd.merge(g1, g2, on='cid', how='left')
 | 
	
		
			
				|  |  | +    merged.fillna(0, inplace=True)
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    merged["abs((new-true)/true)"] = abs(
 | 
	
		
			
				|  |  | +        (merged['new_score_avg'] - merged['true']) / merged['true']
 | 
	
		
			
				|  |  | +    ).mask(merged['true'] == 0, 0)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    # 合并两个 DataFrame,按 'cid' 匹配
 | 
	
		
			
				|  |  | -    merged = pd.merge(most_common_cid1, most_common_cid2, on='cid', suffixes=('_m1', '_m2'))
 | 
	
		
			
				|  |  | +    merged["abs((old-true)/true)"] = abs(
 | 
	
		
			
				|  |  | +        (merged['old_score_avg'] - merged['true']) / merged['true']
 | 
	
		
			
				|  |  | +    ).mask(merged['true'] == 0, 0)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    # 计算 'average_value' 的差值绝对值,并保留六位小数
 | 
	
		
			
				|  |  | -    merged['score_diff'] = (merged['average_value_m1'] - merged['average_value_m2']).abs().round(6)
 | 
	
		
			
				|  |  | +    merged = merged[['cid', 'view', "conv", "true", "old_score_avg", "new_score_avg",
 | 
	
		
			
				|  |  | +                     "abs((old-true)/true)", "abs((new-true)/true)"]]
 | 
	
		
			
				|  |  | +    merged = merged.sort_values(by=['view'], ascending=False)
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | -    # 计算差值的平均值,并保留六位小数
 | 
	
		
			
				|  |  | -    mean_abs_diff = round(merged['score_diff'].mean(), 6)
 | 
	
		
			
				|  |  | -    print(mean_abs_diff)
 | 
	
		
			
				|  |  | +    with open(file, "w") as writer:
 | 
	
		
			
				|  |  | +        writer.write(merged.to_string(index=False))
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |  if __name__ == '__main__':
 | 
	
		
			
				|  |  |      parser = argparse.ArgumentParser(description="model_predict_analyse.py")
 | 
	
		
			
				|  |  |      parser.add_argument("-p", "--predict_path_list", nargs='*',
 | 
	
		
			
				|  |  |                          help="模型评估结果保存路径,第一个为老模型评估结果,第二个为新模型评估结果")
 | 
	
		
			
				|  |  | +    parser.add_argument("-f", "--file", help="最后计算结果的保存路径")
 | 
	
		
			
				|  |  |      args = parser.parse_args()
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      predict_path_list = args.predict_path_list
 | 
	
		
			
				|  |  |      # 判断参数是否正常
 | 
	
		
			
				|  |  |      if len(predict_path_list) != 2:
 | 
	
		
			
				|  |  |          sys.exit(1)
 | 
	
		
			
				|  |  | -    _main(predict_path_list[0], predict_path_list[1])
 | 
	
		
			
				|  |  | +    _main(predict_path_list[0], predict_path_list[1], args.file)
 |