Prechádzať zdrojové kódy

feat:添加评估结果分析脚本

zhaohaipeng 1 rok pred
rodič
commit
235126269c
2 zmenil súbory, kde vykonal 89 pridanie a 21 odobranie
  1. 60 5
      ad/01_ad_model_update.sh
  2. 29 16
      ad/model_predict_analyse.py

+ 60 - 5
ad/01_ad_model_update.sh

@@ -45,7 +45,13 @@ model_local_path=/root/zhaohp/XGB
 start_time=$(date +%s)
 # 线上模型在HDFS中的路径
 online_model_path=`cat ${model_path_file}`
-
+# 评测结果保存路径,后续需要根据此文件评估是否要更新模型
+predict_analyse_file_path=""
+# 保存模型评估的分析结果
+old_incr_rate_avg=0
+new_incr_rate_avg=0
+old_incr_rate_list=""
+new_incr_rate_list=""
 
 # 校验命令的退出码
 check_run_status() {
@@ -100,6 +106,7 @@ init() {
   predict_date_path=${BUCKET_FEATURE_PATH}/${today_early_1}
   new_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${train_first_day: -4}_${train_last_day: -4}
   online_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${online_model_path: -9}
+  predict_analyse_file_path=${model_local_path}/predict_analyse_file/${today_early_1}_351_1000_analyse.txt
 
   echo "init param train_data_path: ${train_data_path}"
   echo "init param predict_date_path: ${predict_date_path}"
@@ -111,6 +118,7 @@ init() {
   echo "init param model_name: ${model_name}"
   echo "init param model_local_path: ${model_local_path}"
   echo "init param model_oss_path: ${MODEL_OSS_PATH}"
+  echo "init param predict_analyse_file_path: ${predict_analyse_file_path}"
 
   echo "当前Python环境安装的Python版本: $(python --version)"
   echo "当前Python环境安装的三方包: $(python -m pip list)"
@@ -183,6 +191,48 @@ xgb_train() {
   check_run_status $return_code $step_start_time "XGB模型训练任务"
 }
 
+calc_model_predict() {
+  local count=0
+  local max_line=10
+  local old_total_diff=0
+  local new_total_diff=0
+  while read -r line && [ ${count} -lt ${max_line} ]; do
+
+      # 使用 ! 取反判断,只有当行中不包含 "cid" 时才执行继续的逻辑
+      if [[ "${line}" == *"cid"* ]]; then
+          continue
+      fi
+
+      read -a numbers <<< "${line}"
+
+      if [[ -z ${old_diff_abs} ]];then
+          old_incr_rate_list="${numbers[6]}"
+          new_incr_rate_list="${numbers[7]}"
+      else
+          old_incr_rate_list="${old_incr_rate_list};${numbers[6]}"
+          new_incr_rate_list="${new_incr_rate_list};${numbers[7]}"
+      fi 
+
+      old_total_diff=$( echo "${old_total_diff} + ${numbers[6]}" | bc -l )
+      new_total_diff=$( echo "${new_total_diff} + ${numbers[7]}" | bc -l )
+
+      count=$((${count} + 1))
+
+  done < "${predict_analyse_file_path}"
+
+  local return_code=$?
+  check_run_status $return_code $step_start_time "计算Top10差异"
+
+  old_incr_rate_avg=$( echo "scale=6; ${old_total_diff} / ${count}" | bc -l )
+  return_code=$?
+  check_run_status $return_code $step_start_time "计算Top10差异"
+
+
+  new_incr_rate_list=$( echo "scale=6; ${new_total_diff} / ${count}" | bc l )
+  return_code=$?
+  check_run_status $return_code $step_start_time "计算Top10差异"
+}
+
 model_predict() {
 
   # 线上模型评估最新的数据
@@ -207,10 +257,15 @@ model_predict() {
   local return_code=$?
   check_run_status $return_code $step_start_time "线上模型评估${predict_date_path: -8}的数据"
 
-  local mean_abs_diff=$(python ${sh_path}/model_predict_analyse.py -p ${online_model_predict_result_path} ${new_model_predict_result_path})
-  if (( $(echo "${mean_abs_diff} > 0.000400" | bc -l ) ));then
-    check_run_status 1 $step_start_time "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.000400,请检查"
-    echo "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.000400,请检查"
+  # 结果分析
+  local python_return_code=$(python ${sh_path}/model_predict_analyse.py -p ${online_model_predict_result_path} ${new_model_predict_result_path})
+  check_run_status $python_return_code $step_start_time "线上模型评估${predict_date_path: -8}的数据"
+
+  calc_model_predict
+
+  if (( $(echo "${new_incr_rate_list} > 0.100000" | bc -l ) ));then 
+    check_run_status 1 $step_start_time "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.1,请检查"
+    echo "线上模型评估${predict_date_path: -8}的数据,绝对误差大于0.1,请检查"
     exit 1
   fi 
 }

+ 29 - 16
ad/model_predict_analyse.py

@@ -18,7 +18,7 @@ def read_predict(hdfs_path: str) -> list:
                     if len(split) != 4:
                         continue
                     cid = split[3].split("_")[0]
-                    label = split[0]
+                    label = int(split[0])
                     score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
 
                     result.append({
@@ -30,7 +30,7 @@ def read_predict(hdfs_path: str) -> list:
     return result
 
 
-def _main(model1_predict_path: str, model2_predict_path: str):
+def _main(model1_predict_path: str, model2_predict_path: str, file: str):
     if not model1_predict_path.endswith("/"):
         model1_predict_path += "/"
 
@@ -45,34 +45,47 @@ def _main(model1_predict_path: str, model2_predict_path: str):
     model2_result = read_predict(model2_predict_path)
 
     m1 = pd.DataFrame(model1_result)
-    g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
-    # 获取出现次数最多的十个 cid
-    most_common_cid1 = g1.nlargest(10, 'count')
+    g1 = m1.groupby("cid").agg(
+        view=('cid', 'size'),
+        conv=('label', 'sum'),
+        old_score_avg=('score', lambda x: round(x.mean(), 6))
+    ).reset_index()
+
+    g1['true'] = g1['conv'] / g1['view']
 
     m2 = pd.DataFrame(model2_result)
-    g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
-    # 获取出现次数最多的十个 cid
-    most_common_cid2 = g2.nlargest(10, 'count')
+    g2 = m2.groupby("cid").agg(
+        new_score_avg=('score', lambda x: round(x.mean(), 6))
+    )
+
+    merged = pd.merge(g1, g2, on='cid', how='left')
+    merged.fillna(0, inplace=True)
+
+    merged["abs((new-true)/true)"] = abs(
+        (merged['new_score_avg'] - merged['true']) / merged['true']
+    ).mask(merged['true'] == 0, 0)
 
-    # 合并两个 DataFrame,按 'cid' 匹配
-    merged = pd.merge(most_common_cid1, most_common_cid2, on='cid', suffixes=('_m1', '_m2'))
+    merged["abs((old-true)/true)"] = abs(
+        (merged['old_score_avg'] - merged['true']) / merged['true']
+    ).mask(merged['true'] == 0, 0)
 
-    # 计算 'average_value' 的差值绝对值,并保留六位小数
-    merged['score_diff'] = (merged['average_value_m1'] - merged['average_value_m2']).abs().round(6)
+    merged = merged[['cid', 'view', "conv", "true", "old_score_avg", "new_score_avg",
+                     "abs((old-true)/true)", "abs((new-true)/true)"]]
+    merged = merged.sort_values(by=['view'], ascending=False)
 
-    # 计算差值的平均值,并保留六位小数
-    mean_abs_diff = round(merged['score_diff'].mean(), 6)
-    print(mean_abs_diff)
+    with open(file, "w") as writer:
+        writer.write(merged.to_string(index=False))
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="model_predict_analyse.py")
     parser.add_argument("-p", "--predict_path_list", nargs='*',
                         help="模型评估结果保存路径,第一个为老模型评估结果,第二个为新模型评估结果")
+    parser.add_argument("-f", "--file", help="最后计算结果的保存路径")
     args = parser.parse_args()
 
     predict_path_list = args.predict_path_list
     # 判断参数是否正常
     if len(predict_path_list) != 2:
         sys.exit(1)
-    _main(predict_path_list[0], predict_path_list[1])
+    _main(predict_path_list[0], predict_path_list[1], args.file)