hai 8 meses · 235126269c
--- a/ad/01_ad_model_update.sh
+++ b/ad/01_ad_model_update.sh
@@ -45,7 +45,13 @@ model_local_path=/root/zhaohp/XGB
 
				 start_time=$(date +%s)
			
 
				 # 线上模型在HDFS中的路径
			
 
				 online_model_path=`cat ${model_path_file}`
			
 
				-
			
 
				+# 评测结果保存路径，后续需要根据此文件评估是否要更新模型
			
 
				+predict_analyse_file_path=""
			
 
				+# 保存模型评估的分析结果
			
 
				+old_incr_rate_avg=0
			
 
				+new_incr_rate_avg=0
			
 
				+old_incr_rate_list=""
			
 
				+new_incr_rate_list=""
			
 
				 
			
 
				 # 校验命令的退出码
			
 
				 check_run_status() {
			
@@ -100,6 +106,7 @@ init() {
 
				   predict_date_path=${BUCKET_FEATURE_PATH}/${today_early_1}
			
 
				   new_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${train_first_day: -4}_${train_last_day: -4}
			
 
				   online_model_predict_result_path=${PREDICT_RESULT_SAVE_PATH}/${today_early_1}_351_1000_${online_model_path: -9}
			
 
				+  predict_analyse_file_path=${model_local_path}/predict_analyse_file/${today_early_1}_351_1000_analyse.txt
			
 
				 
			
 
				   echo "init param train_data_path: ${train_data_path}"
			
 
				   echo "init param predict_date_path: ${predict_date_path}"
			
@@ -111,6 +118,7 @@ init() {
 
				   echo "init param model_name: ${model_name}"
			
 
				   echo "init param model_local_path: ${model_local_path}"
			
 
				   echo "init param model_oss_path: ${MODEL_OSS_PATH}"
			
 
				+  echo "init param predict_analyse_file_path: ${predict_analyse_file_path}"
			
 
				 
			
 
				   echo "当前Python环境安装的Python版本: $(python --version)"
			
 
				   echo "当前Python环境安装的三方包: $(python -m pip list)"
			
@@ -183,6 +191,48 @@ xgb_train() {
 
				   check_run_status $return_code $step_start_time "XGB模型训练任务"
			
 
				 }
			
 
				 
			
 
				+calc_model_predict() {
			
 
				+  local count=0
			
 
				+  local max_line=10
			
 
				+  local old_total_diff=0
			
 
				+  local new_total_diff=0
			
 
				+  while read -r line && [ ${count} -lt ${max_line} ]; do
			
 
				+
			
 
				+      # 使用 ! 取反判断，只有当行中不包含 "cid" 时才执行继续的逻辑
			
 
				+      if [[ "${line}" == *"cid"* ]]; then
			
 
				+          continue
			
 
				+      fi
			
 
				+
			
 
				+      read -a numbers <<< "${line}"
			
 
				+
			
 
				+      if [[ -z ${old_diff_abs} ]];then
			
 
				+          old_incr_rate_list="${numbers[6]}"
			
 
				+          new_incr_rate_list="${numbers[7]}"
			
 
				+      else
			
 
				+          old_incr_rate_list="${old_incr_rate_list};${numbers[6]}"
			
 
				+          new_incr_rate_list="${new_incr_rate_list};${numbers[7]}"
			
 
				+      fi 
			
 
				+
			
 
				+      old_total_diff=$( echo "${old_total_diff} + ${numbers[6]}" | bc -l )
			
 
				+      new_total_diff=$( echo "${new_total_diff} + ${numbers[7]}" | bc -l )
			
 
				+
			
 
				+      count=$((${count} + 1))
			
 
				+
			
 
				+  done < "${predict_analyse_file_path}"
			
 
				+
			
 
				+  local return_code=$?
			
 
				+  check_run_status $return_code $step_start_time "计算Top10差异"
			
 
				+
			
 
				+  old_incr_rate_avg=$( echo "scale=6; ${old_total_diff} / ${count}" | bc -l )
			
 
				+  return_code=$?
			
 
				+  check_run_status $return_code $step_start_time "计算Top10差异"
			
 
				+
			
 
				+
			
 
				+  new_incr_rate_list=$( echo "scale=6; ${new_total_diff} / ${count}" | bc l )
			
 
				+  return_code=$?
			
 
				+  check_run_status $return_code $step_start_time "计算Top10差异"
			
 
				+}
			
 
				+
			
 
				 model_predict() {
			
 
				 
			
 
				   # 线上模型评估最新的数据
			
@@ -207,10 +257,15 @@ model_predict() {
 
				   local return_code=$?
			
 
				   check_run_status $return_code $step_start_time "线上模型评估${predict_date_path: -8}的数据"
			
 
				 
			
 
				-  local mean_abs_diff=$(python ${sh_path}/model_predict_analyse.py -p ${online_model_predict_result_path} ${new_model_predict_result_path})
			
 
				-  if (( $(echo "${mean_abs_diff} > 0.000400" | bc -l ) ));then
			
 
				-    check_run_status 1 $step_start_time "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.000400，请检查"
			
 
				-    echo "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.000400，请检查"
			
 
				+  # 结果分析
			
 
				+  local python_return_code=$(python ${sh_path}/model_predict_analyse.py -p ${online_model_predict_result_path} ${new_model_predict_result_path})
			
 
				+  check_run_status $python_return_code $step_start_time "线上模型评估${predict_date_path: -8}的数据"
			
 
				+
			
 
				+  calc_model_predict
			
 
				+
			
 
				+  if (( $(echo "${new_incr_rate_list} > 0.100000" | bc -l ) ));then 
			
 
				+    check_run_status 1 $step_start_time "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.1，请检查"
			
 
				+    echo "线上模型评估${predict_date_path: -8}的数据，绝对误差大于0.1，请检查"
			
 
				     exit 1
			
 
				   fi 
			
 
				 }
			
--- a/ad/model_predict_analyse.py
+++ b/ad/model_predict_analyse.py
@@ -18,7 +18,7 @@ def read_predict(hdfs_path: str) -> list:
 
				                     if len(split) != 4:
			
 
				                         continue
			
 
				                     cid = split[3].split("_")[0]
			
 
				-                    label = split[0]
			
 
				+                    label = int(split[0])
			
 
				                     score = float(split[2].replace("[", "").replace("]", "").split(",")[1])
			
 
				 
			
 
				                     result.append({
			
@@ -30,7 +30,7 @@ def read_predict(hdfs_path: str) -> list:
 
				     return result
			
 
				 
			
 
				 
			
 
				-def _main(model1_predict_path: str, model2_predict_path: str):
			
 
				+def _main(model1_predict_path: str, model2_predict_path: str, file: str):
			
 
				     if not model1_predict_path.endswith("/"):
			
 
				         model1_predict_path += "/"
			
 
				 
			
@@ -45,34 +45,47 @@ def _main(model1_predict_path: str, model2_predict_path: str):
 
				     model2_result = read_predict(model2_predict_path)
			
 
				 
			
 
				     m1 = pd.DataFrame(model1_result)
			
 
				-    g1 = m1.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
			
 
				-    # 获取出现次数最多的十个 cid
			
 
				-    most_common_cid1 = g1.nlargest(10, 'count')
			
 
				+    g1 = m1.groupby("cid").agg(
			
 
				+        view=('cid', 'size'),
			
 
				+        conv=('label', 'sum'),
			
 
				+        old_score_avg=('score', lambda x: round(x.mean(), 6))
			
 
				+    ).reset_index()
			
 
				+
			
 
				+    g1['true'] = g1['conv'] / g1['view']
			
 
				 
			
 
				     m2 = pd.DataFrame(model2_result)
			
 
				-    g2 = m2.groupby("cid").agg(count=('cid', 'size'), average_value=('score', lambda x: round(x.mean(), 6)))
			
 
				-    # 获取出现次数最多的十个 cid
			
 
				-    most_common_cid2 = g2.nlargest(10, 'count')
			
 
				+    g2 = m2.groupby("cid").agg(
			
 
				+        new_score_avg=('score', lambda x: round(x.mean(), 6))
			
 
				+    )
			
 
				+
			
 
				+    merged = pd.merge(g1, g2, on='cid', how='left')
			
 
				+    merged.fillna(0, inplace=True)
			
 
				+
			
 
				+    merged["abs((new-true)/true)"] = abs(
			
 
				+        (merged['new_score_avg'] - merged['true']) / merged['true']
			
 
				+    ).mask(merged['true'] == 0, 0)
			
 
				 
			
 
				-    # 合并两个 DataFrame，按 'cid' 匹配
			
 
				-    merged = pd.merge(most_common_cid1, most_common_cid2, on='cid', suffixes=('_m1', '_m2'))
			
 
				+    merged["abs((old-true)/true)"] = abs(
			
 
				+        (merged['old_score_avg'] - merged['true']) / merged['true']
			
 
				+    ).mask(merged['true'] == 0, 0)
			
 
				 
			
 
				-    # 计算 'average_value' 的差值绝对值，并保留六位小数
			
 
				-    merged['score_diff'] = (merged['average_value_m1'] - merged['average_value_m2']).abs().round(6)
			
 
				+    merged = merged[['cid', 'view', "conv", "true", "old_score_avg", "new_score_avg",
			
 
				+                     "abs((old-true)/true)", "abs((new-true)/true)"]]
			
 
				+    merged = merged.sort_values(by=['view'], ascending=False)
			
 
				 
			
 
				-    # 计算差值的平均值，并保留六位小数
			
 
				-    mean_abs_diff = round(merged['score_diff'].mean(), 6)
			
 
				-    print(mean_abs_diff)
			
 
				+    with open(file, "w") as writer:
			
 
				+        writer.write(merged.to_string(index=False))
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     parser = argparse.ArgumentParser(description="model_predict_analyse.py")
			
 
				     parser.add_argument("-p", "--predict_path_list", nargs='*',
			
 
				                         help="模型评估结果保存路径，第一个为老模型评估结果，第二个为新模型评估结果")
			
 
				+    parser.add_argument("-f", "--file", help="最后计算结果的保存路径")
			
 
				     args = parser.parse_args()
			
 
				 
			
 
				     predict_path_list = args.predict_path_list
			
 
				     # 判断参数是否正常
			
 
				     if len(predict_path_list) != 2:
			
 
				         sys.exit(1)
			
 
				-    _main(predict_path_list[0], predict_path_list[1])
			
 
				+    _main(predict_path_list[0], predict_path_list[1], args.file)