Просмотр исходного кода

feat(指标分析): rosn 口径对齐回流 + 新增 MAPE/样本数/方差

- rosn COPC 和预测均值限定 is_return_noself=1,与 rosn_real 对齐
- 新增 rosn MAPE(相对误差)
- 新增 str_samples/rosn_samples/rovn_samples 样本数
- 02 文件额外包含方差字段

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 месяц назад
Родитель
Сommit
9adaefdc38

+ 265 - 0
tasks/指标分析/01_实验组xTop20视频_vs对照组.sql

@@ -0,0 +1,265 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 rosn_ori(未校准原始分)对比校准后的 rosn_pred
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS rosn_ori
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END), 0), 4) AS rosn_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END), 0), 4) AS rosn_ori_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_ori), 0), 4) AS rovn_ori_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END),6) AS rosn_pred
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END),6) AS rosn_ori
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_ori), 6) AS rovn_ori
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差:str(无条件)
+            ,round(AVG(ABS(str_pred - is_return_noself)),6) AS str_mae
+            -- 误差:rosn(条件于 is_return_noself=1)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) END),6) AS rosn_pred_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) END),6) AS rosn_ori_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) END),6) AS rosn_stat_mae
+            -- 误差:rovn(无条件)
+            ,round(AVG(ABS(str_pred * rosn_pred - return_n_uv_noself)),6) AS rovn_pred_mae
+            ,round(AVG(ABS(str_pred * rosn_ori - return_n_uv_noself)),6) AS rovn_ori_mae
+            ,round(AVG(ABS(str_pred * rosn_stat - return_n_uv_noself)),6) AS rovn_stat_mae
+            -- MAPE:相对误差(仅 rosn,回流样本 actual>0)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_pred_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_ori_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_stat_mape
+            -- 样本数
+            ,COUNT(1) AS str_samples
+            ,SUM(CASE WHEN is_return_noself = 1 THEN 1 ELSE 0 END) AS rosn_samples
+            ,COUNT(1) AS rovn_samples
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_ori_copc, rosn_stat_copc, rovn_copc, rovn_ori_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_ori, rosn_stat
+        ,rovn_real, rovn_pred, rovn_ori, rovn_stat
+        ,str_mae, rosn_pred_mae, rosn_ori_mae, rosn_stat_mae, rovn_pred_mae, rovn_ori_mae, rovn_stat_mae
+        ,rosn_pred_mape, rosn_ori_mape, rosn_stat_mape
+        ,str_samples, rosn_samples, rovn_samples
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_ori_copc - rosn_ori_copc_base) / NULLIF(rosn_ori_copc_base, 0), 4) AS rosn_ori_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_ori_copc - rovn_ori_copc_base) / NULLIF(rovn_ori_copc_base, 0), 4) AS rovn_ori_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 274 - 0
tasks/指标分析/02_实验组xTop20视频_vs对照组_误差分析.sql

@@ -0,0 +1,274 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 rosn_ori(未校准原始分)对比校准后的 rosn_pred
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS rosn_ori
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END), 0), 4) AS rosn_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END), 0), 4) AS rosn_ori_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_ori), 0), 4) AS rovn_ori_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END),6) AS rosn_pred
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END),6) AS rosn_ori
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_ori), 6) AS rovn_ori
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差:str(无条件)
+            ,round(AVG(ABS(str_pred - is_return_noself)),6) AS str_mae
+            -- 误差:rosn(条件于 is_return_noself=1)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) END),6) AS rosn_pred_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) END),6) AS rosn_ori_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) END),6) AS rosn_stat_mae
+            -- 误差:rovn(无条件)
+            ,round(AVG(ABS(str_pred * rosn_pred - return_n_uv_noself)),6) AS rovn_pred_mae
+            ,round(AVG(ABS(str_pred * rosn_ori - return_n_uv_noself)),6) AS rovn_ori_mae
+            ,round(AVG(ABS(str_pred * rosn_stat - return_n_uv_noself)),6) AS rovn_stat_mae
+            -- MAPE:相对误差(仅 rosn,回流样本 actual>0)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_pred_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_ori_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_stat_mape
+            -- 方差:误差离散程度(方差大=忽高忽低,方差小=稳定偏移)
+            ,round(VARIANCE(str_pred - is_return_noself),6) AS str_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_pred - return_n_uv_noself END),6) AS rosn_pred_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_ori - return_n_uv_noself END),6) AS rosn_ori_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_stat - return_n_uv_noself END),6) AS rosn_stat_var
+            ,round(VARIANCE(str_pred * rosn_pred - return_n_uv_noself),6) AS rovn_pred_var
+            ,round(VARIANCE(str_pred * rosn_ori - return_n_uv_noself),6) AS rovn_ori_var
+            ,round(VARIANCE(str_pred * rosn_stat - return_n_uv_noself),6) AS rovn_stat_var
+            -- 样本数
+            ,COUNT(1) AS str_samples
+            ,SUM(CASE WHEN is_return_noself = 1 THEN 1 ELSE 0 END) AS rosn_samples
+            ,COUNT(1) AS rovn_samples
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_ori_copc, rosn_stat_copc, rovn_copc, rovn_ori_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_ori, rosn_stat
+        ,rovn_real, rovn_pred, rovn_ori, rovn_stat
+        ,str_mae, rosn_pred_mae, rosn_ori_mae, rosn_stat_mae, rovn_pred_mae, rovn_ori_mae, rovn_stat_mae
+        ,str_var, rosn_pred_var, rosn_ori_var, rosn_stat_var, rovn_pred_var, rovn_ori_var, rovn_stat_var
+        ,rosn_pred_mape, rosn_ori_mape, rosn_stat_mape
+        ,str_samples, rosn_samples, rovn_samples
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_ori_copc - rosn_ori_copc_base) / NULLIF(rosn_ori_copc_base, 0), 4) AS rosn_ori_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_ori_copc - rovn_ori_copc_base) / NULLIF(rovn_ori_copc_base, 0), 4) AS rovn_ori_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 14 - 0
tasks/指标分析/verify_avg_null.sql

@@ -0,0 +1,14 @@
+-- 验证:AVG 是否忽略 NULL
+-- 预期:如果忽略 NULL,avg_with_null = 2.0(只算 1,2,3)
+--       如果不忽略,avg_with_null = 1.5(sum=6, count=4)
+SELECT  AVG(val) AS avg_all
+        ,AVG(CASE WHEN val <= 2 THEN val END) AS avg_case_null
+        ,COUNT(1) AS cnt_all
+        ,COUNT(CASE WHEN val <= 2 THEN val END) AS cnt_case
+FROM (
+    SELECT 1 AS val
+    UNION ALL SELECT 2
+    UNION ALL SELECT 3
+    UNION ALL SELECT 4
+) t
+;