Explorar o código

feat: 新增全量 SQL 分析文件

包含表洞察、rosn分析、rosn校准、线上实验、头部视频模型指标、
低vov高曝光分析等多个任务的 SQL 查询文件。

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui hai 1 mes
pai
achega
00ffb1dd30
Modificáronse 59 ficheiros con 6636 adicións e 0 borrados
  1. 773 0
      tables/loghubods/loghubods.dwd_recsys_alg_exposure_base_20250108.sql
  2. 27 0
      tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all_20250212/query.sql
  3. 17 0
      tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all_20250212/query_all_hh.sql
  4. 7 0
      tasks/00_表的洞察/loghubods.opengid_base_data/01_基本数据.sql
  5. 7 0
      tasks/00_表的洞察/loghubods.unionid_base_action_data/01_基本数据.sql
  6. 9 0
      tasks/00_表的洞察/loghubods.user_share_log/00_分区范围.sql
  7. 4 0
      tasks/00_表的洞察/loghubods.user_share_log/01_基本数据.sql
  8. 11 0
      tasks/00_表的洞察/loghubods.user_share_log/02_topic_group.sql
  9. 23 0
      tasks/00_表的洞察/loghubods.user_share_log/03_topic_depth_group.sql
  10. 11 0
      tasks/00_表的洞察/loghubods.user_share_log/04_数据.sql
  11. 10 0
      tasks/00_表的洞察/loghubods.user_share_log/05_图数据.sql
  12. 11 0
      tasks/00_表的洞察/loghubods.user_share_log/05_数据.sql
  13. 4 0
      tasks/00_表的洞察/videoods.dim_user/01_基本数据.sql
  14. 4 0
      tasks/00_表的洞察/videoods.wx_video/01_基本数据.sql
  15. 165 0
      tasks/承接/rosn分析/01_实验组xTop10一级品类.sql
  16. 249 0
      tasks/承接/rosn分析/02_实验组xTop20视频_vs对照组_v2.sql
  17. 483 0
      tasks/承接/rosn分析/03_实验组xTop20视频_vs对照组_metrics.sql
  18. 254 0
      tasks/承接/rosn分析/04_实验组xTop20视频_vs对照组_mae_chg.sql
  19. 256 0
      tasks/承接/rosn分析/05_实验组xTop20视频_vs对照组_vor.sql
  20. 178 0
      tasks/承接/rosn分析/06_rosn_gcindex.sql
  21. 233 0
      tasks/承接/rosn分析/07_rosn_gcindex.sql
  22. 229 0
      tasks/承接/rosn分析/07v2_rosn_gcindex.sql
  23. 387 0
      tasks/承接/rosn分析/08_实验组xTop20视频_vs对照组_gauc.sql
  24. 46 0
      tasks/承接/rosn分析/debug_dau_scoresmap字段.sql
  25. 133 0
      tasks/承接/rosn分析/debug_dau_str校准.sql
  26. 27 0
      tasks/承接/rosn分析/debug_dau_str校准_多天.sql
  27. 67 0
      tasks/承接/rosn校准/01_原始校准数据.sql
  28. 90 0
      tasks/承接/rosn校准/02_分组校准数据.sql
  29. 76 0
      tasks/承接/rosn校准/03_label分桶校准数据.sql
  30. 210 0
      tasks/承接/头部视频模型指标分析/query.sql
  31. 96 0
      tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表 copy.sql
  32. 110 0
      tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v2.sql
  33. 113 0
      tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v3.sql
  34. 166 0
      tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v4.sql
  35. 165 0
      tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v5.sql
  36. 83 0
      tasks/承接/线上实验/01_线上实验_曝光+特征表.sql
  37. 83 0
      tasks/承接/线上实验/01_线上实验_曝光表.sql
  38. 113 0
      tasks/承接/线上实验/02_模型预测误差_仅回流样本.sql
  39. 82 0
      tasks/承接/线上实验/03_模型预测分桶验证.sql
  40. 118 0
      tasks/承接/线上实验/04_裂变率预测对比.sql
  41. 78 0
      tasks/承接/线上实验/05_str分桶ros诊断.sql
  42. 87 0
      tasks/承接/线上实验/06_str分桶ros诊断_仅有预测值.sql
  43. 96 0
      tasks/承接/线上实验/06a_str_pred分桶诊断.sql
  44. 126 0
      tasks/承接/线上实验/06a_str_pred分桶诊断_full.sql
  45. 99 0
      tasks/承接/线上实验/06c_ros_pred分桶诊断.sql
  46. 124 0
      tasks/承接/线上实验/06c_ros_pred分桶诊断_full.sql
  47. 116 0
      tasks/承接/线上实验/07_预测值覆盖率分析.sql
  48. 17 0
      tmp/低vov高曝光分析/step1_验证现象.sql
  49. 20 0
      tmp/低vov高曝光分析/step2_影响面.sql
  50. 54 0
      tmp/低vov高曝光分析/step3_原因分析.sql
  51. 51 0
      tmp/低vov高曝光分析/step3b_整体偏差.sql
  52. 39 0
      tmp/低vov高曝光分析/step5_时间趋势.sql
  53. 15 0
      tmp/低vov高曝光分析/step7_check_exp.sql
  54. 25 0
      tmp/低vov高曝光分析/step7_头部vov趋势.sql
  55. 41 0
      tmp/低vov高曝光分析/step7_头部视频时间趋势.sql
  56. 18 0
      tmp/低vov高曝光分析/step8_月度对比.sql
  57. 66 0
      tmp/低vov高曝光分析/v2_step2_统一口径.sql
  58. 74 0
      tmp/低vov高曝光分析/v3_扩展特征.sql
  59. 360 0
      tmp_sql/头部视频.sql

+ 773 - 0
tables/loghubods/loghubods.dwd_recsys_alg_exposure_base_20250108.sql

@@ -0,0 +1,773 @@
+--@exclude_input=loghubods.video_action_log_flow_new
+--@exclude_input=loghubods.user_share_log_flow
+--*********************
+-- alg_recsys_rank_labelmatch_20250108
+--*********************
+--drop table loghubods.dwd_recsys_alg_exposure_base_20250108;
+CREATE TABLE IF NOT EXISTS loghubods.dwd_recsys_alg_exposure_base_20250108
+(
+    apptype                    STRING
+    ,uid                       STRING
+    ,mid                       STRING
+    ,vid                       STRING
+    ,sessionid                 STRING
+    ,subsessionid              STRING
+    ,pagesource                STRING
+    ,page                      STRING
+    ,recommendlogvo            STRING COMMENT '推荐算法的返回结果日志存在这个字段中'
+    ,abcode                    STRING COMMENT '推荐算法的ab分组:ab0'
+    ,recommendpagetype         STRING COMMENT '用于区分pagesource相同时某些场景的。三种回流头部;两种下滑-沉浸页下滑和feed下滑。 -pages/user-videos-share-recommend-detail 是沉浸页。'
+    ,recomtraceid              STRING COMMENT '在后端调取推荐服务之前生成。前端降级会空;后端也可能为空。'
+    ,headvideoid               STRING
+    ,rootsourceid              STRING COMMENT '区分touliu等流量,咨询产品。'
+    ,hotsencetype              STRING
+    ,flowpool                  STRING COMMENT '非流量池,是空字符串。没有null值。'
+    ,level                     STRING COMMENT '非流量池,是null。'
+    ,clientip                  STRING
+    ,machineinfo_brand         STRING
+    ,machineinfo_model         STRING
+    ,machineinfo_system        STRING
+    ,machineinfo_wechatversion STRING
+    ,machineinfo_sdkversion    STRING
+    ,province                  STRING
+    ,city                      STRING
+    ,ts                        STRING
+    ,is_share                  STRING
+    ,share_cnt                 STRING
+    ,is_return_1               STRING
+    ,return_1_pv               STRING
+    ,return_1_uv               STRING
+    ,return_1_mids             STRING
+    ,is_return_n               STRING
+    ,return_n_pv               STRING
+    ,return_n_uv               STRING
+    ,return_n_mids             STRING
+    ,is_return_noself          STRING
+    ,return_1_uv_noself        STRING
+    ,return_1_mids_noself      STRING
+    ,is_return_n_noself        STRING
+    ,return_n_uv_noself        STRING
+    ,return_n_mids_noself      STRING
+    ,new_exposure_cnt          STRING
+    ,extend                    STRING
+)
+PARTITIONED BY 
+(
+    dt                         STRING COMMENT '日期:20240105'
+    ,hh                        STRING COMMENT '小时:04'
+)
+STORED AS ALIORC
+TBLPROPERTIES ('comment' = '推荐算法-labelmatch表-20250108更新最新版')
+LIFECYCLE 3650
+;
+
+SET hive.exec.dynamic.partition = true
+;
+
+SET hive.exec.dynamic.partition.mode = nonstrict
+;
+
+SET odps.stage.mapper.split.size = 1024
+;
+
+INSERT OVERWRITE TABLE loghubods.dwd_recsys_alg_exposure_base_20250108 PARTITION (dt,hh)
+WITH t_return AS 
+(
+    SELECT  *
+            ,CONCAT(dthh,":",shareid,":",vid,":",dthh_id) AS id
+    FROM    (
+                SELECT  CONCAT(year,month,day,hour) AS dthh
+                        ,apptype
+                        ,machinecode AS mid
+                        ,clickobjectid AS vid
+                        ,sessionid
+                        ,subsessionid -- 注意这是回流对应的subsessionid,每次回流点击会重置,可以通过这个字段找到回流的曝光。
+                        ,shareid
+                        ,rootshareid
+                        ,CAST(clienttimestamp / 1000 AS BIGINT) AS ts
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,machinecode,clickobjectid,sessionid,subsessionid,shareid,rootshareid ORDER BY clienttimestamp DESC ) AS rn
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),shareid,clickobjectid ORDER BY clienttimestamp ) AS dthh_id
+                FROM    loghubods.user_share_log_flow -- 回流行为,理应subsessionid只有一条,但有脏数据,去重。
+                WHERE   CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE   CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH')
+                AND     __topic__ = 'click'
+                AND     apptype IS NOT NULL
+                AND     apptype NOT IN ('12') -- 12的pagesoucre是h5-share和h5-detail 暂时过滤掉 不做处理
+                AND     machinecode IS NOT NULL
+                AND     clickobjectid IS NOT NULL
+                AND     pagesource REGEXP "-pages/user-videos-share$" -- 存在脏数据 vlog-gzh /mine/mine-info$ 结尾的,都过滤掉。
+            ) 
+    WHERE   rn = 1
+)
+,t_share_from_sharelog AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  CONCAT(year,month,day,hour) AS dthh
+                        ,apptype
+                        ,machinecode AS mid
+                        ,shareobjectid AS vid
+                        ,sessionid
+                        ,subsessionid
+                        ,pagesource
+                        ,shareid
+                        ,CAST(clienttimestamp / 1000 AS BIGINT) AS ts
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,machinecode,shareobjectid,sessionid,subsessionid,pagesource,shareid ORDER BY clienttimestamp DESC ) AS rn
+                FROM    loghubods.user_share_log_flow
+                WHERE   CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE   CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH')
+                AND     __topic__ = 'share'
+                AND     apptype IS NOT NULL
+                AND     apptype NOT IN ('12')
+                AND     machinecode IS NOT NULL
+                AND     shareobjectid IS NOT NULL
+            ) 
+    WHERE   rn = 1
+)
+,t_exposure AS 
+(
+    SELECT  dthh_id
+            ,dthh
+            ,apptype
+            ,uid
+            ,mid
+            ,vid
+            ,sessionid
+            ,subsessionid
+            ,rootsessionid_new
+            ,pagesource
+            ,recommendlogvo
+            ,abcode
+            ,recommendpagetype
+            ,recomtraceid
+            ,headvideoid
+            ,rootsourceid
+            ,hotsencetype
+            ,animationscenetype
+            ,JSON_PARSE(IF(JSON_VALID(extparams),extparams,"{}")) AS extParams
+            ,flowpool
+            ,level
+            ,clientip
+            ,machineinfo_brand
+            ,machineinfo_model
+            ,machineinfo_system
+            ,machineinfo_wechatversion
+            ,machineinfo_sdkversion
+            ,province
+            ,city
+            ,versioncode
+            ,ts
+            ,rn
+            ,id
+            ,dt
+            ,hh
+    FROM    loghubods.dwd_recsys_alg_exposure_base_view_20250402
+    WHERE   CONCAT(dt,hh) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
+)
+,t_exposure_recommend AS 
+(
+    SELECT  *
+    FROM    t_exposure
+    WHERE   pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+)
+,t_return_exposure_1 AS -- 曝光关联回流,用于计算viewh24                                
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.id AS exposure_id
+                        ,t1.mid AS mid
+                        ,t1.vid AS vid
+                        ,t1.subsessionid AS subsessionid
+                        ,t1.sessionid AS sessionid
+                        ,t1.headvideoid AS headvideoid
+                        ,t1.dthh
+                        ,t2.id AS return_id
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.id ORDER BY t2.ts DESC ) AS rn
+                FROM    t_exposure_recommend t1
+                LEFT JOIN t_return t2
+                ON      t1.mid = t2.mid
+                AND     t1.headvideoid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_return_exposure_2 AS -- 曝光关联回流,用于计算viewh24                                
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.exposure_id AS exposure_id
+                        ,t1.mid AS mid
+                        ,t1.vid AS vid
+                        ,t1.subsessionid AS subsessionid
+                        ,t1.sessionid AS sessionid
+                        ,t1.headvideoid AS headvideoid
+                        ,t1.dthh
+                        ,t2.id AS return_id
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.exposure_id ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_return_exposure_1
+                            WHERE   return_id IS NULL
+                        ) t1
+                LEFT JOIN t_return t2
+                ON      t1.mid = t2.mid
+                AND     t1.headvideoid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_return_exposure_3 AS -- 曝光关联回流,用于计算viewh24                                
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.exposure_id AS exposure_id
+                        ,t1.mid AS mid
+                        ,t1.vid AS vid
+                        ,t1.subsessionid AS subsessionid
+                        ,t1.sessionid AS sessionid
+                        ,t1.headvideoid AS headvideoid
+                        ,t1.dthh
+                        ,t2.id AS return_id
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.exposure_id ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_return_exposure_2
+                            WHERE   return_id IS NULL
+                        ) t1
+                LEFT JOIN t_return t2
+                ON      t1.mid = t2.mid
+                AND     t1.subsessionid = t2.subsessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_return_exposure_4 AS -- 曝光关联回流,用于计算viewh24                                
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.exposure_id AS exposure_id
+                        ,t1.mid AS mid
+                        ,t1.vid AS vid
+                        ,t1.subsessionid AS subsessionid
+                        ,t1.sessionid AS sessionid
+                        ,t1.headvideoid AS headvideoid
+                        ,t1.dthh
+                        ,t2.id AS return_id
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.exposure_id ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_return_exposure_3
+                            WHERE   return_id IS NULL
+                        ) t1
+                LEFT JOIN t_return t2
+                ON      t1.mid = t2.mid
+                AND     t1.sessionid = t2.sessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_return_exposure AS 
+(
+    SELECT  a.*
+            ,b.exposure_cnt AS new_exposure_cnt
+    FROM    t_return a
+    LEFT JOIN   (
+                    SELECT  return_id
+                            ,COUNT(1) AS exposure_cnt
+                    FROM    (
+                                SELECT  *
+                                FROM    t_return_exposure_1
+                                WHERE   return_id IS NOT NULL
+                                UNION ALL
+                                SELECT  *
+                                FROM    t_return_exposure_2
+                                WHERE   return_id IS NOT NULL
+                                UNION ALL
+                                SELECT  *
+                                FROM    t_return_exposure_3
+                                WHERE   return_id IS NOT NULL
+                                UNION ALL
+                                SELECT  *
+                                FROM    t_return_exposure_4
+                                WHERE   return_id IS NOT NULL
+                            ) 
+                    GROUP BY return_id
+                ) b
+    ON      a.id = b.return_id
+)
+,t_normal_share_exposure_1 AS -- 开始处理常规的分享与曝光关联                                                                                          
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    t_share_from_sharelog t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+                AND     t1.pagesource = t2.pagesource
+                AND     t1.ts >= t2.ts
+                WHERE   t1.pagesource NOT REGEXP "pages/detail-user-videos-share-recommend$"
+            ) 
+    WHERE   rn = 1
+)
+,t_normal_share_exposure_2 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_normal_share_exposure_1
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+                AND     t1.pagesource = t2.pagesource
+                AND     t1.ts >= t2.ts
+            ) 
+    WHERE   rn = 1
+)
+,t_normal_share_exposure_3 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_normal_share_exposure_2
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+                AND     t1.pagesource = t2.pagesource
+            ) 
+    WHERE   rn = 1
+)
+,t_normal_share_exposure_4 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_normal_share_exposure_3
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+                AND     t1.pagesource = t2.pagesource
+            ) 
+    WHERE   rn = 1
+)
+,t_normal_share_exposure_5 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_normal_share_exposure_4
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_normal_share_exposure_6 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_normal_share_exposure_5
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_exposure_detail AS 
+(
+    SELECT  *
+    FROM    t_exposure
+    WHERE   pagesource REGEXP "-pages/user-videos-detail$|pages/detail-recommend$"
+)
+,t_no_normal_share_exposure_1 AS -- 开始处理非常规的分享与曝光关联                                                                                         
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    t_share_from_sharelog t1
+                LEFT JOIN t_exposure_detail t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+                AND     t1.ts >= t2.ts
+                WHERE   t1.pagesource REGEXP "pages/detail-user-videos-share-recommend$"
+            ) 
+    WHERE   rn = 1
+)
+,t_no_normal_share_exposure_2 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_no_normal_share_exposure_1
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure_detail t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+                AND     t1.ts >= t2.ts
+            ) 
+    WHERE   rn = 1
+)
+,t_no_normal_share_exposure_3 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_no_normal_share_exposure_2
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure_detail t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.subsessionid = t2.subsessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_no_normal_share_exposure_4 AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  t1.dthh
+                        ,t1.apptype
+                        ,t1.mid
+                        ,t1.vid
+                        ,t1.sessionid
+                        ,t1.subsessionid
+                        ,t1.pagesource
+                        ,t1.shareid
+                        ,t1.ts
+                        ,t2.id AS exposure_id
+                        ,t2.ts AS exposure_ts
+                        ,ROW_NUMBER() OVER (PARTITION BY t1.dthh,t1.apptype,t1.mid,t1.vid,t1.sessionid,t1.subsessionid,t1.pagesource,t1.shareid ORDER BY t2.ts DESC ) AS rn
+                FROM    (
+                            SELECT  *
+                            FROM    t_no_normal_share_exposure_3
+                            WHERE   exposure_id IS NULL
+                        ) t1
+                LEFT JOIN t_exposure_detail t2
+                ON      t1.apptype = t2.apptype
+                AND     t1.mid = t2.mid
+                AND     t1.vid = t2.vid
+                AND     t1.sessionid = t2.sessionid
+            ) 
+    WHERE   rn = 1
+)
+,t_share_exposure AS 
+(
+    SELECT  *
+    FROM    t_normal_share_exposure_1
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_normal_share_exposure_2
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_normal_share_exposure_3
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_normal_share_exposure_4
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_normal_share_exposure_5
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_normal_share_exposure_6
+    UNION ALL
+    SELECT  *
+    FROM    t_no_normal_share_exposure_1
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_no_normal_share_exposure_2
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_no_normal_share_exposure_3
+    WHERE   exposure_id IS NOT NULL
+    UNION ALL
+    SELECT  *
+    FROM    t_no_normal_share_exposure_4
+)
+,t_share_with_label AS 
+(
+    SELECT  a.dthh
+            ,a.apptype -- join 条件
+            ,a.mid
+            ,a.vid -- join 条件
+            ,a.sessionid
+            ,a.subsessionid
+            ,a.pagesource
+            ,a.shareid -- join 条件
+            ,a.ts
+            ,a.exposure_id
+            ,COALESCE(b.return_1_pv,0) AS return_1_pv
+            ,COALESCE(b.return_1_uv,0) AS return_1_uv
+            ,b.return_1_mids AS return_1_mids -- 可能为null,再决策是否提前处理。
+            ,COALESCE(c.return_n_pv,0) AS return_n_pv
+            ,COALESCE(c.return_n_uv,0) AS return_n_uv
+            ,c.return_n_mids AS return_n_mids -- 可能为null,再决策是否提前处理。
+            ,COALESCE(c.new_exposure_cnt,0) AS new_exposure_cnt
+    FROM    t_share_exposure a
+    LEFT JOIN   (
+                    SELECT  shareid
+                            ,vid
+                            ,apptype
+                            ,COUNT(1) AS return_1_pv
+                            ,COUNT(DISTINCT mid) AS return_1_uv
+                            ,CONCAT_WS(',',COLLECT_SET(mid)) AS return_1_mids
+                    FROM    t_return
+                    GROUP BY shareid
+                             ,vid
+                             ,apptype
+                ) b
+    ON      a.shareid = b.shareid
+    AND     a.vid = b.vid
+    AND     a.apptype = b.apptype
+    LEFT JOIN   (
+                    SELECT  rootshareid
+                            ,vid
+                            ,apptype
+                            ,COUNT(1) AS return_n_pv
+                            ,COUNT(DISTINCT mid) AS return_n_uv
+                            ,CONCAT_WS(',',COLLECT_SET(mid)) AS return_n_mids
+                            ,SUM(new_exposure_cnt) AS new_exposure_cnt
+                    FROM    t_return_exposure
+                    GROUP BY rootshareid
+                             ,vid
+                             ,apptype
+                ) c
+    ON      a.shareid = c.rootshareid
+    AND     a.vid = c.vid
+    AND     a.apptype = c.apptype
+)
+,t_share_with_label_group AS 
+(
+    SELECT  exposure_id
+            ,COUNT(1) AS share_cnt
+            ,SUM(return_1_pv) AS return_1_pv
+            ,COALESCE(SIZE(SPLIT(DEDUPLICATION4LIST(CONCAT_WS(',',COLLECT_LIST(return_1_mids))),",")),0) AS return_1_uv
+            ,DEDUPLICATION4LIST(CONCAT_WS(',',COLLECT_LIST(return_1_mids))) AS return_1_mids -- 可能是null
+            ,SUM(return_n_pv) AS return_n_pv
+            ,COALESCE(SIZE(SPLIT(DEDUPLICATION4LIST(CONCAT_WS(',',COLLECT_LIST(return_n_mids))),",")),0) AS return_n_uv
+            ,DEDUPLICATION4LIST(CONCAT_WS(',',COLLECT_LIST(return_n_mids))) AS return_n_mids -- 可能是null
+            ,SUM(new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_share_with_label
+    GROUP BY exposure_id
+)
+,t_root_source_id_group_name AS 
+(
+    SELECT  *
+    FROM    (
+                SELECT  root_source_id
+                        ,group_name
+                        ,ROW_NUMBER() OVER (PARTITION BY root_source_id ) AS rn
+                FROM    loghubods.changwen_rootsourceid_group_hour
+                WHERE   dt = MAX_PT('loghubods.changwen_rootsourceid_group_hour')
+            ) 
+    WHERE   rn = 1
+)
+,t_exposure_share_return AS 
+(
+    SELECT  apptype
+            ,uid
+            ,mid
+            ,vid
+            ,sessionid
+            ,subsessionid
+            ,pagesource
+            ,CASE   WHEN pagesource REGEXP 'pages/user-videos-share-recommend$' THEN '回流后沉浸页&内页feed'
+                    WHEN pagesource REGEXP 'pages/detail-recommend$' THEN '详情后沉浸页'
+                    WHEN pagesource REGEXP 'pages/user-videos-share$' THEN '回流页'
+                    WHEN pagesource REGEXP 'pages/user-videos-detail$' THEN '详情页'
+                    WHEN pagesource REGEXP 'pages/category$' THEN '首页feed'
+                    ELSE '其他'
+            END AS pagesource_new
+            ,recommendlogvo -- 推荐算法的返回结果日志存在这个字段中
+            ,abcode -- 推荐算法的ab分组
+            ,recommendpagetype -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
+            ,recomtraceid
+            ,headvideoid
+            ,rootsourceid
+            ,hotsencetype
+            ,flowpool -- 14#68#3#1735262438476#2
+            ,level
+            ,clientip
+            ,machineinfo_brand
+            ,machineinfo_model
+            ,machineinfo_system
+            ,machineinfo_wechatversion
+            ,machineinfo_sdkversion
+            ,province
+            ,city
+            ,ts
+            ,IF(COALESCE(share_cnt,0) > 0,1,0) AS is_share
+            ,COALESCE(share_cnt,0) AS share_cnt
+            ,IF(COALESCE(return_1_uv,0) > 0,1,0) AS is_return_1
+            ,COALESCE(return_1_pv,0) AS return_1_pv
+            ,COALESCE(return_1_uv,0) AS return_1_uv
+            ,return_1_mids -- 可能是null
+            ,IF(COALESCE(return_n_pv,0) > 0,1,0) AS is_return_n
+            ,COALESCE(return_n_pv,0) AS return_n_pv
+            ,COALESCE(return_n_uv,0) AS return_n_uv
+            ,return_n_mids -- 可能是null
+            ,IF(COALESCE(COALESCE(SIZE(ARRAY_REMOVE(SPLIT(return_1_mids,","),mid)),0),0) > 0,1,0) AS is_return_noself
+            ,COALESCE(SIZE(ARRAY_REMOVE(SPLIT(return_1_mids,","),mid)),0) AS return_1_uv_noself
+            ,ARRAY_JOIN(ARRAY_REMOVE(SPLIT(return_1_mids,","),mid),",") AS return_1_mids_noself
+            ,IF(COALESCE(COALESCE(SIZE(ARRAY_REMOVE(SPLIT(return_n_mids,","),mid)),0),0) > 0,1,0) AS is_return_n_noself
+            ,COALESCE(SIZE(ARRAY_REMOVE(SPLIT(return_n_mids,","),mid)),0) AS return_n_uv_noself
+            ,ARRAY_JOIN(ARRAY_REMOVE(SPLIT(return_n_mids,","),mid),",") AS return_n_mids_noself
+            ,COALESCE(new_exposure_cnt) AS new_exposure_cnt
+            ,JSON_FORMAT(
+                        JSON_OBJECT("animationSceneType",animationSceneType,"extParams",extParams,"rootsessionid",rootsessionid_new,"versioncode",versioncode,"group_name",tc.group_name)
+            ) AS extend
+            ,SUBSTR(dthh,1,8) AS dt
+            ,SUBSTR(dthh,9,2) AS hh
+    FROM    t_exposure ta
+    LEFT JOIN t_share_with_label_group tb
+    ON      ta.id = tb.exposure_id
+    LEFT JOIN t_root_source_id_group_name tc
+    ON      ta.rootsourceid = tc.root_source_id
+)SELECT  *
+FROM    t_exposure_share_return
+;

+ 27 - 0
tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all_20250212/query.sql

@@ -0,0 +1,27 @@
+-- 探索 loghubods.dwd_recsys_alg_sample_all_20250212 表
+-- 查看样本数据和模型分数字段
+
+SELECT  dt
+        ,hh
+        ,vid
+        ,apptype
+        ,page
+        ,abcode
+        ,is_share
+        ,share_cnt
+        ,is_return_1
+        ,is_return_n
+        ,is_return_noself
+        ,return_1_uv
+        ,return_n_uv
+        ,return_n_uv_noself
+        ,new_exposure_cnt
+        ,score
+        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),'\\','') AS scoresmap
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     hh = '10'
+AND     apptype = 'zuiyou'
+AND     extend_alg IS NOT NULL
+LIMIT   20
+;

+ 17 - 0
tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all_20250212/query_all_hh.sql

@@ -0,0 +1,17 @@
+-- 探索 loghubods.dwd_recsys_alg_sample_all_20250212 表
+-- 不限制 hh,查看所有小时的数据
+
+SELECT  dt
+        ,hh
+        ,vid
+        ,apptype
+        ,page
+        ,is_share
+        ,score
+        ,SUBSTR(REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),'\\',''), 1, 200) AS scoresmap_preview
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = 'zuiyou'
+AND     extend_alg IS NOT NULL
+LIMIT   10
+;

+ 7 - 0
tasks/00_表的洞察/loghubods.opengid_base_data/01_基本数据.sql

@@ -0,0 +1,7 @@
+-- 用户行为表样本数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.unionid_base_action_data/01_基本数据.sql --date 20260107
+
+SELECT *
+FROM loghubods.opengid_base_data
+WHERE dt = '${dt}'
+LIMIT 10000

+ 7 - 0
tasks/00_表的洞察/loghubods.unionid_base_action_data/01_基本数据.sql

@@ -0,0 +1,7 @@
+-- 用户行为表样本数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.unionid_base_action_data/01_基本数据.sql --date 20260107
+
+SELECT *
+FROM loghubods.unionid_base_action_data
+WHERE dt = '${dt}'
+LIMIT 100

+ 9 - 0
tasks/00_表的洞察/loghubods.user_share_log/00_分区范围.sql

@@ -0,0 +1,9 @@
+SELECT DISTINCT dt
+FROM loghubods.user_share_log
+ORDER BY dt
+LIMIT 1
+UNION ALL
+SELECT DISTINCT dt
+FROM loghubods.user_share_log
+ORDER BY dt DESC
+LIMIT 1

+ 4 - 0
tasks/00_表的洞察/loghubods.user_share_log/01_基本数据.sql

@@ -0,0 +1,4 @@
+SELECT *
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+LIMIT 100

+ 11 - 0
tasks/00_表的洞察/loghubods.user_share_log/02_topic_group.sql

@@ -0,0 +1,11 @@
+-- 用户行为表样本数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.user_share_log/01_基本数据.sql --date 20260107
+
+SELECT 
+topic,
+sum(1) as cnt
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+group by topic
+ORDER BY cnt desc
+LIMIT 100

+ 23 - 0
tasks/00_表的洞察/loghubods.user_share_log/03_topic_depth_group.sql

@@ -0,0 +1,23 @@
+-- 按 topic 和 usersharedepth 分组统计
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.user_share_log/03_topic_depth_group.sql --date 20260107
+
+SELECT
+    dt,
+    coalesce(topic, '_SUM_') as topic,
+    coalesce(usersharedepth, '_SUM_') as usersharedepth,
+    count(1) as cnt,
+    round(count(1) * 1.0 / sum(count(1)) over(), 6) as pct_total,
+    round(count(1) * 1.0 / sum(count(1)) over(partition by topic), 6) as pct_in_topic,
+    count(distinct machinecode) as uv,
+    round(count(distinct machinecode) * 1.0 / sum(count(distinct machinecode)) over(), 6) as uv_pct_total,
+    round(count(distinct machinecode) * 1.0 / sum(count(distinct machinecode)) over(partition by topic), 6) as uv_pct_in_topic
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+GROUP BY dt, topic, usersharedepth
+GROUPING SETS (
+    (dt, topic, usersharedepth),
+    (dt, topic),
+    (dt)
+)
+ORDER BY topic, usersharedepth
+LIMIT 2000

+ 11 - 0
tasks/00_表的洞察/loghubods.user_share_log/04_数据.sql

@@ -0,0 +1,11 @@
+SELECT 
+machinecode as mid,
+clickobjectid as vid,
+shareid,
+rootshareid,
+sharedepth,
+clienttimestamp
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+and topic = 'click'
+and clickobjectid is not null

+ 10 - 0
tasks/00_表的洞察/loghubods.user_share_log/05_图数据.sql

@@ -0,0 +1,10 @@
+SELECT
+    split(shareid, '-')[0] as from_mid,
+    clickobjectid as vid,
+    machinecode as target_mid,
+    clienttimestamp as ts
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+    AND topic = 'click'
+    AND clickobjectid IS NOT NULL
+    AND shareid LIKE 'weixin_openid%'

+ 11 - 0
tasks/00_表的洞察/loghubods.user_share_log/05_数据.sql

@@ -0,0 +1,11 @@
+SELECT 
+machinecode as mid,
+clickobjectid as vid,
+shareid,
+rootshareid,
+sharedepth,
+clienttimestamp
+FROM loghubods.user_share_log
+WHERE dt = '${dt}'
+and topic = 'click'
+and clickobjectid is not null

+ 4 - 0
tasks/00_表的洞察/videoods.dim_user/01_基本数据.sql

@@ -0,0 +1,4 @@
+SELECT 
+sum(1) as cnt
+FROM videoods.dim_user
+LIMIT 100

+ 4 - 0
tasks/00_表的洞察/videoods.wx_video/01_基本数据.sql

@@ -0,0 +1,4 @@
+SELECT 
+sum(1) as cnt
+FROM videoods.wx_video
+LIMIT 100

+ 165 - 0
tasks/承接/rosn分析/01_实验组xTop10一级品类.sql

@@ -0,0 +1,165 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v5: 按一级品类 (merge_first_level_cate) 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            -- 品类信息
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_first_level_cate'), 'unknown') AS cate1
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_second_level_cate'), 'unknown') AS cate2
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top10 的一级品类
+,t_cate_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,cate1
+            ,COUNT(1) AS cate_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS cate_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, cate1
+)
+,t_top_cate AS
+(
+    SELECT  dt, apptype, abcode, cate1, cate_rank
+    FROM    t_cate_rank
+    WHERE   cate_rank <= 10
+)
+-- 标记 top 品类
+,t_with_top AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.cate1 IS NOT NULL THEN a.cate1 ELSE NULL END AS top_cate1
+            ,b.cate_rank AS top_cate_rank
+    FROM    t_valid a
+    LEFT JOIN t_top_cate b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.cate1 = b.cate1
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(top_cate1, 'all') AS cate1
+            ,CASE WHEN GROUPING(top_cate1) = 1 THEN NULL ELSE MAX(top_cate_rank) END AS cate_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top
+    GROUP BY dt, apptype, abcode, top_cate1
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top_cate1)
+    )
+    HAVING  top_cate1 IS NOT NULL OR GROUPING(top_cate1) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,cate1
+        ,cate_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN cate1 = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 249 - 0
tasks/承接/rosn分析/02_实验组xTop20视频_vs对照组_v2.sql

@@ -0,0 +1,249 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 rosn_ori(未校准原始分)对比校准后的 rosn_pred
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS rosn_ori
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_ori) / COUNT(1), 0), 4) AS rosn_ori_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_ori), 0), 4) AS rovn_ori_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_ori) / COUNT(1),0),6) AS rosn_ori
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_ori), 6) AS rovn_ori
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_ori - return_n_uv_noself)),6) AS rosn_ori_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_ori_copc, rosn_stat_copc, rovn_copc, rovn_ori_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_ori, rosn_stat
+        ,rovn_real, rovn_pred, rovn_ori, rovn_stat
+        ,rosn_pred_mae, rosn_ori_mae, rosn_stat_mae
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_ori_copc - rosn_ori_copc_base) / NULLIF(rosn_ori_copc_base, 0), 4) AS rosn_ori_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_ori_copc - rovn_ori_copc_base) / NULLIF(rovn_ori_copc_base, 0), 4) AS rovn_ori_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 483 - 0
tasks/承接/rosn分析/03_实验组xTop20视频_vs对照组_metrics.sql

@@ -0,0 +1,483 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增模型评估指标(AUC/GAUC/Spearman)
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- ========== 模型评估指标计算 ==========
+-- 1. 添加排名(用于 AUC 和 Spearman 计算)
+,t_with_rank AS
+(
+    SELECT  *
+            -- 二分类标签
+            ,CASE WHEN is_return_noself > 0 THEN 1 ELSE 0 END AS is_pos
+            -- 整体排名(按实验组)
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY str_pred) AS str_rank_global
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY rosn_pred) AS rosn_pred_rank_global
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY rosn_stat) AS rosn_stat_rank_global
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY return_n_uv_noself) AS rosn_real_rank_global
+            -- 整体排名(按实验组×视频)
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, top5_vid ORDER BY str_pred) AS str_rank_vid
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, top5_vid ORDER BY rosn_pred) AS rosn_pred_rank_vid
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, top5_vid ORDER BY rosn_stat) AS rosn_stat_rank_vid
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, top5_vid ORDER BY return_n_uv_noself) AS rosn_real_rank_vid
+            -- 用户维度排名(按用户)
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred) AS str_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY rosn_pred) AS rosn_pred_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY rosn_stat) AS rosn_stat_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY return_n_uv_noself) AS rosn_real_rank_user
+    FROM    t_with_top5
+)
+-- 2. 计算整体 AUC(按实验组)
+,t_auc_global AS
+(
+    SELECT  dt, apptype, abcode
+            ,'all' AS vid
+            ,COUNT(1) AS n_total
+            ,SUM(is_pos) AS n_pos
+            ,COUNT(1) - SUM(is_pos) AS n_neg
+            ,SUM(CASE WHEN is_pos = 1 THEN str_rank_global ELSE 0 END) AS sum_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode
+)
+,t_auc_global_result AS
+(
+    SELECT  dt, apptype, abcode, vid
+            ,CASE
+                WHEN n_pos = 0 OR n_neg = 0 THEN NULL
+                ELSE round((sum_pos_rank * 1.0 / n_pos - (n_pos + 1) / 2.0) / n_neg, 6)
+            END AS str_auc
+    FROM    t_auc_global
+)
+-- 3. 计算整体 AUC(按实验组×视频)
+,t_auc_vid AS
+(
+    SELECT  dt, apptype, abcode, top5_vid AS vid
+            ,COUNT(1) AS n_total
+            ,SUM(is_pos) AS n_pos
+            ,COUNT(1) - SUM(is_pos) AS n_neg
+            ,SUM(CASE WHEN is_pos = 1 THEN str_rank_vid ELSE 0 END) AS sum_pos_rank
+    FROM    t_with_rank
+    WHERE   top5_vid IS NOT NULL
+    GROUP BY dt, apptype, abcode, top5_vid
+)
+,t_auc_vid_result AS
+(
+    SELECT  dt, apptype, abcode, CAST(vid AS STRING) AS vid
+            ,CASE
+                WHEN n_pos = 0 OR n_neg = 0 THEN NULL
+                ELSE round((sum_pos_rank * 1.0 / n_pos - (n_pos + 1) / 2.0) / n_neg, 6)
+            END AS str_auc
+    FROM    t_auc_vid
+)
+-- 4. 计算用户维度 GAUC(按实验组)
+,t_user_auc AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,SUM(is_pos) AS user_n_pos
+            ,COUNT(1) - SUM(is_pos) AS user_n_neg
+            ,SUM(CASE WHEN is_pos = 1 THEN str_rank_user ELSE 0 END) AS user_sum_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_auc_valid AS
+(
+    SELECT  *
+            -- 要求至少 5 个样本且正负样本都存在,裁剪到 [0, 1]
+            ,CASE
+                WHEN user_exp < 5 OR user_n_pos = 0 OR user_n_neg = 0 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / user_n_neg
+                ))
+            END AS user_auc
+    FROM    t_user_auc
+)
+,t_gauc_result AS
+(
+    SELECT  dt, apptype, abcode
+            ,'all' AS vid
+            ,round(SUM(user_exp * user_auc) / NULLIF(SUM(CASE WHEN user_auc IS NOT NULL THEN user_exp ELSE 0 END), 0), 6) AS str_gauc
+    FROM    t_user_auc_valid
+    GROUP BY dt, apptype, abcode
+)
+-- 5. 计算整体 Spearman(按实验组)
+-- 使用简化公式:1 - 6 * Σd² / (n * (n² - 1)),避免大数溢出
+,t_spearman_global AS
+(
+    SELECT  dt, apptype, abcode
+            ,'all' AS vid
+            ,CAST(COUNT(1) AS DOUBLE) AS n
+            -- d = rank_pred - rank_real,计算 Σd²
+            ,SUM(CAST((rosn_pred_rank_global - rosn_real_rank_global) AS DOUBLE)
+                * (rosn_pred_rank_global - rosn_real_rank_global)) AS sum_d2_pred
+            ,SUM(CAST((rosn_stat_rank_global - rosn_real_rank_global) AS DOUBLE)
+                * (rosn_stat_rank_global - rosn_real_rank_global)) AS sum_d2_stat
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode
+)
+,t_spearman_global_result AS
+(
+    SELECT  dt, apptype, abcode, vid
+            -- Spearman = 1 - 6 * Σd² / (n * (n² - 1))
+            ,round(1.0 - 6.0 * sum_d2_pred / NULLIF(n * (n * n - 1), 0), 6) AS rosn_corr
+            ,round(1.0 - 6.0 * sum_d2_stat / NULLIF(n * (n * n - 1), 0), 6) AS rosn_stat_corr
+    FROM    t_spearman_global
+)
+-- 6. 计算整体 Spearman(按实验组×视频)
+,t_spearman_vid AS
+(
+    SELECT  dt, apptype, abcode, top5_vid AS vid
+            ,CAST(COUNT(1) AS DOUBLE) AS n
+            ,SUM(CAST((rosn_pred_rank_vid - rosn_real_rank_vid) AS DOUBLE)
+                * (rosn_pred_rank_vid - rosn_real_rank_vid)) AS sum_d2_pred
+            ,SUM(CAST((rosn_stat_rank_vid - rosn_real_rank_vid) AS DOUBLE)
+                * (rosn_stat_rank_vid - rosn_real_rank_vid)) AS sum_d2_stat
+    FROM    t_with_rank
+    WHERE   top5_vid IS NOT NULL
+    GROUP BY dt, apptype, abcode, top5_vid
+)
+,t_spearman_vid_result AS
+(
+    SELECT  dt, apptype, abcode, CAST(vid AS STRING) AS vid
+            ,round(1.0 - 6.0 * sum_d2_pred / NULLIF(n * (n * n - 1), 0), 6) AS rosn_corr
+            ,round(1.0 - 6.0 * sum_d2_stat / NULLIF(n * (n * n - 1), 0), 6) AS rosn_stat_corr
+    FROM    t_spearman_vid
+)
+-- 7. 计算用户维度 Spearman(按实验组)
+,t_user_spearman AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,CAST(COUNT(1) AS DOUBLE) AS n
+            ,SUM(CAST((rosn_pred_rank_user - rosn_real_rank_user) AS DOUBLE)
+                * (rosn_pred_rank_user - rosn_real_rank_user)) AS sum_d2_pred
+            ,SUM(CAST((rosn_stat_rank_user - rosn_real_rank_user) AS DOUBLE)
+                * (rosn_stat_rank_user - rosn_real_rank_user)) AS sum_d2_stat
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_spearman_valid AS
+(
+    SELECT  *
+            -- 要求至少 5 个样本以保证稳定性
+            ,CASE
+                WHEN n < 5 THEN NULL
+                ELSE 1.0 - 6.0 * sum_d2_pred / NULLIF(n * (n * n - 1), 0)
+            END AS user_rosn_corr
+            ,CASE
+                WHEN n < 5 THEN NULL
+                ELSE 1.0 - 6.0 * sum_d2_stat / NULLIF(n * (n * n - 1), 0)
+            END AS user_rosn_stat_corr
+    FROM    t_user_spearman
+)
+,t_gspearman_result AS
+(
+    SELECT  dt, apptype, abcode
+            ,'all' AS vid
+            ,round(SUM(user_exp * user_rosn_corr) / NULLIF(SUM(CASE WHEN user_rosn_corr IS NOT NULL THEN user_exp ELSE 0 END), 0), 6) AS rosn_gcorr
+            ,round(SUM(user_exp * user_rosn_stat_corr) / NULLIF(SUM(CASE WHEN user_rosn_stat_corr IS NOT NULL THEN user_exp ELSE 0 END), 0), 6) AS rosn_stat_gcorr
+    FROM    t_user_spearman_valid
+    GROUP BY dt, apptype, abcode
+)
+-- 8. 合并所有指标(实验组粒度)
+,t_metrics_global AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.vid
+            ,a.str_auc
+            ,b.str_gauc
+            ,c.rosn_corr, c.rosn_stat_corr
+            ,d.rosn_gcorr, d.rosn_stat_gcorr
+    FROM    t_auc_global_result a
+    LEFT JOIN t_gauc_result b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+    LEFT JOIN t_spearman_global_result c
+    ON      a.dt = c.dt AND a.apptype = c.apptype AND a.abcode = c.abcode
+    LEFT JOIN t_gspearman_result d
+    ON      a.dt = d.dt AND a.apptype = d.apptype AND a.abcode = d.abcode
+)
+-- 9. 合并所有指标(视频粒度,只有整体指标,无用户维度)
+,t_metrics_vid AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.vid
+            ,a.str_auc
+            ,CAST(NULL AS DOUBLE) AS str_gauc
+            ,b.rosn_corr, b.rosn_stat_corr
+            ,CAST(NULL AS DOUBLE) AS rosn_gcorr
+            ,CAST(NULL AS DOUBLE) AS rosn_stat_gcorr
+    FROM    t_auc_vid_result a
+    LEFT JOIN t_spearman_vid_result b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode AND a.vid = b.vid
+)
+-- 10. 合并两个粒度的指标
+,t_metrics_all AS
+(
+    SELECT * FROM t_metrics_global
+    UNION ALL
+    SELECT * FROM t_metrics_vid
+)
+-- ========== 原有聚合逻辑 ==========
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- JOIN 模型评估指标
+,t_agg_with_metrics AS
+(
+    SELECT  a.*
+            ,b.str_auc
+            ,b.str_gauc
+            ,b.rosn_corr
+            ,b.rosn_gcorr
+            ,b.rosn_stat_corr
+            ,b.rosn_stat_gcorr
+    FROM    t_agg a
+    LEFT JOIN t_metrics_all b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode AND a.vid = b.vid
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+            -- 对照组基准值(模型评估指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_auc END) OVER (PARTITION BY dt, apptype, vid) AS str_auc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_gauc END) OVER (PARTITION BY dt, apptype, vid) AS str_gauc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_corr END) OVER (PARTITION BY dt, apptype, vid) AS rosn_corr_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_gcorr END) OVER (PARTITION BY dt, apptype, vid) AS rosn_gcorr_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_corr END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_corr_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_gcorr END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_gcorr_base
+    FROM    t_agg_with_metrics
+)
+-- 最终输出:原有字段 + 模型评估指标 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        -- ========== 模型评估指标 ==========
+        ,str_auc, str_gauc
+        ,rosn_corr, rosn_gcorr
+        ,rosn_stat_corr, rosn_stat_gcorr
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 模型评估指标变化率
+        ,round((str_auc - str_auc_base) / NULLIF(str_auc_base, 0), 4) AS str_auc_chg
+        ,round((str_gauc - str_gauc_base) / NULLIF(str_gauc_base, 0), 4) AS str_gauc_chg
+        ,round((rosn_corr - rosn_corr_base) / NULLIF(rosn_corr_base, 0), 4) AS rosn_corr_chg
+        ,round((rosn_gcorr - rosn_gcorr_base) / NULLIF(rosn_gcorr_base, 0), 4) AS rosn_gcorr_chg
+        ,round((rosn_stat_corr - rosn_stat_corr_base) / NULLIF(rosn_stat_corr_base, 0), 4) AS rosn_stat_corr_chg
+        ,round((rosn_stat_gcorr - rosn_stat_gcorr_base) / NULLIF(rosn_stat_gcorr_base, 0), 4) AS rosn_stat_gcorr_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 254 - 0
tasks/承接/rosn分析/04_实验组xTop20视频_vs对照组_mae_chg.sql

@@ -0,0 +1,254 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 MAE 与对照组的变化率
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(str_pred - is_return_noself)),6) AS str_pred_mae
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            ,round(AVG(ABS(str_pred * rosn_pred - return_n_uv_noself)),6) AS rovn_pred_mae
+            ,round(AVG(ABS(str_pred * rosn_stat - return_n_uv_noself)),6) AS rovn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(MAE 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_pred_mae END) OVER (PARTITION BY dt, apptype, vid) AS str_pred_mae_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_pred_mae END) OVER (PARTITION BY dt, apptype, vid) AS rosn_pred_mae_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_mae END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_mae_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_pred_mae END) OVER (PARTITION BY dt, apptype, vid) AS rovn_pred_mae_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_mae END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_mae_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,str_pred_mae, rosn_pred_mae, rosn_stat_mae, rovn_pred_mae, rovn_stat_mae
+        -- MAE 变化率
+        ,round((str_pred_mae - str_pred_mae_base) / NULLIF(str_pred_mae_base, 0), 4) AS str_pred_mae_chg
+        ,round((rosn_pred_mae - rosn_pred_mae_base) / NULLIF(rosn_pred_mae_base, 0), 4) AS rosn_pred_mae_chg
+        ,round((rosn_stat_mae - rosn_stat_mae_base) / NULLIF(rosn_stat_mae_base, 0), 4) AS rosn_stat_mae_chg
+        ,round((rovn_pred_mae - rovn_pred_mae_base) / NULLIF(rovn_pred_mae_base, 0), 4) AS rovn_pred_mae_chg
+        ,round((rovn_stat_mae - rovn_stat_mae_base) / NULLIF(rovn_stat_mae_base, 0), 4) AS rovn_stat_mae_chg
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 256 - 0
tasks/承接/rosn分析/05_实验组xTop20视频_vs对照组_vor.sql

@@ -0,0 +1,256 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v5: 新增 vor 统计量 + score_pred/score_stat/score_real
+-- 排序公式: str * ros * vor
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(COALESCE(SUM(vor_stat) / COUNT(1),0),6) AS vor_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- score: str * ros * vor
+            ,round(AVG(str_pred * rosn_pred), 6) AS score_pred
+            ,round(AVG(str_pred * rosn_stat * vor_stat), 6) AS score_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS score_real
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(vor 和 score)
+            ,MAX(CASE WHEN abcode = '对照组' THEN vor_stat END) OVER (PARTITION BY dt, apptype, vid) AS vor_stat_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_pred END) OVER (PARTITION BY dt, apptype, vid) AS score_pred_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_stat END) OVER (PARTITION BY dt, apptype, vid) AS score_stat_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_real END) OVER (PARTITION BY dt, apptype, vid) AS score_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat, vor_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        -- score: str * ros * vor
+        ,score_pred, score_stat, score_real
+        ,rosn_pred_mae, rosn_stat_mae
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- vor 和 score 变化率
+        ,round((vor_stat - vor_stat_base) / NULLIF(vor_stat_base, 0), 4) AS vor_stat_chg
+        ,round((score_pred - score_pred_base) / NULLIF(score_pred_base, 0), 4) AS score_pred_chg
+        ,round((score_stat - score_stat_base) / NULLIF(score_stat_base, 0), 4) AS score_stat_chg
+        ,round((score_real - score_real_base) / NULLIF(score_real_base, 0), 4) AS score_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 178 - 0
tasks/承接/rosn分析/06_rosn_gcindex.sql

@@ -0,0 +1,178 @@
+-- ROSN G-C-Index 指标计算(简化版)
+-- 只计算实验组粒度的 GAUC 类指标:
+--   str_gauc: 分享率预测的 GAUC(二分类,标签=is_return_noself>0)
+--   rov_gauc: ROV 预测的 GAUC(二分类,预测=str*rosn)
+--   rosn_gcindex: ROSN 预测的 G-C-Index(连续值,仅在有回流样本中计算)
+--   rosn_stat_gcindex: ROSN 统计量的 G-C-Index(连续值,仅在有回流样本中计算)
+
+-- ========== 预处理:解析 scoresmap ==========
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+
+-- ========== str_gauc & rov_gauc(二分类 GAUC)==========
+-- 1. 添加用户维度排名
+,t_with_rank AS
+(
+    SELECT  *
+            ,str_pred * rosn_pred AS rov_pred
+            ,str_pred * rosn_stat AS rov_stat_pred
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred) AS str_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred * rosn_pred) AS rov_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred * rosn_stat) AS rov_stat_rank_user
+    FROM    t_valid
+)
+-- 2. 用户维度 AUC(str + rov)
+,t_user_auc AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,SUM(CASE WHEN is_return_noself > 0 THEN 1 ELSE 0 END) AS user_n_pos
+            ,SUM(CASE WHEN is_return_noself > 0 THEN str_rank_user ELSE 0 END) AS user_sum_str_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_rank_user ELSE 0 END) AS user_sum_rov_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_stat_rank_user ELSE 0 END) AS user_sum_rov_stat_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_auc_valid AS
+(
+    SELECT  *
+            ,CASE
+                WHEN user_exp < 5 OR user_n_pos = 0 OR user_n_pos = user_exp THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_str_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_str_auc
+            ,CASE
+                WHEN user_exp < 5 OR user_n_pos = 0 OR user_n_pos = user_exp THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_auc
+            ,CASE
+                WHEN user_exp < 5 OR user_n_pos = 0 OR user_n_pos = user_exp THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_stat_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_stat_auc
+    FROM    t_user_auc
+)
+,t_gauc AS
+(
+    SELECT  dt, apptype, abcode
+            ,round(SUM(user_exp * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_exp END), 0), 6) AS str_gauc
+            ,round(SUM(user_exp * user_rov_auc) / NULLIF(SUM(CASE WHEN user_rov_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_gauc
+            ,round(SUM(user_exp * user_rov_stat_auc) / NULLIF(SUM(CASE WHEN user_rov_stat_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_stat_gauc
+    FROM    t_user_auc_valid
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== rosn_gcindex(连续值 C-Index,仅有回流样本)==========
+-- 筛选有回流的样本
+,t_has_return AS
+(
+    SELECT  *
+    FROM    t_valid
+    WHERE   is_return_noself > 0
+)
+-- 1. 用户维度 self-join 计算样本对
+-- 只比较 return_n_uv_noself 不同的样本对(a > b)
+,t_user_pairs AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,CASE WHEN a.rosn_pred > b.rosn_pred THEN 1.0
+                  WHEN a.rosn_pred = b.rosn_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_pred
+            ,CASE WHEN a.rosn_stat > b.rosn_stat THEN 1.0
+                  WHEN a.rosn_stat = b.rosn_stat THEN 0.5
+                  ELSE 0.0 END AS concordant_stat
+    FROM    t_has_return a
+    JOIN    t_has_return b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+    AND     a.mid = b.mid
+    AND     a.return_n_uv_noself > b.return_n_uv_noself
+)
+-- 2. 用户维度 C-Index
+,t_user_cindex AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_pairs
+            ,AVG(concordant_pred) AS user_cindex_pred
+            ,AVG(concordant_stat) AS user_cindex_stat
+    FROM    t_user_pairs
+    GROUP BY dt, apptype, abcode, mid
+    HAVING  COUNT(1) >= 3
+)
+-- 3. 按样本对数加权平均
+,t_rosn_gcindex AS
+(
+    SELECT  dt, apptype, abcode
+            ,round(SUM(user_pairs * user_cindex_pred) / SUM(user_pairs), 6) AS rosn_gcindex
+            ,round(SUM(user_pairs * user_cindex_stat) / SUM(user_pairs), 6) AS rosn_stat_gcindex
+    FROM    t_user_cindex
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== 最终输出 ==========
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.str_gauc
+        ,a.rov_gauc
+        ,a.rov_stat_gauc
+        ,b.rosn_gcindex
+        ,b.rosn_stat_gcindex
+FROM    t_gauc a
+LEFT JOIN t_rosn_gcindex b
+ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+ORDER BY dt DESC, apptype, abcode
+;

+ 233 - 0
tasks/承接/rosn分析/07_rosn_gcindex.sql

@@ -0,0 +1,233 @@
+-- ROSN G-C-Index 指标计算(v2)
+--
+-- 过滤条件:
+--   原始版本:user_exp >= 5 且正负样本都存在
+--   新版本:user_pairs >= 3(样本对数 = n_pos × n_neg)
+--
+-- 加权方式后缀:
+--   无后缀: 原始版本(exp>=5 + 曝光量加权)
+--   _exp: 曝光量加权(业务影响)
+--   _pairs: 样本对数加权(统计最优)
+--   _equal: 等权(用户公平)
+--
+-- 指标说明:
+--   str_gauc: 原始版本(与 03_metrics 一致)
+--   str_gauc_*: 分享率预测的 GAUC(全量样本,标签=is_return_noself>0)
+--   str_gcindex: str_gauc_pairs 的 self-join 验证版本
+--   rosn_gcindex: ROSN 预测的 G-C-Index(仅有回流样本,标签=return_n_uv_noself)
+--   rosn_stat_gcindex: ROSN 统计量的 G-C-Index
+--   rov_gauc_*: ROV 预测的 GAUC(全量样本,预测=str×rosn)
+--   rov_stat_gauc_*: ROV 统计量预测的 GAUC
+
+-- ========== 预处理:解析 scoresmap ==========
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+            ,str_pred * rosn_pred AS rov_pred
+            ,str_pred * rosn_stat AS rov_stat_pred
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+
+-- ========== 1. str_gauc(秩和公式)==========
+,t_with_rank AS
+(
+    SELECT  *
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred) AS str_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY rov_pred) AS rov_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY rov_stat_pred) AS rov_stat_rank_user
+    FROM    t_valid
+)
+,t_user_auc AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,SUM(CASE WHEN is_return_noself > 0 THEN 1 ELSE 0 END) AS user_n_pos
+            ,SUM(CASE WHEN is_return_noself > 0 THEN str_rank_user ELSE 0 END) AS user_sum_str_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_rank_user ELSE 0 END) AS user_sum_rov_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_stat_rank_user ELSE 0 END) AS user_sum_rov_stat_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_auc_valid AS
+(
+    SELECT  *
+            ,user_n_pos * (user_exp - user_n_pos) AS user_pairs  -- 样本对数 = n_pos * n_neg
+            -- 原始过滤条件:user_exp >= 5 且正负样本都存在
+            ,CASE
+                WHEN user_exp < 5 OR user_n_pos = 0 OR user_n_pos = user_exp THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_str_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_str_auc_orig
+            -- 统一过滤条件:样本对数 >= 3
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_str_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_str_auc
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_auc
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_stat_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_stat_auc
+    FROM    t_user_auc
+)
+,t_gauc AS
+(
+    SELECT  dt, apptype, abcode
+            -- str_gauc 原始版本(与 03_metrics 一致)
+            ,round(SUM(user_exp * user_str_auc_orig) / NULLIF(SUM(CASE WHEN user_str_auc_orig IS NOT NULL THEN user_exp END), 0), 6) AS str_gauc
+            -- str: 三种加权方式
+            ,round(SUM(user_exp * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_exp END), 0), 6) AS str_gauc_exp
+            ,round(SUM(user_pairs * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_pairs END), 0), 6) AS str_gauc_pairs
+            ,round(AVG(user_str_auc), 6) AS str_gauc_equal
+            -- rov: 三种加权方式
+            ,round(SUM(user_exp * user_rov_auc) / NULLIF(SUM(CASE WHEN user_rov_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_gauc_exp
+            ,round(SUM(user_pairs * user_rov_auc) / NULLIF(SUM(CASE WHEN user_rov_auc IS NOT NULL THEN user_pairs END), 0), 6) AS rov_gauc_pairs
+            ,round(AVG(user_rov_auc), 6) AS rov_gauc_equal
+            -- rov_stat: 三种加权方式
+            ,round(SUM(user_exp * user_rov_stat_auc) / NULLIF(SUM(CASE WHEN user_rov_stat_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_stat_gauc_exp
+            ,round(SUM(user_pairs * user_rov_stat_auc) / NULLIF(SUM(CASE WHEN user_rov_stat_auc IS NOT NULL THEN user_pairs END), 0), 6) AS rov_stat_gauc_pairs
+            ,round(AVG(user_rov_stat_auc), 6) AS rov_stat_gauc_equal
+    FROM    t_user_auc_valid
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== 2. rosn_gauc(仅有回流样本,连续值标签)==========
+,t_has_return AS
+(
+    SELECT  *
+    FROM    t_valid
+    WHERE   is_return_noself > 0
+)
+,t_rosn_pairs AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,CASE WHEN a.rosn_pred > b.rosn_pred THEN 1.0
+                  WHEN a.rosn_pred = b.rosn_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_pred
+            ,CASE WHEN a.rosn_stat > b.rosn_stat THEN 1.0
+                  WHEN a.rosn_stat = b.rosn_stat THEN 0.5
+                  ELSE 0.0 END AS concordant_stat
+    FROM    t_has_return a
+    JOIN    t_has_return b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+    AND     a.mid = b.mid
+    AND     a.return_n_uv_noself > b.return_n_uv_noself
+)
+,t_user_rosn_exp AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp  -- 有回流的样本数
+    FROM    t_has_return
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_rosn_cindex AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,b.user_exp
+            ,COUNT(1) AS user_pairs
+            ,AVG(concordant_pred) AS user_cindex_pred
+            ,AVG(concordant_stat) AS user_cindex_stat
+    FROM    t_rosn_pairs a
+    JOIN    t_user_rosn_exp b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode AND a.mid = b.mid
+    GROUP BY a.dt, a.apptype, a.abcode, a.mid, b.user_exp
+    HAVING  COUNT(1) >= 3
+)
+,t_rosn_gcindex AS
+(
+    SELECT  dt, apptype, abcode
+            -- rosn: 三种加权方式
+            ,round(SUM(user_exp * user_cindex_pred) / NULLIF(SUM(CASE WHEN user_cindex_pred IS NOT NULL THEN user_exp END), 0), 6) AS `rosn_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_pred) / SUM(user_pairs), 6) AS `rosn_gauc_样本对`
+            ,round(AVG(user_cindex_pred), 6) AS `rosn_gauc_等权`
+            -- rosn_stat: 三种加权方式
+            ,round(SUM(user_exp * user_cindex_stat) / NULLIF(SUM(CASE WHEN user_cindex_stat IS NOT NULL THEN user_exp END), 0), 6) AS `rosn_stat_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_stat) / SUM(user_pairs), 6) AS `rosn_stat_gauc_样本对`
+            ,round(AVG(user_cindex_stat), 6) AS `rosn_stat_gauc_等权`
+    FROM    t_user_rosn_cindex
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== 最终输出 ==========
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        -- str gauc(三种加权方式)
+        ,a.str_gauc AS str_gauc_曝光
+        ,a.str_gauc_pairs AS str_gauc_样本对
+        ,a.str_gauc_equal AS str_gauc_等权
+        -- rosn gauc(仅有回流样本,三种加权方式)
+        ,c.`rosn_gauc_曝光`
+        ,c.`rosn_gauc_样本对`
+        ,c.`rosn_gauc_等权`
+        ,c.`rosn_stat_gauc_曝光`
+        ,c.`rosn_stat_gauc_样本对`
+        ,c.`rosn_stat_gauc_等权`
+        -- rov gauc(三种加权方式)
+        ,a.rov_gauc_exp AS rov_gauc_曝光
+        ,a.rov_gauc_pairs AS rov_gauc_样本对
+        ,a.rov_gauc_equal AS rov_gauc_等权
+        -- rov_stat gauc(三种加权方式)
+        ,a.rov_stat_gauc_exp AS rov_stat_gauc_曝光
+        ,a.rov_stat_gauc_pairs AS rov_stat_gauc_样本对
+        ,a.rov_stat_gauc_equal AS rov_stat_gauc_等权
+FROM    t_gauc a
+LEFT JOIN t_rosn_gcindex c
+ON      a.dt = c.dt AND a.apptype = c.apptype AND a.abcode = c.abcode
+ORDER BY dt DESC, apptype, abcode
+;

+ 229 - 0
tasks/承接/rosn分析/07v2_rosn_gcindex.sql

@@ -0,0 +1,229 @@
+-- ROSN G-C-Index 指标计算(v2.3)
+-- 全部指标改为 C-Index 方式:rosn/rov 均基于连续值标签的 self-join
+-- 新增:差值加权(pair_diff 加权,仅限连续值指标)
+--
+-- 过滤条件:user_pairs >= 3
+--
+-- 加权方式后缀(中文):
+--   曝光: 曝光量加权(业务影响)
+--   样本对: 样本对数加权(统计最优)
+--   等权: 等权(用户公平)
+--   diff: 用户内线性 diff 加权 + 用户间等权
+--
+-- 指标说明:
+--   str_gauc_*: 分享率预测的 GAUC(全量样本,二分类,标签=is_return_noself>0,秩和公式)
+--   rosn_gauc_*: ROSN 预测的 C-Index(仅有回流样本,连续值,标签=return_n_uv_noself)
+--   rosn_stat_gauc_*: ROSN 统计量的 C-Index
+--   rov_gauc_*: ROV 预测的 C-Index(仅有回流样本,连续值,预测=str×rosn)
+--   rov_stat_gauc_*: ROV 统计量的 C-Index(预测=str×rosn_stat)
+
+-- ========== 预处理:解析 scoresmap ==========
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+            ,str_pred * rosn_pred AS rov_pred
+            ,str_pred * rosn_stat AS rov_stat_pred
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+
+-- ========== 1. str_gauc(秩和公式,二分类)==========
+,t_with_rank AS
+(
+    SELECT  *
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred) AS str_rank_user
+    FROM    t_valid
+)
+,t_user_auc AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,SUM(CASE WHEN is_return_noself > 0 THEN 1 ELSE 0 END) AS user_n_pos
+            ,SUM(CASE WHEN is_return_noself > 0 THEN str_rank_user ELSE 0 END) AS user_sum_str_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_auc_valid AS
+(
+    SELECT  *
+            ,user_n_pos * (user_exp - user_n_pos) AS user_pairs
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_str_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_str_auc
+    FROM    t_user_auc
+)
+,t_gauc AS
+(
+    SELECT  dt, apptype, abcode
+            ,round(SUM(user_exp * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_exp END), 0), 6) AS str_gauc_exp
+            ,round(SUM(user_pairs * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_pairs END), 0), 6) AS str_gauc_pairs
+            ,round(AVG(user_str_auc), 6) AS str_gauc_equal
+    FROM    t_user_auc_valid
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== 2. rosn/rov gauc(仅有回流样本,连续值标签)==========
+,t_has_return AS
+(
+    SELECT  *
+    FROM    t_valid
+    WHERE   is_return_noself > 0
+)
+,t_rosn_pairs AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,a.return_n_uv_noself - b.return_n_uv_noself AS pair_diff  -- pair 差值
+            -- rosn concordant
+            ,CASE WHEN a.rosn_pred > b.rosn_pred THEN 1.0
+                  WHEN a.rosn_pred = b.rosn_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_pred
+            ,CASE WHEN a.rosn_stat > b.rosn_stat THEN 1.0
+                  WHEN a.rosn_stat = b.rosn_stat THEN 0.5
+                  ELSE 0.0 END AS concordant_stat
+            -- rov concordant
+            ,CASE WHEN a.rov_pred > b.rov_pred THEN 1.0
+                  WHEN a.rov_pred = b.rov_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_rov
+            ,CASE WHEN a.rov_stat_pred > b.rov_stat_pred THEN 1.0
+                  WHEN a.rov_stat_pred = b.rov_stat_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_rov_stat
+    FROM    t_has_return a
+    JOIN    t_has_return b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+    AND     a.mid = b.mid
+    AND     a.return_n_uv_noself > b.return_n_uv_noself
+)
+,t_user_rosn_exp AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp  -- 有回流的样本数
+    FROM    t_has_return
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_rosn_cindex AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,b.user_exp
+            ,COUNT(1) AS user_pairs
+            -- rosn: 等权 + 差值加权
+            ,AVG(concordant_pred) AS user_cindex_pred
+            ,AVG(concordant_stat) AS user_cindex_stat
+            ,SUM(pair_diff * concordant_pred) / SUM(pair_diff) AS user_cindex_pred_diff
+            ,SUM(pair_diff * concordant_stat) / SUM(pair_diff) AS user_cindex_stat_diff
+            -- rov: 等权 + 差值加权
+            ,AVG(concordant_rov) AS user_cindex_rov
+            ,AVG(concordant_rov_stat) AS user_cindex_rov_stat
+            ,SUM(pair_diff * concordant_rov) / SUM(pair_diff) AS user_cindex_rov_diff
+            ,SUM(pair_diff * concordant_rov_stat) / SUM(pair_diff) AS user_cindex_rov_stat_diff
+    FROM    t_rosn_pairs a
+    JOIN    t_user_rosn_exp b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode AND a.mid = b.mid
+    GROUP BY a.dt, a.apptype, a.abcode, a.mid, b.user_exp
+    HAVING  COUNT(1) >= 3
+)
+,t_rosn_gcindex AS
+(
+    SELECT  dt, apptype, abcode
+            -- rosn: 四种加权方式
+            ,round(SUM(user_exp * user_cindex_pred) / NULLIF(SUM(CASE WHEN user_cindex_pred IS NOT NULL THEN user_exp END), 0), 6) AS `rosn_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_pred) / SUM(user_pairs), 6) AS `rosn_gauc_样本对`
+            ,round(AVG(user_cindex_pred), 6) AS `rosn_gauc_等权`
+            ,round(AVG(user_cindex_pred_diff), 6) AS rosn_gauc_diff
+            -- rosn_stat: 四种加权方式
+            ,round(SUM(user_exp * user_cindex_stat) / NULLIF(SUM(CASE WHEN user_cindex_stat IS NOT NULL THEN user_exp END), 0), 6) AS `rosn_stat_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_stat) / SUM(user_pairs), 6) AS `rosn_stat_gauc_样本对`
+            ,round(AVG(user_cindex_stat), 6) AS `rosn_stat_gauc_等权`
+            ,round(AVG(user_cindex_stat_diff), 6) AS rosn_stat_gauc_diff
+            -- rov: 四种加权方式
+            ,round(SUM(user_exp * user_cindex_rov) / NULLIF(SUM(CASE WHEN user_cindex_rov IS NOT NULL THEN user_exp END), 0), 6) AS `rov_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_rov) / SUM(user_pairs), 6) AS `rov_gauc_样本对`
+            ,round(AVG(user_cindex_rov), 6) AS `rov_gauc_等权`
+            ,round(AVG(user_cindex_rov_diff), 6) AS rov_gauc_diff
+            -- rov_stat: 四种加权方式
+            ,round(SUM(user_exp * user_cindex_rov_stat) / NULLIF(SUM(CASE WHEN user_cindex_rov_stat IS NOT NULL THEN user_exp END), 0), 6) AS `rov_stat_gauc_曝光`
+            ,round(SUM(user_pairs * user_cindex_rov_stat) / SUM(user_pairs), 6) AS `rov_stat_gauc_样本对`
+            ,round(AVG(user_cindex_rov_stat), 6) AS `rov_stat_gauc_等权`
+            ,round(AVG(user_cindex_rov_stat_diff), 6) AS rov_stat_gauc_diff
+    FROM    t_user_rosn_cindex
+    GROUP BY dt, apptype, abcode
+)
+
+-- ========== 最终输出 ==========
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        -- str gauc(三种加权方式)
+        ,a.str_gauc_exp AS str_gauc_曝光
+        ,a.str_gauc_pairs AS str_gauc_样本对
+        ,a.str_gauc_equal AS str_gauc_等权
+        -- rosn gauc(仅有回流样本,四种加权方式)
+        ,c.`rosn_gauc_曝光`
+        ,c.`rosn_gauc_样本对`
+        ,c.`rosn_gauc_等权`
+        ,c.rosn_gauc_diff
+        ,c.`rosn_stat_gauc_曝光`
+        ,c.`rosn_stat_gauc_样本对`
+        ,c.`rosn_stat_gauc_等权`
+        ,c.rosn_stat_gauc_diff
+        -- rov gauc(仅有回流样本,四种加权方式)
+        ,c.`rov_gauc_曝光`
+        ,c.`rov_gauc_样本对`
+        ,c.`rov_gauc_等权`
+        ,c.rov_gauc_diff
+        -- rov_stat gauc(仅有回流样本,四种加权方式)
+        ,c.`rov_stat_gauc_曝光`
+        ,c.`rov_stat_gauc_样本对`
+        ,c.`rov_stat_gauc_等权`
+        ,c.rov_stat_gauc_diff
+FROM    t_gauc a
+LEFT JOIN t_rosn_gcindex c
+ON      a.dt = c.dt AND a.apptype = c.apptype AND a.abcode = c.abcode
+ORDER BY dt DESC, apptype, abcode
+;

+ 387 - 0
tasks/承接/rosn分析/08_实验组xTop20视频_vs对照组_gauc.sql

@@ -0,0 +1,387 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 GAUC 指标(str/rosn/rov,三种加权方式:曝光/样本对/等权)
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+
+-- ========== GAUC 计算(仅 vid='all' 粒度)==========
+-- 1. str/rov gauc(秩和公式)
+,t_with_rank AS
+(
+    SELECT  dt, apptype, abcode, mid, is_return_noself, return_n_uv_noself
+            ,str_pred, rosn_pred, rosn_stat
+            ,str_pred * rosn_pred AS rov_pred
+            ,str_pred * rosn_stat AS rov_stat_pred
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred) AS str_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred * rosn_pred) AS rov_rank_user
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode, mid ORDER BY str_pred * rosn_stat) AS rov_stat_rank_user
+    FROM    t_valid
+)
+,t_user_auc AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+            ,SUM(CASE WHEN is_return_noself > 0 THEN 1 ELSE 0 END) AS user_n_pos
+            ,SUM(CASE WHEN is_return_noself > 0 THEN str_rank_user ELSE 0 END) AS user_sum_str_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_rank_user ELSE 0 END) AS user_sum_rov_pos_rank
+            ,SUM(CASE WHEN is_return_noself > 0 THEN rov_stat_rank_user ELSE 0 END) AS user_sum_rov_stat_pos_rank
+    FROM    t_with_rank
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_auc_valid AS
+(
+    SELECT  *
+            ,user_n_pos * (user_exp - user_n_pos) AS user_pairs
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_str_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_str_auc
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_auc
+            ,CASE
+                WHEN user_n_pos * (user_exp - user_n_pos) < 3 THEN NULL
+                ELSE GREATEST(0.0, LEAST(1.0,
+                    (user_sum_rov_stat_pos_rank * 1.0 / user_n_pos - (user_n_pos + 1) / 2.0) / (user_exp - user_n_pos)
+                ))
+            END AS user_rov_stat_auc
+    FROM    t_user_auc
+)
+,t_gauc AS
+(
+    SELECT  dt, apptype, abcode
+            -- str: 三种加权方式
+            ,round(SUM(user_exp * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_exp END), 0), 6) AS str_gauc_曝光
+            ,round(SUM(user_pairs * user_str_auc) / NULLIF(SUM(CASE WHEN user_str_auc IS NOT NULL THEN user_pairs END), 0), 6) AS str_gauc_样本对
+            ,round(AVG(user_str_auc), 6) AS str_gauc_等权
+            -- rov: 三种加权方式
+            ,round(SUM(user_exp * user_rov_auc) / NULLIF(SUM(CASE WHEN user_rov_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_gauc_曝光
+            ,round(SUM(user_pairs * user_rov_auc) / NULLIF(SUM(CASE WHEN user_rov_auc IS NOT NULL THEN user_pairs END), 0), 6) AS rov_gauc_样本对
+            ,round(AVG(user_rov_auc), 6) AS rov_gauc_等权
+            -- rov_stat: 三种加权方式
+            ,round(SUM(user_exp * user_rov_stat_auc) / NULLIF(SUM(CASE WHEN user_rov_stat_auc IS NOT NULL THEN user_exp END), 0), 6) AS rov_stat_gauc_曝光
+            ,round(SUM(user_pairs * user_rov_stat_auc) / NULLIF(SUM(CASE WHEN user_rov_stat_auc IS NOT NULL THEN user_pairs END), 0), 6) AS rov_stat_gauc_样本对
+            ,round(AVG(user_rov_stat_auc), 6) AS rov_stat_gauc_等权
+    FROM    t_user_auc_valid
+    GROUP BY dt, apptype, abcode
+)
+
+-- 2. rosn gauc(仅有回流样本,连续值标签)
+,t_has_return AS
+(
+    SELECT  dt, apptype, abcode, mid, return_n_uv_noself, rosn_pred, rosn_stat
+    FROM    t_valid
+    WHERE   is_return_noself > 0
+)
+,t_rosn_pairs AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,CASE WHEN a.rosn_pred > b.rosn_pred THEN 1.0
+                  WHEN a.rosn_pred = b.rosn_pred THEN 0.5
+                  ELSE 0.0 END AS concordant_pred
+            ,CASE WHEN a.rosn_stat > b.rosn_stat THEN 1.0
+                  WHEN a.rosn_stat = b.rosn_stat THEN 0.5
+                  ELSE 0.0 END AS concordant_stat
+    FROM    t_has_return a
+    JOIN    t_has_return b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+    AND     a.mid = b.mid
+    AND     a.return_n_uv_noself > b.return_n_uv_noself
+)
+,t_user_rosn_exp AS
+(
+    SELECT  dt, apptype, abcode, mid
+            ,COUNT(1) AS user_exp
+    FROM    t_has_return
+    GROUP BY dt, apptype, abcode, mid
+)
+,t_user_rosn_cindex AS
+(
+    SELECT  a.dt, a.apptype, a.abcode, a.mid
+            ,b.user_exp
+            ,COUNT(1) AS user_pairs
+            ,AVG(concordant_pred) AS user_cindex_pred
+            ,AVG(concordant_stat) AS user_cindex_stat
+    FROM    t_rosn_pairs a
+    JOIN    t_user_rosn_exp b
+    ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode AND a.mid = b.mid
+    GROUP BY a.dt, a.apptype, a.abcode, a.mid, b.user_exp
+    HAVING  COUNT(1) >= 3
+)
+,t_rosn_gauc AS
+(
+    SELECT  dt, apptype, abcode
+            -- rosn: 三种加权方式
+            ,round(SUM(user_exp * user_cindex_pred) / NULLIF(SUM(CASE WHEN user_cindex_pred IS NOT NULL THEN user_exp END), 0), 6) AS rosn_gauc_曝光
+            ,round(SUM(user_pairs * user_cindex_pred) / SUM(user_pairs), 6) AS rosn_gauc_样本对
+            ,round(AVG(user_cindex_pred), 6) AS rosn_gauc_等权
+            -- rosn_stat: 三种加权方式
+            ,round(SUM(user_exp * user_cindex_stat) / NULLIF(SUM(CASE WHEN user_cindex_stat IS NOT NULL THEN user_exp END), 0), 6) AS rosn_stat_gauc_曝光
+            ,round(SUM(user_pairs * user_cindex_stat) / SUM(user_pairs), 6) AS rosn_stat_gauc_样本对
+            ,round(AVG(user_cindex_stat), 6) AS rosn_stat_gauc_等权
+    FROM    t_user_rosn_cindex
+    GROUP BY dt, apptype, abcode
+)
+
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + GAUC + 变化率
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.vid
+        ,a.vid_title
+        ,a.vid_rank
+        ,a.exp_pct
+        ,round((a.dau - a.dau_base) / NULLIF(a.dau_base, 0), 4) AS dau_chg
+        ,round((a.exp - a.exp_base) / NULLIF(a.exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,a.str_copc, a.rosn_copc, a.rosn_stat_copc, a.rovn_copc, a.rovn_stat_copc
+        -- 模型预测与真实值
+        ,a.str_real, a.str_pred, a.rosn_real, a.rosn_pred, a.rosn_stat
+        ,a.rovn_real, a.rovn_pred, a.rovn_stat
+        ,a.rosn_pred_mae, a.rosn_stat_mae
+        -- ========== GAUC 指标(仅 vid='all' 有值)==========
+        -- str gauc
+        ,CASE WHEN a.vid = 'all' THEN b.str_gauc_曝光 END AS str_gauc_曝光
+        ,CASE WHEN a.vid = 'all' THEN b.str_gauc_样本对 END AS str_gauc_样本对
+        ,CASE WHEN a.vid = 'all' THEN b.str_gauc_等权 END AS str_gauc_等权
+        -- rosn gauc
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_gauc_曝光 END AS rosn_gauc_曝光
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_gauc_样本对 END AS rosn_gauc_样本对
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_gauc_等权 END AS rosn_gauc_等权
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_stat_gauc_曝光 END AS rosn_stat_gauc_曝光
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_stat_gauc_样本对 END AS rosn_stat_gauc_样本对
+        ,CASE WHEN a.vid = 'all' THEN c.rosn_stat_gauc_等权 END AS rosn_stat_gauc_等权
+        -- rov gauc
+        ,CASE WHEN a.vid = 'all' THEN b.rov_gauc_曝光 END AS rov_gauc_曝光
+        ,CASE WHEN a.vid = 'all' THEN b.rov_gauc_样本对 END AS rov_gauc_样本对
+        ,CASE WHEN a.vid = 'all' THEN b.rov_gauc_等权 END AS rov_gauc_等权
+        ,CASE WHEN a.vid = 'all' THEN b.rov_stat_gauc_曝光 END AS rov_stat_gauc_曝光
+        ,CASE WHEN a.vid = 'all' THEN b.rov_stat_gauc_样本对 END AS rov_stat_gauc_样本对
+        ,CASE WHEN a.vid = 'all' THEN b.rov_stat_gauc_等权 END AS rov_stat_gauc_等权
+        -- 业务指标
+        ,a.exp_per_dau, a.str_one, a.ros_one, a.str, a.ros, a.str_plus, a.ros_minus, a.rovn, a.vovh24
+        -- 计数
+        ,a.dau, a.exp, a.is_share, a.share_cnt, a.is_return_1, a.return_n_uv, a.viewh24, a.return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((a.exp_per_dau - a.exp_per_dau_base) / NULLIF(a.exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((a.str_one - a.str_one_base) / NULLIF(a.str_one_base, 0), 4) AS str_one_chg
+        ,round((a.ros_one - a.ros_one_base) / NULLIF(a.ros_one_base, 0), 4) AS ros_one_chg
+        ,round((a.str - a.str_base) / NULLIF(a.str_base, 0), 4) AS str_chg
+        ,round((a.ros - a.ros_base) / NULLIF(a.ros_base, 0), 4) AS ros_chg
+        ,round((a.str_plus - a.str_plus_base) / NULLIF(a.str_plus_base, 0), 4) AS str_plus_chg
+        ,round((a.ros_minus - a.ros_minus_base) / NULLIF(a.ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((a.rovn - a.rovn_base) / NULLIF(a.rovn_base, 0), 4) AS rovn_chg
+        ,round((a.vovh24 - a.vovh24_base) / NULLIF(a.vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((a.str_copc - a.str_copc_base) / NULLIF(a.str_copc_base, 0), 4) AS str_copc_chg
+        ,round((a.rosn_copc - a.rosn_copc_base) / NULLIF(a.rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((a.rosn_stat_copc - a.rosn_stat_copc_base) / NULLIF(a.rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((a.rovn_copc - a.rovn_copc_base) / NULLIF(a.rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((a.rovn_stat_copc - a.rovn_stat_copc_base) / NULLIF(a.rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((a.str_real - a.str_real_base) / NULLIF(a.str_real_base, 0), 4) AS str_real_chg
+        ,round((a.rosn_real - a.rosn_real_base) / NULLIF(a.rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((a.rovn_real - a.rovn_real_base) / NULLIF(a.rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((a.is_share - a.is_share_base) / NULLIF(a.is_share_base, 0), 4) AS is_share_chg
+        ,round((a.share_cnt - a.share_cnt_base) / NULLIF(a.share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((a.is_return_1 - a.is_return_1_base) / NULLIF(a.is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((a.return_n_uv - a.return_n_uv_base) / NULLIF(a.return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((a.viewh24 - a.viewh24_base) / NULLIF(a.viewh24_base, 0), 4) AS viewh24_chg
+        ,round((a.return_n_uv_noself - a.return_n_uv_noself_base) / NULLIF(a.return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline a
+LEFT JOIN t_gauc b
+ON      a.dt = b.dt AND a.apptype = b.apptype AND a.abcode = b.abcode
+LEFT JOIN t_rosn_gauc c
+ON      a.dt = c.dt AND a.apptype = c.apptype AND a.abcode = c.abcode
+ORDER BY a.dt DESC, a.apptype, a.abcode, a.exp DESC
+;

+ 46 - 0
tasks/承接/rosn分析/debug_dau_scoresmap字段.sql

@@ -0,0 +1,46 @@
+-- 排查:各实验组 scoresMap 中字段缺失情况
+-- 检查 fmRov 和 NorXGBScore 是否存在及有效
+
+SELECT  dt
+        ,abcode
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        -- 总量
+        ,COUNT(1) AS total_cnt
+        ,COUNT(DISTINCT mid) AS total_dau
+        -- scoresMap 存在
+        ,SUM(CASE WHEN scoresmap IS NOT NULL THEN 1 ELSE 0 END) AS has_scoresmap
+        -- fmRov 字段检查
+        ,SUM(CASE WHEN GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL THEN 1 ELSE 0 END) AS has_fmRov
+        ,SUM(CASE WHEN CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) IS NOT NULL THEN 1 ELSE 0 END) AS valid_fmRov
+        -- NorXGBScore 字段检查
+        ,SUM(CASE WHEN GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL THEN 1 ELSE 0 END) AS has_NorXGBScore
+        ,SUM(CASE WHEN CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) IS NOT NULL THEN 1 ELSE 0 END) AS valid_NorXGBScore
+        -- 两个字段都有效
+        ,SUM(CASE WHEN CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) IS NOT NULL
+                  AND CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) IS NOT NULL
+             THEN 1 ELSE 0 END) AS both_valid
+        -- 两个字段都有效 + 推荐页面(最终有效数据)
+        ,SUM(CASE WHEN CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) IS NOT NULL
+                  AND CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) IS NOT NULL
+                  AND page IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页')
+             THEN 1 ELSE 0 END) AS final_valid
+        ,COUNT(DISTINCT CASE WHEN CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) IS NOT NULL
+                  AND CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) IS NOT NULL
+                  AND page IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页')
+             THEN mid END) AS final_dau
+FROM    (
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype = '4'
+    AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+) t
+GROUP BY dt, abcode
+ORDER BY abcode
+;

+ 133 - 0
tasks/承接/rosn分析/debug_dau_str校准.sql

@@ -0,0 +1,133 @@
+-- 排查:所有实验组在指定日期的 dau 异常
+-- 逐层检查数据量变化,定位问题环节
+
+-- 1. 原始数据层:检查所有实验组的基础数据量
+SELECT  '1_原始数据' AS step
+        ,dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        ,abcode
+        ,COUNT(1) AS row_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = '4'
+AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+GROUP BY dt, abcode
+ORDER BY abcode_group, abcode
+
+UNION ALL
+
+-- 2. 过滤 extend_alg 后
+SELECT  '2_有extend_alg' AS step
+        ,dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        ,abcode
+        ,COUNT(1) AS row_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = '4'
+AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+AND     extend_alg IS NOT NULL
+GROUP BY dt, abcode
+ORDER BY abcode_group, abcode
+
+UNION ALL
+
+-- 3. 过滤 scoresMap 后
+SELECT  '3_有scoresMap' AS step
+        ,dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        ,abcode
+        ,COUNT(1) AS row_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = '4'
+AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+AND     extend_alg IS NOT NULL
+AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+GROUP BY dt, abcode
+ORDER BY abcode_group, abcode
+
+UNION ALL
+
+-- 4. 过滤推荐页面后
+SELECT  '4_推荐页面' AS step
+        ,dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        ,abcode
+        ,COUNT(1) AS row_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = '4'
+AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+AND     extend_alg IS NOT NULL
+AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+AND     page IN ('回流后沉浸页&内页feed', '详情后沉浸页', '首页feed', '详情页')
+GROUP BY dt, abcode
+ORDER BY abcode_group, abcode
+
+UNION ALL
+
+-- 5. 过滤 str_pred/rosn_pred 非空后
+SELECT  '5_模型分数有效' AS step
+        ,dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+        END AS abcode_group
+        ,abcode
+        ,COUNT(1) AS row_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+FROM    (
+    SELECT  dt, abcode, mid, vid
+            ,CAST(GET_JSON_OBJECT(
+                REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),
+                '$.fmRov'
+            ) AS DOUBLE) AS str_pred
+            ,CAST(GET_JSON_OBJECT(
+                REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\",""),
+                '$.NorXGBScore'
+            ) AS DOUBLE) AS rosn_pred_raw
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype = '4'
+    AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+    AND     page IN ('回流后沉浸页&内页feed', '详情后沉浸页', '首页feed', '详情页')
+) t
+WHERE   str_pred IS NOT NULL
+AND     rosn_pred_raw IS NOT NULL
+GROUP BY dt, abcode
+ORDER BY abcode_group, abcode
+;

+ 27 - 0
tasks/承接/rosn分析/debug_dau_str校准_多天.sql

@@ -0,0 +1,27 @@
+-- 排查:所有实验组 多天 dau 趋势对比
+-- 检查是单组问题还是全局问题
+
+SELECT  dt
+        ,CASE   WHEN abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                WHEN abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                WHEN abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                WHEN abcode IN ('ab2','ab3') THEN '对照组'
+                WHEN abcode IN ('ab4','ab5') THEN 'ab4-5'
+                ELSE '其他'
+        END AS abcode_group
+        ,abcode AS abcode_raw
+        ,COUNT(1) AS exp_cnt
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(DISTINCT vid) AS vid_cnt
+        ,round(COUNT(1) / COUNT(DISTINCT mid), 2) AS exp_per_dau
+        -- 检查过滤条件的影响
+        ,SUM(CASE WHEN extend_alg IS NULL THEN 1 ELSE 0 END) AS null_extend_alg
+        ,SUM(CASE WHEN GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NULL THEN 1 ELSE 0 END) AS null_scoresmap
+        ,SUM(CASE WHEN page NOT IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页') THEN 1 ELSE 0 END) AS non_rec_page
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt BETWEEN '20260120' AND '20260127'
+AND     apptype = '4'
+AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+GROUP BY dt, abcode
+ORDER BY dt, abcode_group, abcode
+;

+ 67 - 0
tasks/承接/rosn校准/01_原始校准数据.sql

@@ -0,0 +1,67 @@
+WITH t_base AS 
+(
+    SELECT  apptype
+            ,page
+            ,dt
+            ,hh
+            ,mid
+            ,vid
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,is_return_n_noself
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,"all" AS group_key
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,"$.scoresMap"),"\\","") AS scoresmap
+    FROM    dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = "${dt}"
+    AND     apptype IN ("0", "4")
+    AND     abcode NOT IN ("ab100")
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,"$.scoresMap") IS NOT NULL
+)
+,t_p_r AS 
+(
+    SELECT  apptype
+            ,group_key
+            ,return_n_uv_noself
+            ,norXGBScore
+            ,NTILE(100) OVER (PARTITION BY apptype ORDER BY norXGBScore ) AS bucket_id
+    FROM    (
+                SELECT  apptype
+                        ,group_key
+                        ,mid
+                        ,vid
+                        ,CAST(return_n_uv_noself AS BIGINT) AS return_n_uv_noself
+                        ,CAST(GET_JSON_OBJECT(scoresmap,"$.NorXGBScore") AS DOUBLE) AS norXGBScore
+                FROM    t_base
+                WHERE   group_key = "all"
+                AND     GET_JSON_OBJECT(scoresmap,"$.NorXGBScore") IS NOT NULL
+                AND     return_n_uv_noself IS NOT NULL
+                AND     CAST(return_n_uv_noself AS BIGINT) > 0
+            ) 
+    WHERE   norXGBScore > 0
+)
+,t_qq AS 
+(
+    SELECT  apptype
+            ,group_key
+            ,bucket_id
+            ,ROUND(MIN(norXGBScore),6) AS range_begin
+            ,ROUND(MAX(norXGBScore),6) AS range_end
+            ,ROUND(AVG(norXGBScore),6) AS predict
+            ,ROUND(AVG(return_n_uv_noself),6) AS label
+            ,COUNT(1) AS cnt
+    FROM    t_p_r
+    GROUP BY apptype
+             ,group_key
+             ,bucket_id
+)
+SELECT  *
+FROM    t_qq
+ORDER BY apptype,group_key,bucket_id
+LIMIT   1000
+;

+ 90 - 0
tasks/承接/rosn校准/02_分组校准数据.sql

@@ -0,0 +1,90 @@
+WITH t_raw AS
+(
+    SELECT  apptype
+            ,abcode
+            ,dt
+            ,mid
+            ,vid
+            ,CAST(return_n_uv_noself AS BIGINT) AS return_n_uv_noself
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),'\\','') AS scoresmap
+    FROM    dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ('0', '4')
+    AND     abcode NOT IN ('ab100')
+    AND     page IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页','回流页','其他')
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+    AND     return_n_uv_noself IS NOT NULL
+    AND     CAST(return_n_uv_noself AS BIGINT) > 0
+)
+,t_base AS
+(
+    SELECT  apptype
+            ,abcode
+            ,dt
+            ,mid
+            ,vid
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS norXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+    FROM    t_raw
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+)
+-- 展开 group_key: "all" + 实验组分组
+,t_group AS
+(
+    SELECT  apptype, dt, mid, vid, return_n_uv_noself, norXGBScore, rosn_stat
+            ,'all' AS group_key
+    FROM    t_base
+    UNION ALL
+    SELECT  apptype, dt, mid, vid, return_n_uv_noself, norXGBScore, rosn_stat
+            ,CASE   WHEN apptype IN ('4') AND abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                    WHEN apptype IN ('4') AND abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                    WHEN apptype IN ('4') AND abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                    WHEN apptype IN ('4') AND abcode IN ('ab2','ab3') THEN '对照组'
+                    WHEN apptype IN ('4') AND abcode IN ('ab4','ab5') THEN 'ab4-5'
+                    ELSE '其他'
+            END AS group_key
+    FROM    t_base
+)
+-- 双维度分桶: norXGBScore + rosn_stat
+,t_bucket AS
+(
+    SELECT  apptype, dt, group_key, return_n_uv_noself, norXGBScore, rosn_stat
+            ,'norXGBScore' AS bucket_type
+            ,NTILE(100) OVER (PARTITION BY apptype, group_key ORDER BY norXGBScore) AS bucket_id
+    FROM    t_group
+    WHERE   norXGBScore > 0
+    UNION ALL
+    SELECT  apptype, dt, group_key, return_n_uv_noself, norXGBScore, rosn_stat
+            ,'rosn_stat' AS bucket_type
+            ,NTILE(100) OVER (PARTITION BY apptype, group_key ORDER BY rosn_stat) AS bucket_id
+    FROM    t_group
+    WHERE   rosn_stat IS NOT NULL
+    AND     rosn_stat > 0
+)
+,t_qq AS
+(
+    SELECT  apptype
+            ,dt
+            ,group_key
+            ,bucket_type
+            ,bucket_id
+            ,ROUND(MIN(CASE WHEN bucket_type = 'norXGBScore' THEN norXGBScore ELSE rosn_stat END), 6) AS range_begin
+            ,ROUND(MAX(CASE WHEN bucket_type = 'norXGBScore' THEN norXGBScore ELSE rosn_stat END), 6) AS range_end
+            ,ROUND(AVG(norXGBScore), 6) AS predict
+            ,ROUND(AVG(rosn_stat), 6) AS stat
+            ,ROUND(AVG(return_n_uv_noself), 6) AS label
+            ,COUNT(1) AS cnt
+    FROM    t_bucket
+    GROUP BY apptype
+             ,dt
+             ,group_key
+             ,bucket_type
+             ,bucket_id
+)
+SELECT  *
+FROM    t_qq
+ORDER BY apptype, group_key, bucket_type, bucket_id
+LIMIT   10000
+;

+ 76 - 0
tasks/承接/rosn校准/03_label分桶校准数据.sql

@@ -0,0 +1,76 @@
+-- 以 label(return_n_uv_noself) 作为分桶维度,观察不同实验组的 predict/stat 分布
+WITH t_raw AS
+(
+    SELECT  apptype
+            ,abcode
+            ,dt
+            ,mid
+            ,vid
+            ,CAST(return_n_uv_noself AS BIGINT) AS return_n_uv_noself
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),'\\','') AS scoresmap
+    FROM    dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ('0', '4')
+    AND     abcode NOT IN ('ab100')
+    AND     page IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页','回流页','其他')
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+    AND     return_n_uv_noself IS NOT NULL
+    AND     CAST(return_n_uv_noself AS BIGINT) > 0
+)
+,t_base AS
+(
+    SELECT  apptype
+            ,abcode
+            ,dt
+            ,mid
+            ,vid
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS norXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+    FROM    t_raw
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+)
+-- 展开 group_key: "all" + 实验组分组
+,t_group AS
+(
+    SELECT  apptype, dt, mid, vid, return_n_uv_noself, norXGBScore, rosn_stat
+            ,'all' AS group_key
+    FROM    t_base
+    UNION ALL
+    SELECT  apptype, dt, mid, vid, return_n_uv_noself, norXGBScore, rosn_stat
+            ,CASE   WHEN apptype IN ('4') AND abcode IN ('ab0','ab1') THEN '实验组-先验地域降权'
+                    WHEN apptype IN ('4') AND abcode IN ('ab6','ab7') THEN '实验组-str+校准&ros-统计量'
+                    WHEN apptype IN ('4') AND abcode IN ('ab8','ab9') THEN '实验组-str+校准'
+                    WHEN apptype IN ('4') AND abcode IN ('ab2','ab3') THEN '对照组'
+                    WHEN apptype IN ('4') AND abcode IN ('ab4','ab5') THEN 'ab4-5'
+                    ELSE '其他'
+            END AS group_key
+    FROM    t_base
+)
+-- 按 label 分桶
+,t_bucket AS
+(
+    SELECT  apptype, dt, group_key, return_n_uv_noself, norXGBScore, rosn_stat
+            ,NTILE(100) OVER (PARTITION BY apptype, group_key ORDER BY return_n_uv_noself) AS bucket_id
+    FROM    t_group
+)
+SELECT  apptype
+        ,dt
+        ,group_key
+        ,'label' AS bucket_type
+        ,bucket_id
+        ,ROUND(MIN(return_n_uv_noself), 6) AS range_begin
+        ,ROUND(MAX(return_n_uv_noself), 6) AS range_end
+        ,ROUND(AVG(norXGBScore), 6) AS predict
+        ,ROUND(AVG(rosn_stat), 6) AS stat
+        ,ROUND(AVG(return_n_uv_noself), 6) AS label
+        ,COUNT(1) AS cnt
+FROM    t_bucket
+GROUP BY apptype
+         ,dt
+         ,group_key
+         ,bucket_id
+ORDER BY apptype, group_key, bucket_id
+LIMIT   10000
+;

+ 210 - 0
tasks/承接/头部视频模型指标分析/query.sql

@@ -0,0 +1,210 @@
+-- 头部视频模型指标分析(以头部视频表为主,增加模型预估字段)
+-- 使用 fetch_daily.py 按天获取,变量: ${dt}
+
+-- 模型分数表:按 vid 聚合
+WITH tab_model AS (
+    SELECT  dt
+            ,vid
+            ,COUNT(1) AS sample_cnt
+            -- 模型预估指标
+            ,AVG(CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE)) AS pred_str
+            ,AVG(CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE)) AS pred_str_origin
+            ,AVG((0.059 * CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE)) / (1 - (1 - 0.059) * CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE))) AS pred_str_online
+            ,AVG((0.036 * CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE)) / (1 - (1 - 0.036) * CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE))) AS pred_str_real
+            ,AVG(1.22 * POW(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15)) AS pred_ros
+            ,AVG(CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE)) AS pred_vor
+            ,AVG(CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) * 1.22 * POW(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) * CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE)) AS pred_vov
+            -- 样本级真实指标
+            ,SUM(is_return_noself) / COUNT(1) AS real_str
+            ,SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0) AS real_ros
+            ,SUM(return_n_uv_noself) / COUNT(1) AS real_rov
+            ,SUM(new_exposure_cnt) / COUNT(1) AS real_vov
+            ,SUM(new_exposure_cnt) / NULLIF(SUM(return_n_uv_noself), 0) AS real_vor
+    FROM    (
+                SELECT  dt
+                        ,vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),'\\','') AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ('回流后沉浸页&内页feed','详情后沉浸页','首页feed','详情页')
+                AND     abcode IN ('ab0','ab1','ab2','ab3','ab4','ab5','ab6','ab7','ab8','ab9')
+                AND     abcode NOT IN ('ab100')
+            ) t
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.vor') IS NOT NULL
+    GROUP BY dt, vid
+)
+
+-- 主查询:头部视频表 LEFT JOIN 模型字段
+SELECT
+v.dt,
+v.视频id,v.标题,v.merge二级品类,
+sum(v.当日分发曝光pv) as 分发曝光pv,
+sum(v.当日分发拉回曝光pv) as 分发拉回曝光pv,
+sum(v.当日分发回流uv) AS 分发回流_当日,
+sum(v.累计分享回流uv) AS 总回流uv,
+sum(v.当日分发回流uv)/sum(v.当日分发曝光pv) as rov_t0,
+sum(v.当日分发回流uv)/sum(v.当日分发分享pv)  as ros_t0,
+sum(v.当日分发拉回曝光pv)/sum(v.当日分发曝光pv)  as vov0,
+sum(v.0_1日分发拉回曝光pv)/sum(v.当日分发曝光pv)  as vov1,
+sum(v.当日分发拉回曝光pv)/sum(v.当日分发回流uv)  as vor_t0,
+sum(v.当日分发分享pv)/sum(v.当日分发曝光pv)  as str_t0,
+-- ========== 模型预估字段(新增)==========
+m.sample_cnt,
+m.pred_str,
+m.pred_str_origin,
+m.pred_str_online,
+m.pred_str_real,
+m.pred_ros,
+m.pred_vor,
+m.pred_vov,
+m.real_str,
+m.real_ros,
+m.real_rov,
+m.real_vov,
+m.real_vor,
+-- ========== 原始头部视频字段(续)==========
+AVG(v.视频时长) as 视频时长,
+count(DISTINCT v.视频id) as 分发视频量,
+count(DISTINCT if(v.是否当日新推荐>0,v.视频id,null)) as 新推荐视频量,
+SUM(v.1008回流人数) / SUM(v.总回流uv) AS 群聊占比,
+SUM(v.头部分享pv)/SUM(v.总分享pv) AS 头部分享占比,
+SUM(v.当日分发头部分享pv)/SUM(v.当日分发曝光pv) AS 头部str_t0,
+SUM(v.当日分发头部分享pv)/SUM(v.当日分发头部分享pv+v.当日分发分享pv) AS 当日分发头部分享占比,
+sum(v.推荐回流)/sum(v.流量池曝光) AS 流量池曝光roi,
+sum(v.流量池曝光) AS 流量池分发曝光,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3) THEN v.当日分发曝光pv END) as 新0_3VoV0,
+avg(v.曝光rank)-avg(v.回流rank) AS rankdiff,
+avg(v.回流rank) as 回流rank_avg,
+avg(v.曝光rank) as 曝光rank_avg,
+sum(v.流量池回流)/sum(v.流量池曝光) AS 流量池rov,
+sum(v.流量池回流)/sum(v.流量池分享) AS 流量池ros,
+sum(v.流量池分享)/sum(v.流量池曝光) AS 流量池str,
+sum(v.推荐回流)/sum(v.推荐曝光) AS 推荐rov,
+sum(v.推荐回流)/sum(v.推荐分享) AS 推荐ros,
+sum(v.推荐分享)/sum(v.推荐曝光) AS 推荐str,
+(SUM(v.带来1007回流的分享数)+SUM(v.带来1008回流的分享数))/SUM(v.总分享pv) AS 有效分享率,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3,4,5,6,7) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3,4,5,6,7) THEN v.当日分发曝光pv END) as 新0_7VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3,4,5,6,7) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3,4,5,6,7) THEN v.当日分发曝光pv END) as 新0_7VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3,4,5,6,7) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新0_7曝光占比,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3) THEN v.当日分发曝光pv END) as 新0_3VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (0,1,2,3) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新0_3曝光占比,
+sum(CASE WHEN v.推荐天数间隔 in (1,2,3) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (1,2,3) THEN v.当日分发曝光pv END) as 新1_3VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (1,2,3) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (1,2,3) THEN v.当日分发曝光pv END) as 新1_3VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (0) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0) THEN v.当日分发曝光pv END) as 新0VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (0) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (0) THEN v.当日分发曝光pv END) as 新0VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (0) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新0曝光占比,
+count(DISTINCT if(v.推荐天数间隔=1,v.视频id,null)) as 新1视频量,
+sum(CASE WHEN v.推荐天数间隔 in (1) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (1) THEN v.当日分发曝光pv END) as 新1VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (1) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (1) THEN v.当日分发曝光pv END) as 新1VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (1) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新1曝光占比,
+count(DISTINCT if(v.推荐天数间隔=2,v.视频id,null)) as 新2视频量,
+sum(CASE WHEN v.推荐天数间隔 in (2) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (2) THEN v.当日分发曝光pv END) as 新2VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (2) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (2) THEN v.当日分发曝光pv END) as 新2VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (2) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新2曝光占比,
+count(DISTINCT if(v.推荐天数间隔=3,v.视频id,null)) as 新3视频量,
+sum(CASE WHEN v.推荐天数间隔 in (3) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (3) THEN v.当日分发曝光pv END) as 新3VoV1,
+sum(CASE WHEN v.推荐天数间隔 in (3) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 in (3) THEN v.当日分发曝光pv END) as 新3VoV0,
+sum(CASE WHEN v.推荐天数间隔 in (3) THEN v.当日分发曝光pv END)/sum(v.当日分发曝光pv) as 新3曝光占比,
+sum(CASE WHEN v.推荐天数间隔 not in (0,1,2,3,4,5,6,7) THEN v.0_1日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 not in (0,1,2,3,4,5,6,7) THEN v.当日分发曝光pv END) as 非0_7_VoV1,
+sum(CASE WHEN v.推荐天数间隔 not in (0,1,2,3,4,5,6,7) THEN v.当日分发拉回曝光pv END)/sum(CASE WHEN v.推荐天数间隔 not in (0,1,2,3,4,5,6,7) THEN v.当日分发曝光pv END) as 非0_7_VoV0,
+sum(v.0_2日分发拉回曝光pv)/sum(v.当日分发曝光pv) as vov2,
+sum(v.0_7日分发拉回曝光pv)/sum(v.当日分发曝光pv) as vov7,
+sum(v.0_30日分发拉回曝光pv)/sum(v.当日分发曝光pv) as vov30,
+(sum(v.0_1日分发拉回曝光pv)/sum(v.当日分发曝光pv))-(sum(v.当日分发拉回曝光pv)/sum(v.当日分发曝光pv)) as vov1减vov0,
+sum(v.当日分发回流uv) as 分发回流uv,
+sum(v.当日分发分享pv) as 分发分享pv,
+SUM(v.1008回流人数)/ SUM(v.带来1008回流的分享数) AS 群聊ros,
+SUM(v.1007回流人数)/ SUM(v.带来1007回流的分享数) AS 单聊ros,
+SUM(v.1007进入分发曝光pv)/SUM(v.1007回流人数) AS 单聊vor,
+SUM(v.1008进入分发曝光pv)/SUM(v.1008回流人数) AS 群聊vor,
+(SUM(v.1007回流再分享pv)+ SUM(v.1008回流再分享pv))/(SUM(v.1007进入分发曝光pv)+ SUM(v.1008进入分发曝光pv)) AS 回流后str,
+(SUM(v.1008回流再分享pv))/(SUM(v.1008进入分发曝光pv)) AS 群聊后str,
+(SUM(v.1007回流再分享pv))/(SUM(v.1007进入分发曝光pv)) AS 单聊后str,
+SUM(v.总回流uv)/SUM(v.累计分享回流uv) AS 当日分享回流占比,
+SUM(v.当日分享当日回流首层uv)/SUM(v.当日分享当日回流uv) AS 当日分享当日回流首层比当日分享当日回流,
+count(DISTINCT if(v.是否七日内创建>0,v.视频id,null)) as 七日内新视频量,
+count(DISTINCT if(v.是否首发视频>0,v.视频id,null)) as 首发视频量,
+count(DISTINCT if(v.是否首发视频>0,v.视频id,null))/count(DISTINCT v.视频id) as 首发视频比例,
+count(DISTINCT v.站内uid) as 供给uid量,
+AVG(v.首发距今时间) as 首发距今间隔avg,
+AVG(v.推荐天数间隔) as 推荐距今间隔avg,
+AVG(v.创建天数间隔) as 创建距今间隔avg,
+sum(v.0_1日分发拉回曝光pv) as 0_1日分发拉回曝光pv,
+sum(v.0_2日分发拉回曝光pv) as 0_2日分发拉回曝光pv,
+sum(v.0_3日分发拉回曝光pv) as 0_3日分发拉回曝光pv,
+sum(v.0_7日分发拉回曝光pv) as 0_7日分发拉回曝光pv,
+sum(v.0_30日分发拉回曝光pv) as 0_30日分发拉回曝光pv,
+sum(v.0_1日分发回流uv)/sum(v.当日分发曝光pv) as rov1,
+sum(v.0_7日分发回流uv)/sum(v.当日分发曝光pv) as rov7,
+sum(v.0_30日分发回流uv)/sum(v.当日分发曝光pv) as rov30,
+sum(v.0_1日分发拉回曝光pv)/sum(v.0_1日分发回流uv) as vor1,
+sum(v.0_7日分发拉回曝光pv)/sum(v.0_7日分发回流uv) as vor7,
+sum(v.0_30日分发拉回曝光pv)/sum(v.0_30日分发回流uv) as vor30,
+sum(v.流量池曝光) AS 流量池曝光,
+sum(v.流量池播放) AS 流量池播放,
+sum(v.流量池分享) AS 流量池分享,
+sum(v.流量池回流) AS 流量池回流,
+sum(v.推荐曝光) AS 推荐曝光,
+sum(v.推荐播放) AS 推荐播放,
+sum(v.推荐分享) AS 推荐分享,
+sum(v.推荐回流) AS 推荐回流,
+SUM(v.总分享pv) AS 总分享pv,
+SUM(v.总回流uv) AS 总回流uv,
+SUM(v.1007回流人数) AS 1007回流人数,
+SUM(v.1008回流人数) AS 1008回流人数,
+SUM(v.带来1007回流的分享数) AS 带来1007回流的分享数,
+SUM(v.带来1008回流的分享数) AS 带来1008回流的分享数,
+SUM(v.1007进入分发曝光pv) AS 1007进入分发曝光pv,
+SUM(v.1008进入分发曝光pv) AS 1008进入分发曝光pv,
+SUM(v.1007回流再分享pv) AS 1007回流再分享pv,
+SUM(v.1008回流再分享pv) AS 1008回流再分享pv,
+SUM(v.有回流分享pv) AS 有回流分享pv,
+SUM(v.累计分享回流uv) AS 累计分享回流uv,
+SUM(v.分发分享pv) AS 分发分享pv2,
+SUM(v.头部分享pv) AS 头部分享pv,
+SUM(v.当日分发头部分享pv) AS 当日分发头部分享pv,
+SUM(v.当日分享当日回流uv) AS 当日分享当日回流uv,
+SUM(v.当日分享当日回流首层uv) AS 当日分享当日回流首层uv,
+SUM(v.当日分享当日回流非首层uv) AS 当日分享当日回流非首层uv,
+SUM(v.非当日分享回流uv) AS 非当日分享回流uv,
+SUM(v.n当日分发回流uv) AS n当日分发回流uv,
+SUM(v.非当日分发回流uv) AS 非当日分发回流uv,
+count(DISTINCT if(v.当日分发曝光pv>=100,v.视频id,null)) as t0_100曝光视频量,
+count(DISTINCT if(v.当日分发曝光pv>=500,v.视频id,null)) as t0_500曝光视频量,
+count(DISTINCT if(v.当日分发曝光pv>=1000,v.视频id,null)) as t0_1k曝光视频量,
+count(DISTINCT if(v.当日分发曝光pv>=10000,v.视频id,null)) as t0_1w曝光视频量,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)-(v.当日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.2 and v.当日分发曝光pv>=500,v.视频id,null)) as vov1_0_02_500视频量,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)-(v.当日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.2 and v.当日分发曝光pv>=500,v.视频id,null))/count(DISTINCT if(v.当日分发曝光pv>=500,v.视频id,null)) as vov1_0_02_500视频占比,
+count(DISTINCT if((v.当日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.4 and v.当日分发曝光pv>=500,v.视频id,null)) as vov0_04_500视频量,
+count(DISTINCT if((v.当日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.4 and v.当日分发曝光pv>=500,v.视频id,null))/count(DISTINCT if(v.当日分发曝光pv>=500,v.视频id,null)) as vov0_04_500视频占比,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.7 and v.当日分发曝光pv>=500,v.视频id,null)) as vov1_07_500视频量,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.7 and v.当日分发曝光pv>=500,v.视频id,null))/count(DISTINCT if(v.当日分发曝光pv>=500,v.视频id,null)) as vov1_07_500视频占比,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.8 and v.当日分发曝光pv>=500,v.视频id,null)) as vov1_08_500视频量,
+count(DISTINCT if((v.0_1日分发拉回曝光pv)/(v.当日分发曝光pv)>=0.8 and v.当日分发曝光pv>=500,v.视频id,null))/count(DISTINCT if(v.当日分发曝光pv>=500,v.视频id,null)) as vov1_08_500视频占比,
+count(DISTINCT if(v.当日分发拉回曝光pv>=500,v.视频id,null)) as t0_500拉回曝光视频量,
+count(DISTINCT if(v.0_1日分发拉回曝光pv>=500,v.视频id,null)) as t1_500拉回曝光视频量,
+count(DISTINCT if(v.当日分发拉回曝光pv>=10000,v.视频id,null)) as t0_1w拉回曝光视频量,
+count(DISTINCT if(v.0_1日分发拉回曝光pv>=10000,v.视频id,null)) as t1_1w拉回曝光视频量,
+count(DISTINCT if(v.当日分发拉回曝光pv>=100000,v.视频id,null)) as t0_10w拉回曝光视频量,
+count(DISTINCT if(v.0_1日分发拉回曝光pv>=100000,v.视频id,null)) as t1_10w拉回曝光视频量,
+count(DISTINCT if(v.当日分发拉回曝光pv>=1000000,v.视频id,null)) as t0_100w拉回曝光视频量,
+count(DISTINCT if(v.0_1日分发拉回曝光pv>=1000000,v.视频id,null)) as t1_100w拉回曝光视频量,
+(SUM(v.带来流量池1007回流的分享数)+SUM(v.带来流量池1008回流的分享数))/SUM(v.带来流量池回流的分享数) AS 流量池有效分享率,
+SUM(v.流量池1008回流人数) / SUM(v.流量池回流人数) AS 流量池群聊占比
+
+FROM loghubods.video_dimension_detail_add_column v
+LEFT JOIN tab_model m
+ON v.dt = m.dt AND v.视频id = m.vid
+WHERE v.dt = '${dt}' AND v.曝光rank < 40
+GROUP BY m.sample_cnt, m.pred_str, m.pred_str_origin, m.pred_str_online, m.pred_str_real, m.pred_ros, m.pred_vor, m.pred_vov,
+         m.real_str, m.real_ros, m.real_rov, m.real_vov, m.real_vor,
+         v.dt, v.视频id, v.标题, v.merge二级品类
+ORDER BY 分发曝光pv DESC
+;

+ 96 - 0
tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表 copy.sql

@@ -0,0 +1,96 @@
+-- 预处理:解析 scoresmap + page 分类
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        -- ,COALESCE(page,"sum") AS page
+        -- 模型预测与真实值对比指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+        -- 预测误差指标
+        ,round(AVG(ABS(ros_pred - return_n_uv_noself)),6) AS ros_pred_mae
+        ,round(SQRT(AVG(pow(ros_pred - return_n_uv_noself, 2))),6) AS ros_pred_rmse
+        ,round(AVG(ABS(ros_stat - return_n_uv_noself)),6) AS ros_stat_mae
+        ,round(SQRT(AVG(pow(ros_stat - return_n_uv_noself, 2))),6) AS ros_stat_rmse
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+         -- ,page
+-- GROUPING SETS ((dt,apptype,abcode)
+--               ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,abcode
+;

+ 110 - 0
tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v2.sql

@@ -0,0 +1,110 @@
+-- 预处理:解析 scoresmap + page 分类
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+    
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+
+
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        -- ,COALESCE(page,"sum") AS page
+        -- 模型预测与真实值对比指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+        
+        -- 预测误差指标
+        ,round(AVG(ABS(ros_pred - return_n_uv_noself)),6) AS ros_pred_mae
+        ,round(AVG(ABS(ros_stat - return_n_uv_noself)),6) AS ros_stat_mae
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_valid
+GROUP BY dt
+         ,apptype
+         ,abcode
+         -- ,page
+-- GROUPING SETS ((dt,apptype,abcode)
+--               ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,abcode
+;

+ 113 - 0
tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v3.sql

@@ -0,0 +1,113 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v3: 新增 COPC(校准系数 = 真实值/预测值)
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+
+
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        -- COPC(校准系数 = 真实值/预测值,>1表示低估,<1表示高估)
+        ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+        ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(ros_pred) / COUNT(1), 0), 4) AS ros_copc
+        ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(ros_stat) / COUNT(1), 0), 4) AS ros_stat_copc
+        ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * ros_pred), 0), 4) AS rovn_copc
+        ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * ros_stat), 0), 4) AS rovn_stat_copc
+        -- 模型预测与真实值对比指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+
+        -- 预测误差指标
+        ,round(AVG(ABS(ros_pred - return_n_uv_noself)),6) AS ros_pred_mae
+        ,round(AVG(ABS(ros_stat - return_n_uv_noself)),6) AS ros_stat_mae
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_valid
+GROUP BY dt
+         ,apptype
+         ,abcode
+ORDER BY dt DESC,apptype,abcode
+;

+ 166 - 0
tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v4.sql

@@ -0,0 +1,166 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top10 vid 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top5 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 10
+)
+-- 标记 top5 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 165 - 0
tasks/承接/线上实验/01_线上实验+模型预测_曝光+特征表_v5.sql

@@ -0,0 +1,165 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v5: 按一级品类 (merge_first_level_cate) 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            -- 品类信息
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_first_level_cate'), 'unknown') AS cate1
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_second_level_cate'), 'unknown') AS cate2
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top10 的一级品类
+,t_cate_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,cate1
+            ,COUNT(1) AS cate_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS cate_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, cate1
+)
+,t_top_cate AS
+(
+    SELECT  dt, apptype, abcode, cate1, cate_rank
+    FROM    t_cate_rank
+    WHERE   cate_rank <= 10
+)
+-- 标记 top 品类
+,t_with_top AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.cate1 IS NOT NULL THEN a.cate1 ELSE NULL END AS top_cate1
+            ,b.cate_rank AS top_cate_rank
+    FROM    t_valid a
+    LEFT JOIN t_top_cate b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.cate1 = b.cate1
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(top_cate1, 'all') AS cate1
+            ,CASE WHEN GROUPING(top_cate1) = 1 THEN NULL ELSE MAX(top_cate_rank) END AS cate_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top
+    GROUP BY dt, apptype, abcode, top_cate1
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top_cate1)
+    )
+    HAVING  top_cate1 IS NOT NULL OR GROUPING(top_cate1) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,cate1
+        ,cate_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN cate1 = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 83 - 0
tasks/承接/线上实验/01_线上实验_曝光+特征表.sql

@@ -0,0 +1,83 @@
+WITH t_base AS 
+(
+    SELECT  dt
+            ,apptype 
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab0","ab1","ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5","ab6","ab7") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+    FROM   loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        ,COALESCE(page,"sum") AS page
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself -- ,MAX(CAST(COALESCE(share_cnt,'0') AS BIGINT)) AS max_share_cnt
+        -- ,MAX(CAST(COALESCE(return_1_uv,'0') AS BIGINT)) AS max_return_1_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv,'0') AS BIGINT)) AS max_return_n_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv_noself,'0') AS BIGINT)) AS max_return_n_uv_noself
+        -- ,COALESCE(SUM(is_return_noself),0) AS is_return_noself
+        -- ,COALESCE(SUM(return_1_uv),0) AS return_1_uv
+        -- ,COUNT(DISTINCT vid) AS exp_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_share = '1' THEN vid ELSE NULL END) AS share_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_return_n = '1' THEN vid ELSE NULL END) AS return_vid_cnt
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+         ,page
+GROUPING SETS ((dt,apptype,abcode)
+              ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,page,abcode
+;

+ 83 - 0
tasks/承接/线上实验/01_线上实验_曝光表.sql

@@ -0,0 +1,83 @@
+WITH t_base AS 
+(
+    SELECT  dt
+            ,apptype 
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab0","ab1","ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5","ab6","ab7") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        ,COALESCE(page,"sum") AS page
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself -- ,MAX(CAST(COALESCE(share_cnt,'0') AS BIGINT)) AS max_share_cnt
+        -- ,MAX(CAST(COALESCE(return_1_uv,'0') AS BIGINT)) AS max_return_1_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv,'0') AS BIGINT)) AS max_return_n_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv_noself,'0') AS BIGINT)) AS max_return_n_uv_noself
+        -- ,COALESCE(SUM(is_return_noself),0) AS is_return_noself
+        -- ,COALESCE(SUM(return_1_uv),0) AS return_1_uv
+        -- ,COUNT(DISTINCT vid) AS exp_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_share = '1' THEN vid ELSE NULL END) AS share_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_return_n = '1' THEN vid ELSE NULL END) AS return_vid_cnt
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+         ,page
+GROUPING SETS ((dt,apptype,abcode)
+              ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,page,abcode
+;

+ 113 - 0
tasks/承接/线上实验/02_模型预测误差_仅回流样本.sql

@@ -0,0 +1,113 @@
+-- 预处理:解析 scoresmap + page 分类
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        -- ,COALESCE(page,"sum") AS page
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_sample_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_sample_ratio
+        -- 全量样本指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+        -- 仅回流样本的均值对比
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)),6) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)),6) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)),6) AS ros_stat_return
+        -- 仅回流样本的偏差(正=高估,负=低估)
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_bias
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_bias
+        -- 仅回流样本的误差
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)),6) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)),6) AS ros_stat_mae_return
+        -- 高估/低估分开的误差(仅回流样本)
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred > return_n_uv_noself, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred < return_n_uv_noself, return_n_uv_noself - ros_pred, NULL)),6) AS ros_pred_mae_under
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat > return_n_uv_noself, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat < return_n_uv_noself, return_n_uv_noself - ros_stat, NULL)),6) AS ros_stat_mae_under
+        -- 非回流样本的误差(真实值=0)
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_pred), NULL)),6) AS ros_pred_mae_noreturn
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_stat), NULL)),6) AS ros_stat_mae_noreturn
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+         -- ,page
+-- GROUPING SETS ((dt,apptype,abcode)
+--               ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,abcode
+;

+ 82 - 0
tasks/承接/线上实验/03_模型预测分桶验证.sql

@@ -0,0 +1,82 @@
+-- 预处理:解析 scoresmap + page 分类
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+-- 按 ros_pred 分桶
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY ros_pred) AS ros_pred_bucket
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY ros_stat) AS ros_stat_bucket
+    FROM    t_base
+)
+-- 按桶聚合
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,ros_pred_bucket AS bucket
+        ,'ros_pred' AS bucket_type
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, ros_pred_bucket
+UNION ALL
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,ros_stat_bucket AS bucket
+        ,'ros_stat' AS bucket_type
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, ros_stat_bucket
+ORDER BY dt DESC, apptype, abcode, bucket_type, bucket
+;

+ 118 - 0
tasks/承接/线上实验/04_裂变率预测对比.sql

@@ -0,0 +1,118 @@
+-- 预处理:解析 scoresmap + page 分类
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_sample_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_sample_ratio
+        -- 全量样本指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+        -- 裂变率 = str * ros(真实值 vs 预测值 vs 统计量)
+        ,round(COALESCE(SUM(return_n_uv_noself) / COUNT(1),0),6) AS rovn_real
+        ,round(COALESCE(SUM(str_pred * ros_pred) / COUNT(1),0),6) AS rovn_pred
+        ,round(COALESCE(SUM(str_pred * ros_stat) / COUNT(1),0),6) AS rovn_stat
+        -- 裂变率误差(全量样本)
+        ,round(AVG(str_pred * ros_pred - return_n_uv_noself),6) AS rovn_pred_bias
+        ,round(AVG(str_pred * ros_stat - return_n_uv_noself),6) AS rovn_stat_bias
+        ,round(AVG(ABS(str_pred * ros_pred - return_n_uv_noself)),6) AS rovn_pred_mae
+        ,round(AVG(ABS(str_pred * ros_stat - return_n_uv_noself)),6) AS rovn_stat_mae
+        -- 仅回流样本的均值对比
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)),6) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)),6) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)),6) AS ros_stat_return
+        -- 仅回流样本的偏差(正=高估,负=低估)
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_bias
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_bias
+        -- 仅回流样本的误差
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)),6) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)),6) AS ros_stat_mae_return
+        -- 高估/低估分开的误差(仅回流样本)
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred > return_n_uv_noself, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred < return_n_uv_noself, return_n_uv_noself - ros_pred, NULL)),6) AS ros_pred_mae_under
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat > return_n_uv_noself, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat < return_n_uv_noself, return_n_uv_noself - ros_stat, NULL)),6) AS ros_stat_mae_under
+        -- 非回流样本的误差(真实值=0)
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_pred), NULL)),6) AS ros_pred_mae_noreturn
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_stat), NULL)),6) AS ros_stat_mae_noreturn
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+ORDER BY dt DESC,apptype,abcode
+;

+ 78 - 0
tasks/承接/线上实验/05_str分桶ros诊断.sql

@@ -0,0 +1,78 @@
+-- 按 str_pred 分桶,诊断 P(分享) 和 E(回流|分享) 是否对齐
+-- 目的:检验高分享概率桶里 ros 是否低估更严重
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+-- 按 str_pred 分桶
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY str_pred) AS str_pred_bucket
+    FROM    t_base
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,str_pred_bucket AS bucket
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        -- str 真实 vs 预测
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        -- ros 全量样本:真实 vs 预测 vs 统计量
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, str_pred_bucket
+ORDER BY dt DESC, apptype, abcode, str_pred_bucket
+;

+ 87 - 0
tasks/承接/线上实验/06_str分桶ros诊断_仅有预测值.sql

@@ -0,0 +1,87 @@
+-- 按 str_pred 分桶,诊断 P(分享) 和 E(回流|分享) 是否对齐
+-- 目的:检验高分享概率桶里 ros 是否低估更严重
+-- 注意:只在有预测值的样本上分析(过滤 str_pred IS NULL)
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+-- 过滤:只保留有预测值的样本
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+-- 按 str_pred 分桶
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY str_pred) AS str_pred_bucket
+    FROM    t_valid
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,str_pred_bucket AS bucket
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        -- str 真实 vs 预测
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        -- ros 全量样本:真实 vs 预测 vs 统计量
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, str_pred_bucket
+ORDER BY dt DESC, apptype, abcode, str_pred_bucket
+;

+ 96 - 0
tasks/承接/线上实验/06a_str_pred分桶诊断.sql

@@ -0,0 +1,96 @@
+-- 按 str_pred(分享概率预测值)分桶诊断
+-- 目的:检验模型在不同预测分享概率区间的校准度
+-- 包含"全部"汇总行
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY str_pred) AS bucket
+    FROM    t_valid
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,COALESCE(CAST(bucket AS STRING), '全部') AS bucket
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        -- str 真实 vs 预测
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        ,round(AVG(str_pred) - SUM(is_return_noself) / COUNT(1), 6) AS str_bias
+        -- ros 全量样本
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- 仅回流样本的偏差
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_bias_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_bias_return
+        -- 仅回流样本的 MAE
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)), 4) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)), 4) AS ros_stat_mae_return
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, bucket
+GROUPING SETS ((dt, apptype, abcode, bucket), (dt, apptype, abcode))
+ORDER BY dt DESC, apptype, abcode, bucket
+;

+ 126 - 0
tasks/承接/线上实验/06a_str_pred分桶诊断_full.sql

@@ -0,0 +1,126 @@
+-- 按 str_pred(分享概率预测值)分桶诊断 - 完整指标版
+-- 包含 02 的所有指标 + 分桶
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY str_pred) AS bucket
+    FROM    t_valid
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,COALESCE(CAST(bucket AS STRING), '全部') AS bucket
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_rate
+        -- str 真实 vs 预测
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        ,round(MIN(str_pred), 6) AS str_pred_min
+        ,round(MAX(str_pred), 6) AS str_pred_max
+        ,round(AVG(str_pred) - SUM(is_return_noself) / COUNT(1), 6) AS str_bias
+        -- ros 预测阈值
+        ,round(MIN(ros_pred), 4) AS ros_pred_min
+        ,round(MAX(ros_pred), 4) AS ros_pred_max
+        -- ros 全量样本
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 全量偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- 仅回流样本的偏差
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_bias_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_bias_return
+        -- 仅回流样本的 MAE
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)), 4) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)), 4) AS ros_stat_mae_return
+        -- 高估/低估分开的误差(仅回流样本)
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred > return_n_uv_noself, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred < return_n_uv_noself, return_n_uv_noself - ros_pred, NULL)), 4) AS ros_pred_mae_under
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat > return_n_uv_noself, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat < return_n_uv_noself, return_n_uv_noself - ros_stat, NULL)), 4) AS ros_stat_mae_under
+        -- 非回流样本的误差(真实值=0)
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_pred), NULL)), 4) AS ros_pred_mae_noreturn
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_stat), NULL)), 4) AS ros_stat_mae_noreturn
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+        -- 业务指标
+        ,round(COUNT(1) / COUNT(DISTINCT mid), 2) AS exp_per_dau
+        ,round(SUM(is_share) / COUNT(1), 6) AS str_one
+        ,round(SUM(return_n_uv) / NULLIF(SUM(is_share), 0), 4) AS ros_one
+        ,round(SUM(share_cnt) / COUNT(1), 6) AS str
+        ,round(SUM(return_n_uv) / NULLIF(SUM(share_cnt), 0), 4) AS ros
+        ,round(SUM(is_return_1) / COUNT(1), 6) AS str_plus
+        ,round(SUM(return_n_uv) / NULLIF(SUM(is_return_1), 0), 4) AS ros_minus
+        ,round(SUM(return_n_uv) / COUNT(1), 6) AS rovn_one
+        ,round(SUM(new_exposure_cnt) / COUNT(1), 6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, bucket
+GROUPING SETS ((dt, apptype, abcode, bucket), (dt, apptype, abcode))
+ORDER BY dt DESC, apptype, abcode, bucket
+;

+ 99 - 0
tasks/承接/线上实验/06c_ros_pred分桶诊断.sql

@@ -0,0 +1,99 @@
+-- 按 ros_pred(ros预测值)分桶诊断
+-- 目的:检验模型在不同 ros 预测区间的校准度
+-- 包含"全部"汇总行
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY ros_pred) AS bucket
+    FROM    t_valid
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,COALESCE(CAST(bucket AS STRING), '全部') AS bucket
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_rate
+        -- ros_pred 区间
+        ,round(MIN(ros_pred), 4) AS ros_pred_min
+        ,round(MAX(ros_pred), 4) AS ros_pred_max
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 真实值(全量样本)
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        -- ros 偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- 仅回流样本的偏差
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_bias_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_bias_return
+        -- 仅回流样本的 MAE
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)), 4) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)), 4) AS ros_stat_mae_return
+        -- str 相关
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, bucket
+GROUPING SETS ((dt, apptype, abcode, bucket), (dt, apptype, abcode))
+ORDER BY dt DESC, apptype, abcode, bucket
+;

+ 124 - 0
tasks/承接/线上实验/06c_ros_pred分桶诊断_full.sql

@@ -0,0 +1,124 @@
+-- 按 ros_pred(ros预测值)分桶诊断 - 完整指标版
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     ros_pred IS NOT NULL
+)
+,t_bucket AS
+(
+    SELECT  *
+            ,NTILE(10) OVER (PARTITION BY dt, apptype, abcode ORDER BY ros_pred) AS bucket
+    FROM    t_valid
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,COALESCE(CAST(bucket AS STRING), '全部') AS bucket
+        -- 样本量
+        ,COUNT(1) AS sample_cnt
+        ,SUM(is_return_noself) AS return_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_rate
+        -- str 真实 vs 预测
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS str_real
+        ,round(AVG(str_pred), 6) AS str_pred_avg
+        ,round(MIN(str_pred), 6) AS str_pred_min
+        ,round(MAX(str_pred), 6) AS str_pred_max
+        ,round(AVG(str_pred) - SUM(is_return_noself) / COUNT(1), 6) AS str_bias
+        -- ros 全量样本
+        ,round(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_real
+        ,round(AVG(ros_pred), 4) AS ros_pred_avg
+        ,round(MIN(ros_pred), 4) AS ros_pred_min
+        ,round(MAX(ros_pred), 4) AS ros_pred_max
+        ,round(AVG(ros_stat), 4) AS ros_stat_avg
+        -- ros 全量偏差(正=高估,负=低估)
+        ,round(AVG(ros_pred) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_pred_bias
+        ,round(AVG(ros_stat) - SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS ros_stat_bias
+        -- 仅回流样本的 ros
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)), 4) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)), 4) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)), 4) AS ros_stat_return
+        -- 仅回流样本的偏差
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_bias_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_bias_return
+        -- 仅回流样本的 MAE
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)), 4) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)), 4) AS ros_stat_mae_return
+        -- 高估/低估分开的误差(仅回流样本)
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred > return_n_uv_noself, ros_pred - return_n_uv_noself, NULL)), 4) AS ros_pred_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred < return_n_uv_noself, return_n_uv_noself - ros_pred, NULL)), 4) AS ros_pred_mae_under
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat > return_n_uv_noself, ros_stat - return_n_uv_noself, NULL)), 4) AS ros_stat_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat < return_n_uv_noself, return_n_uv_noself - ros_stat, NULL)), 4) AS ros_stat_mae_under
+        -- 非回流样本的误差(真实值=0)
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_pred), NULL)), 4) AS ros_pred_mae_noreturn
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_stat), NULL)), 4) AS ros_stat_mae_noreturn
+        -- rovn = str * ros(裂变率)
+        ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+        ,round(AVG(str_pred * ros_pred), 6) AS rovn_pred
+        ,round(AVG(str_pred * ros_stat), 6) AS rovn_stat
+        -- 业务指标
+        ,round(COUNT(1) / COUNT(DISTINCT mid), 2) AS exp_per_dau
+        ,round(SUM(is_share) / COUNT(1), 6) AS str_one
+        ,round(SUM(return_n_uv) / NULLIF(SUM(is_share), 0), 4) AS ros_one
+        ,round(SUM(share_cnt) / COUNT(1), 6) AS str
+        ,round(SUM(return_n_uv) / NULLIF(SUM(share_cnt), 0), 4) AS ros
+        ,round(SUM(is_return_1) / COUNT(1), 6) AS str_plus
+        ,round(SUM(return_n_uv) / NULLIF(SUM(is_return_1), 0), 4) AS ros_minus
+        ,round(SUM(return_n_uv) / COUNT(1), 6) AS rovn_one
+        ,round(SUM(new_exposure_cnt) / COUNT(1), 6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+FROM    t_bucket
+GROUP BY dt, apptype, abcode, bucket
+GROUPING SETS ((dt, apptype, abcode, bucket), (dt, apptype, abcode))
+ORDER BY dt DESC, apptype, abcode, bucket
+;

+ 116 - 0
tasks/承接/线上实验/07_预测值覆盖率分析.sql

@@ -0,0 +1,116 @@
+-- 按有/无预测值分组,分析覆盖率及各组指标
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS ros_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS ros_stat
+            -- 是否有预测值
+            ,CASE WHEN GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+                   AND GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+                  THEN '有预测值'
+                  ELSE '无预测值'
+            END AS has_pred
+    FROM    t_filtered
+)
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,has_pred
+        -- 样本量与占比
+        ,COUNT(1) AS sample_cnt
+        ,round(COUNT(1) / SUM(COUNT(1)) OVER (PARTITION BY dt, apptype, abcode), 4) AS sample_ratio
+        ,SUM(is_return_noself) AS return_sample_cnt
+        ,round(SUM(is_return_noself) / COUNT(1), 6) AS return_sample_ratio
+        -- 全量样本指标
+        ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+        ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+        ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS ros_real
+        ,round(COALESCE(SUM(ros_pred) / COUNT(1),0),6) AS ros_pred
+        ,round(COALESCE(SUM(ros_stat) / COUNT(1),0),6) AS ros_stat
+        -- 仅回流样本的均值对比
+        ,round(AVG(IF(is_return_noself=1, return_n_uv_noself, NULL)),6) AS ros_real_return
+        ,round(AVG(IF(is_return_noself=1, ros_pred, NULL)),6) AS ros_pred_return
+        ,round(AVG(IF(is_return_noself=1, ros_stat, NULL)),6) AS ros_stat_return
+        -- 仅回流样本的偏差(正=高估,负=低估)
+        ,round(AVG(IF(is_return_noself=1, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_bias
+        ,round(AVG(IF(is_return_noself=1, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_bias
+        -- 仅回流样本的误差
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_pred - return_n_uv_noself), NULL)),6) AS ros_pred_mae_return
+        ,round(AVG(IF(is_return_noself=1, ABS(ros_stat - return_n_uv_noself), NULL)),6) AS ros_stat_mae_return
+        -- 高估/低估分开的误差(仅回流样本)
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred > return_n_uv_noself, ros_pred - return_n_uv_noself, NULL)),6) AS ros_pred_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_pred < return_n_uv_noself, return_n_uv_noself - ros_pred, NULL)),6) AS ros_pred_mae_under
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat > return_n_uv_noself, ros_stat - return_n_uv_noself, NULL)),6) AS ros_stat_mae_over
+        ,round(AVG(IF(is_return_noself=1 AND ros_stat < return_n_uv_noself, return_n_uv_noself - ros_stat, NULL)),6) AS ros_stat_mae_under
+        -- 非回流样本的误差(真实值=0)
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_pred), NULL)),6) AS ros_pred_mae_noreturn
+        ,round(AVG(IF(is_return_noself=0, ABS(ros_stat), NULL)),6) AS ros_stat_mae_noreturn
+        -- 业务指标
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+FROM    t_base
+GROUP BY dt
+         ,apptype
+         ,abcode
+         ,has_pred
+ORDER BY dt DESC,apptype,abcode,has_pred DESC
+;

+ 17 - 0
tmp/低vov高曝光分析/step1_验证现象.sql

@@ -0,0 +1,17 @@
+-- Step1: 验证"低vov高曝光"现象是否存在
+-- 查询头部视频(曝光rank < 50)的 vov 分布
+
+SELECT  dt
+        ,视频id AS vid
+        ,曝光rank
+        ,当日分发曝光pv AS exp
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发曝光pv, 0), 4) AS vov0
+        ,ROUND(当日分发分享pv / NULLIF(当日分发曝光pv, 0), 4) AS str_t0
+        ,ROUND(当日分发回流uv / NULLIF(当日分发分享pv, 0), 4) AS ros_t0
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发回流uv, 0), 4) AS vor_t0
+        ,标题
+FROM    loghubods.video_dimension_detail_add_column
+WHERE   dt BETWEEN '${start}' AND '${end}'
+AND     曝光rank < 50
+ORDER BY dt DESC, 曝光rank ASC
+;

+ 20 - 0
tmp/低vov高曝光分析/step2_影响面.sql

@@ -0,0 +1,20 @@
+-- Step2: 量化影响面
+-- 扩大范围,看更多天的数据,统计有多少视频存在这个问题
+
+SELECT  dt
+        ,视频id AS vid
+        ,曝光rank
+        ,当日分发曝光pv AS exp
+        ,当日分发拉回曝光pv AS 拉回曝光
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发曝光pv, 0), 4) AS vov0
+        ,ROUND(当日分发分享pv / NULLIF(当日分发曝光pv, 0), 4) AS str_t0
+        ,ROUND(当日分发回流uv / NULLIF(当日分发分享pv, 0), 4) AS ros_t0
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发回流uv, 0), 4) AS vor_t0
+        ,推荐天数间隔
+        ,标题
+        ,merge二级品类 AS category
+FROM    loghubods.video_dimension_detail_add_column
+WHERE   dt BETWEEN '${start}' AND '${end}'
+AND     曝光rank < 50
+ORDER BY dt DESC, 曝光rank ASC
+;

+ 54 - 0
tmp/低vov高曝光分析/step3_原因分析.sql

@@ -0,0 +1,54 @@
+-- Step3: 分析原因
+-- 对比模型预估值(str/ros/vor)与真实值,找出偏差来源
+-- 使用 ${dt} 变量,配合 fetch_daily.py 并发获取
+
+WITH tab_base AS
+(
+    SELECT  dt
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(score AS DOUBLE) AS score
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor
+    FROM    (
+                SELECT  dt
+                        ,vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,score
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                -- 选取问题视频
+                AND     vid IN ('62421458','63497994','63444428','62955809','55931081','63535473','63214561','63378562','60237250','62875331')
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.vor') IS NOT NULL
+)
+SELECT  vid
+        ,dt
+        ,COUNT(1) AS sample_cnt
+        -- 预估值
+        ,ROUND(AVG(score), 6) AS avg_score
+        ,ROUND(AVG(fmRov), 6) AS pred_str
+        ,ROUND(AVG(1.22 * POW(NorXGBScore, 1.15)), 6) AS pred_ros
+        ,ROUND(AVG(vor), 6) AS pred_vor
+        -- 真实值
+        ,ROUND(SUM(is_return_noself) / COUNT(1), 6) AS real_str
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 6) AS real_ros
+        ,ROUND(SUM(new_exposure_cnt) / NULLIF(SUM(return_n_uv_noself), 0), 6) AS real_vor
+        ,ROUND(SUM(new_exposure_cnt) / COUNT(1), 6) AS real_vov
+FROM    tab_base
+GROUP BY vid, dt
+ORDER BY vid, dt
+;

+ 51 - 0
tmp/低vov高曝光分析/step3b_整体偏差.sql

@@ -0,0 +1,51 @@
+-- Step3b: 分析整体预估偏差(不限定vid)
+-- 看所有头部视频的预估 vs 真实
+
+WITH tab_base AS
+(
+    SELECT  dt
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(score AS DOUBLE) AS score
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor
+    FROM    (
+                SELECT  dt
+                        ,vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,score
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.vor') IS NOT NULL
+)
+SELECT  vid
+        ,dt
+        ,COUNT(1) AS sample_cnt
+        -- 预估值
+        ,ROUND(AVG(fmRov), 6) AS pred_str
+        ,ROUND(AVG(1.22 * POW(NorXGBScore, 1.15)), 6) AS pred_ros
+        ,ROUND(AVG(vor), 6) AS pred_vor
+        -- 真实值
+        ,ROUND(SUM(is_return_noself) / COUNT(1), 6) AS real_str
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 6) AS real_ros
+        ,ROUND(SUM(new_exposure_cnt) / NULLIF(SUM(return_n_uv_noself), 0), 6) AS real_vor
+        ,ROUND(SUM(new_exposure_cnt) / COUNT(1), 6) AS real_vov
+FROM    tab_base
+GROUP BY vid, dt
+HAVING  COUNT(1) >= 100  -- 样本数>=100保证统计显著
+ORDER BY sample_cnt DESC
+;

+ 39 - 0
tmp/低vov高曝光分析/step5_时间趋势.sql

@@ -0,0 +1,39 @@
+-- Step5: 分析问题从什么时候开始
+-- 拉更长时间的数据,看 ROS 偏差的时间趋势
+
+WITH tab_base AS
+(
+    SELECT  dt
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+    FROM    (
+                SELECT  dt
+                        ,vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+)
+SELECT  dt
+        ,COUNT(DISTINCT vid) AS vid_cnt
+        ,COUNT(1) AS sample_cnt
+        -- 预估 vs 真实
+        ,ROUND(AVG(1.22 * POW(NorXGBScore, 1.15)), 4) AS pred_ros
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS real_ros
+        -- COPC
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0) / AVG(1.22 * POW(NorXGBScore, 1.15)), 4) AS ros_copc
+FROM    tab_base
+GROUP BY dt
+;

+ 15 - 0
tmp/低vov高曝光分析/step7_check_exp.sql

@@ -0,0 +1,15 @@
+-- 检查样本表中曝光分布
+SELECT  dt
+        ,COUNT(1) AS total_cnt
+        ,SUM(CASE WHEN new_exposure_cnt > 10000 THEN 1 ELSE 0 END) AS exp_gt_10k
+        ,SUM(CASE WHEN new_exposure_cnt > 1000 THEN 1 ELSE 0 END) AS exp_gt_1k
+        ,SUM(CASE WHEN new_exposure_cnt > 100 THEN 1 ELSE 0 END) AS exp_gt_100
+        ,ROUND(AVG(new_exposure_cnt), 0) AS avg_exp
+        ,PERCENTILE(CAST(new_exposure_cnt AS BIGINT), 0.9) AS p90_exp
+        ,PERCENTILE(CAST(new_exposure_cnt AS BIGINT), 0.99) AS p99_exp
+FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE   dt = '${dt}'
+AND     apptype = '36'
+AND     extend_alg IS NOT NULL
+GROUP BY dt
+;

+ 25 - 0
tmp/低vov高曝光分析/step7_头部vov趋势.sql

@@ -0,0 +1,25 @@
+-- Step7: 头部视频(Top10曝光)的 vov 时间趋势
+SELECT  dt
+        ,COUNT(1) AS top10_cnt
+        ,ROUND(AVG(vov0), 4) AS vov_mean
+        ,ROUND(PERCENTILE(CAST(vov0*10000 AS BIGINT), 0.5)/10000, 4) AS vov_median
+        ,ROUND(AVG(str_t0), 4) AS str_mean
+        ,ROUND(AVG(ros_t0), 2) AS ros_mean
+        ,ROUND(AVG(vor_t0), 2) AS vor_mean
+        -- 低vov高曝光比例(vov < 0.35 且 rank <= 5)
+        ,SUM(CASE WHEN vov0 < 0.35 AND 曝光rank <= 5 THEN 1 ELSE 0 END) AS problem_cnt
+        ,ROUND(SUM(CASE WHEN vov0 < 0.35 AND 曝光rank <= 5 THEN 1 ELSE 0 END) / COUNT(1) * 100, 1) AS problem_pct
+FROM    (
+            SELECT  dt
+                    ,视频id AS vid
+                    ,曝光rank
+                    ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发曝光pv, 0), 4) AS vov0
+                    ,ROUND(当日分发分享pv / NULLIF(当日分发曝光pv, 0), 4) AS str_t0
+                    ,ROUND(当日分发回流uv / NULLIF(当日分发分享pv, 0), 4) AS ros_t0
+                    ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发回流uv, 0), 4) AS vor_t0
+            FROM    loghubods.video_dimension_detail_add_column
+            WHERE   dt = '${dt}'
+            AND     曝光rank <= 10
+        )
+GROUP BY dt
+;

+ 41 - 0
tmp/低vov高曝光分析/step7_头部视频时间趋势.sql

@@ -0,0 +1,41 @@
+-- Step7: 头部视频的 ROS COPC 时间趋势
+-- 只看高曝光视频(new_exposure_cnt > 10000)
+
+WITH tab_base AS
+(
+    SELECT  dt
+            ,vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+    FROM    (
+                SELECT  dt
+                        ,vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     new_exposure_cnt > 10000  -- 只看高曝光视频
+)
+SELECT  dt
+        ,COUNT(DISTINCT vid) AS vid_cnt
+        ,COUNT(1) AS sample_cnt
+        ,ROUND(AVG(new_exposure_cnt), 0) AS avg_exp
+        -- 预估 vs 真实
+        ,ROUND(AVG(1.22 * POW(NorXGBScore, 1.15)), 4) AS pred_ros
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0), 4) AS real_ros
+        -- COPC
+        ,ROUND(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0) / AVG(1.22 * POW(NorXGBScore, 1.15)), 4) AS ros_copc
+FROM    tab_base
+GROUP BY dt
+;

+ 18 - 0
tmp/低vov高曝光分析/step8_月度对比.sql

@@ -0,0 +1,18 @@
+-- Step8: 拉取特定月份的详细数据,分析问题原因
+SELECT  dt
+        ,视频id AS vid
+        ,曝光rank
+        ,当日分发曝光pv AS exp
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发曝光pv, 0), 4) AS vov0
+        ,ROUND(当日分发分享pv / NULLIF(当日分发曝光pv, 0), 4) AS str_t0
+        ,ROUND(当日分发回流uv / NULLIF(当日分发分享pv, 0), 4) AS ros_t0
+        ,ROUND(当日分发拉回曝光pv / NULLIF(当日分发回流uv, 0), 4) AS vor_t0
+        ,当日分发分享pv AS share_pv
+        ,当日分发回流uv AS return_uv
+        ,当日分发拉回曝光pv AS return_exp
+        ,标题
+FROM    loghubods.video_dimension_detail_add_column
+WHERE   dt BETWEEN '${start}' AND '${end}'
+AND     曝光rank <= 10
+ORDER BY dt, 曝光rank
+;

+ 66 - 0
tmp/低vov高曝光分析/v2_step2_统一口径.sql

@@ -0,0 +1,66 @@
+-- V2 Step2: 统一口径分析
+-- 在同一张表上同时获取预估值和真实值,避免口径混乱
+-- 聚合到视频级别,计算预估vs真实的偏差
+
+WITH sample_base AS (
+    SELECT  vid
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+    FROM    (
+                SELECT  vid
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+)
+,vid_agg AS (
+    -- 聚合到视频级别
+    SELECT  vid
+            ,COUNT(1) AS sample_cnt
+            ,SUM(new_exposure_cnt) AS total_exp
+            -- 预估值(样本均值)
+            ,AVG(1.22 * POW(NorXGBScore, 1.15)) AS pred_ros
+            ,AVG(fmRov) AS pred_str
+            -- 真实值
+            ,SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0) AS real_ros
+            ,SUM(is_return_noself) / NULLIF(SUM(new_exposure_cnt), 0) AS real_str
+    FROM    sample_base
+    GROUP BY vid
+    HAVING  SUM(new_exposure_cnt) > 100  -- 至少100曝光
+)
+SELECT  vid
+        ,sample_cnt
+        ,total_exp
+        -- 预估
+        ,ROUND(pred_ros, 4) AS pred_ros
+        ,ROUND(pred_str, 6) AS pred_str
+        -- 真实
+        ,ROUND(real_ros, 4) AS real_ros
+        ,ROUND(real_str, 6) AS real_str
+        -- 偏差
+        ,ROUND((pred_ros - real_ros) / NULLIF(real_ros, 0) * 100, 2) AS ros_bias_pct
+        ,ROUND((pred_str - real_str) / NULLIF(real_str, 0) * 100, 2) AS str_bias_pct
+        -- 计算预估vov和真实vov(假设vor相同)
+        ,ROUND(pred_ros * pred_str, 6) AS pred_score
+        ,ROUND(real_ros * real_str, 6) AS real_score
+FROM    vid_agg
+WHERE   real_ros IS NOT NULL
+AND     real_str IS NOT NULL
+AND     real_ros > 0
+AND     real_str > 0
+ORDER BY total_exp DESC
+LIMIT   500
+;

+ 74 - 0
tmp/低vov高曝光分析/v3_扩展特征.sql

@@ -0,0 +1,74 @@
+-- V3: 扩展特征分析
+-- 增加page场景、曝光量分层等维度,支持更深入的归因分析
+
+WITH sample_base AS (
+    SELECT  vid
+            ,page
+            ,is_return_noself
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+    FROM    (
+                SELECT  vid
+                        ,page
+                        ,is_return_noself
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                WHERE   dt = '${dt}'
+                AND     apptype = '36'
+                AND     extend_alg IS NOT NULL
+                AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+            )
+    WHERE   GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+    AND     GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+)
+,vid_page_agg AS (
+    -- 按视频+page聚合
+    SELECT  vid
+            ,page
+            ,COUNT(1) AS sample_cnt
+            ,SUM(new_exposure_cnt) AS total_exp
+            -- 预估值
+            ,AVG(1.22 * POW(NorXGBScore, 1.15)) AS pred_ros
+            ,AVG(fmRov) AS pred_str
+            -- 真实值
+            ,SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0) AS real_ros
+            ,SUM(is_return_noself) / NULLIF(SUM(new_exposure_cnt), 0) AS real_str
+            -- 原始值用于后续计算
+            ,SUM(return_n_uv_noself) AS total_return_uv
+            ,SUM(is_return_noself) AS total_share_cnt
+    FROM    sample_base
+    GROUP BY vid, page
+    HAVING  SUM(new_exposure_cnt) > 50  -- 至少50曝光
+)
+SELECT  vid
+        ,page
+        ,sample_cnt
+        ,total_exp
+        ,total_share_cnt
+        ,total_return_uv
+        -- 预估
+        ,ROUND(pred_ros, 4) AS pred_ros
+        ,ROUND(pred_str, 6) AS pred_str
+        -- 真实
+        ,ROUND(real_ros, 4) AS real_ros
+        ,ROUND(real_str, 6) AS real_str
+        -- 偏差
+        ,ROUND((pred_ros - real_ros) / NULLIF(real_ros, 0) * 100, 2) AS ros_bias_pct
+        ,ROUND((pred_str - real_str) / NULLIF(real_str, 0) * 100, 2) AS str_bias_pct
+        -- 预估score和真实score
+        ,ROUND(pred_ros * pred_str, 6) AS pred_score
+        ,ROUND(real_ros * real_str, 6) AS real_score
+FROM    vid_page_agg
+WHERE   real_ros IS NOT NULL
+AND     real_str IS NOT NULL
+AND     real_ros > 0
+AND     real_str > 0
+ORDER BY total_exp DESC
+LIMIT   2000
+;

+ 360 - 0
tmp_sql/头部视频.sql

@@ -0,0 +1,360 @@
+SELECT dt,
+视频id,标题,merge二级品类,4,5,6,7,8,9,
+sum(当日分发曝光pv) as 分发曝光pv,
+sum(当日分发拉回曝光pv) as 分发拉回曝光pv,
+sum(当日分发回流uv) AS 分发回流_当日,
+sum(累计分享回流uv) AS 总回流uv,
+sum(当日分发回流uv)/sum(当日分发曝光pv) as rov_t0,
+sum(当日分发回流uv)/sum(当日分发分享pv)  as ros_t0,
+sum(当日分发拉回曝光pv)/sum(当日分发曝光pv)  as vov0,
+sum(0_1日分发拉回曝光pv)/sum(当日分发曝光pv)  as vov1,
+sum(当日分发拉回曝光pv)/sum(当日分发回流uv)  as vor_t0,
+sum(当日分发分享pv)/sum(当日分发曝光pv)  as str_t0,
+AVG(视频时长) as 视频时长,
+count(DISTINCT 视频id) as 分发视频量,
+count(DISTINCT if(是否当日新推荐>0,视频id,null)) as 新推荐视频量,
+SUM(1008回流人数) / SUM(总回流uv) AS 群聊占比,
+SUM(头部分享pv)/SUM(总分享pv) AS 头部分享占比,
+SUM(当日分发头部分享pv)/SUM(当日分发曝光pv) AS 头部str_t0,
+SUM(当日分发头部分享pv)/SUM(当日分发头部分享pv+当日分发分享pv) AS 当日分发头部分享占比,
+sum(推荐回流)/sum(流量池曝光) AS 流量池曝光roi,
+sum(流量池曝光) AS 流量池分发曝光,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0_3VoV0,
+avg(曝光rank)-avg(回流rank) AS rankdiff,
+avg(回流rank) as 回流rank_avg,
+avg(曝光rank) as 曝光rank_avg,
+sum(流量池回流)/sum(流量池曝光) AS 流量池rov,
+sum(流量池回流)/sum(流量池分享) AS 流量池ros,
+sum(流量池分享)/sum(流量池曝光) AS 流量池str,
+sum(推荐回流)/sum(推荐曝光) AS 推荐rov,
+sum(推荐回流)/sum(推荐分享) AS 推荐ros,
+sum(推荐分享)/sum(推荐曝光) AS 推荐str,
+    (SUM(带来1007回流的分享数)+SUM(带来1008回流的分享数))/SUM(总分享pv) AS 有效分享率,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3,4,5,6,7)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3,4,5,6,7)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0_7VoV1,        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3,4,5,6,7)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3,4,5,6,7)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0_7VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3,4,5,6,7)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新0_7曝光占比,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0_3VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0_3VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0,1,2,3)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新0_3曝光占比,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (1,2,3)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (1,2,3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新1_3VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (1,2,3)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (1,2,3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新1_3VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (0)
+            THEN 当日分发曝光pv
+          END
+        ) as 新0VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (0)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新0曝光占比,
+        count(DISTINCT if(推荐天数间隔=1,视频id,null)) as 新1视频量,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (1)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (1)
+            THEN 当日分发曝光pv
+          END
+        ) as 新1VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (1)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (1)
+            THEN 当日分发曝光pv
+          END
+        ) as 新1VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (1)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新1曝光占比,
+        count(DISTINCT if(推荐天数间隔=2,视频id,null)) as 新2视频量,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (2)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (2)
+            THEN 当日分发曝光pv
+          END
+        ) as 新2VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (2)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (2)
+            THEN 当日分发曝光pv
+          END
+        ) as 新2VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (2)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新2曝光占比,
+        count(DISTINCT if(推荐天数间隔=3,视频id,null)) as 新3视频量,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (3)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新3VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (3)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 in (3)
+            THEN 当日分发曝光pv
+          END
+        ) as 新3VoV0,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 in (3)
+            THEN 当日分发曝光pv
+          END
+        )/sum(
+          当日分发曝光pv
+        ) as 新3曝光占比,
+                sum(
+          CASE
+            WHEN 推荐天数间隔 not in (0,1,2,3,4,5,6,7)
+            THEN 0_1日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 not in (0,1,2,3,4,5,6,7)
+            THEN 当日分发曝光pv
+          END
+        ) as 非0_7_VoV1,
+        sum(
+          CASE
+            WHEN 推荐天数间隔 not in (0,1,2,3,4,5,6,7)
+            THEN 当日分发拉回曝光pv
+          END
+        )/sum(
+          CASE
+            WHEN 推荐天数间隔 not in (0,1,2,3,4,5,6,7)
+            THEN 当日分发曝光pv
+          END
+        ) as 非0_7_VoV0,
+sum(0_2日分发拉回曝光pv)/sum(当日分发曝光pv)  as vov2,
+sum(0_7日分发拉回曝光pv)/sum(当日分发曝光pv)  as vov7,
+sum(0_30日分发拉回曝光pv)/sum(当日分发曝光pv)  as vov30,
+(sum(0_1日分发拉回曝光pv)/sum(当日分发曝光pv))-(sum(当日分发拉回曝光pv)/sum(当日分发曝光pv)) as vov1减vov0,
+sum(当日分发回流uv) as 分发回流uv,
+sum(当日分发分享pv) as 分发分享pv,
+   SUM(1008回流人数)/ SUM(带来1008回流的分享数) AS 群聊ros,
+   SUM(1007回流人数)/ SUM(带来1007回流的分享数) AS 单聊ros,
+  SUM(1007进入分发曝光pv)/SUM(1007回流人数) AS 单聊vor,
+    SUM(1008进入分发曝光pv)/SUM(1008回流人数) AS 群聊vor,
+  (SUM(1007回流再分享pv)+ SUM(1008回流再分享pv))/(SUM(1007进入分发曝光pv)+ SUM(1008进入分发曝光pv)) AS 回流后str,
+  (SUM(1008回流再分享pv))/(SUM(1008进入分发曝光pv)) AS 群聊后str,
+  (SUM(1007回流再分享pv))/(SUM(1007进入分发曝光pv)) AS 单聊后str,
+SUM(总回流uv)/SUM(累计分享回流uv) AS 当日分享回流占比,
+SUM(当日分享当日回流首层uv)/SUM(当日分享当日回流uv) AS 当日分享当日回流首层比当日分享当日回流,
+
+count(DISTINCT if(是否七日内创建>0,视频id,null)) as 七日内新视频量,
+count(DISTINCT if(是否首发视频>0,视频id,null)) as 首发视频量,
+count(DISTINCT if(是否首发视频>0,视频id,null))/count(DISTINCT 视频id) as 首发视频比例,
+count(DISTINCT 站内uid) as 供给uid量,
+AVG(首发距今时间) as 首发距今间隔avg,
+AVG(推荐天数间隔) as 推荐距今间隔avg,
+AVG(创建天数间隔) as 创建距今间隔avg,
+sum(0_1日分发拉回曝光pv) as 0_1日分发拉回曝光pv,
+sum(0_2日分发拉回曝光pv) as 0_2日分发拉回曝光pv,
+sum(0_3日分发拉回曝光pv) as 0_3日分发拉回曝光pv,
+sum(0_7日分发拉回曝光pv) as 0_7日分发拉回曝光pv,
+sum(0_30日分发拉回曝光pv) as 0_30日分发拉回曝光pv,
+sum(0_1日分发回流uv)/sum(当日分发曝光pv) as rov1,
+sum(0_7日分发回流uv)/sum(当日分发曝光pv) as rov7,
+sum(0_30日分发回流uv)/sum(当日分发曝光pv) as rov30,
+sum(0_1日分发拉回曝光pv)/sum(0_1日分发回流uv)  as vor1,
+sum(0_7日分发拉回曝光pv)/sum(0_7日分发回流uv)  as vor7,
+sum(0_30日分发拉回曝光pv)/sum(0_30日分发回流uv)  as vor30,
+sum(流量池曝光) AS 流量池曝光,
+sum(流量池播放) AS 流量池播放, 
+sum(流量池分享) AS 流量池分享,
+sum(流量池回流) AS 流量池回流,
+sum(推荐曝光) AS 推荐曝光,
+sum(推荐播放) AS 推荐播放,
+sum(推荐分享) AS 推荐分享,
+sum(推荐回流) AS 推荐回流,
+SUM(总分享pv) AS 总分享pv,
+SUM(总回流uv) AS 总回流uv,
+  SUM(1007回流人数) AS 1007回流人数,
+  SUM(1008回流人数) AS 1008回流人数,
+   
+  SUM(带来1007回流的分享数) AS 带来1007回流的分享数,
+  SUM(带来1008回流的分享数) AS 带来1008回流的分享数,
+  SUM(1007进入分发曝光pv) AS 1007进入分发曝光pv,
+  SUM(1008进入分发曝光pv) AS 1008进入分发曝光pv ,
+
+  SUM(1007回流再分享pv) AS 1007回流再分享pv ,
+  SUM(1008回流再分享pv) AS 1008回流再分享pv,
+  SUM(有回流分享pv) AS 有回流分享pv,
+  SUM(累计分享回流uv) AS 累计分享回流uv,
+  SUM(分发分享pv) AS 分发分享pv,
+  SUM(头部分享pv) AS 头部分享pv ,
+  SUM(当日分发头部分享pv) AS 当日分发头部分享pv,
+  SUM(当日分享当日回流uv) AS 当日分享当日回流uv,
+  SUM(当日分享当日回流首层uv) AS 当日分享当日回流首层uv,
+  SUM(当日分享当日回流非首层uv) AS 当日分享当日回流非首层uv,
+  SUM(非当日分享回流uv) AS 非当日分享回流uv,
+  SUM(n当日分发回流uv) AS n当日分发回流uv,
+  SUM(非当日分发回流uv) AS  非当日分发回流uv,
+count(DISTINCT if(当日分发曝光pv>=100,视频id,null)) as t0_100曝光视频量,
+count(DISTINCT if(当日分发曝光pv>=500,视频id,null)) as t0_500曝光视频量,
+count(DISTINCT if(当日分发曝光pv>=1000,视频id,null)) as t0_1k曝光视频量,
+count(DISTINCT if(当日分发曝光pv>=10000,视频id,null)) as t0_1w曝光视频量,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)-(当日分发拉回曝光pv)/(当日分发曝光pv)>=0.2 and 当日分发曝光pv>=500,视频id,null)) as vov1_0_02_500视频量,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)-(当日分发拉回曝光pv)/(当日分发曝光pv)>=0.2 and 当日分发曝光pv>=500,视频id,null))/count(DISTINCT if(当日分发曝光pv>=500,视频id,null)) as vov1_0_02_500视频占比,
+count(DISTINCT if((当日分发拉回曝光pv)/(当日分发曝光pv)>=0.4 and 当日分发曝光pv>=500,视频id,null)) as vov0_04_500视频量,
+count(DISTINCT if((当日分发拉回曝光pv)/(当日分发曝光pv)>=0.4 and 当日分发曝光pv>=500,视频id,null))/count(DISTINCT if(当日分发曝光pv>=500,视频id,null)) as vov0_04_500视频占比,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)>=0.7 and 当日分发曝光pv>=500,视频id,null)) as vov1_07_500视频量,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)>=0.7 and 当日分发曝光pv>=500,视频id,null))/count(DISTINCT if(当日分发曝光pv>=500,视频id,null)) as vov1_07_500视频占比,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)>=0.8 and 当日分发曝光pv>=500,视频id,null)) as vov1_08_500视频量,
+count(DISTINCT if((0_1日分发拉回曝光pv)/(当日分发曝光pv)>=0.8 and 当日分发曝光pv>=500,视频id,null))/count(DISTINCT if(当日分发曝光pv>=500,视频id,null)) as vov1_08_500视频占比,
+count(DISTINCT if(当日分发拉回曝光pv>=500,视频id,null)) as t0_500拉回曝光视频量,
+count(DISTINCT if(0_1日分发拉回曝光pv>=500,视频id,null)) as t1_500拉回曝光视频量,
+count(DISTINCT if(当日分发拉回曝光pv>=10000,视频id,null)) as t0_1w拉回曝光视频量,
+count(DISTINCT if(0_1日分发拉回曝光pv>=10000,视频id,null)) as t1_1w拉回曝光视频量,
+count(DISTINCT if(当日分发拉回曝光pv>=100000,视频id,null)) as t0_10w拉回曝光视频量,
+count(DISTINCT if(0_1日分发拉回曝光pv>=100000,视频id,null)) as t1_10w拉回曝光视频量,
+count(DISTINCT if(当日分发拉回曝光pv>=1000000,视频id,null)) as t0_100w拉回曝光视频量,
+count(DISTINCT if(0_1日分发拉回曝光pv>=1000000,视频id,null)) as t1_100w拉回曝光视频量
+,(SUM(带来流量池1007回流的分享数)+SUM(带来流量池1008回流的分享数))/SUM(带来流量池回流的分享数) AS 流量池有效分享率
+,SUM(流量池1008回流人数) / SUM(流量池回流人数) AS 流量池群聊占比
+  FROM loghubods.video_dimension_detail_add_column 
+  WHERE dt>=20251201 AND dt<=20270101 and 曝光rank <40 and 0 =0 and 0 =0 and 0 =0 AND ( 0 =0 OR  0 =0)
+  group by dt,视频id,标题,merge二级品类,4,5,6,7,8,9
+  order by dt desc,分发曝光pv desc
+  LIMIT 50000;