Jelajahi Sumber

feat: 新增 CSV 合并功能及 rosn 分析任务

- fetch_daily.py: 新增 --merge 参数,支持合并所有日期 CSV 为单文件
- 更新 opengid_base_data 表结构(新增 27 个字段)
- 新增 rosn 分析 SQL(实验组 x Top10/20 视频、一级品类)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 bulan lalu
induk
melakukan
88b872d918

+ 44 - 0
fetch_daily.py

@@ -45,6 +45,39 @@ def get_existing_dates(daily_dir):
     return existing
 
 
+def merge_csv_files(daily_dir, output_file=None):
+    """合并目录下所有日期 CSV 文件,只保留一个表头"""
+    csv_files = sorted(daily_dir.glob("*.csv"))
+    if not csv_files:
+        print("没有找到 CSV 文件")
+        return None
+
+    if output_file is None:
+        output_file = daily_dir.parent / f"{daily_dir.name}_merged.csv"
+
+    with open(output_file, "w", encoding="utf-8") as out:
+        header_written = False
+        total_rows = 0
+
+        for csv_file in csv_files:
+            with open(csv_file, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+                if not lines:
+                    continue
+
+                if not header_written:
+                    out.write(lines[0])
+                    header_written = True
+
+                for line in lines[1:]:
+                    out.write(line)
+                    total_rows += 1
+
+    print(f"合并完成: {len(csv_files)} 个文件, {total_rows} 行数据")
+    print(f"输出文件: {output_file}")
+    return output_file
+
+
 def get_date_range(start_str, end_str):
     """生成日期范围列表"""
     start = datetime.strptime(start_str, "%Y%m%d")
@@ -106,6 +139,7 @@ def main():
     parser.add_argument("--force", action="store_true", help="强制重新获取")
     parser.add_argument("--workers", type=int, default=5, help="天级并发数 (默认5)")
     parser.add_argument("--parallel", type=int, default=50, help="单天多线程下载 (默认50, 大数据量推荐)")
+    parser.add_argument("--merge", action="store_true", help="合并所有日期数据到一个文件")
     args = parser.parse_args()
 
     # 解析 SQL 文件路径
@@ -122,6 +156,16 @@ def main():
     print(f"SQL文件: {sql_file}")
     print(f"数据目录: {daily_dir}")
 
+    # 仅合并模式:不获取数据,直接合并已有文件
+    if args.merge:
+        existing_dates = get_existing_dates(daily_dir)
+        print(f"已有数据: {len(existing_dates)}天")
+        if existing_dates:
+            merge_csv_files(daily_dir)
+        else:
+            print("没有可合并的数据")
+        return
+
     # 确定日期范围
     if args.date:
         target_dates = [args.date]

+ 27 - 1
tables/loghubods/opengid_base_data.txt

@@ -1,7 +1,7 @@
 表名: loghubods.opengid_base_data
 注释: (无)
 创建时间: 2025-10-28 16:34:36
-最后修改: 2026-01-05 13:38:09
+最后修改: 2026-01-21 11:00:23
 
 ============================================================
 字段名                            类型              注释
@@ -92,6 +92,32 @@ contenturl                     string
 t0裂变人数                         bigint          
 t0分发裂变人数                       bigint          
 t0头部裂变人数                       bigint          
+位置                             string          
+文章品类                           string          
+ghid                           string          
+人群包                            string          
+人群_创意名称                        string          
+长文封面                           string          
+画像_二级品类                        string          
+vlog产品层实验组                     string          
+票圈视频产品层实验组                     string          
+进入小程序场景                        string          
+点击小时                           string          
+24h一层回流                        bigint          
+rootsharemid                   string          
+24_fission_uv_root             bigint          
+t0_fission_uv_root             bigint          
+创意id                           string          
+创意名称                           string          
+素材标题                           string          
+素材封面                           string          
+账号id                           string          
+账号名称                           string          
+代理名称                           string          
+广告id                           string          
+广告名称                           string          
+包名                             string          
+real_user_share_depth          bigint          
 dt                             string          分区
 
 分区字段:

+ 165 - 0
tasks/承接/rosn分析/01_实验组xTop10一级品类sql

@@ -0,0 +1,165 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v5: 按一级品类 (merge_first_level_cate) 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            -- 品类信息
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_first_level_cate'), 'unknown') AS cate1
+            ,COALESCE(GET_JSON_OBJECT(v1_feature,'$.merge_second_level_cate'), 'unknown') AS cate2
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top10 的一级品类
+,t_cate_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,cate1
+            ,COUNT(1) AS cate_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS cate_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, cate1
+)
+,t_top_cate AS
+(
+    SELECT  dt, apptype, abcode, cate1, cate_rank
+    FROM    t_cate_rank
+    WHERE   cate_rank <= 10
+)
+-- 标记 top 品类
+,t_with_top AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.cate1 IS NOT NULL THEN a.cate1 ELSE NULL END AS top_cate1
+            ,b.cate_rank AS top_cate_rank
+    FROM    t_valid a
+    LEFT JOIN t_top_cate b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.cate1 = b.cate1
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(top_cate1, 'all') AS cate1
+            ,CASE WHEN GROUPING(top_cate1) = 1 THEN NULL ELSE MAX(top_cate_rank) END AS cate_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top
+    GROUP BY dt, apptype, abcode, top_cate1
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top_cate1)
+    )
+    HAVING  top_cate1 IS NOT NULL OR GROUPING(top_cate1) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,cate1
+        ,cate_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN cate1 = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 166 - 0
tasks/承接/rosn分析/01_实验组xTop10视频.sql

@@ -0,0 +1,166 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top10 vid 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top5 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 10
+)
+-- 标记 top5 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 166 - 0
tasks/承接/rosn分析/01_实验组xTop20视频.sql

@@ -0,0 +1,166 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top10 vid 分组 + GROUPING SETS + 曝光占比
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top5 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top5 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 计算曝光占比
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        ,rosn_pred_mae, rosn_stat_mae
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+FROM    t_agg
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;