Przeglądaj źródła

feat: 新增尾号实验 v3/v4_v1 新版 SQL,fetch_daily 支持自定义列值排序,修复 base_v3_new_v2 实验映射重复行

- 新增 base_v3_new/v4_v1_new 系列 SQL 及飞书配置 JSON
- fetch_daily 新增 order 参数支持自定义列值顺序排序
- 修复 base_v3_new_v2.sql 中 ef 桶 DNN模型 映射重复导致 exp_per_dau 翻倍
- 新增 ECS/ARP 指标计算公式及说明文档

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
yangxiaohui 4 tygodni temu
rodzic
commit
fa5d821f49

+ 39 - 7
fetch_daily.py

@@ -164,6 +164,7 @@ def load_feishu_config(sql_file):
         "filter": None,
         "limit": None,
         "append_cols": False,
+        "order": None,  # 自定义列值顺序,如 {"group": ["5d", "01", "34"]}
     }
 
     root_dir = Path(__file__).parent
@@ -186,6 +187,29 @@ def load_feishu_config(sql_file):
     return defaults
 
 
+def make_custom_order_key(idx, custom_order):
+    """为自定义顺序排序构造 key 函数。
+
+    白名单内的值严格按 custom_order 指定顺序排列;
+    白名单外的值统一放到末尾,未知值之间按字典序稳定排列。
+
+    Args:
+        idx: 目标列在 header 中的索引
+        custom_order: 期望的值顺序列表,如 ["5d", "01", "34"]
+
+    Returns:
+        一个 row -> sortable 的 key 函数,供 list.sort(key=...) 使用
+    """
+    order_map = {str(v): i for i, v in enumerate(custom_order)}
+    fallback = len(custom_order)
+
+    def key_fn(row):
+        v = row[idx] if idx < len(row) else ""
+        return (order_map.get(v, fallback), v)
+
+    return key_fn
+
+
 def parse_sort_spec(sort_spec):
     """解析排序规格,如 'dt:desc,name:asc' -> [('dt', True), ('name', False)]"""
     if not sort_spec:
@@ -262,7 +286,7 @@ def column_index_to_letter(col_idx):
     return result
 
 
-def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc", cols_spec=None, filter_spec=None, limit=None, append_cols=False):
+def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc", cols_spec=None, filter_spec=None, limit=None, append_cols=False, order_spec=None):
     """上传 CSV 文件到飞书表格(通过模板行继承样式)
 
     第1行: 表头
@@ -278,6 +302,7 @@ def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc",
         filter_spec: 过滤条件,dict {"字段": "值"} 或 str "字段=值,字段=值"
         limit: 上传行数上限
         append_cols: 是否将飞书中没有的新列追加到右侧(默认 False 忽略)
+        order_spec: 自定义列值顺序,dict {字段: [值1, 值2, ...]}
     """
     from feishu import Client, LARK_HOST, APP_ID, APP_SECRET, request
 
@@ -300,8 +325,14 @@ def upload_to_feishu(csv_file, sheet_token, sheet_id=None, sort_spec="dt:desc",
         for field, desc in reversed(sort_fields):
             if field in header:
                 idx = header.index(field)
-                data_rows.sort(key=lambda row: row[idx] if idx < len(row) else "", reverse=desc)
-                applied.append(f"{field}:{'desc' if desc else 'asc'}")
+                if order_spec and field in order_spec:
+                    # 自定义顺序排序(asc/desc 被忽略)
+                    custom_order = order_spec[field]
+                    data_rows.sort(key=make_custom_order_key(idx, custom_order))
+                    applied.append(f"{field}:custom({len(custom_order)})")
+                else:
+                    data_rows.sort(key=lambda row: row[idx] if idx < len(row) else "", reverse=desc)
+                    applied.append(f"{field}:{'desc' if desc else 'asc'}")
         if applied:
             print(f"排序: {', '.join(reversed(applied))}")
 
@@ -734,10 +765,11 @@ def main():
     if args.limit is None:
         args.limit = feishu_config["limit"]
     append_cols = feishu_config.get("append_cols", False)
+    order_spec = feishu_config.get("order")
 
     # 打印飞书配置
     if args.feishu:
-        print(f"飞书配置: token={args.feishu}, sheet_id={args.sheet_id}, sort={args.sort}, cols={args.cols}")
+        print(f"飞书配置: token={args.feishu}, sheet_id={args.sheet_id}, sort={args.sort}, cols={args.cols}, order={order_spec}")
 
     # 输出目录:SQL 同目录下的 output/SQL文件名/
     output_dir = sql_file.parent / "output"
@@ -755,7 +787,7 @@ def main():
             merged_file = merge_csv_files(daily_dir)
             # 如果指定了飞书上传
             if args.feishu and merged_file:
-                upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols)
+                upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols, order_spec)
         else:
             print("没有可合并的数据")
         return
@@ -819,7 +851,7 @@ def main():
             print(f"数据目录: {output_file}")
             # 如果指定了飞书上传
             if args.feishu and output_file.exists():
-                upload_to_feishu(output_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols)
+                upload_to_feishu(output_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols, order_spec)
         except Exception as e:
             print(f"✗ 执行失败: {e}")
         return
@@ -859,7 +891,7 @@ def main():
     if args.feishu:
         merged_file = merge_csv_files(daily_dir)
         if merged_file:
-            upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols)
+            upload_to_feishu(merged_file, args.feishu, args.sheet_id, args.sort, args.cols, args.filter, args.limit, append_cols, order_spec)
 
 
 if __name__ == "__main__":

+ 11 - 0
tasks/00_尾号实验/base_v3_new.json

@@ -0,0 +1,11 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "XQA2ek",
+  "sort": "dt:desc,suffix_group:asc",
+  "order": {
+      "suffix_group": ["67", "01", "5d", "ef", "89"]
+  },
+  "cols": null,
+  "append_cols": false,
+  "filter": "abcode!=前基线,abcode!=6,abcode!=e,abcode!=f"
+}

+ 538 - 0
tasks/00_尾号实验/base_v3_new.sql

@@ -0,0 +1,538 @@
+-- ════════════════════════════════════════════════════════════════════════════
+-- 两层尾号映射 (SCD Type 2 模式)
+--
+-- 第一层 t_suffix_group:物理尾号 → 分流桶 ID(16 个 hex 尾号分成 8 个 2-元桶)
+--   - 分流规则不变时,此层永不改
+--
+-- 第二层 t_experiment_map:分流桶 → 实验名 + 生效日期
+--   - 只列出"分配了具体实验"的桶,未列出的桶自动默认为"对照组"
+--   - 支持 1 对多:同一个实验占多个桶时,用同一 abcode 字符串多加几行
+--   - 实验切换:不删旧行,关闭 end_dt + 追加新行(保留历史可回溯)
+-- ════════════════════════════════════════════════════════════════════════════
+WITH t_suffix_group AS
+(
+    SELECT "a" AS suffix, "ab" AS suffix_group
+    UNION ALL SELECT "b", "ab"
+    UNION ALL SELECT "0", "01"
+    UNION ALL SELECT "1", "01"
+    UNION ALL SELECT "2", "2c"
+    UNION ALL SELECT "c", "2c"
+    UNION ALL SELECT "3", "34"
+    UNION ALL SELECT "4", "34"
+    UNION ALL SELECT "5", "5d"
+    UNION ALL SELECT "d", "5d"
+    UNION ALL SELECT "6", "67"
+    UNION ALL SELECT "7", "67"
+    UNION ALL SELECT "8", "89"
+    UNION ALL SELECT "9", "89"
+    UNION ALL SELECT "e", "ef"
+    UNION ALL SELECT "f", "ef"
+)
+-- 当前实验映射
+--   未列出的桶(89 / 2c)→ 自动默认为"对照组"
+--   同一个 suffix_group 可以有多行(SCD Type 2),但同一时间只能命中一行
+,t_experiment_map AS
+(
+    -- 前基线(ab 桶,从未变动)
+    SELECT "ab" AS suffix_group, "前基线" AS abcode, "20250101" AS start_dt, "29991231" AS end_dt
+
+    -- 变更str*ros建模目标实验(分阶段扩量,当前占用 4 个桶;1 对多)
+    --   20260320: 首批上 01 桶
+    --   20260330: 扩到 67 桶(此时 67 桶的 bn_ros 实验已下线 10 天 ⚠️)
+    --   20260407: 同日扩到 5d 桶(5d 的解构str 实验下线)和 34 桶(34 此前是默认对照组)
+    UNION ALL SELECT "01", "实验组:变更str*ros建模目标实验", "20260320", "29991231"
+    UNION ALL SELECT "67", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+    UNION ALL SELECT "5d", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+    UNION ALL SELECT "34", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+
+    -- 67 桶的前实验:bn_ros 新损失函数
+    --   20260320~20260329 为空窗期(10 天),此间 67 → 默认"对照组"
+    UNION ALL SELECT "67", "实验组:bn_ros新损失函数",    "20260311", "20260319"
+
+    -- 5d 桶的前实验:解构特征排序 str 模型
+    --   20260407 直接被建模目标实验接手,无空窗
+    UNION ALL SELECT "5d", "实验组:解构特征排序str模型", "20260314", "20260406"
+
+    -- ef 桶的历史:解构str&召回(已下线)→ 空窗 17 天 → DNN 模型(仍在运行)
+    --   20260321~20260406 为空窗期,此间 ef → 默认"对照组"
+    UNION ALL SELECT "ef", "实验组:解构特征排序str模型&召回", "20260314", "20260320"
+    UNION ALL SELECT "ef", "实验组:DNN模型",                   "20260407", "29991231"
+    UNION ALL SELECT "ef", "实验组:DNN模型",                   "20260407", "29991231"
+
+    -- ────────────────────────────────────────────────────────────────────
+    -- 📖 修改样例(复制下面的行到上面 UNION ALL 列表里使用)
+    --
+    -- 样例 A:新增一个占用单桶的实验
+    --   UNION ALL SELECT "2c", "实验组:新策略 X", "20260501", "29991231"
+    --
+    -- 样例 B:新增一个 1 对多 实验(同一实验占 01 + 34 两个桶)
+    --   用同一 abcode 字符串加两行即可,下游 GROUP BY 自动合并:
+    --   UNION ALL SELECT "01", "实验组:大流量 Y", "20260601", "29991231"
+    --   UNION ALL SELECT "34", "实验组:大流量 Y", "20260601", "29991231"
+    --
+    -- 样例 C:实验切换(SCD Type 2 —— 保留历史)
+    --   假设 01 桶 20260701 从 实验 A 切换到 实验 B:
+    --   Step 1: 把原来那行 end_dt 改成切换前一天:
+    --     SELECT "01", "实验组:A", "20260320", "20260630"
+    --   Step 2: 追加新实验行:
+    --     UNION ALL SELECT "01", "实验组:B", "20260701", "29991231"
+    --
+    -- 样例 D:实验下线回到对照组(产生空窗)
+    --   直接把该行的 end_dt 改成下线前一天即可(不用追加行):
+    --     SELECT "5d", "实验组:A", "20250101", "20260630"
+    --   20260701 之后 5d 桶没有任何有效行覆盖,自动进入"对照组"
+    --   ⚠️ 如果这是有意的空窗,没问题;如果只是忘了接新实验,后续记得补
+    -- ────────────────────────────────────────────────────────────────────
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,sg.suffix_group
+            ,COALESCE(m.abcode,"对照组") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        ,cc.cn
+                        ,cc.c1
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_exposure_base_20250108 a
+                LEFT JOIN   (
+                                -- c1/cn:分享后被点击的回流 UV
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                -- d1/dn:下一条视频带来的回流
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("4")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    -- INNER JOIN: 合法尾号(在 16 个 hex 里)才进分析;防御异常数据
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    -- LEFT JOIN: 可无实验匹配,此时 m.abcode 为 NULL → COALESCE 为"对照组"
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+)
+-- 桶内每个 vid 的曝光数(ECS / ARP 的共同中间件,避免重复扫 t_base)
+,t_vid_exp AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+             ,vid
+)
+-- 桶内 ECS (Effective Catalog Size):曝光实际"相当于推了多少条视频"
+-- ECS = 2 * Σ(p_i * rank_i) - 1
+--   p_i    = vid 在桶内曝光占比
+--   rank_i = 按曝光降序的排名(1 起)
+-- 值域 [1, distinct_vid_cnt],越大越分散,越小越头部集中
+,t_bucket_ecs AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,2 * SUM(p * rn) - 1 AS ecs
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,abcode
+                        ,suffix
+                        ,vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                        ) AS p
+                        ,ROW_NUMBER() OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                            ORDER BY vid_exp_cnt DESC
+                        ) AS rn
+                FROM    t_vid_exp
+            ) t
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 全平台每个 vid 的曝光度(作为 ARP 的 popularity reference)
+-- 注意:不过滤 abcode,让 reference 覆盖全部合法尾号
+,t_vid_global_pop AS
+(
+    SELECT  dt
+            ,apptype
+            ,vid
+            ,COUNT(1) AS vid_global_pop
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,vid
+)
+-- 桶内 ARP (Average Recommendation Popularity):推荐视频的平均热门度
+-- 按桶内曝光量加权:曝光越多的 vid 对 ARP 影响越大
+-- 组合 ECS + ARP 可识别四象限:
+--   高ECS + 低ARP = 分散 + 偏长尾        ✅ 理想
+--   高ECS + 高ARP = 分散 + 头部内部多样化  ⚠️ 需警惕
+--   低ECS + 低ARP = 集中 + 冷门(小众爆发) ❓ 特殊
+--   低ECS + 高ARP = 集中 + 头部            ❌ 模型坍缩
+,t_bucket_arp AS
+(
+    SELECT  v.dt
+            ,v.apptype
+            ,v.abcode
+            ,v.suffix
+            ,SUM(v.vid_exp_cnt * g.vid_global_pop) / SUM(v.vid_exp_cnt) AS arp
+    FROM    t_vid_exp v
+    LEFT JOIN t_vid_global_pop g
+    ON      v.dt = g.dt
+    AND     v.apptype = g.apptype
+    AND     v.vid = g.vid
+    GROUP BY v.dt
+             ,v.apptype
+             ,v.abcode
+             ,v.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"对照组") AS abcode
+            ,sg.suffix_group
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                -- FROM    loghubods.useractive_log_per5min
+                -- WHERE   dt BETWEEN CONCAT("${dt}","000000") AND CONCAT("${dt}","235500")
+                AND     apptype IN ("4")
+            ) sub
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"对照组")
+             ,sg.suffix_group
+             ,sub.suffix
+)
+-- dau2:按 suffix_group 求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,suffix
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            -- [NEW] 合并 ROV = bn_rov + cn_rov + dn_rov(三者分母同为 COUNT(1),可合并)
+            ,COALESCE((SUM(return_n_uv) + SUM(cn) + SUM(dn)) / COUNT(1),0) AS total_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            -- [NEW] 桶内去重 vid 数(ECS 的天然配套)
+            ,COUNT(DISTINCT vid) AS distinct_vid_cnt
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+             ,suffix
+)
+-- 按实验组求尾号均值(新增:合并 ROV + 分发多样性三件套)
+,t_metrics AS
+(
+    SELECT  b.dt
+            ,b.apptype
+            ,b.abcode
+            ,b.suffix_group
+            ,ROUND(AVG(b.exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(b.str_one),6) AS str_one
+            ,ROUND(AVG(b.ros_one),6) AS ros_one
+            ,ROUND(AVG(b.str),6) AS str
+            ,ROUND(AVG(b.ros),6) AS ros
+            ,ROUND(AVG(b.str_plus),6) AS str_plus
+            ,ROUND(AVG(b.ros_minus),6) AS ros_minus
+            ,ROUND(AVG(b.bn_rov),6) AS bn_rov
+            ,ROUND(AVG(b.c1_rov),6) AS c1_rov
+            ,ROUND(AVG(b.cn_rov),6) AS cn_rov
+            ,ROUND(AVG(b.d1_rov),6) AS d1_rov
+            ,ROUND(AVG(b.dn_rov),6) AS dn_rov
+            ,ROUND(AVG(b.total_rov),6) AS total_rov
+            ,ROUND(AVG(b.vovh24),6) AS vovh24
+            ,AVG(b.dau) AS dau
+            ,AVG(b.exp) AS exp
+            ,ROUND(AVG(b.distinct_vid_cnt),0) AS distinct_vid_cnt
+            ,ROUND(AVG(e.ecs),1) AS ecs
+            -- ECS 归一化比值:去掉池子大小的影响,纯形态指标
+            ,ROUND(AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS ecs_ratio
+            -- Gini 系数:快手/Twitter/Netflix 业界标准,数学上 Gini = 1 - ecs_ratio
+            ,ROUND(1 - AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS gini
+            ,ROUND(AVG(a.arp),0) AS arp
+            ,AVG(b.is_share) AS is_share
+            ,AVG(b.share_cnt) AS share_cnt
+            ,AVG(b.is_return_1) AS is_return_1
+            ,AVG(b.return_n_uv) AS return_n_uv
+            ,AVG(b.viewh24) AS viewh24
+            ,AVG(b.return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(b.cn) AS cn
+            ,AVG(b.c1) AS c1
+            ,AVG(b.dn) AS dn
+            ,AVG(b.d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',b.suffix) AS suffix
+    FROM    t_bucket b
+    LEFT JOIN t_bucket_ecs e
+    ON      b.dt = e.dt
+    AND     b.apptype = e.apptype
+    AND     b.abcode = e.abcode
+    AND     b.suffix = e.suffix
+    LEFT JOIN t_bucket_arp a
+    ON      b.dt = a.dt
+    AND     b.apptype = a.apptype
+    AND     b.abcode = a.abcode
+    AND     b.suffix = a.suffix
+    GROUP BY b.dt
+             ,b.apptype
+             ,b.abcode
+             ,b.suffix_group
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix_group
+        ,a.suffix
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.total_rov
+        ,a.vovh24
+        ,a.dau
+        ,a.exp
+        ,a.distinct_vid_cnt
+        ,a.ecs
+        ,a.ecs_ratio
+        ,a.gini
+        ,a.arp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+AND     a.suffix_group = b.suffix_group
+ORDER BY a.dt DESC,a.apptype,a.abcode,a.suffix_group
+;

+ 10 - 0
tasks/00_尾号实验/base_v3_new_v2.json

@@ -0,0 +1,10 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "8rEAkY",
+  "sort": "dt:desc,suffix_group:asc",
+  "order": {
+      "suffix_group": ["ab", "34", "2c", "67", "01", "5d", "ef", "89"]
+  },
+  "cols": null,
+  "append_cols": false
+}

+ 542 - 0
tasks/00_尾号实验/base_v3_new_v2.sql

@@ -0,0 +1,542 @@
+-- ════════════════════════════════════════════════════════════════════════════
+-- 两层尾号映射 (SCD Type 2 模式)
+--
+-- 第一层 t_suffix_group:物理尾号 → 分流桶 ID(16 个 hex 尾号分成 8 个 2-元桶)
+--   - 分流规则不变时,此层永不改
+--
+-- 第二层 t_experiment_map:分流桶 → 实验名 + 生效日期
+--   - 只列出"分配了具体实验"的桶,未列出的桶自动默认为"对照组"
+--   - 支持 1 对多:同一个实验占多个桶时,用同一 abcode 字符串多加几行
+--   - 实验切换:不删旧行,关闭 end_dt + 追加新行(保留历史可回溯)
+-- ════════════════════════════════════════════════════════════════════════════
+WITH t_suffix_group AS
+(
+    SELECT "a" AS suffix, "ab" AS suffix_group
+    UNION ALL SELECT "b", "ab"
+    UNION ALL SELECT "0", "01"
+    UNION ALL SELECT "1", "01"
+    UNION ALL SELECT "2", "2c"
+    UNION ALL SELECT "c", "2c"
+    UNION ALL SELECT "3", "34"
+    UNION ALL SELECT "4", "34"
+    UNION ALL SELECT "5", "5d"
+    UNION ALL SELECT "d", "5d"
+    UNION ALL SELECT "6", "67"
+    UNION ALL SELECT "7", "67"
+    UNION ALL SELECT "8", "89"
+    UNION ALL SELECT "9", "89"
+    UNION ALL SELECT "e", "ef"
+    UNION ALL SELECT "f", "ef"
+)
+-- 当前实验映射
+--   未列出的桶(89 / 2c)→ 自动默认为"对照组"
+--   同一个 suffix_group 可以有多行(SCD Type 2),但同一时间只能命中一行
+,t_experiment_map AS
+(
+    -- 前基线(ab 桶,从未变动)
+    SELECT "ab" AS suffix_group, "实验组:变更str*ros建模目标实验" AS abcode, "20260413" AS start_dt, "29991231" AS end_dt
+
+    -- 变更str*ros建模目标实验(分阶段扩量,当前占用 4 个桶;1 对多)
+    --   20260320: 首批上 01 桶
+    --   20260330: 扩到 67 桶(此时 67 桶的 bn_ros 实验已下线 10 天 ⚠️)
+    --   20260407: 同日扩到 5d 桶(5d 的解构str 实验下线)和 34 桶(34 此前是默认对照组)
+    UNION ALL SELECT "01", "实验组:变更str*ros建模目标实验", "20260320", "29991231"
+    UNION ALL SELECT "67", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+    UNION ALL SELECT "5d", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+    UNION ALL SELECT "34", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+
+    -- 67 桶的前实验:bn_ros 新损失函数
+    --   20260320~20260329 为空窗期(10 天),此间 67 → 默认"对照组"
+    UNION ALL SELECT "67", "实验组:bn_ros新损失函数",    "20260311", "20260319"
+
+    -- 5d 桶的前实验:解构特征排序 str 模型
+    --   20260407 直接被建模目标实验接手,无空窗
+    UNION ALL SELECT "5d", "实验组:解构特征排序str模型", "20260314", "20260406"
+
+    -- ef 桶的历史:解构str&召回(已下线)→ 空窗 17 天 → DNN 模型(仍在运行)
+    --   20260321~20260406 为空窗期,此间 ef → 默认"对照组"
+    UNION ALL SELECT "ef", "实验组:解构特征排序str模型&召回", "20260314", "20260320"
+    UNION ALL SELECT "ef", "实验组:DNN模型",                   "20260407", "29991231"
+    UNION ALL SELECT "2c", "实验组:DNN模型-调参", "20260413", "29991231"
+
+    UNION ALL SELECT "89", "对照组", "20260301", "20260412"
+    UNION ALL SELECT "89", "实验组:变更str*ros建模目标实验", "20260413", "29991231"
+
+
+    -- ────────────────────────────────────────────────────────────────────
+    -- 📖 修改样例(复制下面的行到上面 UNION ALL 列表里使用)
+    --
+    -- 样例 A:新增一个占用单桶的实验
+    --   UNION ALL SELECT "2c", "实验组:新策略 X", "20260501", "29991231"
+    --
+    -- 样例 B:新增一个 1 对多 实验(同一实验占 01 + 34 两个桶)
+    --   用同一 abcode 字符串加两行即可,下游 GROUP BY 自动合并:
+    --   UNION ALL SELECT "01", "实验组:大流量 Y", "20260601", "29991231"
+    --   UNION ALL SELECT "34", "实验组:大流量 Y", "20260601", "29991231"
+    --
+    -- 样例 C:实验切换(SCD Type 2 —— 保留历史)
+    --   假设 01 桶 20260701 从 实验 A 切换到 实验 B:
+    --   Step 1: 把原来那行 end_dt 改成切换前一天:
+    --     SELECT "01", "实验组:A", "20260320", "20260630"
+    --   Step 2: 追加新实验行:
+    --     UNION ALL SELECT "01", "实验组:B", "20260701", "29991231"
+    --
+    -- 样例 D:实验下线回到对照组(产生空窗)
+    --   直接把该行的 end_dt 改成下线前一天即可(不用追加行):
+    --     SELECT "5d", "实验组:A", "20250101", "20260630"
+    --   20260701 之后 5d 桶没有任何有效行覆盖,自动进入"对照组"
+    --   ⚠️ 如果这是有意的空窗,没问题;如果只是忘了接新实验,后续记得补
+    -- ────────────────────────────────────────────────────────────────────
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,sg.suffix_group
+            ,COALESCE(m.abcode,"对照组") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        ,cc.cn
+                        ,cc.c1
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_exposure_base_20250108 a
+                LEFT JOIN   (
+                                -- c1/cn:分享后被点击的回流 UV
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                -- d1/dn:下一条视频带来的回流
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("4")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    -- INNER JOIN: 合法尾号(在 16 个 hex 里)才进分析;防御异常数据
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    -- LEFT JOIN: 可无实验匹配,此时 m.abcode 为 NULL → COALESCE 为"对照组"
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+)
+-- 桶内每个 vid 的曝光数(ECS / ARP 的共同中间件,避免重复扫 t_base)
+,t_vid_exp AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+             ,vid
+)
+-- 桶内 ECS (Effective Catalog Size):曝光实际"相当于推了多少条视频"
+-- ECS = 2 * Σ(p_i * rank_i) - 1
+--   p_i    = vid 在桶内曝光占比
+--   rank_i = 按曝光降序的排名(1 起)
+-- 值域 [1, distinct_vid_cnt],越大越分散,越小越头部集中
+,t_bucket_ecs AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,2 * SUM(p * rn) - 1 AS ecs
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,abcode
+                        ,suffix
+                        ,vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                        ) AS p
+                        ,ROW_NUMBER() OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                            ORDER BY vid_exp_cnt DESC
+                        ) AS rn
+                FROM    t_vid_exp
+            ) t
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 全平台每个 vid 的曝光度(作为 ARP 的 popularity reference)
+-- 注意:不过滤 abcode,让 reference 覆盖全部合法尾号
+,t_vid_global_pop AS
+(
+    SELECT  dt
+            ,apptype
+            ,vid
+            ,COUNT(1) AS vid_global_pop
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,vid
+)
+-- 桶内 ARP (Average Recommendation Popularity):推荐视频的平均热门度
+-- 按桶内曝光量加权:曝光越多的 vid 对 ARP 影响越大
+-- 组合 ECS + ARP 可识别四象限:
+--   高ECS + 低ARP = 分散 + 偏长尾        ✅ 理想
+--   高ECS + 高ARP = 分散 + 头部内部多样化  ⚠️ 需警惕
+--   低ECS + 低ARP = 集中 + 冷门(小众爆发) ❓ 特殊
+--   低ECS + 高ARP = 集中 + 头部            ❌ 模型坍缩
+,t_bucket_arp AS
+(
+    SELECT  v.dt
+            ,v.apptype
+            ,v.abcode
+            ,v.suffix
+            ,SUM(v.vid_exp_cnt * g.vid_global_pop) / SUM(v.vid_exp_cnt) AS arp
+    FROM    t_vid_exp v
+    LEFT JOIN t_vid_global_pop g
+    ON      v.dt = g.dt
+    AND     v.apptype = g.apptype
+    AND     v.vid = g.vid
+    GROUP BY v.dt
+             ,v.apptype
+             ,v.abcode
+             ,v.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"对照组") AS abcode
+            ,sg.suffix_group
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                -- FROM    loghubods.useractive_log_per5min
+                -- WHERE   dt BETWEEN CONCAT("${dt}","000000") AND CONCAT("${dt}","235500")
+                AND     apptype IN ("4")
+            ) sub
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"对照组")
+             ,sg.suffix_group
+             ,sub.suffix
+)
+-- dau2:按 suffix_group 求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,suffix
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            -- [NEW] 合并 ROV = bn_rov + cn_rov + dn_rov(三者分母同为 COUNT(1),可合并)
+            ,COALESCE((SUM(return_n_uv) + SUM(cn) + SUM(dn)) / COUNT(1),0) AS total_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            -- [NEW] 桶内去重 vid 数(ECS 的天然配套)
+            ,COUNT(DISTINCT vid) AS distinct_vid_cnt
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+             ,suffix
+)
+-- 按实验组求尾号均值(新增:合并 ROV + 分发多样性三件套)
+,t_metrics AS
+(
+    SELECT  b.dt
+            ,b.apptype
+            ,b.abcode
+            ,b.suffix_group
+            ,ROUND(AVG(b.exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(b.str_one),6) AS str_one
+            ,ROUND(AVG(b.ros_one),6) AS ros_one
+            ,ROUND(AVG(b.str),6) AS str
+            ,ROUND(AVG(b.ros),6) AS ros
+            ,ROUND(AVG(b.str_plus),6) AS str_plus
+            ,ROUND(AVG(b.ros_minus),6) AS ros_minus
+            ,ROUND(AVG(b.bn_rov),6) AS bn_rov
+            ,ROUND(AVG(b.c1_rov),6) AS c1_rov
+            ,ROUND(AVG(b.cn_rov),6) AS cn_rov
+            ,ROUND(AVG(b.d1_rov),6) AS d1_rov
+            ,ROUND(AVG(b.dn_rov),6) AS dn_rov
+            ,ROUND(AVG(b.total_rov),6) AS total_rov
+            ,ROUND(AVG(b.vovh24),6) AS vovh24
+            ,AVG(b.dau) AS dau
+            ,AVG(b.exp) AS exp
+            ,ROUND(AVG(b.distinct_vid_cnt),0) AS distinct_vid_cnt
+            ,ROUND(AVG(e.ecs),1) AS ecs
+            -- ECS 归一化比值:去掉池子大小的影响,纯形态指标
+            ,ROUND(AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS ecs_ratio
+            -- Gini 系数:快手/Twitter/Netflix 业界标准,数学上 Gini = 1 - ecs_ratio
+            ,ROUND(1 - AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS gini
+            ,ROUND(AVG(a.arp),0) AS arp
+            ,AVG(b.is_share) AS is_share
+            ,AVG(b.share_cnt) AS share_cnt
+            ,AVG(b.is_return_1) AS is_return_1
+            ,AVG(b.return_n_uv) AS return_n_uv
+            ,AVG(b.viewh24) AS viewh24
+            ,AVG(b.return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(b.cn) AS cn
+            ,AVG(b.c1) AS c1
+            ,AVG(b.dn) AS dn
+            ,AVG(b.d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',b.suffix) AS suffix
+    FROM    t_bucket b
+    LEFT JOIN t_bucket_ecs e
+    ON      b.dt = e.dt
+    AND     b.apptype = e.apptype
+    AND     b.abcode = e.abcode
+    AND     b.suffix = e.suffix
+    LEFT JOIN t_bucket_arp a
+    ON      b.dt = a.dt
+    AND     b.apptype = a.apptype
+    AND     b.abcode = a.abcode
+    AND     b.suffix = a.suffix
+    GROUP BY b.dt
+             ,b.apptype
+             ,b.abcode
+             ,b.suffix_group
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix_group
+        ,a.suffix
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.total_rov
+        ,a.vovh24
+        ,a.dau
+        ,a.exp
+        ,a.distinct_vid_cnt
+        ,a.ecs
+        ,a.ecs_ratio
+        ,a.gini
+        ,a.arp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+AND     a.suffix_group = b.suffix_group
+ORDER BY a.dt DESC,a.apptype,a.abcode,a.suffix_group
+;

+ 11 - 0
tasks/00_尾号实验/base_v4_v1_new.json

@@ -0,0 +1,11 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "LLWyyA",
+  "sort": "dt:desc,suffix_group:asc",
+  "order": {
+      "suffix_group": ["34", "67", "01", "5d", "ef", "89"]
+  },
+  "cols": null,
+  "append_cols": false,
+  "filter": "abcode!=前基线,abcode!=6,abcode!=e,abcode!=f"
+}

+ 537 - 0
tasks/00_尾号实验/base_v4_v1_new.sql

@@ -0,0 +1,537 @@
+-- ════════════════════════════════════════════════════════════════════════════
+-- 两层尾号映射 (SCD Type 2 模式) — apptype = 0
+--
+-- 第一层 t_suffix_group:物理尾号 → 分流桶 ID(16 个 hex 尾号分成 8 个 2-元桶)
+--   - 分流规则不变时,此层永不改
+--
+-- 第二层 t_experiment_map:分流桶 → 实验名 + 生效日期
+--   - 只列出"分配了具体实验"的桶,未列出的桶自动默认为"对照组"
+--   - 支持 1 对多:同一个实验占多个桶时,用同一 abcode 字符串多加几行
+--   - 实验切换:不删旧行,关闭 end_dt + 追加新行(保留历史可回溯)
+-- ════════════════════════════════════════════════════════════════════════════
+WITH t_suffix_group AS
+(
+    SELECT "a" AS suffix, "ab" AS suffix_group
+    UNION ALL SELECT "b", "ab"
+    UNION ALL SELECT "0", "01"
+    UNION ALL SELECT "1", "01"
+    UNION ALL SELECT "2", "2c"
+    UNION ALL SELECT "c", "2c"
+    UNION ALL SELECT "3", "34"
+    UNION ALL SELECT "4", "34"
+    UNION ALL SELECT "5", "5d"
+    UNION ALL SELECT "d", "5d"
+    UNION ALL SELECT "6", "67"
+    UNION ALL SELECT "7", "67"
+    UNION ALL SELECT "8", "89"
+    UNION ALL SELECT "9", "89"
+    UNION ALL SELECT "e", "ef"
+    UNION ALL SELECT "f", "ef"
+)
+-- 当前实验映射 (apptype = 0)
+--   未列出的桶(89 / 2c)→ 自动默认为"对照组"
+--   同一个 suffix_group 可以有多行(SCD Type 2),但同一时间只能命中一行
+--   TODO: start_dt 全填 '20250101' 是占位,请替换为真实上线日期
+,t_experiment_map AS
+(
+    -- 前基线(ab 桶)
+    SELECT "ab" AS suffix_group, "实验组:变更str*ros建模目标实验" AS abcode, "20260413" AS start_dt, "29991231" AS end_dt
+   
+    -- 建模目标实验
+    UNION ALL SELECT "01", "实验组:变更str*ros建模目标实验", "20260320", "29991231"
+
+    -- bn_ros 新损失函数
+    UNION ALL SELECT "34", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+
+    -- cn_rov 实验
+    UNION ALL SELECT "67", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+
+    -- 解构特征排序 str 模型
+    UNION ALL SELECT "5d", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+
+    UNION ALL SELECT "ef", "实验组:DNN模型-调参", "20260410", "29991231"
+
+    UNION ALL SELECT "2c", "实验组:DNN模型", "20260413", "29991231"
+
+    UNION ALL SELECT "89", "实验组:DNN模型", "20260413", "29991231"
+
+    UNION ALL SELECT "89", "对照组", "20260301", "20260412"
+
+    -- ────────────────────────────────────────────────────────────────────
+    -- 📖 修改样例(复制下面的行到上面 UNION ALL 列表里使用)
+    --
+    -- 样例 A:新增一个占用单桶的实验
+    --   UNION ALL SELECT "2c", "实验组:新策略 X", "20260501", "29991231"
+    --
+    -- 样例 B:新增一个 1 对多 实验(同一实验占 01 + 34 两个桶)
+    --   用同一 abcode 字符串加两行即可,下游 GROUP BY 自动合并:
+    --   UNION ALL SELECT "01", "实验组:大流量 Y", "20260601", "29991231"
+    --   UNION ALL SELECT "34", "实验组:大流量 Y", "20260601", "29991231"
+    --
+    -- 样例 C:实验切换(SCD Type 2 —— 保留历史)
+    --   假设 01 桶 20260701 从 实验 A 切换到 实验 B:
+    --   Step 1: 把原来那行 end_dt 改成切换前一天:
+    --     SELECT "01", "实验组:A", "20250101", "20260630"
+    --   Step 2: 追加新实验行:
+    --     UNION ALL SELECT "01", "实验组:B", "20260701", "29991231"
+    --
+    -- 样例 D:实验下线回到对照组(产生空窗)
+    --   直接把该行的 end_dt 改成下线前一天即可(不用追加行):
+    --     SELECT "5d", "实验组:A", "20250101", "20260630"
+    --   20260701 之后 5d 桶没有任何有效行覆盖,自动进入"对照组"
+    -- ────────────────────────────────────────────────────────────────────
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,sg.suffix_group
+            ,COALESCE(m.abcode,"对照组") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        ,cc.cn
+                        ,cc.c1
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_exposure_base_20250108 a
+                LEFT JOIN   (
+                                -- c1/cn:分享后被点击的回流 UV
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                -- d1/dn:下一条视频带来的回流
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("0")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    -- INNER JOIN: 合法尾号(在 16 个 hex 里)才进分析;防御异常数据
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    -- LEFT JOIN: 可无实验匹配,此时 m.abcode 为 NULL → COALESCE 为"对照组"
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+)
+-- 桶内每个 vid 的曝光数(ECS / ARP 的共同中间件,避免重复扫 t_base)
+,t_vid_exp AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+             ,vid
+)
+-- 桶内 ECS (Effective Catalog Size):曝光实际"相当于推了多少条视频"
+-- ECS = 2 * Σ(p_i * rank_i) - 1
+--   p_i    = vid 在桶内曝光占比
+--   rank_i = 按曝光降序的排名(1 起)
+-- 值域 [1, distinct_vid_cnt],越大越分散,越小越头部集中
+,t_bucket_ecs AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,2 * SUM(p * rn) - 1 AS ecs
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,abcode
+                        ,suffix
+                        ,vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                        ) AS p
+                        ,ROW_NUMBER() OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                            ORDER BY vid_exp_cnt DESC
+                        ) AS rn
+                FROM    t_vid_exp
+            ) t
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 全平台每个 vid 的曝光度(作为 ARP 的 popularity reference)
+-- 注意:不过滤 abcode,让 reference 覆盖全部合法尾号
+,t_vid_global_pop AS
+(
+    SELECT  dt
+            ,apptype
+            ,vid
+            ,COUNT(1) AS vid_global_pop
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,vid
+)
+-- 桶内 ARP (Average Recommendation Popularity):推荐视频的平均热门度
+-- 按桶内曝光量加权:曝光越多的 vid 对 ARP 影响越大
+-- 组合 ECS + ARP 可识别四象限:
+--   高ECS + 低ARP = 分散 + 偏长尾        ✅ 理想
+--   高ECS + 高ARP = 分散 + 头部内部多样化  ⚠️ 需警惕
+--   低ECS + 低ARP = 集中 + 冷门(小众爆发) ❓ 特殊
+--   低ECS + 高ARP = 集中 + 头部            ❌ 模型坍缩
+,t_bucket_arp AS
+(
+    SELECT  v.dt
+            ,v.apptype
+            ,v.abcode
+            ,v.suffix
+            ,SUM(v.vid_exp_cnt * g.vid_global_pop) / SUM(v.vid_exp_cnt) AS arp
+    FROM    t_vid_exp v
+    LEFT JOIN t_vid_global_pop g
+    ON      v.dt = g.dt
+    AND     v.apptype = g.apptype
+    AND     v.vid = g.vid
+    GROUP BY v.dt
+             ,v.apptype
+             ,v.abcode
+             ,v.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"对照组") AS abcode
+            ,sg.suffix_group
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                -- FROM    loghubods.useractive_log_per5min
+                -- WHERE   dt BETWEEN CONCAT("${dt}","000000") AND CONCAT("${dt}","235500")
+                AND     apptype IN ("0")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') NOT IN ("ab100")
+            ) sub
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"对照组")
+             ,sg.suffix_group
+             ,sub.suffix
+)
+-- dau2:按 suffix_group 求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,suffix
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            -- 合并 ROV = bn_rov + cn_rov + dn_rov(三者分母同为 COUNT(1),可合并)
+            ,COALESCE((SUM(return_n_uv) + SUM(cn) + SUM(dn)) / COUNT(1),0) AS total_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            -- 桶内去重 vid 数(ECS 的天然配套)
+            ,COUNT(DISTINCT vid) AS distinct_vid_cnt
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+             ,suffix
+)
+-- 按 suffix_group 求尾号均值(含合并 ROV + 分发多样性三件套)
+,t_metrics AS
+(
+    SELECT  b.dt
+            ,b.apptype
+            ,b.abcode
+            ,b.suffix_group
+            ,ROUND(AVG(b.exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(b.str_one),6) AS str_one
+            ,ROUND(AVG(b.ros_one),6) AS ros_one
+            ,ROUND(AVG(b.str),6) AS str
+            ,ROUND(AVG(b.ros),6) AS ros
+            ,ROUND(AVG(b.str_plus),6) AS str_plus
+            ,ROUND(AVG(b.ros_minus),6) AS ros_minus
+            ,ROUND(AVG(b.bn_rov),6) AS bn_rov
+            ,ROUND(AVG(b.c1_rov),6) AS c1_rov
+            ,ROUND(AVG(b.cn_rov),6) AS cn_rov
+            ,ROUND(AVG(b.d1_rov),6) AS d1_rov
+            ,ROUND(AVG(b.dn_rov),6) AS dn_rov
+            ,ROUND(AVG(b.total_rov),6) AS total_rov
+            ,ROUND(AVG(b.vovh24),6) AS vovh24
+            ,AVG(b.dau) AS dau
+            ,AVG(b.exp) AS exp
+            ,ROUND(AVG(b.distinct_vid_cnt),0) AS distinct_vid_cnt
+            ,ROUND(AVG(e.ecs),1) AS ecs
+            -- ECS 归一化比值:去掉池子大小的影响,纯形态指标
+            ,ROUND(AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS ecs_ratio
+            -- Gini 系数:快手/Twitter/Netflix 业界标准,数学上 Gini = 1 - ecs_ratio
+            ,ROUND(1 - AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS gini
+            ,ROUND(AVG(a.arp),0) AS arp
+            ,AVG(b.is_share) AS is_share
+            ,AVG(b.share_cnt) AS share_cnt
+            ,AVG(b.is_return_1) AS is_return_1
+            ,AVG(b.return_n_uv) AS return_n_uv
+            ,AVG(b.viewh24) AS viewh24
+            ,AVG(b.return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(b.cn) AS cn
+            ,AVG(b.c1) AS c1
+            ,AVG(b.dn) AS dn
+            ,AVG(b.d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',b.suffix) AS suffix
+    FROM    t_bucket b
+    LEFT JOIN t_bucket_ecs e
+    ON      b.dt = e.dt
+    AND     b.apptype = e.apptype
+    AND     b.abcode = e.abcode
+    AND     b.suffix = e.suffix
+    LEFT JOIN t_bucket_arp a
+    ON      b.dt = a.dt
+    AND     b.apptype = a.apptype
+    AND     b.abcode = a.abcode
+    AND     b.suffix = a.suffix
+    GROUP BY b.dt
+             ,b.apptype
+             ,b.abcode
+             ,b.suffix_group
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix_group
+        ,a.suffix
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.total_rov
+        ,a.vovh24
+        ,a.dau
+        ,a.exp
+        ,a.distinct_vid_cnt
+        ,a.ecs
+        ,a.ecs_ratio
+        ,a.gini
+        ,a.arp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+AND     a.suffix_group = b.suffix_group
+ORDER BY a.dt DESC,a.apptype,a.abcode,a.suffix_group
+;

+ 11 - 0
tasks/00_尾号实验/base_v4_v1_new_v2.json

@@ -0,0 +1,11 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "vJQSTF",
+  "sort": "dt:desc,suffix_group:asc",
+  "order": {
+      "suffix_group": ["ab", "34", "2c", "67", "01", "5d", "ef", "89"]
+
+  },
+  "cols": null,
+  "append_cols": false
+}

+ 537 - 0
tasks/00_尾号实验/base_v4_v1_new_v2.sql

@@ -0,0 +1,537 @@
+-- ════════════════════════════════════════════════════════════════════════════
+-- 两层尾号映射 (SCD Type 2 模式) — apptype = 0
+--
+-- 第一层 t_suffix_group:物理尾号 → 分流桶 ID(16 个 hex 尾号分成 8 个 2-元桶)
+--   - 分流规则不变时,此层永不改
+--
+-- 第二层 t_experiment_map:分流桶 → 实验名 + 生效日期
+--   - 只列出"分配了具体实验"的桶,未列出的桶自动默认为"对照组"
+--   - 支持 1 对多:同一个实验占多个桶时,用同一 abcode 字符串多加几行
+--   - 实验切换:不删旧行,关闭 end_dt + 追加新行(保留历史可回溯)
+-- ════════════════════════════════════════════════════════════════════════════
+WITH t_suffix_group AS
+(
+    SELECT "a" AS suffix, "ab" AS suffix_group
+    UNION ALL SELECT "b", "ab"
+    UNION ALL SELECT "0", "01"
+    UNION ALL SELECT "1", "01"
+    UNION ALL SELECT "2", "2c"
+    UNION ALL SELECT "c", "2c"
+    UNION ALL SELECT "3", "34"
+    UNION ALL SELECT "4", "34"
+    UNION ALL SELECT "5", "5d"
+    UNION ALL SELECT "d", "5d"
+    UNION ALL SELECT "6", "67"
+    UNION ALL SELECT "7", "67"
+    UNION ALL SELECT "8", "89"
+    UNION ALL SELECT "9", "89"
+    UNION ALL SELECT "e", "ef"
+    UNION ALL SELECT "f", "ef"
+)
+-- 当前实验映射 (apptype = 0)
+--   未列出的桶(89 / 2c)→ 自动默认为"对照组"
+--   同一个 suffix_group 可以有多行(SCD Type 2),但同一时间只能命中一行
+--   TODO: start_dt 全填 '20250101' 是占位,请替换为真实上线日期
+,t_experiment_map AS
+(
+    -- 前基线(ab 桶)
+    SELECT "ab" AS suffix_group, "实验组:变更str*ros建模目标实验" AS abcode, "20260413" AS start_dt, "29991231" AS end_dt
+   
+    -- 建模目标实验
+    UNION ALL SELECT "01", "实验组:变更str*ros建模目标实验", "20260320", "29991231"
+
+    -- bn_ros 新损失函数
+    UNION ALL SELECT "34", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+
+    -- cn_rov 实验
+    UNION ALL SELECT "67", "实验组:变更str*ros建模目标实验", "20260330", "29991231"
+
+    -- 解构特征排序 str 模型
+    UNION ALL SELECT "5d", "实验组:变更str*ros建模目标实验", "20260407", "29991231"
+
+    UNION ALL SELECT "ef", "实验组:DNN模型-调参", "20260410", "29991231"
+
+    UNION ALL SELECT "2c", "实验组:DNN模型", "20260413", "29991231"
+
+    UNION ALL SELECT "89", "实验组:变更str*ros建模目标实验", "20260413", "29991231"
+
+    UNION ALL SELECT "89", "对照组", "20260301", "20260412"
+
+    -- ────────────────────────────────────────────────────────────────────
+    -- 📖 修改样例(复制下面的行到上面 UNION ALL 列表里使用)
+    --
+    -- 样例 A:新增一个占用单桶的实验
+    --   UNION ALL SELECT "2c", "实验组:新策略 X", "20260501", "29991231"
+    --
+    -- 样例 B:新增一个 1 对多 实验(同一实验占 01 + 34 两个桶)
+    --   用同一 abcode 字符串加两行即可,下游 GROUP BY 自动合并:
+    --   UNION ALL SELECT "01", "实验组:大流量 Y", "20260601", "29991231"
+    --   UNION ALL SELECT "34", "实验组:大流量 Y", "20260601", "29991231"
+    --
+    -- 样例 C:实验切换(SCD Type 2 —— 保留历史)
+    --   假设 01 桶 20260701 从 实验 A 切换到 实验 B:
+    --   Step 1: 把原来那行 end_dt 改成切换前一天:
+    --     SELECT "01", "实验组:A", "20250101", "20260630"
+    --   Step 2: 追加新实验行:
+    --     UNION ALL SELECT "01", "实验组:B", "20260701", "29991231"
+    --
+    -- 样例 D:实验下线回到对照组(产生空窗)
+    --   直接把该行的 end_dt 改成下线前一天即可(不用追加行):
+    --     SELECT "5d", "实验组:A", "20250101", "20260630"
+    --   20260701 之后 5d 桶没有任何有效行覆盖,自动进入"对照组"
+    -- ────────────────────────────────────────────────────────────────────
+)
+,t_base AS
+(
+    SELECT  sub.*
+            ,sg.suffix_group
+            ,COALESCE(m.abcode,"对照组") AS abcode
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,SUBSTR(GET_JSON_OBJECT(extend,'$.rootsessionid'),LENGTH(GET_JSON_OBJECT(extend,'$.rootsessionid')),1) AS suffix
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page
+                        ,a.mid
+                        ,a.vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,flowpool
+                        ,cc.cn
+                        ,cc.c1
+                        ,dd.dn
+                        ,dd.d1
+                FROM    loghubods.dwd_recsys_alg_exposure_base_20250108 a
+                LEFT JOIN   (
+                                -- c1/cn:分享后被点击的回流 UV
+                                SELECT  a.machinecode AS mid
+                                        ,a.subsessionid
+                                        ,a.videoid AS vid
+                                        ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
+                                        ,COUNT(DISTINCT CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS c1
+                                FROM    (
+                                            SELECT  DISTINCT machinecode
+                                                    ,shareobjectid AS videoid
+                                                    ,recomTraceId
+                                                    ,subsessionid
+                                                    ,sharedepth
+                                                    ,shareid
+                                            FROM    loghubods.user_share_log
+                                            WHERE   dt = '${dt}'
+                                            AND     topic = 'share'
+                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                        ) a
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b
+                                ON      a.shareid = b.rootshareid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,shareobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'share'
+                                                AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                            ) b1
+                                ON      b.machinecode = b1.machinecode
+                                AND     b.subsessionid = b1.subsessionid
+                                LEFT JOIN   (
+                                                SELECT  DISTINCT machinecode
+                                                        ,clickobjectid
+                                                        ,recomTraceId
+                                                        ,subsessionid
+                                                        ,sharedepth
+                                                        ,shareid
+                                                        ,rootshareid
+                                                FROM    loghubods.user_share_log
+                                                WHERE   dt = '${dt}'
+                                                AND     topic = 'click'
+                                            ) b2
+                                ON      b1.shareid = b2.rootshareid
+                                GROUP BY a.machinecode
+                                         ,a.subsessionid
+                                         ,a.videoid
+                            ) cc
+                ON      a.mid = cc.mid
+                AND     a.subsessionid = cc.subsessionid
+                AND     a.vid = cc.vid
+                LEFT JOIN   (
+                                -- d1/dn:下一条视频带来的回流
+                                SELECT  *
+                                        ,LAG(回流,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS dn
+                                        ,LAG(回流1,1,0) OVER (PARTITION BY mid,subsessionid ORDER BY rn DESC) AS d1
+                                FROM    (
+                                            SELECT  a.mid AS mid
+                                                    ,a.subsessionid
+                                                    ,a.videoid AS vid
+                                                    ,COUNT(DISTINCT b.shareid) AS 分享次数
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode THEN c.machinecode END) AS 回流
+                                                    ,COUNT(DISTINCT CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1 THEN c.machinecode END) AS 回流1
+                                                    ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp ASC) AS rn
+                                            FROM    (
+                                                        SELECT  *
+                                                        FROM    (
+                                                                    SELECT  DISTINCT mid
+                                                                            ,subsessionid
+                                                                            ,videoid
+                                                                            ,logtimestamp
+                                                                            ,ROW_NUMBER() OVER (PARTITION BY mid,subsessionid,videoid ORDER BY logtimestamp ASC) AS rn
+                                                                    FROM    loghubods.video_action_log_rp
+                                                                    WHERE   dt = '${dt}'
+                                                                    AND     businesstype = 'videoView'
+                                                                    AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                                )
+                                                        WHERE   rn = 1
+                                                    ) a
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,shareobjectid AS videoid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,shareid
+                                                                    ,clienttimestamp
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'share'
+                                                            AND     pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
+                                                        ) b
+                                            ON      a.mid = b.machinecode
+                                            AND     a.subsessionid = b.subsessionid
+                                            AND     a.videoid = b.videoid
+                                            LEFT JOIN   (
+                                                            SELECT  DISTINCT machinecode
+                                                                    ,clickobjectid
+                                                                    ,recomTraceId
+                                                                    ,subsessionid
+                                                                    ,sharedepth
+                                                                    ,rootshareid
+                                                            FROM    loghubods.user_share_log
+                                                            WHERE   dt = '${dt}'
+                                                            AND     topic = 'click'
+                                                        ) c
+                                            ON      b.shareid = c.rootshareid
+                                            GROUP BY a.mid
+                                                     ,a.subsessionid
+                                                     ,a.videoid
+                                                     ,a.logtimestamp
+                                        )
+                            ) dd
+                ON      a.mid = dd.mid
+                AND     a.subsessionid = dd.subsessionid
+                AND     a.vid = dd.vid
+                WHERE   dt="${dt}"
+                AND     apptype IN ("0")
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) sub
+    -- INNER JOIN: 合法尾号(在 16 个 hex 里)才进分析;防御异常数据
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    -- LEFT JOIN: 可无实验匹配,此时 m.abcode 为 NULL → COALESCE 为"对照组"
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+)
+-- 桶内每个 vid 的曝光数(ECS / ARP 的共同中间件,避免重复扫 t_base)
+,t_vid_exp AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+             ,vid
+)
+-- 桶内 ECS (Effective Catalog Size):曝光实际"相当于推了多少条视频"
+-- ECS = 2 * Σ(p_i * rank_i) - 1
+--   p_i    = vid 在桶内曝光占比
+--   rank_i = 按曝光降序的排名(1 起)
+-- 值域 [1, distinct_vid_cnt],越大越分散,越小越头部集中
+,t_bucket_ecs AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix
+            ,2 * SUM(p * rn) - 1 AS ecs
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,abcode
+                        ,suffix
+                        ,vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                        ) AS p
+                        ,ROW_NUMBER() OVER (
+                            PARTITION BY dt, apptype, abcode, suffix
+                            ORDER BY vid_exp_cnt DESC
+                        ) AS rn
+                FROM    t_vid_exp
+            ) t
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix
+)
+-- 全平台每个 vid 的曝光度(作为 ARP 的 popularity reference)
+-- 注意:不过滤 abcode,让 reference 覆盖全部合法尾号
+,t_vid_global_pop AS
+(
+    SELECT  dt
+            ,apptype
+            ,vid
+            ,COUNT(1) AS vid_global_pop
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,vid
+)
+-- 桶内 ARP (Average Recommendation Popularity):推荐视频的平均热门度
+-- 按桶内曝光量加权:曝光越多的 vid 对 ARP 影响越大
+-- 组合 ECS + ARP 可识别四象限:
+--   高ECS + 低ARP = 分散 + 偏长尾        ✅ 理想
+--   高ECS + 高ARP = 分散 + 头部内部多样化  ⚠️ 需警惕
+--   低ECS + 低ARP = 集中 + 冷门(小众爆发) ❓ 特殊
+--   低ECS + 高ARP = 集中 + 头部            ❌ 模型坍缩
+,t_bucket_arp AS
+(
+    SELECT  v.dt
+            ,v.apptype
+            ,v.abcode
+            ,v.suffix
+            ,SUM(v.vid_exp_cnt * g.vid_global_pop) / SUM(v.vid_exp_cnt) AS arp
+    FROM    t_vid_exp v
+    LEFT JOIN t_vid_global_pop g
+    ON      v.dt = g.dt
+    AND     v.apptype = g.apptype
+    AND     v.vid = g.vid
+    GROUP BY v.dt
+             ,v.apptype
+             ,v.abcode
+             ,v.suffix
+)
+-- dau2:按单尾号聚合
+,t_dau2_bucket AS
+(
+    SELECT  SUBSTR(sub.dt,1,8) AS dt
+            ,sub.apptype
+            ,COALESCE(m.abcode,"对照组") AS abcode
+            ,sg.suffix_group
+            ,sub.suffix
+            ,COUNT(DISTINCT sub.machinecode) AS dau2
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,machinecode
+                        ,SUBSTR(GET_JSON_OBJECT(extparams,'$.rootSessionId'),LENGTH(GET_JSON_OBJECT(extparams,'$.rootSessionId')),1) AS suffix
+                FROM    loghubods.useractive_log
+                WHERE   dt="${dt}"
+                -- FROM    loghubods.useractive_log_per5min
+                -- WHERE   dt BETWEEN CONCAT("${dt}","000000") AND CONCAT("${dt}","235500")
+                AND     apptype IN ("0")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                AND     GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003') NOT IN ("ab100")
+            ) sub
+    INNER JOIN t_suffix_group sg
+    ON      sub.suffix = sg.suffix
+    LEFT JOIN t_experiment_map m
+    ON      sg.suffix_group = m.suffix_group
+    AND     '${dt}' BETWEEN m.start_dt AND m.end_dt
+    GROUP BY SUBSTR(sub.dt,1,8)
+             ,sub.apptype
+             ,COALESCE(m.abcode,"对照组")
+             ,sg.suffix_group
+             ,sub.suffix
+)
+-- dau2:按 suffix_group 求尾号均值
+,t_dau2 AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,AVG(dau2) AS dau2
+    FROM    t_dau2_bucket
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+)
+-- 按单尾号聚合(尾号内 UV 去重)
+,t_bucket AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,suffix_group
+            ,suffix
+            ,COALESCE(COUNT(1) / COUNT(DISTINCT mid),0) AS exp_per_dau
+            ,COALESCE(SUM(is_share) / COUNT(1),0) AS str_one
+            ,COALESCE(SUM(return_n_uv) / SUM(is_share),0) AS ros_one
+            ,COALESCE(SUM(share_cnt) / COUNT(1),0) AS str
+            ,COALESCE(SUM(return_n_uv) / SUM(share_cnt),0) AS ros
+            ,COALESCE(SUM(is_return_1) / COUNT(1),0) AS str_plus
+            ,COALESCE(SUM(return_n_uv) / SUM(is_return_1),0) AS ros_minus
+            ,COALESCE(SUM(return_n_uv) / COUNT(1),0) AS bn_rov
+            ,COALESCE(SUM(c1) / COUNT(1),0) AS c1_rov
+            ,COALESCE(SUM(cn) / COUNT(1),0) AS cn_rov
+            ,COALESCE(SUM(d1) / COUNT(1),0) AS d1_rov
+            ,COALESCE(SUM(dn) / COUNT(1),0) AS dn_rov
+            -- 合并 ROV = bn_rov + cn_rov + dn_rov(三者分母同为 COUNT(1),可合并)
+            ,COALESCE((SUM(return_n_uv) + SUM(cn) + SUM(dn)) / COUNT(1),0) AS total_rov
+            ,COALESCE(SUM(new_exposure_cnt) / COUNT(1),0) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            -- 桶内去重 vid 数(ECS 的天然配套)
+            ,COUNT(DISTINCT vid) AS distinct_vid_cnt
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+            ,COALESCE(SUM(cn),0) AS cn
+            ,COALESCE(SUM(c1),0) AS c1
+            ,COALESCE(SUM(dn),0) AS dn
+            ,COALESCE(SUM(d1),0) AS d1
+    FROM    t_base
+    WHERE   page = "推荐"
+    GROUP BY dt
+             ,apptype
+             ,abcode
+             ,suffix_group
+             ,suffix
+)
+-- 按 suffix_group 求尾号均值(含合并 ROV + 分发多样性三件套)
+,t_metrics AS
+(
+    SELECT  b.dt
+            ,b.apptype
+            ,b.abcode
+            ,b.suffix_group
+            ,ROUND(AVG(b.exp_per_dau),2) AS exp_per_dau
+            ,ROUND(AVG(b.str_one),6) AS str_one
+            ,ROUND(AVG(b.ros_one),6) AS ros_one
+            ,ROUND(AVG(b.str),6) AS str
+            ,ROUND(AVG(b.ros),6) AS ros
+            ,ROUND(AVG(b.str_plus),6) AS str_plus
+            ,ROUND(AVG(b.ros_minus),6) AS ros_minus
+            ,ROUND(AVG(b.bn_rov),6) AS bn_rov
+            ,ROUND(AVG(b.c1_rov),6) AS c1_rov
+            ,ROUND(AVG(b.cn_rov),6) AS cn_rov
+            ,ROUND(AVG(b.d1_rov),6) AS d1_rov
+            ,ROUND(AVG(b.dn_rov),6) AS dn_rov
+            ,ROUND(AVG(b.total_rov),6) AS total_rov
+            ,ROUND(AVG(b.vovh24),6) AS vovh24
+            ,AVG(b.dau) AS dau
+            ,AVG(b.exp) AS exp
+            ,ROUND(AVG(b.distinct_vid_cnt),0) AS distinct_vid_cnt
+            ,ROUND(AVG(e.ecs),1) AS ecs
+            -- ECS 归一化比值:去掉池子大小的影响,纯形态指标
+            ,ROUND(AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS ecs_ratio
+            -- Gini 系数:快手/Twitter/Netflix 业界标准,数学上 Gini = 1 - ecs_ratio
+            ,ROUND(1 - AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt),0),6) AS gini
+            ,ROUND(AVG(a.arp),0) AS arp
+            ,AVG(b.is_share) AS is_share
+            ,AVG(b.share_cnt) AS share_cnt
+            ,AVG(b.is_return_1) AS is_return_1
+            ,AVG(b.return_n_uv) AS return_n_uv
+            ,AVG(b.viewh24) AS viewh24
+            ,AVG(b.return_n_uv_noself) AS return_n_uv_noself
+            ,AVG(b.cn) AS cn
+            ,AVG(b.c1) AS c1
+            ,AVG(b.dn) AS dn
+            ,AVG(b.d1) AS d1
+            ,WM_CONCAT(DISTINCT ',',b.suffix) AS suffix
+    FROM    t_bucket b
+    LEFT JOIN t_bucket_ecs e
+    ON      b.dt = e.dt
+    AND     b.apptype = e.apptype
+    AND     b.abcode = e.abcode
+    AND     b.suffix = e.suffix
+    LEFT JOIN t_bucket_arp a
+    ON      b.dt = a.dt
+    AND     b.apptype = a.apptype
+    AND     b.abcode = a.abcode
+    AND     b.suffix = a.suffix
+    GROUP BY b.dt
+             ,b.apptype
+             ,b.abcode
+             ,b.suffix_group
+)
+SELECT  a.dt
+        ,a.apptype
+        ,a.abcode
+        ,a.suffix_group
+        ,a.suffix
+        ,a.exp_per_dau
+        ,a.str_one
+        ,a.ros_one
+        ,a.str
+        ,a.ros
+        ,a.str_plus
+        ,a.ros_minus
+        ,a.bn_rov
+        ,a.c1_rov
+        ,a.cn_rov
+        ,a.d1_rov
+        ,a.dn_rov
+        ,a.total_rov
+        ,a.vovh24
+        ,a.dau
+        ,a.exp
+        ,a.distinct_vid_cnt
+        ,a.ecs
+        ,a.ecs_ratio
+        ,a.gini
+        ,a.arp
+        ,a.is_share
+        ,a.share_cnt
+        ,a.is_return_1
+        ,a.return_n_uv
+        ,a.viewh24
+        ,a.return_n_uv_noself
+        ,a.cn
+        ,a.c1
+        ,a.dn
+        ,a.d1
+        ,b.dau2
+FROM    t_metrics a
+LEFT JOIN t_dau2 b
+ON      a.dt = b.dt
+AND     a.apptype = b.apptype
+AND     a.abcode = b.abcode
+AND     a.suffix_group = b.suffix_group
+ORDER BY a.dt DESC,a.apptype,a.abcode,a.suffix_group
+;

+ 313 - 0
tasks/00_尾号实验/指标计算公式_ecs_arp.md

@@ -0,0 +1,313 @@
+# ECS / ecs_ratio / ARP 计算公式与举例
+
+---
+
+## 一、ECS(Effective Catalog Size)— 有效视频数
+
+### 含义
+
+"今天的曝光分布,**等效于均匀推了多少条视频**。"
+
+### 公式
+
+```
+ECS = 2 × Σ(pᵢ × i) − 1
+```
+
+其中:
+- 把桶内每条视频按**曝光量从多到少排序**,排名记为 `i`(第 1 名、第 2 名 …)
+- `pᵢ` = 第 `i` 名视频的曝光量 ÷ 桶内总曝光量(即"曝光占比")
+- 对所有视频求和后乘 2 再减 1
+
+### 值域
+
+- **最小值 = 1**:所有曝光都给了 1 条视频
+- **最大值 = N**(N = 桶内去重视频数):所有视频曝光次数完全相同
+
+### 举例计算
+
+假设某个桶当天有 **5 条视频**被曝光,曝光次数如下:
+
+| 视频 | 曝光次数 | 排名 i(降序)| 占比 pᵢ |
+|---|---|---|---|
+| A | 600 | 1 | 600/1000 = 0.60 |
+| B | 200 | 2 | 200/1000 = 0.20 |
+| C | 100 | 3 | 100/1000 = 0.10 |
+| D | 60 | 4 | 60/1000 = 0.06 |
+| E | 40 | 5 | 40/1000 = 0.04 |
+| **合计** | **1000** | | **1.00** |
+
+**第一步:计算 Σ(pᵢ × i)**
+
+```
+= 0.60×1 + 0.20×2 + 0.10×3 + 0.06×4 + 0.04×5
+= 0.60   + 0.40   + 0.30   + 0.24   + 0.20
+= 1.74
+```
+
+**第二步:代入公式**
+
+```
+ECS = 2 × 1.74 − 1 = 2.48
+```
+
+**解读**:虽然真实有 5 条视频,但因为 A 占了 60% 的曝光,实际上**等效于均匀推了 2.48 条视频**。头部集中效应吃掉了 5 − 2.48 = 2.52 条视频的"有效性"。
+
+### 验证两个极端
+
+**极端 1:完全集中**(A 占 100%)
+
+| 视频 | 占比 pᵢ | 排名 i |
+|---|---|---|
+| A | 1.00 | 1 |
+| B~E | 0.00 | 2~5 |
+
+```
+Σ(pᵢ × i) = 1.00×1 + 0×2 + 0×3 + 0×4 + 0×5 = 1.00
+ECS = 2×1.00 − 1 = 1
+```
+
+ECS = 1,等效只有 1 条视频 ✓
+
+**极端 2:完全均匀**(每条各 200 次,各占 20%)
+
+| 视频 | 占比 pᵢ | 排名 i |
+|---|---|---|
+| A~E | 各 0.20 | 1~5 |
+
+```
+Σ(pᵢ × i) = 0.20×1 + 0.20×2 + 0.20×3 + 0.20×4 + 0.20×5
+          = 0.20 × (1+2+3+4+5)
+          = 0.20 × 15
+          = 3.00
+ECS = 2×3.00 − 1 = 5
+```
+
+ECS = 5 = N,所有视频等效全部被推 ✓
+
+---
+
+## 二、ecs_ratio — 分发均匀度比率
+
+### 含义
+
+"ECS 占真实视频池子的几成。去掉池子大小的影响后,只看**分布形态有多均匀**。"
+
+### 公式
+
+```
+ecs_ratio = ECS / distinct_vid_cnt
+```
+
+其中 `distinct_vid_cnt` = 桶内当天被曝光的去重视频数。
+
+### 值域
+
+- **(0, 1]**(通常以百分比展示)
+- → 100%:完全均匀
+- → 0%:完全集中在一条视频
+
+### 举例计算(接上面的例子)
+
+```
+ecs_ratio = ECS / N = 2.48 / 5 = 0.496 = 49.6%
+```
+
+**解读**:分发的均匀度达到了理想值的 49.6%。
+
+### 实际数据中的量级
+
+你们的短视频 Feed:
+```
+distinct_vid_cnt ≈ 16,000
+ECS ≈ 1,150
+ecs_ratio ≈ 1,150 / 16,000 ≈ 7.2%
+```
+
+意思是:"**菜单上有 1.6 万条视频,但实际等效只推了约 1,150 条**,均匀度是 7.2%。"
+
+### 三者的关系
+
+```
+ECS = distinct_vid_cnt × ecs_ratio
+```
+
+| 量 | 含义 | 类比 |
+|---|---|---|
+| `distinct_vid_cnt` | 菜单有多少道菜 | 池子大小 |
+| `ecs_ratio` | 每道菜是否都有人点 | 分配均匀度 |
+| `ECS` | 这家店实际在"运营"多少道菜 | 池子 × 均匀度 |
+
+### 为什么需要 ecs_ratio 而不只看 ECS
+
+**场景**:A 组 ECS = 1100,B 组 ECS = 1200。看 ECS 觉得 B 更分散。
+
+但如果:
+- A 组 `distinct_vid_cnt` = 15,000 → `ecs_ratio = 7.33%`
+- B 组 `distinct_vid_cnt` = 18,000 → `ecs_ratio = 6.67%`
+
+**真相是 A 的分布形态更均匀**,B 只是池子更大。
+
+**结论**:
+- 跨组/跨天比较 → 看 `ecs_ratio`(去掉池子大小干扰)
+- 向老板解释 → 说 `ECS`("等效推了 1150 条视频"更直观)
+- 诊断问题 → 两个一起拆(ECS 跌了是池子小了还是形态变集中了?)
+
+---
+
+## 三、ARP(Average Recommendation Popularity)— 平均推荐热度
+
+### 含义
+
+"用户每次刷到一条推荐,这条视频在全平台当天有多火。**按曝光次数加权的平均热度**。"
+
+### 公式
+
+```
+          Σ(桶内曝光次数ᵢ × 全平台曝光次数ᵢ)
+ARP = ────────────────────────────────────────
+                Σ 桶内曝光次数ᵢ
+```
+
+其中:
+- `i` 遍历桶内所有被曝光的视频
+- `桶内曝光次数ᵢ` = 这条视频在**这个桶**内被推了几次
+- `全平台曝光次数ᵢ` = 这条视频在**全平台**当天被推了几次(= 它的"热门度")
+
+### 为什么要加权
+
+因为每一次"推荐事件"对用户体验的贡献是等权的。如果一条视频被推了 500 次,它对"用户平均看到什么"的影响就是另一条只推了 10 次的视频的 50 倍。
+
+### 举例计算
+
+假设某个桶当天推了 3 条视频:
+
+| 视频 | 桶内曝光 | 全平台曝光(热度) | 桶内曝光 × 全平台曝光 |
+|---|---|---|---|
+| A(爆款) | 500 | 200,000 | 500 × 200,000 = 100,000,000 |
+| B(中等) | 300 | 50,000 | 300 × 50,000 = 15,000,000 |
+| C(长尾) | 200 | 3,000 | 200 × 3,000 = 600,000 |
+| **合计** | **1,000** | | **115,600,000** |
+
+**代入公式**:
+
+```
+         115,600,000
+ARP = ──────────────── = 115,600
+            1,000
+```
+
+**解读**:这个桶推给用户的视频,**平均每条在全平台当天被推了 11.56 万次**。比较偏头部——因为 A(20 万次爆款)被推了 500 次,权重最大。
+
+### 对比:如果不加权(简单平均)
+
+```
+简单平均 = (200,000 + 50,000 + 3,000) / 3 = 84,333
+```
+
+加权后 ARP = 115,600 > 简单平均 84,333。差异来自 A 视频被推得多(500 次),拉高了加权值。
+
+**这个差异就是"用户真实体验的热度" vs "视频池子本身的热度"之间的差**。ARP 反映的是前者。
+
+### 值域和解读
+
+- **没有绝对好坏**,只能同日同组比较
+- 你们当前 ARP 在 **12 万左右**
+- 两组之间差异 > 0.5% 就是显著信号(这是所有指标里噪声最小的)
+
+---
+
+## 四、三个指标的综合应用
+
+### 完整的诊断思路
+
+```
+Step 1:  看 bn_rov / cn_rov / total_rov  →  数值赢没赢?
+Step 2:  看 ecs_ratio                     →  分发有没有变集中?
+Step 3:  看 ARP                           →  推的内容有没有变头部?
+Step 4:  结合判断                          →  落在四象限的哪个位置?
+```
+
+### 四象限诊断矩阵
+
+|  | ARP ↓(推更长尾) | ARP ↑(推更头部) |
+|---|---|---|
+| **ecs_ratio ↑**(更分散) | ✅ **最健康**:分散推长尾 = 真·个性化 | ⚠️ **可接受**:分散但推头部 |
+| **ecs_ratio ↓**(更集中) | ❓ **少见**:集中推长尾 | ❌ **最差**:集中推头部 = 作弊 |
+
+### 用真实数据走一遍
+
+以建模目标实验 01 桶(8 天均值 vs 对照组 89)为例:
+
+| 指标 | 01 桶值 | 89 桶值 | 相对差 |
+|---|---|---|---|
+| bn_rov | 0.04115 | 0.03840 | **+7.95%** |
+| cn_rov | 0.00912 | 0.00844 | **+11.43%** |
+| total_rov | 0.07491 | 0.07057 | **+7.35%** |
+| ecs_ratio | 7.11% | 7.23% | **+1.02%**(更分散)|
+| ARP | 133,856 | 131,685 | **-0.36%**(推更少头部)|
+
+**诊断**:
+- Step 1: 数值全面赢(bn +8%, cn +11%, total +7%)✅
+- Step 2: ecs_ratio +1%(更分散,没有集中)✅
+- Step 3: ARP -0.4%(推更少头部)✅
+- Step 4: 落在**左上象限**(分散 + 长尾)= **最健康**
+
+**汇报一句话**:
+> "bn_rov 涨 8%,**不是靠推头部换的**——ecs_ratio 反而升了(更分散),ARP 反而降了(推更少爆款)。这是真·个性化提升。"
+
+---
+
+## 五、SQL 实现速查
+
+### ECS 计算(`t_bucket_ecs` CTE)
+
+```sql
+-- 内层:算每条 vid 的占比 p 和降序排名 rn
+SELECT  vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+            PARTITION BY dt, apptype, abcode, suffix
+        ) AS p
+        ,ROW_NUMBER() OVER (
+            PARTITION BY dt, apptype, abcode, suffix
+            ORDER BY vid_exp_cnt DESC
+        ) AS rn
+FROM    t_vid_exp
+
+-- 外层:公式 2·Σ(p·rn) - 1
+SELECT  2 * SUM(p * rn) - 1 AS ecs
+GROUP BY dt, apptype, abcode, suffix
+```
+
+### ecs_ratio 计算(`t_metrics` 内)
+
+```sql
+ROUND(AVG(ecs) / NULLIF(AVG(distinct_vid_cnt), 0), 6) AS ecs_ratio
+```
+
+### ARP 计算(`t_bucket_arp` CTE)
+
+```sql
+-- 分子:桶内曝光 × 全平台曝光;分母:桶内总曝光
+SELECT  SUM(v.vid_exp_cnt * g.vid_global_pop)
+      / SUM(v.vid_exp_cnt) AS arp
+FROM    t_vid_exp v
+LEFT JOIN t_vid_global_pop g
+ON      v.vid = g.vid AND v.dt = g.dt
+GROUP BY v.dt, v.apptype, v.abcode, v.suffix
+```
+
+---
+
+## 六、噪声基底(实测值,用于判断信号显著性)
+
+基于对照组内部 89 vs 2c 的 8 天实测:
+
+| 指标 | 噪声门槛 | 低于此值视为噪声 |
+|---|---|---|
+| ecs_ratio | **2.88%** | 组间差异 < 2.9% 不显著 |
+| ARP | **0.43%** | 组间差异 < 0.5% 不显著 |
+| bn_rov | **1.10%** | 组间差异 < 1.1% 不显著 |
+| cn_rov | **4.10%** | cn 噪声最大,< 4% 不显著 |
+| total_rov | **1.79%** | 组间差异 < 1.8% 不显著 |
+
+**用法**:每次看数据,先查这张表。低于门槛的差异不要汇报为"发现"。

+ 311 - 0
tasks/00_尾号实验/指标说明_ecs_arp.md

@@ -0,0 +1,311 @@
+# ecs_ratio 和 ARP 指标说明
+
+> 这两个指标是推荐实验报表的"健康度 guardrail"——专门用来回答老板关心的那句话:
+> **"数值涨了,是真本事还是靠推头部换来的?"**
+
+---
+
+## 📌 TL;DR(30 秒电梯版)
+
+两个健康度指标:
+
+- **`ecs_ratio`** = **分发均匀度**:内容是集中在头部几条视频,还是分散推出去?越高越分散。我们现在 7% 左右,意思是"**菜单上 1.6 万条视频,实际等效只推了 1100 条**"。
+- **`ARP`** = **平均推荐热度**:推给用户的视频平均有多火?越高说明越在推爆款,越低说明越在推长尾。
+
+**两个一起看**,就能判断 AB 涨的数值是真本事还是靠推头部换的。
+
+---
+
+## 一、ecs_ratio — 分发均匀度
+
+### 一句话定义
+**回答"内容有没有集中"的问题**。把分发的形态(多均匀 / 多集中)量化成一个 0~100% 的数字。
+
+### 大白话解释
+- 我们菜单上有 16,000 种视频,但每天的曝光量 90% 都集中在前 1,000 条上
+- `ecs_ratio` 告诉你:**实际等效推了多少种,占菜单的几成**
+- 比如 7%,就是"**菜单 16,000 条,等效只推了 1,100 条**"——剩下 14,900 条要么曝光少得可忽略,要么根本没上架
+
+### 数值范围
+
+| ecs_ratio 值 | 含义 |
+|---|---|
+| → 100% | 每条视频曝光次数完全相同(完全均匀,不现实) |
+| **7% 左右** | **你们短视频 Feed 的典型水平**(基线形态) |
+| → 0% | 所有曝光都集中在一条视频(极端集中) |
+
+### 业务含义
+- 这个数字**越高越健康**(分发越分散,长尾视频有机会)
+- **变化 > 2.9%** 才是显著信号(噪声门槛,实测 7 天得出)
+- 如果从 7% 掉到 6%,意思是"**推荐系统比以前更偏向头部了,长尾内容被挤掉了**"——这就是老板担心的"内容在集中"
+
+### 计算方法(3 步,不用公式)
+
+**第 1 步:数每条视频当天被推了多少次**
+
+```
+A 视频: 50,000 次     ← 头部
+B 视频: 30,000 次
+C 视频: 15,000 次
+...
+Z 视频:     10 次     ← 长尾
+```
+
+**第 2 步:算"等效视频数"**
+
+如果把这些曝光**平均分配**给所有视频,等于推了多少条?
+
+> 举个简单的例子:3 条视频,曝光分别是 500 / 300 / 200。
+> 如果**完全平均**,每条应该 333.3 次,相当于推了 3 条。
+> 但实际是 500/300/200 不均匀,所以"等效视频数"**小于 3**——算出来大约 **2.4 条**。
+
+**第 3 步:等效视频数 ÷ 真实视频数 = ecs_ratio**
+
+```
+2.4 / 3 = 80%   ← 这个例子的 ecs_ratio
+```
+
+在真实数据里:**1158(等效)/ 16,757(真实)= 6.9%**。
+
+---
+
+## 二、ARP — 平均推荐热度
+
+### 一句话定义
+**回答"推的都是爆款吗"的问题**。把"推荐内容的平均热门度"量化成一个数字。
+
+### 大白话解释
+- 用户刷到的每一条推荐,**这条视频在全平台当天被推过多少次**
+- 如果推给用户的都是"全平台日曝光 50 万次的爆款",ARP 就很高
+- 如果推给用户的是"一天只被推 500 次的长尾视频",ARP 就很低
+- 我们现在 ARP 在 **12 万左右**,意思是"**用户平均每次刷到的视频,当天被全平台推过 12 万次**"——中等偏上热门
+
+### 数值范围
+- **这个数字没有绝对好坏**,只能和对比组比相对值
+- 两组之间 ARP **差 > 0.5%** 就是显著信号(最稳的指标,噪声极小)
+- **越高** = 推得越头部
+- **越低** = 推得越长尾
+
+### 业务含义
+- ARP 单独看不够,要和 `ecs_ratio` 一起看
+- 如果一个实验组的 ARP 比对照组高 5%,说明这组**倾向于多推热门内容**
+- 如果同时 bn_rov 也涨了,要问:"**涨的 bn_rov 是不是靠多推爆款换来的?**"
+
+### 计算方法(2 步)
+
+**第 1 步:查每条被推视频在全平台的当天总曝光**
+
+```
+A 视频(头部爆款):全平台日曝光 500,000
+B 视频(中等):    全平台日曝光 100,000
+C 视频(长尾):    全平台日曝光   5,000
+```
+
+**第 2 步:按"我们推了多少次"加权求平均**
+
+> 假设我们今天推了:A 给 500 个用户,B 给 300 个用户,C 给 200 个用户
+>
+> 加权平均 =
+> `(500×500,000 + 300×100,000 + 200×5,000) / (500+300+200)`
+> `= 281,000`
+>
+> 含义:**我们推的视频平均每条当天全平台被推了 28.1 万次**。
+
+**为什么要加权**:因为用户不是"每条视频看 1 次",而是"**头部的 A 视频他看了 500 次**"。加权后才能反映"用户实际看到的内容平均有多火"。
+
+---
+
+## 三、两个一起看 = 实验健康度四象限
+
+这是**汇报时的核心图**,老板一看就懂:
+
+| | **ARP ↓**(推更长尾) | **ARP ↑**(推更头部) |
+|---|---|---|
+| **ecs_ratio ↑**<br>(更分散) | ✅ **最健康**<br>分散推冷门<br>= 真·个性化提升 | ⚠️ **可接受**<br>分散但推头部<br>= 头部内部多样化 |
+| **ecs_ratio ↓**<br>(更集中) | ❓ **少见**<br>集中推冷门<br>= 小众爆发 | ❌ **最差**<br>集中推头部<br>= popularity bias 作弊 |
+
+**解读规则**:
+- 落在左上 = 模型真的变聪明了(扩量放心)
+- 落在右上 = 模型倾向头部但还保持多样性(可以接受)
+- 落在右下 = 警报!数值涨可能是靠推头部作弊换来的
+- 落在左下 = 少见情况,需要单独分析
+
+---
+
+## 四、真实数据应用:建模目标 01 桶
+
+以跑了 19 天的建模目标 01 桶 vs 对照组 89(7 天均值):
+
+| 指标 | 变化 | 象限判定 |
+|---|---|---|
+| `ecs_ratio` | **+1.4%**(更分散) | ⬆️ 纵轴正向 |
+| `ARP` | **-0.64%**(推更少头部) | ⬅️ 横轴负向 |
+| **落在哪个象限** | **左上:最健康** ✅ | |
+
+同时数值指标:
+- `bn_rov` +8.04%(7/7 天全胜)
+- `cn_rov` +11.92%(7/7 天全胜)
+- `total_rov` +7.51%(7/7 天全胜)
+
+### 翻译成大白话
+
+> "建模目标实验 01 桶数据全面健康。bn_rov 涨了 8%,**但不是靠推头部换来的**——
+>
+> 分发均匀度 **ecs_ratio 反而升了 1.4%**(说明内容更分散,没有向头部集中),
+> 平均推荐热度 **ARP 还降了 0.64%**(说明推的视频比以前平均**更冷门**)。
+>
+> 这意味着:模型在**推得更分散、更长尾的前提下,还拿到了 +8% 的收益**。这才是真·个性化提升,不是 popularity bias 作弊。"
+
+---
+
+## 五、汇报话术模板
+
+### 👔 汇报给老板的标准格式
+
+```
+【实验名称】XX 实验
+【结论】数据全面健康 / 有弱风险 / 还需观察
+【数值】bn_rov +X%, cn_rov +Y%, total_rov +Z%(N/7 天全胜)
+【健康度】ecs_ratio 变化 +/-X%(门槛 2.9%),ARP +/-Y%
+          => 翻译成"是否靠推头部换数值"
+【建议】继续扩量 / 暂缓扩量 / 再观察 X 天
+```
+
+### 🎤 一句话终极版
+
+> "这两个指标就是老板您说的'**内容有没有集中、有没有靠推头部换数值**'的量化版本。ecs_ratio 测分散度,ARP 测头部偏向。建模目标的 bn_rov 涨 8%,同时 ecs_ratio 略升、ARP 反降——**涨得很干净,不是作弊**。"
+
+---
+
+## 六、餐厅比喻(适合 business sense 的人)
+
+> 您可以这么想——
+>
+> **`ecs_ratio` 就像一家餐厅的菜单使用率**:菜单有 100 道菜,但客人天天只点前 5 道,这家店'**实际在运营 5 道菜**'。这个比例越高,店铺越活,所有菜都有人点。
+>
+> **`ARP` 就像这家餐厅推荐给你的菜的平均热度**:服务员推荐你点'**招牌烤鸭**'(爆款)还是'**上海独家本帮小菜**'(长尾)。数字越高,说明推的越是'大家都点的爆款'。
+>
+> 两个一起看,就能判断我们的'餐厅'(推荐系统)到底是在**精准上菜**还是**懒人式只推爆款**。
+
+---
+
+## 七、技术细节(给同事/自己留的)
+
+### ecs_ratio 的数学公式
+
+**ECS (Effective Catalog Size)**:
+
+$$ECS = 2 \cdot \sum_{i=1}^{N} p_i \cdot i - 1$$
+
+其中:
+- `N` = 桶内去重的 vid 数
+- `p_i` = 第 `i` 名 vid 的曝光占比(按曝光量**降序**排好)
+- `i` = 排名(第 1 名、第 2 名 ...)
+
+**ecs_ratio**:
+
+$$\text{ecs\_ratio} = \frac{ECS}{\text{distinct\_vid\_cnt}}$$
+
+**值域**:`(0, 1]`,越大越分散。
+
+**验算**:
+- 3 条视频曝光 500/300/200:ECS = 2·(0.5·1 + 0.3·2 + 0.2·3) - 1 = 2.4
+- 完全均匀(各 1/3):ECS = 2·(1/3·1 + 1/3·2 + 1/3·3) - 1 = 3(= N 上限)
+- 完全集中(一个占 100%):ECS = 2·1 - 1 = 1(下限)
+
+### ARP 的数学公式
+
+$$ARP = \frac{\sum_{i=1}^{N} (\text{桶内曝光次数}_i \times \text{全平台曝光次数}_i)}{\sum_{i=1}^{N} \text{桶内曝光次数}_i}$$
+
+这是一个**按桶内曝光量加权的平均**——推得越多的 vid,对 ARP 的贡献越大。
+
+**关键设计**:用"桶内曝光加权"而不是"简单平均",是因为用户不是均匀看每条 vid,而是**每次推荐事件**贡献 1 个权重。加权后 ARP 真正反映"**随机挑一次推荐,这条 vid 平均有多火**"。
+
+---
+
+## 八、SQL 实现位置
+
+在 `base_v3_new.sql` 里:
+
+| CTE | 作用 | 大致行号 |
+|---|---|---|
+| `t_vid_exp` | 桶内每条 vid 的曝光次数(ECS/ARP 共用的中间表) | ~259 |
+| `t_bucket_ecs` | 用窗口函数算 ECS | ~280 |
+| `t_vid_global_pop` | 全平台每条 vid 的当天总曝光(ARP 的 popularity reference) | ~308 |
+| `t_bucket_arp` | 按桶内曝光加权算 ARP | ~327 |
+| `t_metrics` | 把 ecs_ratio 算出来:`AVG(ecs) / AVG(distinct_vid_cnt)` | ~436 |
+
+### 关键 SQL 片段
+
+```sql
+-- ECS 计算
+,t_bucket_ecs AS (
+    SELECT  dt, apptype, abcode, suffix
+            ,2 * SUM(p * rn) - 1 AS ecs
+    FROM    (
+        SELECT  dt, apptype, abcode, suffix
+                ,vid_exp_cnt / SUM(vid_exp_cnt) OVER (
+                    PARTITION BY dt, apptype, abcode, suffix
+                ) AS p
+                ,ROW_NUMBER() OVER (
+                    PARTITION BY dt, apptype, abcode, suffix
+                    ORDER BY vid_exp_cnt DESC
+                ) AS rn
+        FROM    t_vid_exp
+    ) t
+    GROUP BY dt, apptype, abcode, suffix
+)
+
+-- ARP 计算
+,t_bucket_arp AS (
+    SELECT  v.dt, v.apptype, v.abcode, v.suffix
+            ,SUM(v.vid_exp_cnt * g.vid_global_pop) / SUM(v.vid_exp_cnt) AS arp
+    FROM    t_vid_exp v
+    LEFT JOIN t_vid_global_pop g
+    ON      v.dt = g.dt AND v.apptype = g.apptype AND v.vid = g.vid
+    GROUP BY v.dt, v.apptype, v.abcode, v.suffix
+)
+
+-- ecs_ratio 计算(t_metrics 里)
+,ROUND(AVG(e.ecs) / NULLIF(AVG(b.distinct_vid_cnt), 0), 6) AS ecs_ratio
+```
+
+---
+
+## 九、噪声基底(重要!)
+
+**基于 7 天对照组内部(89 vs 2c)实测的噪声门槛**:
+
+| 指标 | 噪声门槛 | 含义 |
+|---|---|---|
+| `ecs_ratio` | **2.88%** | 实验组 ratio 变化 < 2.9% 都算噪声 |
+| `bn_rov` | **1.06%** | 数值差 < 1% 不算赢 |
+| `cn_rov` | **4.10%** | cn 噪声最大(分享行为方差大) |
+| `total_rov` | **1.79%** | |
+| `ARP` | **0.43%** | **最稳定**,> 0.5% 就是信号 |
+
+**记住**:任何低于门槛的变化都不是信号,是噪声。不要把噪声当结论上报。
+
+---
+
+## 十、常见误解
+
+### ❌ 误解 1:"ARP 高就是坏"
+**正确理解**:ARP 高只是说"推得更头部",本身没有好坏。关键看 `ecs_ratio` 是不是同时跌——只有**两者同时往坏的方向走**(ARP ↑ + ecs_ratio ↓)才是 popularity bias。
+
+### ❌ 误解 2:"ecs_ratio 7% 很差,应该更高"
+**正确理解**:7% 是**短视频 Feed 的正常形态**。Netflix 的同类指标也在类似量级(10-15%)。**绝对值不重要,变化才重要**。
+
+### ❌ 误解 3:"这两个指标和 bn_rov/cn_rov 是一回事"
+**正确理解**:bn_rov/cn_rov 是**数值指标**(赚了多少),ecs_ratio/ARP 是**健康度指标**(赚得健不健康)。前者回答"有没有涨",后者回答"涨得对不对"。两者是正交维度,不能互相替代。
+
+### ❌ 误解 4:"看单日 ecs_ratio 差 1% 就能下结论"
+**正确理解**:单日差异 < 2.9% 都是噪声(见上面的噪声基底表)。**要看 7 天趋势和"赢的天数"**,不是看单点。
+
+---
+
+## 参考
+
+- Netflix 的 ECS 指标原始论文:《The Netflix Recommender System: Algorithms, Business Value, and Innovation》(ACM TMIS 2015)
+- 业界综述:《Fairness and Diversity in Recommender Systems: A Survey》(arXiv 2307.04644)
+- 本项目实现:`tasks/00_尾号实验/base_v3_new.sql`

+ 229 - 0
tasks/00_尾号实验/指标调研_业界对比.md

@@ -0,0 +1,229 @@
+# 推荐系统分发健康度指标:业界对比调研
+
+> 核心问题:**我们用的 ECS / ecs_ratio / ARP 这三个指标,业界也用吗?有没有更好的?**
+
+---
+
+## 一句话结论
+
+| 我们的指标 | 业界地位 | 评价 |
+|---|---|---|
+| **ECS / ecs_ratio** | Netflix 发明,其他公司几乎没有采用这个名字 | 数学上 = Shannon Entropy 的指数变换,概念没问题,但命名小众 |
+| **ARP** | 学术标准指标,**但没有任何大公司公开在生产环境用它** | 是我们三个里最弱的一个,建议补充 Gini |
+| **(缺失)Gini 系数** | **快手产品级内置、Twitter 生产审计、Netflix 评估都在用** | 业界最广泛使用的聚合多样性指标,我们没有 |
+
+**调研建议**:保留 ECS/ecs_ratio(概念正确),**新增 Gini 系数**(业界共识最强),ARP 可保留但优先级降低。
+
+---
+
+## 业界主要公司用什么指标(附证据)
+
+### 快手(Kuaishou)— 最直接的同行参考
+
+**指标:Gini 系数(产品级内置)**
+
+快手联合创始人宿华公开确认:快手在产品底层内置了**"GDP + Gini 系数分布实验"**,确保流量增长不以极端集中为代价。这是**创始团队设定的不可协商的约束**——优化器不能覆盖这个限制。
+
+快手的具体做法:用 `5× 内容质量 + 5× 关系 + 1× 双向互动` 的加权来分散流量到中腰部创作者,而不是集中在头部。
+
+**证据**:[Sheldon on China: Scrolling Douyin, Playing Kuaishou](https://sheldononchina.substack.com/p/scrolling-douyin-playing-kuaishou)
+
+**启示**:快手是和我们最像的短视频平台,他们用的是 **Gini** 不是 ECS。
+
+---
+
+### Netflix — ECS 的发明者
+
+**指标:ECS(业务级 KPI)+ Gini + HHI(评估用)**
+
+Netflix 2015 年论文(Gomez-Uribe & Hunt, ACM TMIS)定义了 ECS。核心发现:**个性化推荐把 ECS 提高了约 4 倍**(对比纯热门排序)。
+
+但 Netflix 2025 年的经济学论文(Aridor, Bibaut et al., arXiv:2511.07280)在做因果分析时,用的是 **Gini 系数 + HHI**,不是 ECS。论文发现:
+- 基于热门度的推荐让 HHI 恶化 **+42.5%**
+- Netflix 当前推荐系统维持了相对高的消费多样性
+
+**证据**:
+- [The Netflix Recommender System (ACM TMIS 2015)](https://dl.acm.org/doi/10.1145/2843948)
+- [The Value of Personalized Recommendations: Evidence from Netflix (arXiv 2511.07280)](https://arxiv.org/abs/2511.07280)
+
+**启示**:**连 Netflix 自己做严肃分析时也更倾向用 Gini/HHI,而不是 ECS。**
+
+---
+
+### Spotify — 自研 GS-Score,底层等价于 Shannon Entropy
+
+**指标:GS-Score(Generalist-Specialist Score)**
+
+Spotify 2020 年 WWW 论文验证了 GS-score 和 Shannon Entropy 的相关性达 r = -0.77,和 Gini 的相关性 r = -0.60。
+
+关键生产发现:**听歌更多样化的用户,流失率低 10-20 个百分点,付费转化率高 25 个百分点**。这是业界最有力的"多样性 → 商业价值"因果证据。
+
+**证据**:[Algorithmic Effects on the Diversity of Consumption on Spotify (WWW 2020)](https://research.atspotify.com/2020/12/algorithmic-effects-on-the-diversity-of-consumption-on-spotify)
+
+**启示**:Spotify 用的是自研指标,但**底层和 Shannon Entropy(= log(ECS))等价**。ECS 的数学是对的,只是名字没流行。
+
+---
+
+### Twitter/X — Gini + Top-N% 曝光占比
+
+**指标:Gini 系数 + Top 1% 作者曝光占比**
+
+Twitter 2022 年论文用 Gini 和 Top-N 占比审计了生产环境的推荐时间线,发现**前 1% 的作者获得了约 80% 的推文浏览量**。
+
+Twitter 开源的推荐算法中包含"Author Diversity Scorer"——对重复作者的内容评分减半,本质是 Coverage 约束。
+
+**证据**:
+- [Measuring Disparate Outcomes (ScienceDirect)](https://www.sciencedirect.com/science/article/pii/S2666389922001799)
+- [Twitter Open-Source Algorithm Blog](https://blog.x.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm)
+
+**启示**:Twitter 生产审计用 **Gini + Top-N%**,不用 ECS 也不用 ARP。
+
+---
+
+### YouTube — Coverage 约束(多样性上限)
+
+YouTube 在推荐后处理阶段应用"diversity caps"(限制同频道/同话题的视频数量)和"novelty boosts"(新内容加权)。2024 年算法更新明确优先推荐较小频道的内容。
+
+**证据**:[How YouTube's Algorithm Works (Shaped.ai)](https://www.shaped.ai/blog/how-youtubes-algorithm-works)
+
+**启示**:YouTube 用的是**硬规则约束**(Coverage 限制),不是连续指标度量。
+
+---
+
+### Airbnb — ILD(列表内多样性)作为优化目标
+
+Airbnb 2022-2023 年发布"Learning to Rank Diversely",用神经网络估计房源相似度来减少搜索结果的重复性。A/B 测试结果:未取消预订 +0.29%,预订价值 +0.8%,五星评价 +0.4%。
+
+**关键**:ILD 是**优化目标**,但 A/B 报告的是**业务指标**(预订量、评价),不是 ILD 本身。
+
+**证据**:[Airbnb: Learning to Rank Diversely](https://medium.com/airbnb-engineering/learning-to-rank-diversely-add6b1929621)
+
+---
+
+### Pinterest — 表征覆盖率(DIV@k)
+
+Pinterest 用确定性点过程(DPP)做多样性重排序,用 DIV@k(R)(前 k 个结果中所有人群组别都出现的查询比例)作为主要多样性指标。A/B 测试:体型表征提升 454%,近景推荐提升 772%。
+
+**证据**:[Pinterest: Representation Online Matters](https://medium.com/pinterest-engineering/representation-online-matters-practical-end-to-end-diversification-in-search-and-recommender-cb60b547f2e0)
+
+---
+
+## 各指标的业界使用汇总
+
+| 指标 | 生产环境证据 | 使用公司 | 用作 A/B guardrail? | 老板可解释性 |
+|---|---|---|---|---|
+| **Gini 系数** | **强** | 快手(产品内置)、Twitter(审计)、Netflix(评估) | 快手:是 | **高**(收入不平等类比)|
+| **ECS** | 中 | Netflix | 否(业务 KPI) | 中(需解释"有效视频数")|
+| **Shannon Entropy** | 中 | Spotify(via GS-score)、快手(action entropy) | 间接是 | 低(信息论概念)|
+| **HHI** | 中 | Netflix(评估) | 否(离线评估) | 中 |
+| **Coverage** | **强** | Twitter、YouTube、LinkedIn(硬规则) | 是(但作为规则不是连续指标)| **很高** |
+| **ARP** | **弱** | 学术标准,无大公司公开生产使用 | 否 | 低(绝对数字不直观)|
+| **ILD** | 中 | Airbnb(优化目标)、Pinterest(间接) | 否(优化目标,不是报告指标)| 低(需要 embedding)|
+| **Top-N% 占比** | 中 | Twitter(审计) | 是(审计用) | **很高**("前 1% 视频吃了 80% 曝光")|
+
+---
+
+## 我们三个指标的诚实评估
+
+### ECS / ecs_ratio — 概念对,名字小众
+
+**优点**:
+- Netflix 背书,数学上等价于 Shannon Entropy 的指数形式(`ECS = 2^H`)
+- 有物理意义:"等效推了多少条视频",比 Gini 的 0~1 更直观
+- ecs_ratio 归一化后跨天跨组可比
+
+**问题**:
+- 除 Netflix 外**没有其他公司公开采用这个名字**
+- 业界更常用 Gini(快手、Twitter、Netflix 评估都用)
+- 学术论文中 ECS 引用率远低于 Gini 和 Entropy
+
+**结论**:**保留,但不作为唯一的分发健康指标**。ECS/ecs_ratio 的"等效视频数"概念对老板解释很友好,这是它胜过 Gini 的唯一优势。
+
+### ARP — 我们三个里最弱的
+
+**优点**:
+- 学术评估框架(Elliot 等)中是标准指标
+- 计算简单
+
+**问题**:
+- **没有任何大公司公开在生产 A/B 中使用 ARP**
+- 2019 年 AAAI 论文指出 ARP 有盲区:两个系统可以有相同的 ARP 但分布完全不同
+- 2024 年 Springer 综述确认 ARP 需要配合 APLT/ACLT 才有完整画面
+- 绝对数字(12 万)对非技术人员不直观
+
+**结论**:**可保留作为辅助,但优先级应低于 Gini**。如果要精简指标,ARP 是第一个被砍的候选。
+
+### (缺失)Gini 系数 — 业界共识最强,我们没有
+
+**为什么应该加**:
+1. **快手用它**——最直接的同行参考,产品级内置
+2. **Twitter 用它**——生产环境审计
+3. **Netflix 用它**——严肃的因果分析论文中选了 Gini 而不是 ECS
+4. **解释性强**——"Gini = 0.7 意味着头部 10% 的视频拿走了约 70% 的曝光",老板秒懂
+5. **和 ECS 互补**——ECS 衡量"等效多少条"(绝对量),Gini 衡量"分布多不平等"(形态)。两者从不同角度看同一个问题。
+
+---
+
+## 建议的指标组合
+
+### 当前(3 个)
+
+```
+ECS / ecs_ratio  →  "等效推了多少条视频"
+ARP              →  "推的视频平均多热门"
+```
+
+### 建议调整为(4 个,加 Gini)
+
+```
+ECS / ecs_ratio  →  保留,给老板解释用("等效 1100 条视频"很直观)
+Gini 系数        →  新增,业界共识最强,对标快手/Twitter/Netflix
+ARP              →  保留但降低优先级,作为 Gini 的辅助验证
+Top-10% 占比     →  可选新增,最简单的 executive metric("前 10% 视频吃了 X% 曝光")
+```
+
+### 为什么不直接砍掉 ECS 只用 Gini?
+
+| 维度 | ECS/ecs_ratio | Gini |
+|---|---|---|
+| 老板沟通 | ✅ "等效推了 1100 条" | ⚠️ "Gini = 0.85" 需要解释 |
+| 诊断能力 | ECS = distinct_vid × ecs_ratio → 可拆分 | 单一数字,不能拆分 |
+| 业界认可 | Netflix 发明,其他公司少用 | 快手/Twitter/Netflix 都用 |
+| 敏感度 | 对长尾更敏感 | 对头部更敏感 |
+
+**两者互补而非替代**。ECS 对"长尾消失"敏感(长尾视频没了,ECS 会快速下降),Gini 对"头部膨胀"敏感(头部视频吃更多曝光,Gini 会快速上升)。你们的场景两边都需要监控。
+
+---
+
+## 如果老板问"为什么用这些指标"
+
+### 30 秒版
+> "ECS 是 Netflix 发明的,Gini 是快手在用的。我们两个都看:ECS 告诉你'等效推了多少条视频',Gini 告诉你'头部有多集中'。ARP 作为辅助看'推的内容有多热门'。"
+
+### 2 分钟版
+> "我们参考了业界主要公司的做法:
+>
+> - **快手**在产品底层内置了 Gini 系数,确保流量不过度集中到头部创作者。
+> - **Netflix** 发明了 ECS(有效视频数),用来证明个性化推荐把内容多样性提高了 4 倍。
+> - **Twitter** 在生产环境用 Gini 做算法审计,发现前 1% 的作者拿走了 80% 的浏览量。
+> - **Spotify** 用多样性指标发现:听歌更多样的用户,流失率低 20 个百分点,付费转化高 25 个百分点。
+>
+> 我们选了 ECS + Gini + ARP 三个指标组合:ECS 给出'等效视频数'的物理量,Gini 给出'分布不平等度'的形态量,ARP 给出'推荐热度'的方向量。三个一起看,能判断数值涨幅是真·个性化还是靠推头部换来的。"
+
+---
+
+## 参考文献
+
+| 来源 | 内容 | 链接 |
+|---|---|---|
+| Netflix (2015) | ECS 定义,个性化提升 4x ECS | [ACM TMIS](https://dl.acm.org/doi/10.1145/2843948) |
+| Netflix (2025) | Gini + HHI 因果分析 | [arXiv:2511.07280](https://arxiv.org/abs/2511.07280) |
+| 快手 | Gini 系数产品内置 | [Sheldon on China](https://sheldononchina.substack.com/p/scrolling-douyin-playing-kuaishou) |
+| Spotify (2020) | GS-score ↔ Entropy 相关性 r=-0.77 | [WWW 2020](https://research.atspotify.com/2020/12/algorithmic-effects-on-the-diversity-of-consumption-on-spotify) |
+| Twitter (2022) | Gini 审计,Top 1% 占 80% | [ScienceDirect](https://www.sciencedirect.com/science/article/pii/S2666389922001799) |
+| Twitter (2023) | Author Diversity Scorer 开源 | [X Blog](https://blog.x.com/engineering/en_us/topics/open-source/2023/twitter-recommendation-algorithm) |
+| Airbnb (2022) | ILD 优化,预订量 +0.29% | [Engineering Blog](https://medium.com/airbnb-engineering/learning-to-rank-diversely-add6b1929621) |
+| Pinterest (2023) | DIV@k 表征覆盖率 +454% | [Engineering Blog](https://medium.com/pinterest-engineering/representation-online-matters-practical-end-to-end-diversification-in-search-and-recommender-cb60b547f2e0) |
+| AAAI (2019) | ARP 的盲区分析 | [arXiv:1901.07555](https://arxiv.org/pdf/1901.07555) |
+| Springer (2024) | Popularity bias 综述 | [UMUAI](https://link.springer.com/article/10.1007/s11257-024-09406-0) |
+| SIGIR (2022) | 广义 Gini 指数优化 | [ACM DL](https://dl.acm.org/doi/10.1145/3477495.3532035) |