Просмотр исходного кода

chore: 提交未跟踪的分析任务、表结构文档和配置

新增 AB效果/推荐AB实时效果、承接/rosn分析与线上实验脚本、
低vov高曝光分析全流程、表结构文档等;更新 default.json sheet_id

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 месяц назад
Родитель
Сommit
2f8fee0358
39 измененных файлов с 6924 добавлено и 1 удалено
  1. 1 1
      default.json
  2. 125 0
      req/new.md
  3. 124 0
      req/低.md
  4. 347 0
      tables/loghubods/loghubods.dwd_recsys_alg_exposure_base_20250108.md
  5. 68 0
      tables/loghubods/user_share_log.txt
  6. 50 0
      tables/videoods/dim_user.txt
  7. 55 0
      tables/videoods/wx_video.txt
  8. 6 0
      tasks/00_AB效果/01_推荐AB实时效果.json
  9. 86 0
      tasks/00_AB效果/01_推荐AB实时效果.sql
  10. 90 0
      tasks/00_表的洞察/loghubods.user_share_log/export_neo4j.py
  11. 256 0
      tasks/承接/rosn分析/05_实验组xTop20视频_vs对照组_vor.config
  12. 391 0
      tasks/承接/rosn校准/plot_calibration.py
  13. 495 0
      tasks/承接/线上实验/08_预测覆盖率效果分析.py
  14. 177 0
      tasks/承接/线上实验/分桶诊断分析.py
  15. 177 0
      tasks/承接/线上实验/分桶诊断分析_full.py
  16. 174 0
      tasks/承接/线上实验/分桶诊断分析_full_v2.py
  17. 293 0
      tasks/指标分析/02_实验组xTop20视频_vs对照组_误差分析_v2_hh.sql
  18. 586 0
      tmp/低vov高曝光分析/step10_可视化.py
  19. 100 0
      tmp/低vov高曝光分析/step1_分析.py
  20. 115 0
      tmp/低vov高曝光分析/step2_分析.py
  21. 122 0
      tmp/低vov高曝光分析/step3_分析.py
  22. 91 0
      tmp/低vov高曝光分析/step3b_分析.py
  23. 93 0
      tmp/低vov高曝光分析/step3c_分析.py
  24. 82 0
      tmp/低vov高曝光分析/step3d_分析.py
  25. 80 0
      tmp/低vov高曝光分析/step3e_copc分析.py
  26. 89 0
      tmp/低vov高曝光分析/step3f_copc详细.py
  27. 134 0
      tmp/低vov高曝光分析/step4_建议.md
  28. 94 0
      tmp/低vov高曝光分析/step5_分析.py
  29. 93 0
      tmp/低vov高曝光分析/step6_历史抽样.py
  30. 130 0
      tmp/低vov高曝光分析/step7_分析.py
  31. 153 0
      tmp/低vov高曝光分析/step8_分析.py
  32. 167 0
      tmp/低vov高曝光分析/step9_天级趋势.py
  33. 91 0
      tmp/低vov高曝光分析/v2_step1_数据驱动定义.py
  34. 106 0
      tmp/低vov高曝光分析/v2_step2_分析.py
  35. 135 0
      tmp/低vov高曝光分析/v2_step3_对比分析.py
  36. 225 0
      tmp/低vov高曝光分析/v2_最终报告.py
  37. 264 0
      tmp/低vov高曝光分析/v2_深入分析.py
  38. 594 0
      tmp/低vov高曝光分析/v3_可视化报告.py
  39. 465 0
      tmp/低vov高曝光分析/v3_综合分析.py

+ 1 - 1
default.json

@@ -1,6 +1,6 @@
 {
   "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
-  "sheet_id": null,
+  "sheet_id": "oYkbVB",
   "sort": "dt:desc",
   "cols": null
 }

+ 125 - 0
req/new.md

@@ -0,0 +1,125 @@
+
+现在有一个推荐场景的问题,
+我的推荐预估的是 str、ros, vor 是统计出来的,最终的排序公式是str * ros * vor = vov
+str用的是 fm 模型,ros用的是 xgb模型,vor 用的是 24 小时的统计量;
+现在有一个问题,最近看到一些头部的 item,vov 低,但是给的曝光量很多,现在要你分析具体的原因;
+我会给你一个 sql:
+exp 是曝光的意思;具体 sql 如下,请你基于以下 sql 来分析具体低vov高曝光的原因是什么?这个问题的影响面有哪些?怎么解决:
+
+模型:
+1. fmRov = str
+1. fmRovOrigin = str采样还原前
+2. NorXGBScore = ros 
+3. vor=vor
+真实
+  1. Str = str-plus-noself
+  2. Ros = ros-minus-noself
+  3. Rov = return-n-uv-noself / exp
+  4. Vov = new-exposure-cnt / exp
+
+
+WITH tab_base AS 
+(
+    SELECT  *
+            ,((0.059 * fmRovOrigin) / (1 - (1 - 0.059) * fmRovOrigin)) AS online_fmrov
+            ,((0.036 * fmRovOrigin) / (1 - (1 - 0.036) * fmRovOrigin)) AS real_fmrov
+    FROM    (
+                SELECT  dt
+                        ,hh
+                        ,vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,CAST(score AS DOUBLE) AS score
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE) AS fmRovOrigin
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS hasReturnRovScore
+                FROM    (
+                            SELECT  dt
+                                    ,hh
+                                    ,vid
+                                    ,is_share
+                                    ,share_cnt
+                                    ,is_return_1
+                                    ,is_return_n
+                                    ,is_return_noself
+                                    ,return_1_uv
+                                    ,return_n_uv
+                                    ,return_n_uv_noself
+                                    ,new_exposure_cnt
+                                    ,score
+                                    ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                            FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                            WHERE   dt BETWEEN '${start_dt}' AND '${end_dt}'
+                            AND     hh BETWEEN '${start_hh}' AND '${end_hh}'
+                            AND     apptype = '${apptype}'
+                            --AND     vid IN ('62421458','55931081','62955809','58807530') 
+                            AND     vid IN ('62967014','63159658','62151288')
+                            AND     extend_alg IS NOT NULL
+                            AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                            AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                            AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                            AND     abcode NOT IN ("ab100")
+                        ) 
+                WHERE   GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.vor') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') IS NOT NULL
+            ) 
+)
+,tab_pre AS 
+(
+    SELECT  dt --,hh
+            ,vid
+            ,COUNT(1) AS cnt
+            ,AVG(score) AS score
+            ,AVG(fmRov) AS fmRov
+            ,AVG(online_fmrov) AS online_fmrov
+            ,AVG(real_fmrov) AS real_fmrov
+            ,AVG(1.22 * pow(NorXGBScore,1.15)) AS NorXGBScore
+            ,AVG(vor) AS vor
+            ,AVG(hasReturnRovScore) AS hasReturnRovScore
+            ,STDDEV(score) AS std_score
+            ,STDDEV(fmRov) AS std_fmRov
+            ,STDDEV(NorXGBScore) AS std_NorXGBScore
+            ,STDDEV(vor) AS std_vor
+            ,STDDEV(hasReturnRovScore) AS std_hasReturnRovScore
+    FROM    tab_base
+    GROUP BY dt -- ,hh
+             ,vid
+)
+,tab_post AS 
+(
+    SELECT  dt --,hh
+            ,vid
+            ,COUNT(1) AS exp
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_plus_noself
+            ,round(COALESCE(SUM(return_n_uv_noself) / SUM(is_return_noself),0),6) AS ros_minus_noself
+    FROM    tab_base
+    GROUP BY dt -- ,hh
+             ,vid
+)
+SELECT  t1.*
+        ,t2.*
+FROM    tab_post t1
+LEFT JOIN tab_pre t2
+ON      t1.dt = t2.dt --AND     t1.hh = t2.hh
+AND     t1.vid = t2.vid
+ORDER BY t1.dt,t1.vid
+;

+ 124 - 0
req/低.md

@@ -0,0 +1,124 @@
+
+现在有一个推荐场景的问题,
+我的推荐预估的是 str、ros, vor 是统计出来的,最终的排序公式是str * ros * vor = vov
+str用的是 fm 模型,ros用的是 xgb模型,vor 用的是 24 小时的统计量;
+现在有一个问题,最近看到一些头部的 item,vov 低,但是给的曝光量很多,现在要你分析具体的原因;
+我会给你一个 sql:
+模型:
+1. fmRov = str
+1. fmRovOrigin = str采样还原前
+2. NorXGBScore = ros 
+3. vor=vor
+真实
+  1. Str = str-plus-noself
+  2. Ros = ros-minus-noself
+  3. Rov = return-n-uv-noself / exp
+  4. Vov = new-exposure-cnt / exp
+
+exp 是曝光的意思;具体 sql 如下,请你基于以下 sql 来分析具体低vov高曝光的原因是什么?这个问题的影响面有哪些?怎么解决:
+
+WITH tab_base AS 
+(
+    SELECT  *
+            ,((0.059 * fmRovOrigin) / (1 - (1 - 0.059) * fmRovOrigin)) AS online_fmrov
+            ,((0.036 * fmRovOrigin) / (1 - (1 - 0.036) * fmRovOrigin)) AS real_fmrov
+    FROM    (
+                SELECT  dt
+                        ,hh
+                        ,vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,is_return_noself
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,CAST(score AS DOUBLE) AS score
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS fmRov
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRovOrigin') AS DOUBLE) AS fmRovOrigin
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS NorXGBScore
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor
+                        ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS hasReturnRovScore
+                FROM    (
+                            SELECT  dt
+                                    ,hh
+                                    ,vid
+                                    ,is_share
+                                    ,share_cnt
+                                    ,is_return_1
+                                    ,is_return_n
+                                    ,is_return_noself
+                                    ,return_1_uv
+                                    ,return_n_uv
+                                    ,return_n_uv_noself
+                                    ,new_exposure_cnt
+                                    ,score
+                                    ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+                            FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+                            WHERE   dt BETWEEN '${start_dt}' AND '${end_dt}'
+                            AND     hh BETWEEN '${start_hh}' AND '${end_hh}'
+                            AND     apptype = '${apptype}'
+                            --AND     vid IN ('62421458','55931081','62955809','58807530') 
+                            AND     vid IN ('62967014','63159658','62151288')
+                            AND     extend_alg IS NOT NULL
+                            AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+                            AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+                            AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+                            AND     abcode NOT IN ("ab100")
+                        ) 
+                WHERE   GET_JSON_OBJECT(scoresmap,'$.fmRov') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.vor') IS NOT NULL
+                AND     GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') IS NOT NULL
+            ) 
+)
+,tab_pre AS 
+(
+    SELECT  dt --,hh
+            ,vid
+            ,COUNT(1) AS cnt
+            ,AVG(score) AS score
+            ,AVG(fmRov) AS fmRov
+            ,AVG(online_fmrov) AS online_fmrov
+            ,AVG(real_fmrov) AS real_fmrov
+            ,AVG(1.22 * pow(NorXGBScore,1.15)) AS NorXGBScore
+            ,AVG(vor) AS vor
+            ,AVG(hasReturnRovScore) AS hasReturnRovScore
+            ,STDDEV(score) AS std_score
+            ,STDDEV(fmRov) AS std_fmRov
+            ,STDDEV(NorXGBScore) AS std_NorXGBScore
+            ,STDDEV(vor) AS std_vor
+            ,STDDEV(hasReturnRovScore) AS std_hasReturnRovScore
+    FROM    tab_base
+    GROUP BY dt -- ,hh
+             ,vid
+)
+,tab_post AS 
+(
+    SELECT  dt --,hh
+            ,vid
+            ,COUNT(1) AS exp
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_plus_noself
+            ,round(COALESCE(SUM(return_n_uv_noself) / SUM(is_return_noself),0),6) AS ros_minus_noself
+    FROM    tab_base
+    GROUP BY dt -- ,hh
+             ,vid
+)
+SELECT  t1.*
+        ,t2.*
+FROM    tab_post t1
+LEFT JOIN tab_pre t2
+ON      t1.dt = t2.dt --AND     t1.hh = t2.hh
+AND     t1.vid = t2.vid
+ORDER BY t1.dt,t1.vid
+;

+ 347 - 0
tables/loghubods/loghubods.dwd_recsys_alg_exposure_base_20250108.md

@@ -0,0 +1,347 @@
+# dwd_recsys_alg_exposure_base_20250108 表逻辑说明
+
+## 一、表定位
+
+**曝光-分享-回流** 链路分析表,在曝光粒度上统计分享和回流指标。
+
+---
+
+## 二、整体流程(4 步)
+
+```
+数据源 → 分享关联曝光 → 分享关联回流 → 汇总输出
+```
+
+---
+
+## 三、每步详解
+
+### Step 1: 数据准备
+
+| CTE | 数据源 | 说明 |
+|-----|--------|------|
+| `t_return` | `user_share_log_flow` (topic=click) | 回流点击数据 |
+| `t_share_from_sharelog` | `user_share_log_flow` (topic=share) | 分享行为数据 |
+| `t_exposure` | `dwd_recsys_alg_exposure_base_view_20250402` | 曝光数据 |
+
+**贡献字段**:
+- 曝光维度:apptype, uid, mid, vid, sessionid, subsessionid
+- 场景信息:pagesource, recommendlogvo, abcode, recommendpagetype, recomtraceid, headvideoid, rootsourceid, hotsencetype
+- 流量池:flowpool, level
+- 设备信息:clientip, machineinfo_brand/model/system/wechatversion/sdkversion
+- 地理信息:province, city
+- 时间:ts
+
+---
+
+### Step 2: 分享 → 曝光关联
+
+**目的**:找到每次分享对应的曝光记录
+
+**关联方向**:分享 → 曝光(多对一)
+
+**关联 Key(6 级 Fallback)**:
+
+| 级别 | 关联条件 | 说明 |
+|------|---------|------|
+| 1 | apptype + mid + vid + **subsessionid** + pagesource + ts>= | 最严格 |
+| 2 | apptype + mid + vid + **sessionid** + pagesource + ts>= | 放宽会话 |
+| 3 | apptype + mid + vid + **subsessionid** + pagesource | 去掉时间 |
+| 4 | apptype + mid + vid + **sessionid** + pagesource | 去掉时间 |
+| 5 | apptype + mid + vid + **subsessionid** | 去掉 pagesource |
+| 6 | apptype + mid + vid + **sessionid** | 最宽松 |
+
+**必须 Key**:`apptype + mid + vid`
+
+**贡献字段**:
+- `is_share` = 该曝光是否产生分享
+- `share_cnt` = 该曝光产生的分享次数
+
+---
+
+### Step 3: 分享 ← 回流关联
+
+**目的**:统计每次分享带来的回流
+
+**关联方向**:回流 → 分享(多对一)
+
+**关联 Key**:
+
+| 回流类型 | 关联条件 | 说明 |
+|---------|---------|------|
+| 一级回流 | `shareid` + vid + apptype | 直接分享带来的回流 |
+| N级回流 | `rootshareid` + vid + apptype | 裂变链路所有回流 |
+
+**贡献字段**:
+
+| 字段 | 说明 |
+|------|------|
+| `is_return_1` | 是否有一级回流 |
+| `return_1_pv` | 一级回流 PV |
+| `return_1_uv` | 一级回流 UV |
+| `return_1_mids` | 一级回流用户列表 |
+| `is_return_n` | 是否有 N 级回流 |
+| `return_n_pv` | N 级回流 PV |
+| `return_n_uv` | N 级回流 UV |
+| `return_n_mids` | N 级回流用户列表 |
+| `is_return_noself` | 是否有非自己的一级回流 |
+| `return_1_uv_noself` | 排除自己的一级回流 UV |
+| `return_1_mids_noself` | 排除自己的一级回流用户列表 |
+| `is_return_n_noself` | 是否有非自己的 N 级回流 |
+| `return_n_uv_noself` | 排除自己的 N 级回流 UV |
+| `return_n_mids_noself` | 排除自己的 N 级回流用户列表 |
+| `new_exposure_cnt` | 回流带来的新曝光数 |
+
+**注**:`return_n` 包含 `return_1`(rootshareid = shareid 时)
+
+---
+
+### Step 4: 汇总输出
+
+**关联 Key**:`exposure_id`
+
+```sql
+t_exposure
+LEFT JOIN t_share_with_label_group
+ON exposure_id
+```
+
+**派生字段**:
+
+| 字段 | 计算逻辑 |
+|------|---------|
+| `pagesource_new` | pagesource 分类映射(回流后沉浸页、详情页、首页feed等) |
+| `extend` | JSON 扩展字段(animationSceneType, extParams, group_name 等) |
+
+---
+
+## 四、关联关系总图
+
+```
+┌─────────────────────────────────────────────────────────────────────────┐
+│                                                                         │
+│   t_exposure                t_share                    t_return         │
+│   (曝光)                    (分享)                     (回流)           │
+│       │                        │                          │             │
+│       │                        │                          │             │
+│       │◄───────────────────────┤                          │             │
+│       │  Step2: 分享→曝光       │◄─────────────────────────┤             │
+│       │  key: apptype+mid+vid  │  Step3: 回流→分享         │             │
+│       │       +subsession/     │  key: shareid+vid+apptype│             │
+│       │        session         │       rootshareid+vid    │             │
+│       │       +pagesource+ts   │       +apptype           │             │
+│       │                        │                          │             │
+│       ▼                        ▼                          ▼             │
+│  ┌─────────────────────────────────────────────────────────────────┐    │
+│  │                    最终输出字段                                  │    │
+│  ├─────────────────────────────────────────────────────────────────┤    │
+│  │  曝光字段: apptype,uid,mid,vid,sessionid,pagesource...          │    │
+│  │  分享字段: is_share, share_cnt                                  │    │
+│  │  回流字段: is_return_1, return_1_pv/uv/mids                     │    │
+│  │           is_return_n, return_n_pv/uv/mids                      │    │
+│  │           *_noself (排除分享者自己)                              │    │
+│  │           new_exposure_cnt                                      │    │
+│  │  派生字段: pagesource_new, extend                               │    │
+│  └─────────────────────────────────────────────────────────────────┘    │
+│                                                                         │
+└─────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 五、核心 Key 汇总
+
+| 步骤 | 关联 Key | 必须 Key |
+|------|---------|---------|
+| 分享→曝光 | apptype + mid + vid + subsessionid/sessionid + pagesource + ts | apptype + mid + vid |
+| 回流→分享(1级) | shareid + vid + apptype | shareid + vid + apptype |
+| 回流→分享(N级) | rootshareid + vid + apptype | rootshareid + vid + apptype |
+| 汇总→曝光 | exposure_id | exposure_id |
+
+---
+
+## 六、附:曝光→回流关联(new_exposure_cnt 计算)
+
+**目的**:统计每次回流带来多少新曝光
+
+**关联方向**:曝光 → 回流(多对一)
+
+**关联 Key(4 级 Fallback)**:
+
+| 级别 | 关联条件 |
+|------|---------|
+| 1 | mid + headvideoid + subsessionid |
+| 2 | mid + headvideoid + sessionid |
+| 3 | mid + subsessionid |
+| 4 | mid + sessionid |
+
+**必须 Key**:`mid`
+
+**贡献字段**:`new_exposure_cnt`(回流后用户浏览了多少新内容)
+
+---
+
+## 七、重要洞见
+
+### 1. new_exposure_cnt 包含整条裂变链路
+
+`new_exposure_cnt` **包括所有裂变用户的新曝光**,不只是直接回流用户。
+
+```
+A 分享视频 V(shareid_A = rootshareid)
+    │
+    ├─ B 回流,浏览了 5 个视频 → new_exposure_cnt = 5
+    │      │
+    │      └─ B 再分享视频 V
+    │              │
+    │              └─ C 回流,浏览了 3 个视频 → new_exposure_cnt = 3
+    │
+    └─ 最终汇总:SUM(new_exposure_cnt) = 5 + 3 = 8
+```
+
+**原因**:在 `t_share_with_label` 中按 `rootshareid` 分组后 `SUM(new_exposure_cnt)`。
+
+---
+
+### 2. return_n 只统计同一视频的裂变
+
+`return_n_uv` **不包括**用户往下滑分享其他视频带回的人。
+
+```sql
+-- 关联条件
+ON  a.shareid = c.rootshareid
+AND a.vid = c.vid              -- vid 必须匹配
+```
+
+**场景说明**:
+
+| 场景 | 是否计入 return_n |
+|------|------------------|
+| 直接点击分享链接回流看视频 V | ✓ |
+| 裂变用户继续分享**同一视频 V** 带回的人 | ✓ |
+| 裂变用户往下滑分享**其他视频 W** 带回的人 | ✗ |
+
+```
+A 分享视频 V
+    │
+    ├─ B 点击回流看视频 V ─────────────────► return_n ✓
+    │      │
+    │      ├─ B 往下滑分享视频 W
+    │      │      └─ D 点击回流 ──────────► return_n ✗(vid 不同)
+    │      │
+    │      └─ B 再分享视频 V
+    │              └─ C 点击回流 ──────────► return_n ✓
+    │
+    └─ A 的 return_n_uv = B + C(只统计视频 V 的裂变)
+```
+
+**结论**:`return_n` 只追踪**同一视频**的裂变链路,不跨视频统计。
+
+---
+
+## 八、字段说明
+
+### 曝光维度字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `apptype` | STRING | 应用类型 |
+| `uid` | STRING | 用户 ID |
+| `mid` | STRING | 设备 ID |
+| `vid` | STRING | 视频 ID |
+| `sessionid` | STRING | 会话 ID |
+| `subsessionid` | STRING | 子会话 ID(更细粒度的会话划分) |
+| `pagesource` | STRING | 页面来源 |
+| `page` | STRING | 页面标识 |
+
+### 推荐算法字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `recommendlogvo` | STRING | 推荐算法的返回结果日志 |
+| `abcode` | STRING | 推荐算法的 AB 分组(如 ab0) |
+| `recommendpagetype` | STRING | 区分 pagesource 相同时的场景(三种回流头部、沉浸页下滑、feed下滑) |
+| `recomtraceid` | STRING | 推荐服务追踪 ID(后端调取推荐服务前生成,前端降级或后端异常时可能为空) |
+| `headvideoid` | STRING | 头部视频 ID(用于回流场景关联) |
+| `rootsourceid` | STRING | 区分流量来源(如投流等) |
+| `hotsencetype` | STRING | 热点场景类型 |
+| `flowpool` | STRING | 流量池标识(非流量池为空字符串,无 null) |
+| `level` | STRING | 流量池层级(非流量池为 null) |
+
+### 设备信息字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `clientip` | STRING | 客户端 IP |
+| `machineinfo_brand` | STRING | 设备品牌 |
+| `machineinfo_model` | STRING | 设备型号 |
+| `machineinfo_system` | STRING | 操作系统 |
+| `machineinfo_wechatversion` | STRING | 微信版本 |
+| `machineinfo_sdkversion` | STRING | SDK 版本 |
+| `province` | STRING | 省份 |
+| `city` | STRING | 城市 |
+
+### 时间字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `ts` | STRING | 曝光时间戳 |
+| `dt` | STRING | 分区字段:日期(格式:20240105) |
+| `hh` | STRING | 分区字段:小时(格式:04) |
+
+### 分享指标字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `is_share` | STRING | 该曝光是否产生分享(1/0) |
+| `share_cnt` | STRING | 该曝光产生的分享次数 |
+
+### 一级回流字段(直接回流)
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `is_return_1` | STRING | 该曝光的分享是否带来一级回流(1/0) |
+| `return_1_pv` | STRING | 一级回流 PV(点击次数) |
+| `return_1_uv` | STRING | 一级回流 UV(回流人数) |
+| `return_1_mids` | STRING | 一级回流用户的 mid 列表 |
+| `is_return_noself` | STRING | 是否有非自己的一级回流(1/0) |
+| `return_1_uv_noself` | STRING | 排除分享者自己的一级回流 UV |
+| `return_1_mids_noself` | STRING | 排除分享者自己的一级回流 mid 列表 |
+
+### N级回流字段(裂变回流)
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `is_return_n` | STRING | 该曝光的分享是否带来 N 级回流(1/0) |
+| `return_n_pv` | STRING | N 级回流 PV(整条裂变链的点击次数) |
+| `return_n_uv` | STRING | N 级回流 UV(整条裂变链的回流人数) |
+| `return_n_mids` | STRING | N 级回流用户的 mid 列表 |
+| `is_return_n_noself` | STRING | 是否有非自己的 N 级回流(1/0) |
+| `return_n_uv_noself` | STRING | 排除分享者自己的 N 级回流 UV |
+| `return_n_mids_noself` | STRING | 排除分享者自己的 N 级回流 mid 列表 |
+
+### 新曝光指标
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `new_exposure_cnt` | STRING | 回流带来的新曝光数(整条裂变链所有用户浏览的内容总数) |
+
+### 扩展字段
+
+| 字段 | 类型 | 说明 |
+|------|------|------|
+| `extend` | STRING | JSON 扩展字段(包含 animationSceneType, extParams, group_name 等) |
+
+---
+
+## 九、关键字段关系
+
+```
+return_1 ⊂ return_n
+├── return_1:只统计直接点击分享链接的回流(by shareid)
+└── return_n:统计整条裂变链的回流(by rootshareid),包含 return_1
+
+*_noself 系列:从对应指标中排除分享者自己的回流
+├── return_1_uv_noself = return_1_uv - (分享者自己点击)
+└── return_n_uv_noself = return_n_uv - (分享者自己在裂变链中的点击)
+```

+ 68 - 0
tables/loghubods/user_share_log.txt

@@ -0,0 +1,68 @@
+表名: loghubods.user_share_log
+注释: *
+创建时间: 2019-07-06 18:03:41
+最后修改: 2026-01-22 00:11:48
+
+============================================================
+字段名                            类型              注释
+============================================================
+topic                          string          null
+machinecode                    string          null
+apptype                        string          null
+pagesource                     string          null
+shareid                        string          null
+shareobjectid                  string          null
+type                           string          null
+clienttimestamp                string          null
+parentshareid                  string          null
+versioncode                    string          null
+pagecategoryid                 string          null
+rootshareid                    string          null
+rootpagecategoryid             string          null
+sharedepth                     string          null
+rootpagesource                 string          null
+sessionid                      string          
+returnid                       string          
+rootlaunchshareid              string          
+subsessionid                   string          
+sharebuttontype                string          
+rootjumphomevideoid            string          
+rootpagetimestamp              string          
+abinfodata                     string          
+playid                         string          
+jumphomevideoid                string          
+eventid                        string          
+loginuid                       string          
+eventids                       string          实验组分组
+parenteventids                 string          
+parentrootpagesource           string          
+eventinfos                     string          
+clickobjectid                  string          
+clientip                       string          
+sharetitle                     string          
+shareimageurl                  string          
+rooteventinfos                 string          
+rootapptype                    string          
+wxconfigerr                    string          
+sharetitleid                   string          
+recomtraceid                   string          
+shareimgid                     string          
+isfeedcom                      string          
+rootsharemid                   string          
+rootsourceid                   string          
+rootsessionid                  string          
+opengid                        string          
+sequence                       string          
+usersharedepth                 string          
+progress                       string          
+groupshare                     string          
+sencetype                      string          
+hotsencetype                   string          
+ghid                           string          
+expstrategy                    string          
+creativeid                     string          
+dt                             string          
+
+分区字段:
+------------------------------------------------------------
+dt                             string          

+ 50 - 0
tables/videoods/dim_user.txt

@@ -0,0 +1,50 @@
+表名: videoods.dim_user
+注释: (无)
+创建时间: 2020-06-08 16:44:22
+最后修改: 2026-01-23 03:29:45
+
+============================================================
+字段名                            类型              注释
+============================================================
+uid                            bigint          
+mids                           string          设备唯一标识
+nick_name                      string          微信昵称
+longvideo_nick_name            string          小程序昵称
+gender                         string          性别
+user_type                      string          用户身份
+phone_number                   string          联系方式
+gmt_create                     datetime        创建时间
+gmt_create_timestamp           bigint          创建时间戳
+tags                           string          用户内容标签
+category_name                  string          场景
+isvip                          string          是否开通vip
+isreward                       string          是否开通赞赏
+isad                           string          是否开通广告
+isgood                         string          是否开通商品权限
+first_up_datetime              string          首次上传时间
+last_up_datetime               string          最后一次上传时间
+next_to_last_up_datetime       string          倒数第二次上传时间
+videos                         bigint          上传视频数
+today_videos                   bigint          今日上传视频数
+idols                          bigint          关注的人数
+fans                           bigint          粉丝数
+play_count                     bigint          累计播放人数
+play_count_total               bigint          累计播放次数
+total_reward                   double          赞赏总金额
+currentday_reward              double          当日赞赏金额
+reward_person                  bigint          赞赏人数
+total_reward_times             bigint          赞赏次数
+reward_videos                  bigint          赞赏视频数
+total_price                    bigint          付费总金额
+currentday_price               bigint          当日付费金额
+total_price_times              bigint          付费次数
+total_price_person             bigint          付费人数
+total_price_videos             bigint          付费视频数
+cgrain_user_type               string          粗粒度身份
+identity_tagname               string          用户身份标签
+operation_tags                 string          用户运营标签
+identity_tag_id                bigint          用户身份标签号
+identity_create_time           datetime        用户身份标签创建时间
+country                        string          国家
+province                       string          省份
+city                           string          市

+ 55 - 0
tables/videoods/wx_video.txt

@@ -0,0 +1,55 @@
+表名: videoods.wx_video
+注释: 视频表
+创建时间: 2019-07-02 19:43:22
+最后修改: 2026-01-23 00:30:45
+
+============================================================
+字段名                            类型              注释
+============================================================
+id                             bigint          主键编号,取值来源为redis分布式主键
+uid                            bigint          用户编号,用户信息表中的uid字段
+title                          string          标题
+video_path                     string          视频地址
+cover_img_path                 string          封面图片地址
+self_cover_img_path            string          自定义封面图片地址
+share_moment_img_path          string          分享到朋友圈的图片保存地址
+qrimg_path                     string          pc端生成二维码的保存路径
+width                          bigint          
+height                         bigint          
+cover_img_width                bigint          
+cover_img_height               bigint          
+play_count                     bigint          播放次数,去重
+play_count_total               bigint          被播放总次数,不去重
+share_count                    bigint          分享次数,去重
+share_count_total              bigint          被分享到朋友圈总次数,不去重
+reported_count                 bigint          被举报次数
+share_count_friend             bigint          微信分享给朋友的次数,不去重
+share_count_friend_total       bigint          被分享给微信好友的总次数,不去重
+favoriteds                     bigint          视频被收藏的次数
+total_time                     bigint          视频时长
+rotate                         string          
+bit_rate                       bigint          比率
+transcode_status               bigint          转码状态(1:发送转码失败,2:转码中,3:转码完成,4:转码失败)
+transcode_done_datetime        datetime        转码完成时间
+request_id                     string          
+job_id                         string          
+transed_video_path             string          
+gmt_create                     datetime        创建时间
+changed_by                     bigint          由谁修改
+gmt_modified                   datetime        最后修改时间
+gmt_create_timestamp           bigint          创建时间戳,用来排序和分页查询
+gmt_modified_timestamp         bigint          最后修改时间戳,用来排序和分页查询
+version                        bigint          数据版本号,用来做版本控制和乐观锁
+status                         bigint          数据状态,1有效,2 已删除,3 已屏蔽,4关注可见,5分享可见,6自己可见
+system                         string          发送视频时的操作系统
+file_extensions                string          视频后缀名
+examine_status                 bigint          审核状态(0:上传未审,1:上传已审)
+content_md5                    string          原视频的md5
+size                           bigint          原文件大小
+code_name                      string          原文件编码格式
+video_collection_id            bigint          用户的视频集编号
+recommend_status               bigint          推荐状态(0:未推荐,-6:待推荐,1:普通推荐,10:编辑推荐,-7:可搜索)
+tag_count                      bigint          标签个数
+stage_recommend_examine_status bigint          待推荐审核状态(0:待推荐未审,1:待推荐已审)
+sensitive_status               bigint          内容敏感状态(0:未检验,1:不敏感,2:敏感,3:敏感已审)
+is_foreogn_bucket              bigint          是否是存放在境外bucket,针对境外用户上传后转码前的地址

+ 6 - 0
tasks/00_AB效果/01_推荐AB实时效果.json

@@ -0,0 +1,6 @@
+{
+  "token": "ONZqsxB9BhGH8tt90EScSJT5nHh",
+  "sheet_id": "zxVjf5",
+  "sort": "dt:desc",
+  "cols": null
+}

+ 86 - 0
tasks/00_AB效果/01_推荐AB实时效果.sql

@@ -0,0 +1,86 @@
+WITH t_base AS 
+(
+    SELECT  dt
+            ,apptype 
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab0","ab1","ab4","ab5","ab6","ab7","ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("0") AND abcode IN ("ab2","ab3") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准&ros损失函数优化"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "实验组-str+校准&ros天级更新"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    ELSE "其他"
+            END AS abcode
+            -- ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+            --         WHEN apptype IN ("4") AND abcode IN ("ab2","ab3","ab4","ab5","ab6","ab7") THEN "对照组"
+            --         ELSE "其他"
+            -- END AS abcode
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = '${dt}'
+    AND     hh BETWEEN "16" AND "24"
+    AND     apptype IN ("4")
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+)
+SELECT  dt
+        ,COALESCE(apptype,"sum") AS apptype
+        ,COALESCE(abcode,"sum") AS abcode
+        ,COALESCE(page,"sum") AS page
+        ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+        ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+        ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+        ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+        ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+        ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+        ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+        ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+        ,COUNT(DISTINCT mid) AS dau
+        ,COUNT(1) AS exp
+        ,COALESCE(SUM(is_share),0) AS is_share
+        ,COALESCE(SUM(share_cnt),0) AS share_cnt
+        ,COALESCE(SUM(is_return_1),0) AS is_return_1
+        ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+        ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+        ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself -- ,MAX(CAST(COALESCE(share_cnt,'0') AS BIGINT)) AS max_share_cnt
+        -- ,MAX(CAST(COALESCE(return_1_uv,'0') AS BIGINT)) AS max_return_1_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv,'0') AS BIGINT)) AS max_return_n_uv
+        -- ,MAX(CAST(COALESCE(return_n_uv_noself,'0') AS BIGINT)) AS max_return_n_uv_noself
+        -- ,COALESCE(SUM(is_return_noself),0) AS is_return_noself
+        -- ,COALESCE(SUM(return_1_uv),0) AS return_1_uv
+        -- ,COUNT(DISTINCT vid) AS exp_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_share = '1' THEN vid ELSE NULL END) AS share_vid_cnt
+        -- ,COUNT(DISTINCT CASE    WHEN is_return_n = '1' THEN vid ELSE NULL END) AS return_vid_cnt
+FROM    t_base
+where page in ("推荐")
+GROUP BY dt
+         ,apptype
+         ,abcode
+         ,page
+GROUPING SETS ((dt,apptype,abcode)
+              ,(dt,apptype,abcode,page))
+ORDER BY dt DESC,apptype,page,abcode
+;

+ 90 - 0
tasks/00_表的洞察/loghubods.user_share_log/export_neo4j.py

@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+将分享数据转换为 Neo4j 导入格式
+
+图模型: (User) -[:SHARED {vid, ts}]-> (User)
+
+用法:
+    python export_neo4j.py output/05_图数据/20260111.csv
+    python export_neo4j.py output/05_图数据/*.csv  # 多文件
+"""
+
+import sys
+import csv
+from pathlib import Path
+
+def export_neo4j(input_files, output_dir):
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    users = set()
+    relationships = []
+
+    # 读取所有输入文件
+    for input_file in input_files:
+        print(f"读取: {input_file}")
+        with open(input_file, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                from_mid = row['from_mid']
+                vid = row['vid']
+                target_mid = row['target_mid']
+                ts = row['ts']
+
+                users.add(from_mid)
+                users.add(target_mid)
+                relationships.append((from_mid, target_mid, vid, ts))
+
+    print(f"用户数: {len(users)}, 分享关系数: {len(relationships)}")
+
+    # 1. 导出用户节点 (neo4j-admin 格式)
+    users_file = output_dir / 'users.csv'
+    with open(users_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(['mid:ID', ':LABEL'])
+        for mid in users:
+            writer.writerow([mid, 'User'])
+    print(f"写入: {users_file}")
+
+    # 2. 导出分享关系 (neo4j-admin 格式)
+    rels_file = output_dir / 'shared.csv'
+    with open(rels_file, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow([':START_ID', ':END_ID', 'vid', 'ts:long', ':TYPE'])
+        for from_mid, target_mid, vid, ts in relationships:
+            writer.writerow([from_mid, target_mid, vid, ts, 'SHARED'])
+    print(f"写入: {rels_file}")
+
+    # 3. 生成导入命令
+    cmd = f"""# Neo4j 导入命令 (停止数据库后执行):
+neo4j-admin database import full \\
+    --nodes={users_file.absolute()} \\
+    --relationships={rels_file.absolute()} \\
+    --overwrite-destination \\
+    neo4j
+"""
+    print(cmd)
+
+    cmd_file = output_dir / 'import_cmd.sh'
+    with open(cmd_file, 'w') as f:
+        f.write(cmd.strip())
+    print(f"写入: {cmd_file}")
+
+if __name__ == '__main__':
+    import glob
+
+    if len(sys.argv) < 2:
+        print("用法: python export_neo4j.py <csv文件或通配符>")
+        sys.exit(1)
+
+    input_files = []
+    for pattern in sys.argv[1:]:
+        input_files.extend(glob.glob(pattern))
+
+    if not input_files:
+        print(f"找不到文件: {sys.argv[1:]}")
+        sys.exit(1)
+
+    output_dir = Path(input_files[0]).parent / 'neo4j'
+    export_neo4j(input_files, output_dir)

+ 256 - 0
tasks/承接/rosn分析/05_实验组xTop20视频_vs对照组_vor.config

@@ -0,0 +1,256 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v5: 新增 vor 统计量 + score_pred/score_stat/score_real
+-- 排序公式: str * ros * vor
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.vor') AS DOUBLE) AS vor_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_pred) / COUNT(1), 0), 4) AS rosn_copc
+            ,round((SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)) / NULLIF(SUM(rosn_stat) / COUNT(1), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(COALESCE(SUM(rosn_pred) / COUNT(1),0),6) AS rosn_pred
+            ,round(COALESCE(SUM(rosn_stat) / COUNT(1),0),6) AS rosn_stat
+            ,round(COALESCE(SUM(vor_stat) / COUNT(1),0),6) AS vor_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- score: str * ros * vor
+            ,round(AVG(str_pred * rosn_pred), 6) AS score_pred
+            ,round(AVG(str_pred * rosn_stat * vor_stat), 6) AS score_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS score_real
+            -- 误差
+            ,round(AVG(ABS(rosn_pred - return_n_uv_noself)),6) AS rosn_pred_mae
+            ,round(AVG(ABS(rosn_stat - return_n_uv_noself)),6) AS rosn_stat_mae
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(vor 和 score)
+            ,MAX(CASE WHEN abcode = '对照组' THEN vor_stat END) OVER (PARTITION BY dt, apptype, vid) AS vor_stat_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_pred END) OVER (PARTITION BY dt, apptype, vid) AS score_pred_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_stat END) OVER (PARTITION BY dt, apptype, vid) AS score_stat_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN score_real END) OVER (PARTITION BY dt, apptype, vid) AS score_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- COPC
+        ,str_copc, rosn_copc, rosn_stat_copc, rovn_copc, rovn_stat_copc
+        -- 模型预测与真实值
+        ,str_real, str_pred, rosn_real, rosn_pred, rosn_stat, vor_stat
+        ,rovn_real, rovn_pred, rovn_stat
+        -- score: str * ros * vor
+        ,score_pred, score_stat, score_real
+        ,rosn_pred_mae, rosn_stat_mae
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- vor 和 score 变化率
+        ,round((vor_stat - vor_stat_base) / NULLIF(vor_stat_base, 0), 4) AS vor_stat_chg
+        ,round((score_pred - score_pred_base) / NULLIF(score_pred_base, 0), 4) AS score_pred_chg
+        ,round((score_stat - score_stat_base) / NULLIF(score_stat_base, 0), 4) AS score_stat_chg
+        ,round((score_real - score_real_base) / NULLIF(score_real_base, 0), 4) AS score_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 391 - 0
tasks/承接/rosn校准/plot_calibration.py

@@ -0,0 +1,391 @@
+"""
+校准曲线可视化:读取 output/02_分组校准数据 下的 CSV,生成交互式 HTML。
+
+用法:
+    python tasks/承接/rosn校准/plot_calibration.py
+"""
+
+import os
+import json
+import glob
+import pandas as pd
+from collections import defaultdict
+
+DATA_DIRS = [
+    os.path.join(os.path.dirname(__file__), "output", "02_分组校准数据"),
+    os.path.join(os.path.dirname(__file__), "output", "03_label分桶校准数据"),
+]
+OUT_HTML = os.path.join(os.path.dirname(__file__), "output", "calibration.html")
+
+
+def load_data():
+    files = []
+    for d in DATA_DIRS:
+        files.extend(sorted(glob.glob(os.path.join(d, "*.csv"))))
+    if not files:
+        raise FileNotFoundError(f"No CSV files found in {DATA_DIRS}")
+    df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
+    df["dt"] = df["dt"].astype(str)
+    return df
+
+
+def build_charts_data(df):
+    """
+    扁平化数据结构:
+    ALL_DATA[apptype][bucket_type][group_key][dt] = [ {bucket_id, predict, stat, label, ...}, ... ]
+    """
+    all_data = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list))))
+    for _, row in df.iterrows():
+        at = str(row["apptype"])
+        bt = row["bucket_type"]
+        gk = row["group_key"]
+        dt = row["dt"]
+        predict = float(row["predict"])
+        all_data[at][bt][gk][dt].append({
+            "bucket_id": int(row["bucket_id"]),
+            "predict": predict,
+            "pred_校准": round(1.22 * predict ** 1.15, 6),
+            "stat": float(row["stat"]) if pd.notna(row.get("stat")) else None,
+            "label": float(row["label"]),
+            "cnt": int(row["cnt"]),
+            "range_begin": float(row["range_begin"]),
+            "range_end": float(row["range_end"]),
+        })
+    # 排序 rows by bucket_id
+    for at in all_data:
+        for bt in all_data[at]:
+            for gk in all_data[at][bt]:
+                for dt in all_data[at][bt][gk]:
+                    all_data[at][bt][gk][dt].sort(key=lambda r: r["bucket_id"])
+    return all_data
+
+
+def render_html(all_data, all_dts):
+    """生成单图 ECharts HTML — 动态配色,5 项筛选"""
+
+    # 收集维度值
+    all_apptypes = sorted(all_data.keys())
+    all_bucket_types = sorted({bt for at in all_data for bt in all_data[at]})
+    all_group_keys = sorted({gk for at in all_data for bt in all_data[at] for gk in all_data[at][bt]})
+
+    # apptype 标签映射
+    apptype_labels = {}
+    for at in all_apptypes:
+        apptype_labels[at] = "视频号" if at == "4" else f"apptype={at}"
+
+    # ---- 序列化数据 ----
+    js_all_data = {}
+    for at in all_data:
+        js_all_data[at] = {}
+        for bt in all_data[at]:
+            js_all_data[at][bt] = {}
+            for gk in all_data[at][bt]:
+                js_all_data[at][bt][gk] = {}
+                for dt, rows in all_data[at][bt][gk].items():
+                    js_all_data[at][bt][gk][dt] = rows
+
+    # ---- 构建筛选控件 HTML ----
+    dt_options = ''.join(
+        f'<option value="{dt}">{dt}</option>' for dt in sorted(all_dts, reverse=True)
+    )
+    gk_checkboxes = ''.join(
+        '<label class="cb-item">'
+        f'<input type="checkbox" value="{gk}" {"checked" if gk == "对照组" else ""} onchange="renderChart()">'
+        f'{gk}</label>'
+        for gk in all_group_keys
+    )
+    bt_checkboxes = ''.join(
+        f'<label class="cb-item"><input type="checkbox" value="{bt}" checked onchange="renderChart()">{bt}</label>'
+        for bt in all_bucket_types
+    )
+    at_checkboxes = ''.join(
+        '<label class="cb-item">'
+        f'<input type="checkbox" value="{at}" checked onchange="renderChart()">'
+        f'{apptype_labels[at]}</label>'
+        for at in all_apptypes
+    )
+
+    html = f"""<!DOCTYPE html>
+<html>
+<head>
+<meta charset="utf-8">
+<title>校准曲线</title>
+<script src="https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"></script>
+<style>
+    body {{ font-family: -apple-system, sans-serif; margin: 20px; background: #fafafa; }}
+    .filter-row {{ display: flex; align-items: center; gap: 12px; margin-bottom: 12px; }}
+    .filter-row > label {{ font-weight: 600; min-width: 56px; }}
+    select {{ padding: 4px 8px; border-radius: 4px; border: 1px solid #ccc; }}
+    .checkbox-group {{ display: flex; flex-wrap: wrap; gap: 6px 14px; }}
+    .cb-item {{ display: inline-flex; align-items: center; gap: 4px; cursor: pointer; font-size: 13px; }}
+    .cb-line {{ display: inline-block; width: 18px; height: 0; margin-bottom: 1px; }}
+    #chart-container {{ background: #fff; border-radius: 8px; padding: 16px; box-shadow: 0 1px 3px rgba(0,0,0,.1); margin-top: 16px; }}
+    #mae-info {{ background: #fff; border-radius: 8px; padding: 16px; box-shadow: 0 1px 3px rgba(0,0,0,.1); margin-top: 12px; font-size: 13px; line-height: 1.6; }}
+    #mae-info table {{ font-size: 12px; font-variant-numeric: tabular-nums; }}
+    #mae-info th {{ font-weight: 600; white-space: nowrap; }}
+    #mae-info td {{ white-space: nowrap; }}
+</style>
+</head>
+<body>
+<h2>校准曲线</h2>
+
+<div class="filter-row">
+    <label>dt:</label>
+    <select id="sel_dt" onchange="renderChart()">{dt_options}</select>
+</div>
+<div class="filter-row">
+    <label>分组:</label>
+    <span id="gk_checks" class="checkbox-group">{gk_checkboxes}</span>
+</div>
+<div class="filter-row">
+    <label>指标:</label>
+    <span id="metric_checks" class="checkbox-group">
+        <label class="cb-item"><input type="checkbox" value="predict" checked onchange="renderChart()"><span class="cb-line" style="border-top:2px solid #333"></span>predict</label>
+        <label class="cb-item"><input type="checkbox" value="pred_校准" checked onchange="renderChart()"><span class="cb-line" style="border-top:2px dashed #333"></span>pred_校准</label>
+        <label class="cb-item"><input type="checkbox" value="stat" checked onchange="renderChart()"><span class="cb-line" style="border-top:2px dashed #333"></span>stat</label>
+        <label class="cb-item"><input type="checkbox" value="label" checked onchange="renderChart()"><span class="cb-line" style="border-top:2px dotted #333"></span>label</label>
+    </span>
+</div>
+<div class="filter-row">
+    <label>分桶:</label>
+    <span id="bt_checks" class="checkbox-group">{bt_checkboxes}</span>
+</div>
+<div class="filter-row">
+    <label>apptype:</label>
+    <span id="at_checks" class="checkbox-group">{at_checkboxes}</span>
+</div>
+
+<div id="chart-container">
+    <div id="main-chart" style="width:100%;height:600px;"></div>
+</div>
+<div id="mae-info"></div>
+
+<script>
+var ALL_DATA = {json.dumps(js_all_data, ensure_ascii=False)};
+var APPTYPE_LABELS = {json.dumps(apptype_labels, ensure_ascii=False)};
+
+var PALETTE = [
+    '#3b82f6', '#f97316', '#22c55e', '#ef4444', '#a855f7',
+    '#06b6d4', '#f59e0b', '#ec4899', '#64748b', '#84cc16',
+    '#0ea5e9', '#d946ef', '#14b8a6', '#f43f5e', '#8b5cf6',
+    '#e11d48', '#059669', '#7c3aed', '#ea580c', '#0284c7',
+];
+
+var LINE_STYLES = {{
+    'predict': 'solid',
+    'pred_校准': [10, 3],
+    'stat':    [6, 3],
+    'label':   [2, 2]
+}};
+
+var chart = echarts.init(document.getElementById('main-chart'));
+
+function getCheckedValues(id) {{
+    var vals = [];
+    document.getElementById(id).querySelectorAll('input[type=checkbox]').forEach(function(cb) {{
+        if (cb.checked) vals.push(cb.value);
+    }});
+    return vals;
+}}
+
+function renderChart() {{
+    var dt = document.getElementById('sel_dt').value;
+    var selGroups = getCheckedValues('gk_checks');
+    var selMetrics = getCheckedValues('metric_checks');
+    var selBucketTypes = getCheckedValues('bt_checks');
+    var selApptypes = getCheckedValues('at_checks');
+
+    // 1. 构建 combo 列表 (apptype × bucket_type × group_key)
+    var combos = [];
+    selApptypes.forEach(function(at) {{
+        selBucketTypes.forEach(function(bt) {{
+            selGroups.forEach(function(gk) {{
+                if (ALL_DATA[at] && ALL_DATA[at][bt] && ALL_DATA[at][bt][gk]) {{
+                    combos.push({{ at: at, bt: bt, gk: gk }});
+                }}
+            }});
+        }});
+    }});
+
+    // 2. 动态配色: 每个 combo 分配一个颜色
+    var series = [];
+    var bucketIds = null;
+
+    combos.forEach(function(combo, idx) {{
+        var color = PALETTE[idx % PALETTE.length];
+        var rows = (ALL_DATA[combo.at][combo.bt][combo.gk][dt]) || [];
+        if (!bucketIds && rows.length > 0) {{
+            bucketIds = rows.map(function(r) {{ return r.bucket_id; }});
+        }}
+        var atLabel = APPTYPE_LABELS[combo.at] || combo.at;
+        var comboLabel = atLabel + '·' + combo.bt + '·' + combo.gk;
+
+        // 3. 每个 combo × selMetrics → 一条 series
+        selMetrics.forEach(function(metric) {{
+            series.push({{
+                name: comboLabel + '·' + metric,
+                type: 'line',
+                data: rows.map(function(r) {{ return r[metric]; }}),
+                symbol: 'none',
+                lineStyle: {{
+                    type: LINE_STYLES[metric],
+                    color: color,
+                    width: metric === 'label' ? 2 : 1.5
+                }},
+                itemStyle: {{ color: color }}
+            }});
+        }});
+    }});
+
+    chart.setOption({{
+        tooltip: {{
+            trigger: 'axis',
+            formatter: function(params) {{
+                if (!params.length) return '';
+                var dataIdx = params[0].dataIndex;
+                var s = 'bucket: ' + (bucketIds ? bucketIds[dataIdx] : dataIdx) + '<br/>';
+                // 按 combo 分组显示,从 ALL_DATA 取原始行
+                var groups = {{}};
+                params.forEach(function(p) {{
+                    var parts = p.seriesName.split('·');
+                    var metric = parts.pop();
+                    var comboKey = parts.join('·');
+                    if (!groups[comboKey]) groups[comboKey] = {{}};
+                    groups[comboKey][metric] = p.value;
+                    groups[comboKey].color = p.color;
+                }});
+                combos.forEach(function(combo, ci) {{
+                    var atLabel = APPTYPE_LABELS[combo.at] || combo.at;
+                    var ck = atLabel + '·' + combo.bt + '·' + combo.gk;
+                    var g = groups[ck];
+                    if (!g) return;
+                    var rows = (ALL_DATA[combo.at][combo.bt][combo.gk][dt]) || [];
+                    var r = rows[dataIdx];
+                    s += '<br/><b style="color:' + g.color + '">' + ck + '</b>';
+                    if (r) s += ' &nbsp;<span style="color:#999">cnt=' + r.cnt + ' range=[' + r.range_begin.toFixed(4) + ', ' + r.range_end.toFixed(4) + ']</span>';
+                    s += '<br/>';
+                    if (g.predict != null) s += '  predict: ' + g.predict.toFixed(4) + '<br/>';
+                    if (g['pred_校准'] != null) s += '  pred_校准: ' + g['pred_校准'].toFixed(4) + '<br/>';
+                    if (g.stat != null) s += '  stat: ' + g.stat.toFixed(4) + '<br/>';
+                    if (g.label != null) s += '  label: ' + g.label.toFixed(4) + '<br/>';
+                }});
+                return s;
+            }}
+        }},
+        legend: {{ type: 'scroll', bottom: 0 }},
+        grid: {{ left: 60, right: 30, top: 30, bottom: 60 }},
+        xAxis: {{ name: 'bucket_id', type: 'category', data: bucketIds || [], nameLocation: 'center', nameGap: 28 }},
+        yAxis: {{ name: 'value', type: 'value', nameLocation: 'center', nameGap: 45 }},
+        series: series
+    }}, true);
+
+    // 计算并展示 MAE(label 加权,总体 + 分段)
+    function calcWMAE(rows, metric) {{
+        var wSum = 0, wTotal = 0;
+        rows.forEach(function(r) {{
+            if (r[metric] != null && r.label != null) {{
+                wSum += r.label * Math.abs(r[metric] - r.label);
+                wTotal += r.label;
+            }}
+        }});
+        return wTotal > 0 ? (wSum / wTotal) : null;
+    }}
+    function rangeInfo(rows) {{
+        var labelMin = Infinity, labelMax = -Infinity, rngMin = Infinity, rngMax = -Infinity;
+        rows.forEach(function(r) {{
+            if (r.label != null) {{ labelMin = Math.min(labelMin, r.label); labelMax = Math.max(labelMax, r.label); }}
+            if (r.range_begin != null) rngMin = Math.min(rngMin, r.range_begin);
+            if (r.range_end != null) rngMax = Math.max(rngMax, r.range_end);
+        }});
+        return 'label=[' + labelMin.toFixed(4) + ', ' + labelMax.toFixed(4) + ']'
+            + ' bucket=[' + rngMin.toFixed(4) + ', ' + rngMax.toFixed(4) + ']';
+    }}
+    function maeRow(metrics, rows) {{
+        var maeMap = {{}};
+        metrics.forEach(function(m) {{
+            maeMap[m] = calcWMAE(rows, m);
+        }});
+        // 校准Δ
+        var delta = null, deltaPct = null;
+        if (maeMap['predict'] != null && maeMap['pred_校准'] != null) {{
+            delta = maeMap['pred_校准'] - maeMap['predict'];
+            deltaPct = delta / maeMap['predict'] * 100;
+        }}
+        return {{ maeMap: maeMap, delta: delta, deltaPct: deltaPct, rng: rangeInfo(rows) }};
+    }}
+    function fmtVal(v) {{ return v != null ? v.toFixed(4) : '-'; }}
+    function fmtDelta(d) {{
+        if (d == null) return '-';
+        var sign = d.delta <= 0 ? '' : '+';
+        var color = d.delta <= 0 ? '#16a34a' : '#dc2626';
+        return '<span style="color:' + color + '">' + sign + d.delta.toFixed(4) + ' (' + sign + d.deltaPct.toFixed(1) + '%)</span>';
+    }}
+
+    var maeHtml = '';
+    var maeMetrics = selMetrics.filter(function(m) {{ return m !== 'label'; }});
+    if (maeMetrics.length > 0) {{
+        // 表头
+        var thMetrics = '';
+        maeMetrics.forEach(function(m) {{ thMetrics += '<th>' + m + '</th>'; }});
+        var hasCalDelta = maeMetrics.indexOf('predict') >= 0 && maeMetrics.indexOf('pred_校准') >= 0;
+
+        combos.forEach(function(combo, idx) {{
+            var color = PALETTE[idx % PALETTE.length];
+            var rows = (ALL_DATA[combo.at][combo.bt][combo.gk][dt]) || [];
+            if (rows.length === 0) return;
+            var atLabel = APPTYPE_LABELS[combo.at] || combo.at;
+            var comboLabel = atLabel + '·' + combo.bt + '·' + combo.gk;
+
+            var n = rows.length;
+            var cut1 = Math.floor(n / 3), cut2 = Math.floor(n * 2 / 3);
+            var segments = [
+                {{ name: '总体', rows: rows }},
+                {{ name: '低(0~' + (cut1 - 1) + ')', rows: rows.slice(0, cut1) }},
+                {{ name: '中(' + cut1 + '~' + (cut2 - 1) + ')', rows: rows.slice(cut1, cut2) }},
+                {{ name: '高(' + cut2 + '~' + (n - 1) + ')', rows: rows.slice(cut2) }}
+            ];
+
+            maeHtml += '<div style="margin-bottom:12px"><b style="color:' + color + '">' + comboLabel + '</b>';
+            maeHtml += '<table style="border-collapse:collapse;margin-top:4px;width:100%"><tr style="background:#f5f5f5">';
+            maeHtml += '<th style="text-align:left;padding:3px 8px">分段</th>' + thMetrics.replace(/<th>/g, '<th style="padding:3px 8px">');
+            if (hasCalDelta) maeHtml += '<th style="padding:3px 8px">校准Δ</th>';
+            maeHtml += '<th style="padding:3px 8px">label范围</th><th style="padding:3px 8px">bucket范围</th></tr>';
+
+            segments.forEach(function(seg) {{
+                var d = maeRow(maeMetrics, seg.rows);
+                maeHtml += '<tr style="border-top:1px solid #eee"><td style="padding:3px 8px">' + seg.name + '</td>';
+                maeMetrics.forEach(function(m) {{
+                    maeHtml += '<td style="padding:3px 8px;text-align:right">' + fmtVal(d.maeMap[m]) + '</td>';
+                }});
+                if (hasCalDelta) maeHtml += '<td style="padding:3px 8px;text-align:right">' + fmtDelta(d) + '</td>';
+                maeHtml += '<td style="padding:3px 8px;color:#888">' + d.rng.split(' ')[0] + '</td>';
+                maeHtml += '<td style="padding:3px 8px;color:#888">' + d.rng.split(' ')[1] + '</td>';
+                maeHtml += '</tr>';
+            }});
+            maeHtml += '</table></div>';
+        }});
+    }}
+    document.getElementById('mae-info').innerHTML = maeHtml;
+}}
+
+renderChart();
+window.addEventListener('resize', function() {{ chart.resize(); }});
+</script>
+</body>
+</html>"""
+
+    os.makedirs(os.path.dirname(OUT_HTML), exist_ok=True)
+    with open(OUT_HTML, "w", encoding="utf-8") as f:
+        f.write(html)
+    print(f"HTML saved to: {OUT_HTML}")
+
+
+def main():
+    df = load_data()
+    all_data = build_charts_data(df)
+    all_dts = sorted(df["dt"].unique())
+    render_html(all_data, all_dts)
+
+
+if __name__ == "__main__":
+    main()

+ 495 - 0
tasks/承接/线上实验/08_预测覆盖率效果分析.py

@@ -0,0 +1,495 @@
+"""
+预测值覆盖率效果分析
+
+对比有/无预测值样本的业务指标差异
+主要关注: str_plus, ros_minus, rovn, vovh24
+"""
+
+import pandas as pd
+import glob
+import os
+
+# 数据目录
+DATA_DIR = "tasks/承接/线上实验/output/07_预测值覆盖率分析"
+
+# 核心业务指标
+METRICS = ['str_plus', 'ros_minus', 'rovn', 'vovh24']
+
+# 辅助指标
+AUX_METRICS = ['sample_cnt', 'sample_ratio', 'str_one', 'ros_one', 'str', 'ros', 'dau', 'exp']
+
+
+def load_all_data():
+    """加载所有天的数据"""
+    files = sorted(glob.glob(os.path.join(DATA_DIR, "*.csv")))
+    if not files:
+        print(f"未找到数据文件: {DATA_DIR}")
+        return None
+
+    dfs = []
+    for f in files:
+        df = pd.read_csv(f)
+        dfs.append(df)
+        print(f"加载: {os.path.basename(f)} ({len(df)} 行)")
+
+    return pd.concat(dfs, ignore_index=True)
+
+
+def filter_app4(df):
+    """只保留 apptype=4 的数据(实验平台)"""
+    return df[df['apptype'] == 4].copy()
+
+
+def calc_lift(exp_val, ctrl_val):
+    """计算提升幅度"""
+    if ctrl_val == 0:
+        return float('inf') if exp_val > 0 else 0
+    return (exp_val - ctrl_val) / ctrl_val * 100
+
+
+def analyze_by_has_pred(df):
+    """按 有/无预测值 对比各组的业务指标"""
+    print("\n" + "=" * 80)
+    print("【有/无预测值 业务指标对比】")
+    print("=" * 80)
+
+    # 筛选 apptype=4
+    df4 = filter_app4(df)
+
+    # 按 abcode + has_pred 汇总(多天平均)
+    agg_cols = {m: 'mean' for m in METRICS + AUX_METRICS if m in df4.columns}
+    agg_cols['sample_cnt'] = 'sum'  # 样本量用求和
+
+    summary = df4.groupby(['abcode', 'has_pred']).agg(agg_cols).round(6)
+
+    print("\n各组指标均值(多天汇总):")
+    print(summary[METRICS].to_string())
+
+    return summary
+
+
+def compare_vs_baseline(df):
+    """对比实验组 vs 对照组"""
+    print("\n" + "=" * 80)
+    print("【实验组 vs 对照组 业务指标对比】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    results = []
+
+    for has_pred in ['有预测值', '无预测值']:
+        print(f"\n--- {has_pred} ---")
+
+        sub = df4[df4['has_pred'] == has_pred]
+
+        # 计算各组均值
+        group_means = sub.groupby('abcode')[METRICS].mean()
+
+        if '对照组' not in group_means.index:
+            print("  [缺少对照组数据]")
+            continue
+
+        baseline = group_means.loc['对照组']
+
+        print(f"\n对照组基线: {baseline.to_dict()}")
+        print(f"\n各实验组 vs 对照组 提升幅度 (%):")
+
+        for abcode in group_means.index:
+            if abcode == '对照组':
+                continue
+
+            exp_vals = group_means.loc[abcode]
+            lifts = {m: calc_lift(exp_vals[m], baseline[m]) for m in METRICS}
+
+            print(f"\n  {abcode}:")
+            for m in METRICS:
+                sign = '+' if lifts[m] > 0 else ''
+                print(f"    {m}: {exp_vals[m]:.6f} ({sign}{lifts[m]:.2f}%)")
+
+            results.append({
+                'has_pred': has_pred,
+                'abcode': abcode,
+                **{f'{m}_val': exp_vals[m] for m in METRICS},
+                **{f'{m}_lift': lifts[m] for m in METRICS}
+            })
+
+    return pd.DataFrame(results)
+
+
+def compare_pred_vs_nopred(df):
+    """对比同一组内 有预测值 vs 无预测值"""
+    print("\n" + "=" * 80)
+    print("【同组内 有预测值 vs 无预测值 对比】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    results = []
+
+    for abcode in df4['abcode'].unique():
+        sub = df4[df4['abcode'] == abcode]
+
+        has_pred = sub[sub['has_pred'] == '有预测值'][METRICS].mean()
+        no_pred = sub[sub['has_pred'] == '无预测值'][METRICS].mean()
+
+        print(f"\n{abcode}:")
+        print(f"  有预测值: {has_pred.to_dict()}")
+        print(f"  无预测值: {no_pred.to_dict()}")
+
+        diffs = {}
+        for m in METRICS:
+            diff = calc_lift(has_pred[m], no_pred[m])
+            sign = '+' if diff > 0 else ''
+            print(f"  {m} 提升: {sign}{diff:.2f}%")
+            diffs[m] = diff
+
+        results.append({
+            'abcode': abcode,
+            **{f'{m}_有预测值': has_pred[m] for m in METRICS},
+            **{f'{m}_无预测值': no_pred[m] for m in METRICS},
+            **{f'{m}_lift': diffs[m] for m in METRICS}
+        })
+
+    return pd.DataFrame(results)
+
+
+def daily_trend(df):
+    """按天查看指标趋势"""
+    print("\n" + "=" * 80)
+    print("【日趋势 - 有预测值样本】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+    has_pred = df4[df4['has_pred'] == '有预测值']
+
+    pivot = has_pred.pivot_table(
+        index='dt',
+        columns='abcode',
+        values='rovn',
+        aggfunc='mean'
+    ).round(6)
+
+    print("\nrovn 日趋势:")
+    print(pivot.to_string())
+
+    return pivot
+
+
+def coverage_stability(df):
+    """检查预测覆盖率的稳定性"""
+    print("\n" + "=" * 80)
+    print("【预测覆盖率稳定性】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    coverage = df4[df4['has_pred'] == '有预测值'].pivot_table(
+        index='dt',
+        columns='abcode',
+        values='sample_ratio',
+        aggfunc='mean'
+    ).round(4)
+
+    print("\n各组有预测值样本占比 (按天):")
+    print(coverage.to_string())
+
+    print("\n各组覆盖率统计:")
+    print(coverage.describe().round(4).to_string())
+
+    return coverage
+
+
+def daily_lift_stability(df):
+    """按天计算实验组 vs 对照组的提升幅度,检查稳定性"""
+    print("\n" + "=" * 80)
+    print("【多天稳定性分析 - 实验组 vs 对照组 提升幅度】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+    has_pred = df4[df4['has_pred'] == '有预测值']
+
+    results = []
+    for dt in sorted(has_pred['dt'].unique()):
+        day_data = has_pred[has_pred['dt'] == dt]
+
+        if '对照组' not in day_data['abcode'].values:
+            continue
+
+        baseline = day_data[day_data['abcode'] == '对照组'][METRICS].iloc[0]
+
+        for abcode in day_data['abcode'].unique():
+            if abcode == '对照组':
+                continue
+            exp_vals = day_data[day_data['abcode'] == abcode][METRICS].iloc[0]
+            lifts = {m: calc_lift(exp_vals[m], baseline[m]) for m in METRICS}
+            results.append({
+                'dt': dt,
+                'abcode': abcode,
+                **{f'{m}_lift': lifts[m] for m in METRICS}
+            })
+
+    lift_df = pd.DataFrame(results)
+
+    for abcode in lift_df['abcode'].unique():
+        print(f"\n{abcode}:")
+        sub = lift_df[lift_df['abcode'] == abcode]
+
+        for m in METRICS:
+            col = f'{m}_lift'
+            values = sub[col].values
+            mean_lift = values.mean()
+            std_lift = values.std()
+            min_lift = values.min()
+            max_lift = values.max()
+
+            # 判断稳定性
+            is_stable = std_lift < abs(mean_lift) * 0.5 if mean_lift != 0 else std_lift < 1
+            stability = "✓稳定" if is_stable else "⚠波动"
+
+            print(f"  {m}: 均值{mean_lift:+.2f}%, 标准差{std_lift:.2f}%, 范围[{min_lift:+.2f}%, {max_lift:+.2f}%] {stability}")
+
+        # 显示每日明细
+        print(f"  日明细:")
+        for _, row in sub.iterrows():
+            lifts_str = " | ".join([f"{m}:{row[f'{m}_lift']:+.2f}%" for m in METRICS])
+            print(f"    {row['dt']}: {lifts_str}")
+
+    return lift_df
+
+
+def compare_pred_effect_by_day(df):
+    """按天对比有/无预测值的效果差异"""
+    print("\n" + "=" * 80)
+    print("【有/无预测值效果差异分析】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    # 只看对照组,排除策略干扰
+    ctrl = df4[df4['abcode'] == '对照组']
+
+    print("\n对照组 - 有/无预测值对比 (排除策略干扰):")
+
+    results = []
+    for dt in sorted(ctrl['dt'].unique()):
+        day_data = ctrl[ctrl['dt'] == dt]
+        has_pred = day_data[day_data['has_pred'] == '有预测值'][METRICS + ['sample_cnt']].iloc[0]
+        no_pred = day_data[day_data['has_pred'] == '无预测值'][METRICS + ['sample_cnt']].iloc[0]
+
+        diffs = {m: calc_lift(has_pred[m], no_pred[m]) for m in METRICS}
+        results.append({
+            'dt': dt,
+            'sample_有预测值': has_pred['sample_cnt'],
+            'sample_无预测值': no_pred['sample_cnt'],
+            **{f'{m}_有': has_pred[m] for m in METRICS},
+            **{f'{m}_无': no_pred[m] for m in METRICS},
+            **{f'{m}_diff': diffs[m] for m in METRICS}
+        })
+
+    diff_df = pd.DataFrame(results)
+
+    # 汇总统计
+    print("\n指标差异汇总 (有预测值 vs 无预测值):")
+    for m in METRICS:
+        col = f'{m}_diff'
+        mean_diff = diff_df[col].mean()
+        std_diff = diff_df[col].std()
+        print(f"  {m}: 均值差异 {mean_diff:+.1f}%, 标准差 {std_diff:.1f}%")
+
+    print("\n日明细:")
+    for _, row in diff_df.iterrows():
+        print(f"  {row['dt']}:")
+        print(f"    样本量: 有预测值 {row['sample_有预测值']:,.0f} | 无预测值 {row['sample_无预测值']:,.0f}")
+        for m in METRICS:
+            print(f"    {m}: {row[f'{m}_有']:.6f} vs {row[f'{m}_无']:.6f} ({row[f'{m}_diff']:+.1f}%)")
+
+    return diff_df
+
+
+def daily_metrics_by_pred(df):
+    """按天、分组、有无预测值 输出指标明细"""
+    print("\n" + "=" * 80)
+    print("【分天指标明细 - 有/无预测值】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    # 选择需要的列
+    cols = ['dt', 'abcode', 'has_pred', 'sample_cnt', 'sample_ratio'] + METRICS
+
+    result = df4[cols].sort_values(['dt', 'abcode', 'has_pred'], ascending=[True, True, False])
+
+    # 输出到文件
+    output_file = os.path.join(DATA_DIR, "分天指标_有无预测值.csv")
+    result.to_csv(output_file, index=False, encoding='utf-8-sig')
+    print(f"\n输出文件: {output_file}")
+
+    # 打印预览
+    print("\n数据预览:")
+    print(result.to_string(index=False))
+
+    return result
+
+
+def daily_lift_to_file(df):
+    """按天计算实验组 vs 对照组提升幅度,输出到文件"""
+    print("\n" + "=" * 80)
+    print("【分天提升幅度 - 实验组 vs 对照组】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    results = []
+    # 有预测值、无预测值、整体
+    for has_pred in ['有预测值', '无预测值', '整体']:
+        if has_pred == '整体':
+            # 整体:按 dt + abcode 汇总(加权平均)
+            sub = df4.groupby(['dt', 'abcode']).apply(
+                lambda g: pd.Series({
+                    'sample_cnt': g['sample_cnt'].sum(),
+                    **{m: (g[m] * g['sample_cnt']).sum() / g['sample_cnt'].sum() for m in METRICS}
+                })
+            ).reset_index()
+        else:
+            sub = df4[df4['has_pred'] == has_pred]
+
+        for dt in sorted(sub['dt'].unique()):
+            day_data = sub[sub['dt'] == dt]
+
+            if '对照组' not in day_data['abcode'].values:
+                continue
+
+            baseline = day_data[day_data['abcode'] == '对照组'][METRICS].iloc[0]
+
+            for abcode in day_data['abcode'].unique():
+                exp_vals = day_data[day_data['abcode'] == abcode][METRICS].iloc[0]
+                sample_cnt = day_data[day_data['abcode'] == abcode]['sample_cnt'].iloc[0]
+                if abcode == '对照组':
+                    lifts = {m: 0 for m in METRICS}
+                else:
+                    lifts = {m: calc_lift(exp_vals[m], baseline[m]) for m in METRICS}
+
+                results.append({
+                    'dt': dt,
+                    'has_pred': has_pred,
+                    'abcode': abcode,
+                    'sample_cnt': int(sample_cnt),
+                    **{m: exp_vals[m] for m in METRICS},
+                    **{f'{m}_lift': round(lifts[m], 2) for m in METRICS}
+                })
+
+    lift_df = pd.DataFrame(results)
+    # 排序:整体在最后
+    order = {'有预测值': 0, '无预测值': 1, '整体': 2}
+    lift_df['_order'] = lift_df['has_pred'].map(order)
+    lift_df = lift_df.sort_values(['dt', '_order', 'abcode'], ascending=[True, True, True])
+    lift_df = lift_df.drop('_order', axis=1)
+
+    # 输出到文件
+    output_file = os.path.join(DATA_DIR, "分天提升幅度.csv")
+    lift_df.to_csv(output_file, index=False, encoding='utf-8-sig')
+    print(f"\n输出文件: {output_file}")
+
+    # 打印预览
+    print("\n数据预览:")
+    print(lift_df.to_string(index=False))
+
+    return lift_df
+
+
+def overall_lift_summary(df):
+    """整体效果汇总(不分有无预测值)"""
+    print("\n" + "=" * 80)
+    print("【整体效果汇总 - 不分有无预测值】")
+    print("=" * 80)
+
+    df4 = filter_app4(df)
+
+    # 按 dt + abcode 汇总(加权平均)
+    overall = df4.groupby(['dt', 'abcode']).apply(
+        lambda g: pd.Series({
+            'sample_cnt': g['sample_cnt'].sum(),
+            **{m: (g[m] * g['sample_cnt']).sum() / g['sample_cnt'].sum() for m in METRICS}
+        })
+    ).reset_index()
+
+    results = []
+    for dt in sorted(overall['dt'].unique()):
+        day_data = overall[overall['dt'] == dt]
+
+        if '对照组' not in day_data['abcode'].values:
+            continue
+
+        baseline = day_data[day_data['abcode'] == '对照组'][METRICS].iloc[0]
+
+        for abcode in day_data['abcode'].unique():
+            if abcode == '对照组':
+                continue
+            exp_vals = day_data[day_data['abcode'] == abcode][METRICS].iloc[0]
+            lifts = {m: calc_lift(exp_vals[m], baseline[m]) for m in METRICS}
+            results.append({
+                'dt': dt,
+                'abcode': abcode,
+                **{f'{m}_lift': lifts[m] for m in METRICS}
+            })
+
+    lift_df = pd.DataFrame(results)
+
+    print("\n各实验组整体效果(多天汇总):")
+    for abcode in lift_df['abcode'].unique():
+        sub = lift_df[lift_df['abcode'] == abcode]
+        print(f"\n{abcode}:")
+        for m in METRICS:
+            col = f'{m}_lift'
+            mean_lift = sub[col].mean()
+            std_lift = sub[col].std()
+            is_stable = std_lift < abs(mean_lift) * 0.5 if mean_lift != 0 else std_lift < 1
+            stability = "✓稳定" if is_stable else "⚠波动"
+            print(f"  {m}: 均值{mean_lift:+.2f}%, 标准差{std_lift:.2f}% {stability}")
+
+        # 日明细
+        print(f"  日明细:")
+        for _, row in sub.iterrows():
+            lifts_str = " | ".join([f"{m}:{row[f'{m}_lift']:+.2f}%" for m in METRICS])
+            print(f"    {row['dt']}: {lifts_str}")
+
+    return lift_df
+
+
+def main():
+    print("=" * 80)
+    print("预测值覆盖率效果分析")
+    print("=" * 80)
+
+    # 加载数据
+    df = load_all_data()
+    if df is None:
+        return
+
+    print(f"\n总数据量: {len(df)} 行")
+    print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+    print(f"包含天数: {df['dt'].nunique()} 天")
+
+    # 1. 分天指标明细
+    daily_metrics_by_pred(df)
+
+    # 2. 分天提升幅度(有预测值、无预测值、整体)
+    daily_lift_to_file(df)
+
+    # 3. 整体效果汇总
+    overall_lift_summary(df)
+
+    # 4. 有预测值样本稳定性
+    print("\n" + "=" * 80)
+    print("【有预测值样本 - 多天稳定性】")
+    print("=" * 80)
+    daily_lift_stability(df)
+
+    print("\n" + "=" * 80)
+    print("分析完成")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()

+ 177 - 0
tasks/承接/线上实验/分桶诊断分析.py

@@ -0,0 +1,177 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
+plt.rcParams['axes.unicode_minus'] = False
+
+# 读取三个分桶维度的数据
+df_a = pd.read_csv('output/06a_str_pred分桶诊断/20260125.csv')
+df_b = pd.read_csv('output/06b_ros_real分桶诊断/20260125.csv')
+df_c = pd.read_csv('output/06c_ros_pred分桶诊断/20260125.csv')
+
+# 过滤对照组数据进行分析
+df_a_ctrl = df_a[(df_a['abcode'] == '对照组') & (df_a['bucket'] != '全部')].copy()
+df_b_ctrl = df_b[(df_b['abcode'] == '对照组') & (df_b['bucket'] != '全部')].copy()
+df_c_ctrl = df_c[(df_c['abcode'] == '对照组') & (df_c['bucket'] != '全部')].copy()
+
+df_a_ctrl['bucket'] = df_a_ctrl['bucket'].astype(int)
+df_b_ctrl['bucket'] = df_b_ctrl['bucket'].astype(int)
+df_c_ctrl['bucket'] = df_c_ctrl['bucket'].astype(int)
+
+df_a_ctrl = df_a_ctrl.sort_values('bucket')
+df_b_ctrl = df_b_ctrl.sort_values('bucket')
+df_c_ctrl = df_c_ctrl.sort_values('bucket')
+
+# 创建综合分析图
+fig, axes = plt.subplots(3, 3, figsize=(16, 14))
+
+# ========== 06a: str_pred 分桶 ==========
+ax = axes[0, 0]
+ax.bar(df_a_ctrl['bucket'], df_a_ctrl['ros_pred_bias'], alpha=0.7, label='ros_pred偏差', color='steelblue')
+ax.bar(df_a_ctrl['bucket'], df_a_ctrl['ros_stat_bias'], alpha=0.5, label='ros_stat偏差', color='orange')
+ax.axhline(y=0, color='red', linestyle='--', linewidth=1)
+ax.set_xlabel('str_pred 分桶')
+ax.set_ylabel('ROS 偏差')
+ax.set_title('06a: str_pred分桶 - ROS偏差趋势\n(正=高估, 负=低估)')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[0, 1]
+ax.plot(df_a_ctrl['bucket'], df_a_ctrl['str_real'], 'o-', label='str_real', color='green')
+ax.plot(df_a_ctrl['bucket'], df_a_ctrl['str_pred_avg'], 's--', label='str_pred', color='blue')
+ax.set_xlabel('str_pred 分桶')
+ax.set_ylabel('分享率')
+ax.set_title('06a: str_pred分桶 - STR真实vs预测')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[0, 2]
+ax.plot(df_a_ctrl['bucket'], df_a_ctrl['ros_real'], 'o-', label='ros_real', color='green')
+ax.plot(df_a_ctrl['bucket'], df_a_ctrl['ros_pred_avg'], 's--', label='ros_pred', color='blue')
+ax.plot(df_a_ctrl['bucket'], df_a_ctrl['ros_stat_avg'], '^--', label='ros_stat', color='orange')
+ax.set_xlabel('str_pred 分桶')
+ax.set_ylabel('ROS')
+ax.set_title('06a: str_pred分桶 - ROS真实vs预测vs统计量')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+# ========== 06b: ros_real 分桶 (仅回流样本) ==========
+ax = axes[1, 0]
+ax.bar(df_b_ctrl['bucket'], df_b_ctrl['ros_pred_bias'], alpha=0.7, label='ros_pred偏差', color='steelblue')
+ax.bar(df_b_ctrl['bucket'], df_b_ctrl['ros_stat_bias'], alpha=0.5, label='ros_stat偏差', color='orange')
+ax.axhline(y=0, color='red', linestyle='--', linewidth=1)
+ax.set_xlabel('ros_real 分桶 (仅回流)')
+ax.set_ylabel('ROS 偏差')
+ax.set_title('06b: ros_real分桶 - ROS偏差趋势\n(仅回流样本)')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[1, 1]
+ax.plot(df_b_ctrl['bucket'], df_b_ctrl['ros_real_avg'], 'o-', label='ros_real', color='green')
+ax.plot(df_b_ctrl['bucket'], df_b_ctrl['ros_pred_avg'], 's--', label='ros_pred', color='blue')
+ax.plot(df_b_ctrl['bucket'], df_b_ctrl['ros_stat_avg'], '^--', label='ros_stat', color='orange')
+ax.set_xlabel('ros_real 分桶')
+ax.set_ylabel('ROS')
+ax.set_title('06b: ros_real分桶 - ROS各指标对比')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[1, 2]
+ax.plot(df_b_ctrl['bucket'], df_b_ctrl['ros_pred_mae'], 'o-', label='ros_pred MAE', color='blue')
+ax.plot(df_b_ctrl['bucket'], df_b_ctrl['ros_stat_mae'], 's-', label='ros_stat MAE', color='orange')
+ax.set_xlabel('ros_real 分桶')
+ax.set_ylabel('MAE')
+ax.set_title('06b: ros_real分桶 - MAE趋势\n(高回流区间误差更大)')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+# ========== 06c: ros_pred 分桶 ==========
+ax = axes[2, 0]
+ax.bar(df_c_ctrl['bucket'], df_c_ctrl['ros_pred_bias'], alpha=0.7, label='ros_pred偏差', color='steelblue')
+ax.bar(df_c_ctrl['bucket'], df_c_ctrl['ros_stat_bias'], alpha=0.5, label='ros_stat偏差', color='orange')
+ax.axhline(y=0, color='red', linestyle='--', linewidth=1)
+ax.set_xlabel('ros_pred 分桶')
+ax.set_ylabel('ROS 偏差')
+ax.set_title('06c: ros_pred分桶 - ROS偏差趋势')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[2, 1]
+ax.plot(df_c_ctrl['bucket'], df_c_ctrl['ros_real'], 'o-', label='ros_real', color='green')
+ax.plot(df_c_ctrl['bucket'], df_c_ctrl['ros_pred_avg'], 's--', label='ros_pred', color='blue')
+ax.set_xlabel('ros_pred 分桶')
+ax.set_ylabel('ROS')
+ax.set_title('06c: ros_pred分桶 - ROS校准度')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+ax = axes[2, 2]
+ax.plot(df_c_ctrl['bucket'], df_c_ctrl['return_rate'] * 100, 'o-', color='purple')
+ax.set_xlabel('ros_pred 分桶')
+ax.set_ylabel('回流率 (%)')
+ax.set_title('06c: ros_pred分桶 - 各桶回流率')
+ax.set_xticks(range(1, 11))
+
+plt.tight_layout()
+plt.savefig('output/分桶诊断综合分析.png', dpi=150, bbox_inches='tight')
+plt.close()
+
+# ========== 打印关键发现 ==========
+print("=" * 60)
+print("分桶诊断综合分析 - 关键发现")
+print("=" * 60)
+
+# 06a 分析
+print("\n【06a: str_pred分桶】高分享预测样本的ROS表现")
+print("-" * 50)
+a_low = df_a_ctrl[df_a_ctrl['bucket'] <= 3]['ros_pred_bias'].mean()
+a_high = df_a_ctrl[df_a_ctrl['bucket'] >= 8]['ros_pred_bias'].mean()
+print(f"  低str_pred桶(1-3) ROS偏差: {a_low:.4f}")
+print(f"  高str_pred桶(8-10) ROS偏差: {a_high:.4f}")
+if a_high > a_low:
+    print(f"  → 高分享预测样本ROS高估更严重 (+{a_high - a_low:.4f})")
+else:
+    print(f"  → 低分享预测样本ROS高估更严重 (+{a_low - a_high:.4f})")
+
+# 06b 分析
+print("\n【06b: ros_real分桶】真实回流量级与预测误差")
+print("-" * 50)
+b_low = df_b_ctrl[df_b_ctrl['bucket'] <= 3]['ros_pred_bias'].mean()
+b_high = df_b_ctrl[df_b_ctrl['bucket'] >= 8]['ros_pred_bias'].mean()
+print(f"  低ros_real桶(1-3) ROS偏差: {b_low:.4f}")
+print(f"  高ros_real桶(8-10) ROS偏差: {b_high:.4f}")
+print(f"  → 高回流样本被严重低估 (bias={b_high:.2f})")
+
+mae_low = df_b_ctrl[df_b_ctrl['bucket'] <= 3]['ros_pred_mae'].mean()
+mae_high = df_b_ctrl[df_b_ctrl['bucket'] >= 8]['ros_pred_mae'].mean()
+print(f"  低ros_real桶MAE: {mae_low:.4f}")
+print(f"  高ros_real桶MAE: {mae_high:.4f}")
+
+# 06c 分析
+print("\n【06c: ros_pred分桶】ROS模型自身校准度")
+print("-" * 50)
+c_low = df_c_ctrl[df_c_ctrl['bucket'] <= 3]['ros_pred_bias'].mean()
+c_high = df_c_ctrl[df_c_ctrl['bucket'] >= 8]['ros_pred_bias'].mean()
+print(f"  低ros_pred桶(1-3) ROS偏差: {c_low:.4f}")
+print(f"  高ros_pred桶(8-10) ROS偏差: {c_high:.4f}")
+if c_high < 0:
+    print(f"  → 高预测区间实际低于预测 (低估程度={abs(c_high):.2f})")
+
+# 全局汇总
+print("\n" + "=" * 60)
+print("核心结论")
+print("=" * 60)
+df_a_all = df_a[(df_a['abcode'] == '对照组') & (df_a['bucket'] == '全部')]
+df_b_all = df_b[(df_b['abcode'] == '对照组') & (df_b['bucket'] == '全部')]
+df_c_all = df_c[(df_c['abcode'] == '对照组') & (df_c['bucket'] == '全部')]
+
+print(f"\n全量样本指标(对照组):")
+print(f"  样本量: {df_a_all['sample_cnt'].values[0]:,}")
+print(f"  回流样本: {df_a_all['return_cnt'].values[0]:,}")
+print(f"  str_real: {df_a_all['str_real'].values[0]:.4%}")
+print(f"  ros_real: {df_a_all['ros_real'].values[0]:.4f}")
+print(f"  ros_pred偏差: {df_a_all['ros_pred_bias'].values[0]:.4f}")
+print(f"  ros_stat偏差: {df_a_all['ros_stat_bias'].values[0]:.4f}")
+
+print(f"\n图表已保存至: output/分桶诊断综合分析.png")

+ 177 - 0
tasks/承接/线上实验/分桶诊断分析_full.py

@@ -0,0 +1,177 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
+plt.rcParams['axes.unicode_minus'] = False
+
+# 读取数据(排除昨天 20260125)
+dates = ['20260123', '20260124']
+dfs_a, dfs_c = [], []
+
+for dt in dates:
+    dfs_a.append(pd.read_csv(f'output/06a_str_pred分桶诊断_full/{dt}.csv'))
+    dfs_c.append(pd.read_csv(f'output/06c_ros_pred分桶诊断_full/{dt}.csv'))
+
+df_a = pd.concat(dfs_a, ignore_index=True)
+df_c = pd.concat(dfs_c, ignore_index=True)
+
+print(f"数据日期: {dates} (排除昨天)")
+print("=" * 70)
+
+# 定义实验组
+exp_groups = ['对照组', '实验组-str+校准', '实验组-str+校准&ros-统计量', '实验组-先验地域降权']
+
+# ========== 1. 业务指标对比(全部桶汇总) ==========
+print("\n【1. 业务指标对比 - 全量汇总】")
+print("-" * 70)
+
+# 取全部桶的汇总数据,按组聚合
+df_a_all = df_a[(df_a['bucket'] == '全部') & (df_a['apptype'] == 4)].copy()
+df_a_summary = df_a_all.groupby('abcode').agg({
+    'sample_cnt': 'sum',
+    'return_cnt': 'sum',
+    'dau': 'sum',
+    'exp': 'sum'
+}).reset_index()
+
+# 计算加权平均(用样本量加权)
+metrics = ['str_real', 'str_pred_avg', 'ros_real', 'ros_pred_avg', 'ros_stat_avg',
+           'ros_pred_bias', 'ros_stat_bias', 'rovn_real', 'rovn_pred',
+           'str_one', 'ros_one', 'str', 'ros', 'vovh24']
+
+for m in metrics:
+    df_a_all[f'{m}_weighted'] = df_a_all[m] * df_a_all['sample_cnt']
+
+weighted_agg = df_a_all.groupby('abcode').agg({f'{m}_weighted': 'sum' for m in metrics})
+for m in metrics:
+    df_a_summary[m] = weighted_agg[f'{m}_weighted'].values / df_a_summary['sample_cnt'].values
+
+# 打印对比表
+print(f"\n{'组别':<30} {'样本量':>12} {'str_real':>10} {'ros_real':>10} {'rovn_real':>12} {'vovh24':>10}")
+print("-" * 90)
+ctrl_row = df_a_summary[df_a_summary['abcode'] == '对照组'].iloc[0]
+for _, row in df_a_summary[df_a_summary['abcode'].isin(exp_groups)].iterrows():
+    print(f"{row['abcode']:<30} {int(row['sample_cnt']):>12,} {row['str_real']:>10.4%} {row['ros_real']:>10.4f} {row['rovn_real']:>12.6f} {row['vovh24']:>10.6f}")
+
+# 计算相对对照组的提升
+print("\n【相对对照组提升】")
+print("-" * 70)
+print(f"{'组别':<35} {'str提升':>10} {'ros提升':>10} {'rovn提升':>10} {'vovh24提升':>10}")
+print("-" * 70)
+for _, row in df_a_summary[df_a_summary['abcode'].isin(exp_groups)].iterrows():
+    if row['abcode'] == '对照组':
+        continue
+    str_lift = (row['str_real'] / ctrl_row['str_real'] - 1) * 100
+    ros_lift = (row['ros_real'] / ctrl_row['ros_real'] - 1) * 100
+    rovn_lift = (row['rovn_real'] / ctrl_row['rovn_real'] - 1) * 100
+    vovh24_lift = (row['vovh24'] / ctrl_row['vovh24'] - 1) * 100
+    print(f"{row['abcode']:<35} {str_lift:>+9.2f}% {ros_lift:>+9.2f}% {rovn_lift:>+9.2f}% {vovh24_lift:>+9.2f}%")
+
+# ========== 2. 分桶偏差分析 ==========
+print("\n\n【2. 分桶偏差分析 - 对照组】")
+print("-" * 70)
+
+# 只看对照组的分桶数据
+for name, df in [('str_pred分桶', df_a), ('ros_pred分桶', df_c)]:
+    df_ctrl = df[(df['abcode'] == '对照组') & (df['bucket'] != '全部') & (df['apptype'] == 4)].copy()
+    df_ctrl['bucket'] = df_ctrl['bucket'].astype(int)
+
+    # 按桶聚合(跨日期)
+    df_bucket = df_ctrl.groupby('bucket').agg({
+        'sample_cnt': 'sum',
+        'return_cnt': 'sum',
+        'ros_pred_bias': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+        'ros_stat_bias': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+        'ros_pred_mae_return': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'return_cnt']),
+    }).reset_index()
+
+    print(f"\n{name}(对照组):")
+    low_bias = df_bucket[df_bucket['bucket'] <= 3]['ros_pred_bias'].mean()
+    high_bias = df_bucket[df_bucket['bucket'] >= 8]['ros_pred_bias'].mean()
+    print(f"  低桶(1-3) ros_pred偏差: {low_bias:+.4f}")
+    print(f"  高桶(8-10) ros_pred偏差: {high_bias:+.4f}")
+    print(f"  偏差差异: {high_bias - low_bias:+.4f}")
+
+# ========== 3. 可视化 ==========
+fig, axes = plt.subplots(2, 3, figsize=(16, 10))
+
+# 3.1 业务指标对比柱状图
+ax = axes[0, 0]
+groups = ['对照组', 'str+校准', 'str+校准\n&ros统计量', '先验地域降权']
+group_names = ['对照组', '实验组-str+校准', '实验组-str+校准&ros-统计量', '实验组-先验地域降权']
+rovn_values = [df_a_summary[df_a_summary['abcode'] == g]['rovn_real'].values[0] * 1000 for g in group_names]
+colors = ['gray', 'steelblue', 'orange', 'green']
+bars = ax.bar(groups, rovn_values, color=colors, alpha=0.8)
+ax.set_ylabel('rovn (×1000)')
+ax.set_title('各组 ROVN 对比')
+for bar, val in zip(bars, rovn_values):
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{val:.3f}', ha='center', va='bottom')
+
+# 3.2 str提升对比
+ax = axes[0, 1]
+str_values = [df_a_summary[df_a_summary['abcode'] == g]['str_real'].values[0] * 100 for g in group_names]
+bars = ax.bar(groups, str_values, color=colors, alpha=0.8)
+ax.set_ylabel('str_real (%)')
+ax.set_title('各组 STR 对比')
+for bar, val in zip(bars, str_values):
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{val:.3f}', ha='center', va='bottom')
+
+# 3.3 ros对比
+ax = axes[0, 2]
+ros_values = [df_a_summary[df_a_summary['abcode'] == g]['ros_real'].values[0] for g in group_names]
+bars = ax.bar(groups, ros_values, color=colors, alpha=0.8)
+ax.set_ylabel('ros_real')
+ax.set_title('各组 ROS 对比')
+for bar, val in zip(bars, ros_values):
+    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{val:.3f}', ha='center', va='bottom')
+
+# 3.4-3.5 两个分桶维度的偏差趋势
+for idx, (name, df, title) in enumerate([
+    ('06a', df_a, 'str_pred分桶 - ROS偏差'),
+    ('06c', df_c, 'ros_pred分桶 - ROS偏差')
+]):
+    ax = axes[1, idx]
+    for gname, color in [('对照组', 'gray'), ('实验组-str+校准', 'steelblue'),
+                          ('实验组-str+校准&ros-统计量', 'orange')]:
+        df_g = df[(df['abcode'] == gname) & (df['bucket'] != '全部') & (df['apptype'] == 4)].copy()
+        df_g['bucket'] = df_g['bucket'].astype(int)
+        df_g = df_g.groupby('bucket')['ros_pred_bias'].mean().reset_index()
+        label = gname.replace('实验组-', '')
+        ax.plot(df_g['bucket'], df_g['ros_pred_bias'], 'o-', label=label, color=color, alpha=0.8)
+    ax.axhline(y=0, color='red', linestyle='--', linewidth=1, alpha=0.5)
+    ax.set_xlabel('分桶')
+    ax.set_ylabel('ros_pred偏差')
+    ax.set_title(title)
+    ax.legend(fontsize=8)
+    ax.set_xticks(range(1, 11))
+
+# 3.6 真实值 vs 预测值趋势
+ax = axes[1, 2]
+df_ctrl = df_a[(df_a['abcode'] == '对照组') & (df_a['bucket'] != '全部') & (df_a['apptype'] == 4)].copy()
+df_ctrl['bucket'] = df_ctrl['bucket'].astype(int)
+df_bucket = df_ctrl.groupby('bucket').agg({'ros_real': 'mean', 'ros_pred_avg': 'mean', 'ros_stat_avg': 'mean'}).reset_index()
+ax.plot(df_bucket['bucket'], df_bucket['ros_real'], 'o-', label='ros_real', color='green', linewidth=2)
+ax.plot(df_bucket['bucket'], df_bucket['ros_pred_avg'], 's--', label='ros_pred', color='blue', linewidth=2)
+ax.plot(df_bucket['bucket'], df_bucket['ros_stat_avg'], '^--', label='ros_stat', color='orange', linewidth=2)
+ax.set_xlabel('str_pred分桶')
+ax.set_ylabel('ROS')
+ax.set_title('str_pred分桶 - ROS真实vs预测 (对照组)')
+ax.legend()
+ax.set_xticks(range(1, 11))
+
+plt.tight_layout()
+plt.savefig('output/分桶诊断综合分析_full.png', dpi=150, bbox_inches='tight')
+plt.close()
+
+print(f"\n\n图表已保存至: output/分桶诊断综合分析_full.png")
+
+# ========== 4. 实验组对比汇总表 ==========
+print("\n\n【3. 实验组 vs 对照组 完整指标】")
+print("=" * 70)
+
+cols = ['abcode', 'sample_cnt', 'str_real', 'ros_real', 'ros_pred_bias', 'ros_stat_bias',
+        'rovn_real', 'str_one', 'ros_one', 'vovh24']
+summary_table = df_a_summary[df_a_summary['abcode'].isin(exp_groups)][cols].copy()
+summary_table = summary_table.round(6)
+print(summary_table.to_string(index=False))

+ 174 - 0
tasks/承接/线上实验/分桶诊断分析_full_v2.py

@@ -0,0 +1,174 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
+plt.rcParams['axes.unicode_minus'] = False
+
+# 读取数据(排除昨天 20260125)
+dates = ['20260123', '20260124']
+dfs_a, dfs_c = [], []
+
+for dt in dates:
+    dfs_a.append(pd.read_csv(f'output/06a_str_pred分桶诊断_full/{dt}.csv'))
+    dfs_c.append(pd.read_csv(f'output/06c_ros_pred分桶诊断_full/{dt}.csv'))
+
+df_a = pd.concat(dfs_a, ignore_index=True)
+df_c = pd.concat(dfs_c, ignore_index=True)
+
+print(f"数据日期: {dates} (排除昨天)")
+print("=" * 80)
+
+exp_groups = ['对照组', '实验组-str+校准', '实验组-str+校准&ros-统计量']
+
+# ========== 1. 真实值 vs 预测值对比(全量) ==========
+print("\n【1. 真实值 vs 预测值对比 - 全量汇总】")
+print("-" * 80)
+
+df_all = df_a[(df_a['bucket'] == '全部') & (df_a['apptype'] == 4)].copy()
+df_summary = df_all.groupby('abcode', group_keys=False).apply(lambda x: pd.Series({
+    'sample_cnt': x['sample_cnt'].sum(),
+    'str_real': np.average(x['str_real'], weights=x['sample_cnt']),
+    'str_pred': np.average(x['str_pred_avg'], weights=x['sample_cnt']),
+    'ros_real': np.average(x['ros_real'], weights=x['sample_cnt']),
+    'ros_pred': np.average(x['ros_pred_avg'], weights=x['sample_cnt']),
+    'ros_stat': np.average(x['ros_stat_avg'], weights=x['sample_cnt']),
+    'rovn_real': np.average(x['rovn_real'], weights=x['sample_cnt']),
+    'rovn_pred': np.average(x['rovn_pred'], weights=x['sample_cnt']),
+})).reset_index()
+
+print(f"\n{'组别':<35} {'str_real':>10} {'str_pred':>10} {'str差异':>10}")
+print("-" * 70)
+for _, row in df_summary[df_summary['abcode'].isin(exp_groups)].iterrows():
+    str_diff = row['str_pred'] - row['str_real']
+    print(f"{row['abcode']:<35} {row['str_real']:>10.4%} {row['str_pred']:>10.4%} {str_diff:>+10.4%}")
+
+print(f"\n{'组别':<35} {'ros_real':>10} {'ros_pred':>10} {'ros_stat':>10} {'pred差异':>10} {'stat差异':>10}")
+print("-" * 100)
+for _, row in df_summary[df_summary['abcode'].isin(exp_groups)].iterrows():
+    pred_diff = row['ros_pred'] - row['ros_real']
+    stat_diff = row['ros_stat'] - row['ros_real']
+    print(f"{row['abcode']:<35} {row['ros_real']:>10.4f} {row['ros_pred']:>10.4f} {row['ros_stat']:>10.4f} {pred_diff:>+10.4f} {stat_diff:>+10.4f}")
+
+# ========== 2. 分桶真实值 vs 预测值详细对比 ==========
+print("\n\n【2. 分桶真实值 vs 预测值详细对比 - 对照组】")
+print("=" * 80)
+
+for name, df in [('str_pred分桶', df_a), ('ros_pred分桶', df_c)]:
+    print(f"\n{name}:")
+    print("-" * 90)
+
+    df_ctrl = df[(df['abcode'] == '对照组') & (df['bucket'] != '全部') & (df['apptype'] == 4)].copy()
+    df_ctrl['bucket'] = df_ctrl['bucket'].astype(int)
+
+    # 聚合
+    agg_cols = {
+        'sample_cnt': 'sum',
+        'return_cnt': 'sum',
+        'str_real': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+        'str_pred_avg': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+        'ros_real': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'return_cnt']) if df_ctrl.loc[x.index, 'return_cnt'].sum() > 0 else np.nan,
+        'ros_pred_avg': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+        'ros_stat_avg': lambda x: np.average(x, weights=df_ctrl.loc[x.index, 'sample_cnt']),
+    }
+    df_bucket = df_ctrl.groupby('bucket').agg(agg_cols).reset_index()
+
+    print(f"{'桶':>4} {'样本量':>12} {'str_real':>10} {'str_pred':>10} {'str差异':>10} {'ros_real':>10} {'ros_pred':>10} {'ros_stat':>10} {'pred差异':>10}")
+    print("-" * 110)
+    for _, row in df_bucket.iterrows():
+        str_diff = row['str_pred_avg'] - row['str_real']
+        ros_pred_diff = row['ros_pred_avg'] - row['ros_real'] if pd.notna(row['ros_real']) else np.nan
+        print(f"{int(row['bucket']):>4} {int(row['sample_cnt']):>12,} {row['str_real']:>10.4%} {row['str_pred_avg']:>10.4%} {str_diff:>+10.4%} {row['ros_real']:>10.4f} {row['ros_pred_avg']:>10.4f} {row['ros_stat_avg']:>10.4f} {ros_pred_diff:>+10.4f}")
+
+# ========== 3. 可视化:真实值 vs 预测值趋势 ==========
+fig, axes = plt.subplots(2, 3, figsize=(16, 10))
+date_label = f"数据: {', '.join(dates)}"
+fig.suptitle(f'分桶诊断 - 真实值 vs 预测值对比\n({date_label})', fontsize=14, fontweight='bold')
+
+for row_idx, (name, df, title_prefix) in enumerate([
+    ('str_pred分桶', df_a, 'str_pred分桶'),
+    ('ros_pred分桶', df_c, 'ros_pred分桶')
+]):
+    df_ctrl = df[(df['abcode'] == '对照组') & (df['bucket'] != '全部') & (df['apptype'] == 4)].copy()
+    df_ctrl['bucket'] = df_ctrl['bucket'].astype(int)
+    df_bucket = df_ctrl.groupby('bucket').agg({
+        'str_real': 'mean',
+        'str_pred_avg': 'mean',
+        'ros_real': 'mean',
+        'ros_pred_avg': 'mean',
+        'ros_stat_avg': 'mean',
+        'ros_real_return': 'mean',
+        'ros_pred_return': 'mean',
+    }).reset_index()
+
+    # STR 对比
+    ax = axes[row_idx, 0]
+    ax.plot(df_bucket['bucket'], df_bucket['str_real'] * 100, 'o-', label='str_real', color='green', linewidth=2)
+    ax.plot(df_bucket['bucket'], df_bucket['str_pred_avg'] * 100, 's--', label='str_pred', color='blue', linewidth=2)
+    ax.set_xlabel('分桶')
+    ax.set_ylabel('STR (%)')
+    ax.set_title(f'{title_prefix} - STR真实vs预测')
+    ax.legend()
+    ax.set_xticks(range(1, 11))
+
+    # ROS 对比(全量)
+    ax = axes[row_idx, 1]
+    ax.plot(df_bucket['bucket'], df_bucket['ros_real'], 'o-', label='ros_real', color='green', linewidth=2)
+    ax.plot(df_bucket['bucket'], df_bucket['ros_pred_avg'], 's--', label='ros_pred', color='blue', linewidth=2)
+    ax.plot(df_bucket['bucket'], df_bucket['ros_stat_avg'], '^--', label='ros_stat', color='orange', linewidth=2)
+    ax.set_xlabel('分桶')
+    ax.set_ylabel('ROS')
+    ax.set_title(f'{title_prefix} - ROS真实vs预测vs统计量')
+    ax.legend()
+    ax.set_xticks(range(1, 11))
+
+    # ROS 仅回流样本对比
+    ax = axes[row_idx, 2]
+    ax.plot(df_bucket['bucket'], df_bucket['ros_real_return'], 'o-', label='ros_real(回流)', color='green', linewidth=2)
+    ax.plot(df_bucket['bucket'], df_bucket['ros_pred_return'], 's--', label='ros_pred(回流)', color='blue', linewidth=2)
+    ax.set_xlabel('分桶')
+    ax.set_ylabel('ROS (仅回流样本)')
+    ax.set_title(f'{title_prefix} - ROS回流样本真实vs预测')
+    ax.legend()
+    ax.set_xticks(range(1, 11))
+
+plt.tight_layout(rect=[0, 0, 1, 0.95])
+plt.savefig('output/分桶诊断_真实vs预测对比.png', dpi=150, bbox_inches='tight')
+plt.close()
+
+print(f"\n\n图表已保存至: output/分桶诊断_真实vs预测对比.png")
+
+# ========== 4. 各组预测误差汇总 ==========
+print("\n\n【3. 各组预测误差汇总 - MAE对比】")
+print("=" * 80)
+
+df_all = df_a[(df_a['bucket'] == '全部') & (df_a['apptype'] == 4)].copy()
+mae_summary = df_all.groupby('abcode', group_keys=False).apply(lambda x: pd.Series({
+    'ros_pred_mae_return': np.average(x['ros_pred_mae_return'], weights=x['return_cnt']),
+    'ros_stat_mae_return': np.average(x['ros_stat_mae_return'], weights=x['return_cnt']),
+    'ros_pred_mae_over': np.average(x['ros_pred_mae_over'].dropna(), weights=x.loc[x['ros_pred_mae_over'].dropna().index, 'return_cnt']) if x['ros_pred_mae_over'].notna().any() else np.nan,
+    'ros_pred_mae_under': np.average(x['ros_pred_mae_under'].dropna(), weights=x.loc[x['ros_pred_mae_under'].dropna().index, 'return_cnt']) if x['ros_pred_mae_under'].notna().any() else np.nan,
+})).reset_index()
+
+print(f"\n{'组别':<35} {'ros_pred_MAE':>12} {'ros_stat_MAE':>12} {'高估MAE':>10} {'低估MAE':>10}")
+print("-" * 85)
+for _, row in mae_summary[mae_summary['abcode'].isin(exp_groups)].iterrows():
+    print(f"{row['abcode']:<35} {row['ros_pred_mae_return']:>12.4f} {row['ros_stat_mae_return']:>12.4f} {row['ros_pred_mae_over']:>10.4f} {row['ros_pred_mae_under']:>10.4f}")
+
+# ========== 5. 关键发现总结 ==========
+print("\n\n" + "=" * 80)
+print("【关键发现总结】")
+print("=" * 80)
+
+ctrl = df_summary[df_summary['abcode'] == '对照组'].iloc[0]
+print(f"\n对照组基准:")
+print(f"  STR: 真实={ctrl['str_real']:.4%}, 预测={ctrl['str_pred']:.4%}, 差异={ctrl['str_pred']-ctrl['str_real']:+.4%}")
+print(f"  ROS: 真实={ctrl['ros_real']:.4f}, 预测={ctrl['ros_pred']:.4f}, 统计量={ctrl['ros_stat']:.4f}")
+print(f"       pred差异={ctrl['ros_pred']-ctrl['ros_real']:+.4f}, stat差异={ctrl['ros_stat']-ctrl['ros_real']:+.4f}")
+
+for gname in ['实验组-str+校准', '实验组-str+校准&ros-统计量']:
+    row = df_summary[df_summary['abcode'] == gname].iloc[0]
+    print(f"\n{gname}:")
+    print(f"  STR: 真实={row['str_real']:.4%}, 预测={row['str_pred']:.4%}, 差异={row['str_pred']-row['str_real']:+.4%}")
+    print(f"  ROS: 真实={row['ros_real']:.4f}, 预测={row['ros_pred']:.4f}, 统计量={row['ros_stat']:.4f}")
+    print(f"       pred差异={row['ros_pred']-row['ros_real']:+.4f}, stat差异={row['ros_stat']-row['ros_real']:+.4f}")

+ 293 - 0
tasks/指标分析/02_实验组xTop20视频_vs对照组_误差分析_v2_hh.sql

@@ -0,0 +1,293 @@
+-- 预处理:解析 scoresmap + page 分类
+-- v4: 新增 top20 vid 分组 + GROUPING SETS + 曝光占比
+-- v5: 新增相对对照组的变化率字段
+-- v6: 新增 rosn_ori(未校准原始分)对比校准后的 rosn_pred
+-- v7: 新增 rosn log 空间标准差(衡量相对倍数误差离散程度)
+WITH t_raw AS
+(
+    SELECT  *
+            ,REPLACE(GET_JSON_OBJECT(extend_alg,'$.scoresMap'),"\\","") AS scoresmap
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_type
+    FROM    loghubods.dwd_recsys_alg_sample_all_20250212
+    WHERE   dt = '${dt}'
+    AND     apptype IN ("0","4")
+    AND     abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
+    AND     abcode NOT IN ("ab100")
+    AND     extend_alg IS NOT NULL
+    AND     GET_JSON_OBJECT(extend_alg,'$.scoresMap') IS NOT NULL
+)
+-- 过滤:只保留推荐页面
+,t_filtered AS
+(
+    SELECT  *
+    FROM    t_raw
+    WHERE   page_type = "推荐"
+)
+-- 特征提取与维度映射
+,t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,CASE   WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
+                    WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
+                    WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
+                    WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
+                    WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
+                    ELSE "其他"
+            END AS abcode
+            ,page_type AS page
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,is_return_noself
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+            ,scoresmap
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.fmRov') AS DOUBLE) AS str_pred
+            ,1.22 * pow(CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.NorXGBScore') AS DOUBLE) AS rosn_ori
+            ,CAST(GET_JSON_OBJECT(scoresmap,'$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
+            ,GET_JSON_OBJECT(v1_feature,'$.title') AS vid_title
+    FROM    t_filtered
+)
+,t_valid AS
+(
+    SELECT  *
+    FROM    t_base
+    WHERE   str_pred IS NOT NULL
+    AND     rosn_pred IS NOT NULL
+)
+-- 计算每个 abcode 下曝光量 top20 的 vid
+,t_vid_rank AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,vid
+            ,COUNT(1) AS vid_exp_cnt
+            ,ROW_NUMBER() OVER (PARTITION BY dt, apptype, abcode ORDER BY COUNT(1) DESC) AS vid_rank
+    FROM    t_valid
+    GROUP BY dt, apptype, abcode, vid
+)
+,t_top5_vid AS
+(
+    SELECT  dt, apptype, abcode, vid, vid_rank
+    FROM    t_vid_rank
+    WHERE   vid_rank <= 20
+)
+-- 标记 top20 vid
+,t_with_top5 AS
+(
+    SELECT  a.*
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid ELSE NULL END AS top5_vid
+            ,CASE WHEN b.vid IS NOT NULL THEN a.vid_title ELSE NULL END AS top5_vid_title
+            ,b.vid_rank AS top5_vid_rank
+    FROM    t_valid a
+    LEFT JOIN t_top5_vid b
+    ON      a.dt = b.dt
+    AND     a.apptype = b.apptype
+    AND     a.abcode = b.abcode
+    AND     a.vid = b.vid
+)
+-- 先聚合
+,t_agg AS
+(
+    SELECT  dt
+            ,COALESCE(apptype, 'sum') AS apptype
+            ,COALESCE(abcode, 'sum') AS abcode
+            ,COALESCE(CAST(top5_vid AS STRING), 'all') AS vid
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_title) END AS vid_title
+            ,CASE WHEN GROUPING(top5_vid) = 1 THEN NULL ELSE MAX(top5_vid_rank) END AS vid_rank
+            -- COPC
+            ,round((SUM(is_return_noself) / COUNT(1)) / NULLIF(SUM(str_pred) / COUNT(1), 0), 4) AS str_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END), 0), 4) AS rosn_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END), 0), 4) AS rosn_ori_copc
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN return_n_uv_noself END) / NULLIF(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END), 0), 4) AS rosn_stat_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_pred), 0), 4) AS rovn_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_ori), 0), 4) AS rovn_ori_copc
+            ,round((SUM(return_n_uv_noself) / COUNT(1)) / NULLIF(AVG(str_pred * rosn_stat), 0), 4) AS rovn_stat_copc
+            -- 模型预测与真实值
+            ,round(COALESCE(SUM(is_return_noself) / COUNT(1),0),6) AS str_real
+            ,round(COALESCE(SUM(str_pred) / COUNT(1),0),6) AS str_pred
+            ,round(COALESCE(SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0),0),6) AS rosn_real
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_pred END),6) AS rosn_pred
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_ori END),6) AS rosn_ori
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN rosn_stat END),6) AS rosn_stat
+            ,round(SUM(return_n_uv_noself) / COUNT(1), 6) AS rovn_real
+            ,round(AVG(str_pred * rosn_pred), 6) AS rovn_pred
+            ,round(AVG(str_pred * rosn_ori), 6) AS rovn_ori
+            ,round(AVG(str_pred * rosn_stat), 6) AS rovn_stat
+            -- 误差:str(无条件)
+            ,round(AVG(ABS(str_pred - is_return_noself)),6) AS str_mae
+            -- 误差:rosn(条件于 is_return_noself=1)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) END),6) AS rosn_pred_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) END),6) AS rosn_ori_mae
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) END),6) AS rosn_stat_mae
+            -- 误差:rovn(无条件)
+            ,round(AVG(ABS(str_pred * rosn_pred - return_n_uv_noself)),6) AS rovn_pred_mae
+            ,round(AVG(ABS(str_pred * rosn_ori - return_n_uv_noself)),6) AS rovn_ori_mae
+            ,round(AVG(ABS(str_pred * rosn_stat - return_n_uv_noself)),6) AS rovn_stat_mae
+            -- MAPE:相对误差(仅 rosn,回流样本 actual>0)
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_pred - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_pred_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_ori - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_ori_mape
+            ,round(AVG(CASE WHEN is_return_noself = 1 THEN ABS(rosn_stat - return_n_uv_noself) / return_n_uv_noself END),6) AS rosn_stat_mape
+            -- 方差:误差离散程度(方差大=忽高忽低,方差小=稳定偏移)
+            ,round(VARIANCE(str_pred - is_return_noself),6) AS str_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_pred - return_n_uv_noself END),6) AS rosn_pred_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_ori - return_n_uv_noself END),6) AS rosn_ori_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 THEN rosn_stat - return_n_uv_noself END),6) AS rosn_stat_var
+            ,round(VARIANCE(str_pred * rosn_pred - return_n_uv_noself),6) AS rovn_pred_var
+            ,round(VARIANCE(str_pred * rosn_ori - return_n_uv_noself),6) AS rovn_ori_var
+            ,round(VARIANCE(str_pred * rosn_stat - return_n_uv_noself),6) AS rovn_stat_var
+            -- log 空间误差:LN(actual) - LN(pred),用于校准分析
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_pred > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_pred) END),6) AS rosn_pred_log_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_ori > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_ori) END),6) AS rosn_ori_log_var
+            ,round(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_stat > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_stat) END),6) AS rosn_stat_log_var
+            ,round(STDDEV(CASE WHEN is_return_noself = 1 AND rosn_pred > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_pred) END),6) AS rosn_pred_log_std
+            ,round(STDDEV(CASE WHEN is_return_noself = 1 AND rosn_ori > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_ori) END),6) AS rosn_ori_log_std
+            ,round(STDDEV(CASE WHEN is_return_noself = 1 AND rosn_stat > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_stat) END),6) AS rosn_stat_log_std
+            -- 理论校准系数:EXP(log_variance / 2)
+            ,round(EXP(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_pred > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_pred) END) / 2),6) AS rosn_pred_correction
+            ,round(EXP(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_ori > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_ori) END) / 2),6) AS rosn_ori_correction
+            ,round(EXP(VARIANCE(CASE WHEN is_return_noself = 1 AND rosn_stat > 0 AND return_n_uv_noself > 0 THEN LN(return_n_uv_noself) - LN(rosn_stat) END) / 2),6) AS rosn_stat_correction
+            -- 样本数
+            ,COUNT(1) AS str_samples
+            ,SUM(CASE WHEN is_return_noself = 1 THEN 1 ELSE 0 END) AS rosn_samples
+            ,COUNT(1) AS rovn_samples
+            -- 业务指标
+            ,round(COALESCE(COUNT(1) / COUNT(DISTINCT mid),0),2) AS exp_per_dau
+            ,round(COALESCE(SUM(is_share) / COUNT(1),0),6) AS str_one
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_share),0),6) AS ros_one
+            ,round(COALESCE(SUM(share_cnt) / COUNT(1),0),6) AS str
+            ,round(COALESCE(SUM(return_n_uv) / SUM(share_cnt),0),6) AS ros
+            ,round(COALESCE(SUM(is_return_1) / COUNT(1),0),6) AS str_plus
+            ,round(COALESCE(SUM(return_n_uv) / SUM(is_return_1),0),6) AS ros_minus
+            ,round(COALESCE(SUM(return_n_uv) / COUNT(1),0),6) AS rovn
+            ,round(COALESCE(SUM(new_exposure_cnt) / COUNT(1),0),6) AS vovh24
+            ,COUNT(DISTINCT mid) AS dau
+            ,COUNT(1) AS exp
+            ,COALESCE(SUM(is_share),0) AS is_share
+            ,COALESCE(SUM(share_cnt),0) AS share_cnt
+            ,COALESCE(SUM(is_return_1),0) AS is_return_1
+            ,COALESCE(SUM(return_n_uv),0) AS return_n_uv
+            ,COALESCE(SUM(new_exposure_cnt),0) AS viewh24
+            ,COALESCE(SUM(return_n_uv_noself),0) AS return_n_uv_noself
+    FROM    t_with_top5
+    GROUP BY dt, apptype, abcode, top5_vid
+    GROUPING SETS (
+        (dt, apptype, abcode),
+        (dt, apptype, abcode, top5_vid)
+    )
+    HAVING  top5_vid IS NOT NULL OR GROUPING(top5_vid) = 1
+)
+-- 新增:获取对照组基准值并计算变化率
+,t_with_baseline AS
+(
+    SELECT  *
+            -- 计算曝光占比
+            ,round(exp * 1.0 / MAX(CASE WHEN vid = 'all' THEN exp END) OVER (PARTITION BY dt, apptype, abcode), 4) AS exp_pct
+            -- 对照组基准值(业务指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp_per_dau END) OVER (PARTITION BY dt, apptype, vid) AS exp_per_dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_one END) OVER (PARTITION BY dt, apptype, vid) AS str_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_one END) OVER (PARTITION BY dt, apptype, vid) AS ros_one_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str END) OVER (PARTITION BY dt, apptype, vid) AS str_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros END) OVER (PARTITION BY dt, apptype, vid) AS ros_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_plus END) OVER (PARTITION BY dt, apptype, vid) AS str_plus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN ros_minus END) OVER (PARTITION BY dt, apptype, vid) AS ros_minus_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn END) OVER (PARTITION BY dt, apptype, vid) AS rovn_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN vovh24 END) OVER (PARTITION BY dt, apptype, vid) AS vovh24_base
+            -- 对照组基准值(COPC 指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_copc END) OVER (PARTITION BY dt, apptype, vid) AS str_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rosn_stat_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_ori_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_ori_copc_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_stat_copc END) OVER (PARTITION BY dt, apptype, vid) AS rovn_stat_copc_base
+            -- 对照组基准值(真实值)
+            ,MAX(CASE WHEN abcode = '对照组' THEN str_real END) OVER (PARTITION BY dt, apptype, vid) AS str_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rosn_real END) OVER (PARTITION BY dt, apptype, vid) AS rosn_real_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN rovn_real END) OVER (PARTITION BY dt, apptype, vid) AS rovn_real_base
+            -- 对照组基准值(计数指标)
+            ,MAX(CASE WHEN abcode = '对照组' THEN dau END) OVER (PARTITION BY dt, apptype, vid) AS dau_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN exp END) OVER (PARTITION BY dt, apptype, vid) AS exp_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_share END) OVER (PARTITION BY dt, apptype, vid) AS is_share_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN share_cnt END) OVER (PARTITION BY dt, apptype, vid) AS share_cnt_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN is_return_1 END) OVER (PARTITION BY dt, apptype, vid) AS is_return_1_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN viewh24 END) OVER (PARTITION BY dt, apptype, vid) AS viewh24_base
+            ,MAX(CASE WHEN abcode = '对照组' THEN return_n_uv_noself END) OVER (PARTITION BY dt, apptype, vid) AS return_n_uv_noself_base
+    FROM    t_agg
+)
+-- 最终输出:原有字段 + 变化率
+SELECT  dt
+        ,apptype
+        ,abcode
+        ,vid
+        ,vid_title
+        ,vid_rank
+        ,exp_pct
+        ,round((dau - dau_base) / NULLIF(dau_base, 0), 4) AS dau_chg
+        ,round((exp - exp_base) / NULLIF(exp_base, 0), 4) AS exp_chg
+        -- str(全量,预测是否回流)
+        ,str_samples, str_real, str_pred, str_copc, str_mae, str_var
+        -- rosn(回流子集,预测回流UV)
+        ,rosn_samples, rosn_real, rosn_pred, rosn_ori, rosn_stat
+        ,rosn_copc, rosn_ori_copc, rosn_stat_copc
+        ,rosn_pred_mae, rosn_ori_mae, rosn_stat_mae
+        ,rosn_pred_var, rosn_ori_var, rosn_stat_var
+        ,rosn_pred_mape, rosn_ori_mape, rosn_stat_mape
+        ,rosn_pred_log_var, rosn_ori_log_var, rosn_stat_log_var
+        ,rosn_pred_log_std, rosn_ori_log_std, rosn_stat_log_std
+        ,rosn_pred_correction, rosn_ori_correction, rosn_stat_correction
+        -- rovn(全量,预测回流价值)
+        ,rovn_samples, rovn_real, rovn_pred, rovn_ori, rovn_stat
+        ,rovn_copc, rovn_ori_copc, rovn_stat_copc
+        ,rovn_pred_mae, rovn_ori_mae, rovn_stat_mae
+        ,rovn_pred_var, rovn_ori_var, rovn_stat_var
+        -- 业务指标
+        ,exp_per_dau, str_one, ros_one, str, ros, str_plus, ros_minus, rovn, vovh24
+        -- 计数
+        ,dau, exp, is_share, share_cnt, is_return_1, return_n_uv, viewh24, return_n_uv_noself
+        -- ========== 变化率字段 ==========
+        -- 业务指标变化率
+        ,round((exp_per_dau - exp_per_dau_base) / NULLIF(exp_per_dau_base, 0), 4) AS exp_per_dau_chg
+        ,round((str_one - str_one_base) / NULLIF(str_one_base, 0), 4) AS str_one_chg
+        ,round((ros_one - ros_one_base) / NULLIF(ros_one_base, 0), 4) AS ros_one_chg
+        ,round((str - str_base) / NULLIF(str_base, 0), 4) AS str_chg
+        ,round((ros - ros_base) / NULLIF(ros_base, 0), 4) AS ros_chg
+        ,round((str_plus - str_plus_base) / NULLIF(str_plus_base, 0), 4) AS str_plus_chg
+        ,round((ros_minus - ros_minus_base) / NULLIF(ros_minus_base, 0), 4) AS ros_minus_chg
+        ,round((rovn - rovn_base) / NULLIF(rovn_base, 0), 4) AS rovn_chg
+        ,round((vovh24 - vovh24_base) / NULLIF(vovh24_base, 0), 4) AS vovh24_chg
+        -- COPC 变化率
+        ,round((str_copc - str_copc_base) / NULLIF(str_copc_base, 0), 4) AS str_copc_chg
+        ,round((rosn_copc - rosn_copc_base) / NULLIF(rosn_copc_base, 0), 4) AS rosn_copc_chg
+        ,round((rosn_ori_copc - rosn_ori_copc_base) / NULLIF(rosn_ori_copc_base, 0), 4) AS rosn_ori_copc_chg
+        ,round((rosn_stat_copc - rosn_stat_copc_base) / NULLIF(rosn_stat_copc_base, 0), 4) AS rosn_stat_copc_chg
+        ,round((rovn_copc - rovn_copc_base) / NULLIF(rovn_copc_base, 0), 4) AS rovn_copc_chg
+        ,round((rovn_ori_copc - rovn_ori_copc_base) / NULLIF(rovn_ori_copc_base, 0), 4) AS rovn_ori_copc_chg
+        ,round((rovn_stat_copc - rovn_stat_copc_base) / NULLIF(rovn_stat_copc_base, 0), 4) AS rovn_stat_copc_chg
+        -- 真实值变化率
+        ,round((str_real - str_real_base) / NULLIF(str_real_base, 0), 4) AS str_real_chg
+        ,round((rosn_real - rosn_real_base) / NULLIF(rosn_real_base, 0), 4) AS rosn_real_chg
+        ,round((rovn_real - rovn_real_base) / NULLIF(rovn_real_base, 0), 4) AS rovn_real_chg
+        -- 计数指标变化率
+        ,round((is_share - is_share_base) / NULLIF(is_share_base, 0), 4) AS is_share_chg
+        ,round((share_cnt - share_cnt_base) / NULLIF(share_cnt_base, 0), 4) AS share_cnt_chg
+        ,round((is_return_1 - is_return_1_base) / NULLIF(is_return_1_base, 0), 4) AS is_return_1_chg
+        ,round((return_n_uv - return_n_uv_base) / NULLIF(return_n_uv_base, 0), 4) AS return_n_uv_chg
+        ,round((viewh24 - viewh24_base) / NULLIF(viewh24_base, 0), 4) AS viewh24_chg
+        ,round((return_n_uv_noself - return_n_uv_noself_base) / NULLIF(return_n_uv_noself_base, 0), 4) AS return_n_uv_noself_chg
+FROM    t_with_baseline
+ORDER BY dt DESC, apptype, abcode, exp DESC
+;

+ 586 - 0
tmp/低vov高曝光分析/step10_可视化.py

@@ -0,0 +1,586 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step10: 生成分析报告 HTML - 按分析步骤一步步展示
+"""
+import pandas as pd
+import json
+from pathlib import Path
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取各步骤数据
+df_daily = pd.read_csv(output_dir / "step7_头部vov趋势_合并.csv").sort_values('dt')
+df_monthly = pd.read_csv(output_dir / "step7_月度统计.csv")
+
+# 图表数据
+dates = df_daily['dt'].astype(str).tolist()
+vov_data = df_daily['vov_mean'].round(4).tolist()
+problem_data = df_daily['problem_pct'].round(1).tolist()
+str_data = [round(x*100, 2) for x in df_daily['str_mean'].tolist()]
+vor_data = df_daily['vor_mean'].round(2).tolist()
+
+# 月度数据
+monthly_data = []
+for _, row in df_monthly.iterrows():
+    monthly_data.append({
+        'month': str(row['month']),
+        'vov': round(row['vov均值'], 4),
+        'str': round(row['str'], 4),
+        'ros': round(row['ros'], 2),
+        'vor': round(row['vor'], 2),
+        'problem': round(row['问题比例%'], 1)
+    })
+
+html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>低VoV高曝光问题分析报告</title>
+    <script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/chartjs-plugin-annotation"></script>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 40px 20px;
+        }}
+        .report {{
+            max-width: 1000px;
+            margin: 0 auto;
+            background: white;
+            border-radius: 16px;
+            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
+            overflow: hidden;
+        }}
+        .header {{
+            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+            color: white;
+            padding: 40px;
+            text-align: center;
+        }}
+        .header h1 {{ font-size: 32px; margin-bottom: 10px; }}
+        .header p {{ opacity: 0.8; }}
+
+        .step {{
+            padding: 40px;
+            border-bottom: 1px solid #eee;
+        }}
+        .step:last-child {{ border-bottom: none; }}
+
+        .step-header {{
+            display: flex;
+            align-items: center;
+            margin-bottom: 24px;
+        }}
+        .step-number {{
+            width: 48px;
+            height: 48px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border-radius: 50%;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-weight: bold;
+            font-size: 18px;
+            margin-right: 16px;
+        }}
+        .step-title {{
+            font-size: 22px;
+            color: #1a1a2e;
+        }}
+        .step-question {{
+            color: #666;
+            font-size: 14px;
+        }}
+
+        .answer-box {{
+            background: #f8f9fa;
+            border-left: 4px solid #27ae60;
+            padding: 20px;
+            margin: 20px 0;
+            border-radius: 0 8px 8px 0;
+        }}
+        .answer-box.warning {{ border-left-color: #f39c12; }}
+        .answer-box.danger {{ border-left-color: #e74c3c; }}
+
+        .answer-title {{
+            font-weight: bold;
+            color: #333;
+            margin-bottom: 8px;
+        }}
+        .answer-value {{
+            font-size: 28px;
+            font-weight: bold;
+            color: #27ae60;
+        }}
+        .answer-box.warning .answer-value {{ color: #f39c12; }}
+        .answer-box.danger .answer-value {{ color: #e74c3c; }}
+
+        .metric-grid {{
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 16px;
+            margin: 20px 0;
+        }}
+        .metric-card {{
+            background: #f8f9fa;
+            padding: 20px;
+            border-radius: 8px;
+            text-align: center;
+        }}
+        .metric-card.good {{ background: #d4edda; }}
+        .metric-card.bad {{ background: #f8d7da; }}
+        .metric-label {{ font-size: 14px; color: #666; margin-bottom: 8px; }}
+        .metric-value {{ font-size: 24px; font-weight: bold; }}
+        .metric-card.good .metric-value {{ color: #155724; }}
+        .metric-card.bad .metric-value {{ color: #721c24; }}
+
+        .chart-container {{
+            height: 300px;
+            margin: 20px 0;
+        }}
+
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            margin: 20px 0;
+        }}
+        th, td {{
+            padding: 12px;
+            text-align: center;
+            border-bottom: 1px solid #eee;
+        }}
+        th {{ background: #f8f9fa; font-weight: 600; }}
+        tr.problem {{ background: #fff5f5; }}
+        .badge {{
+            display: inline-block;
+            padding: 4px 12px;
+            border-radius: 20px;
+            font-size: 12px;
+            font-weight: bold;
+        }}
+        .badge-danger {{ background: #f8d7da; color: #721c24; }}
+        .badge-warning {{ background: #fff3cd; color: #856404; }}
+        .badge-success {{ background: #d4edda; color: #155724; }}
+
+        .conclusion {{
+            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+            color: white;
+            padding: 40px;
+        }}
+        .conclusion h2 {{ margin-bottom: 20px; }}
+        .conclusion ul {{ margin-left: 20px; }}
+        .conclusion li {{ margin-bottom: 12px; line-height: 1.8; }}
+        .highlight {{ background: rgba(255,255,255,0.2); padding: 2px 8px; border-radius: 4px; }}
+
+        .timeline {{
+            position: relative;
+            padding-left: 30px;
+        }}
+        .timeline::before {{
+            content: '';
+            position: absolute;
+            left: 10px;
+            top: 0;
+            bottom: 0;
+            width: 2px;
+            background: #ddd;
+        }}
+        .timeline-item {{
+            position: relative;
+            margin-bottom: 20px;
+        }}
+        .timeline-item::before {{
+            content: '';
+            position: absolute;
+            left: -24px;
+            top: 6px;
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            background: #667eea;
+        }}
+        .timeline-item.bad::before {{ background: #e74c3c; }}
+        .timeline-item.good::before {{ background: #27ae60; }}
+    </style>
+</head>
+<body>
+    <div class="report">
+        <!-- 报告头部 -->
+        <div class="header">
+            <h1>📊 低VoV高曝光问题分析报告</h1>
+            <p>数据范围: {dates[0]} ~ {dates[-1]} | 共 {len(dates)} 天</p>
+        </div>
+
+        <!-- Step 0: 问题背景 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">0</div>
+                <div>
+                    <div class="step-title">问题背景</div>
+                    <div class="step-question">什么是低VoV高曝光问题?</div>
+                </div>
+            </div>
+            <p style="line-height:1.8;color:#555;">
+                <strong>排序公式</strong>:score = STR × ROS × VOR = VoV<br><br>
+                • <strong>STR</strong> (分享率): FM模型预估,用户看到视频后分享的概率<br>
+                • <strong>ROS</strong> (回流/分享比): XGB模型预估,每次分享能带来多少回流用户<br>
+                • <strong>VOR</strong> (病毒因子): 24h统计量,每个回流用户能带来多少曝光<br>
+                • <strong>VoV</strong> (病毒系数): 最终的传播效率 = STR × ROS × VOR<br><br>
+                <strong style="color:#e74c3c;">问题现象</strong>:某些视频真实VoV很低,但却获得了很高的曝光排名
+            </p>
+        </div>
+
+        <!-- Step 1: 验证现象 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">1</div>
+                <div>
+                    <div class="step-title">验证现象</div>
+                    <div class="step-question">低VoV高曝光问题存在吗?</div>
+                </div>
+            </div>
+            <div class="answer-box danger">
+                <div class="answer-title">结论:问题确实存在 ✓</div>
+                <div class="answer-value">43.5% 的 Top10 记录存在此问题</div>
+            </div>
+            <p style="color:#666;margin-top:12px;">
+                定义:VoV < 0.35 但曝光排名 ≤ 5 的视频<br>
+                验证方法:统计每日 Top10 曝光视频中符合条件的比例
+            </p>
+        </div>
+
+        <!-- Step 2: 影响面 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">2</div>
+                <div>
+                    <div class="step-title">量化影响</div>
+                    <div class="step-question">问题影响有多大?</div>
+                </div>
+            </div>
+            <div class="metric-grid">
+                <div class="metric-card bad">
+                    <div class="metric-label">问题记录占比</div>
+                    <div class="metric-value">38.1%</div>
+                </div>
+                <div class="metric-card bad">
+                    <div class="metric-label">问题视频数</div>
+                    <div class="metric-value">21 个</div>
+                </div>
+                <div class="metric-card bad">
+                    <div class="metric-label">影响曝光量</div>
+                    <div class="metric-value">35.7%</div>
+                </div>
+            </div>
+            <p style="color:#666;">
+                即:超过1/3的头部曝光被低效率视频占据
+            </p>
+        </div>
+
+        <!-- Step 3: 原因分析 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">3</div>
+                <div>
+                    <div class="step-title">原因分析</div>
+                    <div class="step-question">STR、ROS、VOR 哪个出了问题?</div>
+                </div>
+            </div>
+            <table>
+                <thead>
+                    <tr>
+                        <th>指标</th>
+                        <th>预估偏差</th>
+                        <th>偏高比例</th>
+                        <th>诊断</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr>
+                        <td>STR (分享率)</td>
+                        <td>-5.7%</td>
+                        <td>36.4%</td>
+                        <td><span class="badge badge-success">正常</span></td>
+                    </tr>
+                    <tr class="problem">
+                        <td><strong>ROS (回流比)</strong></td>
+                        <td><strong>+72.2%</strong></td>
+                        <td><strong>70.1%</strong></td>
+                        <td><span class="badge badge-danger">系统性偏高 ⚠️</span></td>
+                    </tr>
+                    <tr>
+                        <td>VOR (病毒因子)</td>
+                        <td>混合</td>
+                        <td>44.8%</td>
+                        <td><span class="badge badge-warning">有异常值</span></td>
+                    </tr>
+                </tbody>
+            </table>
+            <div class="answer-box danger">
+                <div class="answer-title">核心问题找到了!</div>
+                <div class="answer-value">ROS 预估系统性偏高 72.2%</div>
+            </div>
+        </div>
+
+        <!-- Step 4: 深入分析 ROS -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">4</div>
+                <div>
+                    <div class="step-title">ROS 偏差深入分析</div>
+                    <div class="step-question">ROS 为什么会偏高?</div>
+                </div>
+            </div>
+            <p style="margin-bottom:20px;color:#555;">发现 ROS 偏差与真实 ROS 呈强负相关 (r = -0.607):</p>
+            <table>
+                <thead>
+                    <tr>
+                        <th>真实 ROS</th>
+                        <th>预估偏差</th>
+                        <th>说明</th>
+                    </tr>
+                </thead>
+                <tbody>
+                    <tr class="problem">
+                        <td>< 2</td>
+                        <td><strong>+152%</strong></td>
+                        <td><span class="badge badge-danger">严重偏高</span></td>
+                    </tr>
+                    <tr>
+                        <td>2-4</td>
+                        <td>+20%</td>
+                        <td>轻微偏高</td>
+                    </tr>
+                    <tr>
+                        <td>4-6</td>
+                        <td>-27%</td>
+                        <td>偏低</td>
+                    </tr>
+                    <tr>
+                        <td>> 6</td>
+                        <td>-54%</td>
+                        <td>严重偏低</td>
+                    </tr>
+                </tbody>
+            </table>
+            <div class="answer-box warning">
+                <div class="answer-title">根本原因</div>
+                <div class="answer-value">XGB 模型存在「回归均值」问题</div>
+            </div>
+            <p style="color:#666;margin-top:12px;">
+                模型倾向于把极端值往中间拉:低的预估偏高,高的预估偏低
+            </p>
+        </div>
+
+        <!-- Step 5: 时间趋势 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">5</div>
+                <div>
+                    <div class="step-title">时间趋势分析</div>
+                    <div class="step-question">问题是什么时候开始的?</div>
+                </div>
+            </div>
+            <div class="chart-container">
+                <canvas id="problemChart"></canvas>
+            </div>
+            <div class="answer-box warning">
+                <div class="answer-title">结论</div>
+                <div class="answer-value">问题从一开始就存在,不是某个时间点突然出现</div>
+            </div>
+        </div>
+
+        <!-- Step 6: 月度对比 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">6</div>
+                <div>
+                    <div class="step-title">月度对比分析</div>
+                    <div class="step-question">哪些月份问题严重?为什么?</div>
+                </div>
+            </div>
+            <table id="monthlyTable">
+                <thead>
+                    <tr>
+                        <th>月份</th>
+                        <th>VoV</th>
+                        <th>STR</th>
+                        <th>VOR</th>
+                        <th>问题比例</th>
+                        <th>诊断</th>
+                    </tr>
+                </thead>
+                <tbody></tbody>
+            </table>
+            <div class="timeline" style="margin-top:30px;">
+                <div class="timeline-item">
+                    <strong>2025年7月</strong>:问题比例 22%(数据起点)
+                </div>
+                <div class="timeline-item bad">
+                    <strong>2025年8-10月</strong>:问题恶化到 28-31%
+                    <span class="badge badge-danger">主因: VOR↓9%</span>
+                </div>
+                <div class="timeline-item">
+                    <strong>2025年11月</strong>:开始改善 27%
+                </div>
+                <div class="timeline-item good">
+                    <strong>2025年12月</strong>:大幅改善到 10%
+                    <span class="badge badge-success">原因: STR↑34%</span>
+                </div>
+                <div class="timeline-item bad">
+                    <strong>2026年1月</strong>:回升到 18%
+                    <span class="badge badge-warning">原因: STR↓10%</span>
+                </div>
+            </div>
+        </div>
+
+        <!-- Step 7: VoV 分解趋势 -->
+        <div class="step">
+            <div class="step-header">
+                <div class="step-number">7</div>
+                <div>
+                    <div class="step-title">VoV 分解趋势</div>
+                    <div class="step-question">STR、VOR 的变化趋势如何?</div>
+                </div>
+            </div>
+            <div class="chart-container">
+                <canvas id="decomposeChart"></canvas>
+            </div>
+            <p style="color:#666;margin-top:12px;">
+                可以看到:12月 STR 明显上升(紫色线),问题比例随之下降
+            </p>
+        </div>
+
+        <!-- 最终结论 -->
+        <div class="conclusion">
+            <h2>📋 最终结论与建议</h2>
+            <ul>
+                <li><strong>问题确认</strong>:低VoV高曝光问题 <span class="highlight">确实存在</span>,影响 35.7% 的头部曝光</li>
+                <li><strong>根本原因</strong>:<span class="highlight">ROS 预估偏高 72%</span>,XGB 模型存在「回归均值」问题</li>
+                <li><strong>时间规律</strong>:问题一直存在,8-10月最严重(VOR↓),12月改善(STR↑)</li>
+                <li><strong>问题视频特征</strong>:<span class="highlight">低 STR</span> 的视频更容易出问题</li>
+            </ul>
+            <h3 style="margin-top:30px;margin-bottom:16px;">💡 建议</h3>
+            <ul>
+                <li><strong>短期</strong>:对真实 ROS < 2 的视频增加预估衰减系数</li>
+                <li><strong>中期</strong>:改进 XGB 模型,解决「回归均值」问题</li>
+                <li><strong>长期</strong>:考虑端到端 VoV 预估,避免三模型误差累积</li>
+            </ul>
+        </div>
+    </div>
+
+    <script>
+        const dates = {json.dumps(dates)};
+        const formattedDates = dates.map(d => d.slice(4,6) + '/' + d.slice(6,8));
+        const problemData = {json.dumps(problem_data)};
+        const strData = {json.dumps(str_data)};
+        const vorData = {json.dumps(vor_data)};
+
+        // 问题比例趋势图
+        const problemColors = problemData.map(v => v > 25 ? '#e74c3c' : v > 15 ? '#f39c12' : '#27ae60');
+        new Chart(document.getElementById('problemChart'), {{
+            type: 'bar',
+            data: {{
+                labels: formattedDates,
+                datasets: [{{
+                    label: '问题比例 (%)',
+                    data: problemData,
+                    backgroundColor: problemColors,
+                    borderRadius: 2
+                }}]
+            }},
+            options: {{
+                responsive: true,
+                maintainAspectRatio: false,
+                plugins: {{
+                    legend: {{ display: false }},
+                    annotation: {{
+                        annotations: {{
+                            line1: {{
+                                type: 'line',
+                                yMin: 20, yMax: 20,
+                                borderColor: '#e74c3c',
+                                borderWidth: 2,
+                                borderDash: [6, 6],
+                                label: {{ display: true, content: '严重线 20%', position: 'end', backgroundColor: '#e74c3c' }}
+                            }}
+                        }}
+                    }}
+                }},
+                scales: {{
+                    y: {{ beginAtZero: true, max: 50, title: {{ display: true, text: '问题比例 (%)' }} }}
+                }}
+            }}
+        }});
+
+        // STR/VOR 分解图
+        new Chart(document.getElementById('decomposeChart'), {{
+            type: 'line',
+            data: {{
+                labels: formattedDates,
+                datasets: [{{
+                    label: 'STR (×100)',
+                    data: strData,
+                    borderColor: '#9b59b6',
+                    tension: 0.3,
+                    yAxisID: 'y'
+                }}, {{
+                    label: 'VOR',
+                    data: vorData,
+                    borderColor: '#f39c12',
+                    tension: 0.3,
+                    yAxisID: 'y1'
+                }}]
+            }},
+            options: {{
+                responsive: true,
+                maintainAspectRatio: false,
+                interaction: {{ mode: 'index', intersect: false }},
+                plugins: {{ legend: {{ display: true, position: 'top' }} }},
+                scales: {{
+                    y: {{ type: 'linear', position: 'left', title: {{ display: true, text: 'STR (×100)' }} }},
+                    y1: {{ type: 'linear', position: 'right', title: {{ display: true, text: 'VOR' }}, grid: {{ drawOnChartArea: false }} }}
+                }}
+            }}
+        }});
+
+        // 月度表格
+        const monthlyData = {json.dumps(monthly_data)};
+        const tbody = document.querySelector('#monthlyTable tbody');
+        monthlyData.forEach(row => {{
+            const tr = document.createElement('tr');
+            if (row.problem > 25) tr.classList.add('problem');
+
+            let diagnosis = '';
+            if (row.problem <= 15) diagnosis = '<span class="badge badge-success">✓ 良好</span>';
+            else if (row.vor < 6.2) diagnosis = '<span class="badge badge-danger">VOR↓</span>';
+            else if (row.str < 0.045) diagnosis = '<span class="badge badge-danger">STR↓</span>';
+            else diagnosis = '<span class="badge badge-warning">待分析</span>';
+
+            tr.innerHTML = `
+                <td>${{row.month}}</td>
+                <td>${{row.vov.toFixed(3)}}</td>
+                <td>${{row.str.toFixed(4)}}</td>
+                <td>${{row.vor.toFixed(2)}}</td>
+                <td>${{row.problem.toFixed(1)}}%</td>
+                <td>${{diagnosis}}</td>
+            `;
+            tbody.appendChild(tr);
+        }});
+    </script>
+</body>
+</html>
+'''
+
+# 保存
+html_path = output_dir / "分析报告.html"
+with open(html_path, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"分析报告已生成: {html_path}")

+ 100 - 0
tmp/低vov高曝光分析/step1_分析.py

@@ -0,0 +1,100 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step1: 验证"低vov高曝光"现象是否存在
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取数据
+data_file = Path(__file__).parent / "output" / "step1_验证现象_20260111_20260113.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step1: 验证低vov高曝光现象是否存在")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"样本数: {len(df)} 条(item x 天)")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"曝光rank范围: {df['曝光rank'].min()} ~ {df['曝光rank'].max()}")
+
+# 2. vov0 分布
+print("\n【2. vov0 整体分布】")
+print(f"均值: {df['vov0'].mean():.4f}")
+print(f"中位数: {df['vov0'].median():.4f}")
+print(f"标准差: {df['vov0'].std():.4f}")
+print(f"最小值: {df['vov0'].min():.4f}")
+print(f"最大值: {df['vov0'].max():.4f}")
+
+# 3. 按曝光rank分组看vov分布
+print("\n【3. 按曝光rank分组看vov0】")
+df['rank_group'] = pd.cut(df['曝光rank'], bins=[0, 5, 10, 20, 50],
+                          labels=['Top1-5', 'Top6-10', 'Top11-20', 'Top21-50'])
+group_stats = df.groupby('rank_group', observed=True).agg({
+    'vov0': ['mean', 'median', 'min', 'count'],
+    'exp': 'mean'
+}).round(4)
+group_stats.columns = ['vov0均值', 'vov0中位数', 'vov0最小', '样本数', '平均曝光']
+print(group_stats.to_string())
+
+# 4. 找出低vov高曝光的item
+median_vov = df['vov0'].median()
+print(f"\n【4. 识别低vov高曝光item】")
+print(f"定义: 曝光rank<=10 且 vov0<中位数({median_vov:.4f})")
+
+problem_items = df[(df['曝光rank'] <= 10) & (df['vov0'] < median_vov)]
+top10 = df[df['曝光rank'] <= 10]
+print(f"符合条件的记录数: {len(problem_items)}")
+print(f"占Top10总记录数: {len(problem_items)}/{len(top10)} = {len(problem_items)/len(top10)*100:.1f}%")
+
+if len(problem_items) > 0:
+    print("\n问题item详情:")
+    print(problem_items[['dt', 'vid', '曝光rank', 'exp', 'vov0', '标题']].to_string(index=False))
+
+# 5. 结论
+print("\n" + "=" * 70)
+print("【Step1 结论】")
+print("=" * 70)
+if len(problem_items) > 0:
+    print(f"现象存在: {len(problem_items)}/{len(top10)} ({len(problem_items)/len(top10)*100:.1f}%) 的Top10记录存在低vov高曝光问题")
+    print(f"  - 这些item的vov0低于中位数{median_vov:.4f},但曝光rank进入Top10")
+else:
+    print("现象不存在")
+
+# 保存问题item到文件
+output_file = Path(__file__).parent / "output" / "step1_问题item.csv"
+problem_items.to_csv(output_file, index=False)
+print(f"\n问题item已保存到: {output_file}")
+
+# 保存分析结论到文件
+conclusion_file = Path(__file__).parent / "output" / "step1_结论.txt"
+with open(conclusion_file, 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step1: 验证低vov高曝光现象是否存在\n")
+    f.write("=" * 70 + "\n\n")
+
+    f.write("【数据概览】\n")
+    f.write(f"样本数: {len(df)} 条(item x 天)\n")
+    f.write(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}\n")
+    f.write(f"曝光rank范围: {df['曝光rank'].min()} ~ {df['曝光rank'].max()}\n\n")
+
+    f.write("【vov0 整体分布】\n")
+    f.write(f"均值: {df['vov0'].mean():.4f}\n")
+    f.write(f"中位数: {df['vov0'].median():.4f}\n")
+    f.write(f"标准差: {df['vov0'].std():.4f}\n")
+    f.write(f"最小值: {df['vov0'].min():.4f}\n")
+    f.write(f"最大值: {df['vov0'].max():.4f}\n\n")
+
+    f.write("【按曝光rank分组看vov0】\n")
+    f.write(group_stats.to_string() + "\n\n")
+
+    f.write("【结论】\n")
+    if len(problem_items) > 0:
+        f.write(f"现象存在: {len(problem_items)}/{len(top10)} ({len(problem_items)/len(top10)*100:.1f}%) 的Top10记录存在低vov高曝光问题\n")
+        f.write(f"定义: 曝光rank<=10 且 vov0<中位数({median_vov:.4f})\n")
+    else:
+        f.write("现象不存在\n")
+
+print(f"分析结论已保存到: {conclusion_file}")

+ 115 - 0
tmp/低vov高曝光分析/step2_分析.py

@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step2: 量化影响面 - 多少视频有这个问题,占比多少
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取数据
+data_file = Path(__file__).parent / "output" / "step2_影响面_20260101_20260113.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step2: 量化影响面")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"样本数: {len(df)} 条(item x 天)")
+print(f"唯一视频数: {df['vid'].nunique()}")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"天数: {df['dt'].nunique()}")
+
+# 2. 定义问题item
+# 使用整体vov0中位数作为阈值
+median_vov = df['vov0'].median()
+print(f"\n【2. 问题定义】")
+print(f"vov0中位数: {median_vov:.4f}")
+print(f"问题定义: 曝光rank<=10 且 vov0<{median_vov:.4f}")
+
+# 3. 统计问题item
+top10 = df[df['曝光rank'] <= 10]
+problem_items = top10[top10['vov0'] < median_vov]
+
+print(f"\n【3. 问题item统计】")
+print(f"Top10总记录数: {len(top10)}")
+print(f"问题记录数: {len(problem_items)}")
+print(f"问题占比: {len(problem_items)/len(top10)*100:.1f}%")
+
+# 4. 按天统计
+print(f"\n【4. 按天统计问题占比】")
+daily_stats = []
+for dt in sorted(df['dt'].unique()):
+    day_top10 = df[(df['dt'] == dt) & (df['曝光rank'] <= 10)]
+    day_problem = day_top10[day_top10['vov0'] < median_vov]
+    daily_stats.append({
+        'dt': dt,
+        'top10_count': len(day_top10),
+        'problem_count': len(day_problem),
+        'problem_ratio': len(day_problem) / len(day_top10) * 100 if len(day_top10) > 0 else 0
+    })
+daily_df = pd.DataFrame(daily_stats)
+print(daily_df.to_string(index=False))
+
+# 5. 问题视频列表(去重)
+print(f"\n【5. 问题视频列表(去重)】")
+problem_vids = problem_items.groupby('vid').agg({
+    'dt': 'count',
+    'vov0': 'mean',
+    '曝光rank': 'mean',
+    'exp': 'sum',
+    '标题': 'first'
+}).reset_index()
+problem_vids.columns = ['vid', '出现天数', '平均vov0', '平均rank', '总曝光', '标题']
+problem_vids = problem_vids.sort_values('出现天数', ascending=False)
+print(f"问题视频数: {len(problem_vids)}")
+print(problem_vids.to_string(index=False))
+
+# 6. 影响的曝光量
+print(f"\n【6. 影响面量化】")
+total_exp_top10 = top10['exp'].sum()
+problem_exp = problem_items['exp'].sum()
+print(f"Top10总曝光: {total_exp_top10:,.0f}")
+print(f"问题item曝光: {problem_exp:,.0f}")
+print(f"问题曝光占比: {problem_exp/total_exp_top10*100:.1f}%")
+
+# 保存结果
+print("\n" + "=" * 70)
+print("【Step2 结论】")
+print("=" * 70)
+print(f"1. 问题占比: {len(problem_items)}/{len(top10)} ({len(problem_items)/len(top10)*100:.1f}%) 的Top10记录存在低vov高曝光")
+print(f"2. 问题视频: {len(problem_vids)} 个视频至少出现过1次问题")
+print(f"3. 影响曝光: {problem_exp:,.0f} ({problem_exp/total_exp_top10*100:.1f}% 的Top10曝光)")
+
+# 保存到文件
+output_dir = Path(__file__).parent / "output"
+
+# 保存问题视频列表
+problem_vids.to_csv(output_dir / "step2_问题视频列表.csv", index=False)
+
+# 保存每日统计
+daily_df.to_csv(output_dir / "step2_每日统计.csv", index=False)
+
+# 保存结论
+with open(output_dir / "step2_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step2: 量化影响面\n")
+    f.write("=" * 70 + "\n\n")
+
+    f.write("【数据范围】\n")
+    f.write(f"日期: {df['dt'].min()} ~ {df['dt'].max()} ({df['dt'].nunique()}天)\n")
+    f.write(f"样本数: {len(df)} 条\n\n")
+
+    f.write("【问题定义】\n")
+    f.write(f"曝光rank<=10 且 vov0<{median_vov:.4f}(中位数)\n\n")
+
+    f.write("【影响面统计】\n")
+    f.write(f"问题记录占比: {len(problem_items)}/{len(top10)} = {len(problem_items)/len(top10)*100:.1f}%\n")
+    f.write(f"问题视频数: {len(problem_vids)} 个\n")
+    f.write(f"影响曝光量: {problem_exp:,.0f} ({problem_exp/total_exp_top10*100:.1f}%)\n\n")
+
+    f.write("【每日问题占比】\n")
+    f.write(daily_df.to_string(index=False))
+
+print(f"\n结果已保存到 output/ 目录")

+ 122 - 0
tmp/低vov高曝光分析/step3_分析.py

@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3: 分析原因 - 对比模型预估值与真实值,找出偏差来源
+"""
+import pandas as pd
+from pathlib import Path
+import glob
+
+# 读取并合并数据
+data_dir = Path(__file__).parent / "output" / "step3_原因分析"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+
+print("=" * 70)
+print("Step3: 分析原因")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"样本数: {len(df)} 条")
+print(f"视频数: {df['vid'].nunique()}")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 2. 计算各分项偏差
+df['str_bias'] = df['pred_str'] - df['real_str']
+df['ros_bias'] = df['pred_ros'] - df['real_ros']
+df['vor_bias'] = df['pred_vor'] - df['real_vor']
+
+# 百分比偏差(避免除0)
+df['str_bias_pct'] = df.apply(lambda x: (x['pred_str'] - x['real_str']) / x['real_str'] * 100 if x['real_str'] > 0 else None, axis=1)
+df['ros_bias_pct'] = df.apply(lambda x: (x['pred_ros'] - x['real_ros']) / x['real_ros'] * 100 if pd.notna(x['real_ros']) and x['real_ros'] > 0 else None, axis=1)
+df['vor_bias_pct'] = df.apply(lambda x: (x['pred_vor'] - x['real_vor']) / x['real_vor'] * 100 if pd.notna(x['real_vor']) and x['real_vor'] > 0 else None, axis=1)
+
+# 3. 整体偏差分析
+print("\n【2. 各分项偏差统计】")
+print(f"str 偏差: 平均 {df['str_bias'].mean():+.6f} ({df['str_bias_pct'].mean():+.1f}%)")
+print(f"ros 偏差: 平均 {df['ros_bias'].mean():+.4f} ({df['ros_bias_pct'].mean():+.1f}%)")
+print(f"vor 偏差: 平均 {df['vor_bias'].mean():+.4f} ({df['vor_bias_pct'].mean():+.1f}%)")
+
+# 4. 按视频汇总
+print("\n【3. 按视频汇总偏差】")
+vid_stats = df.groupby('vid').agg({
+    'sample_cnt': 'sum',
+    'pred_str': 'mean',
+    'real_str': 'mean',
+    'str_bias_pct': 'mean',
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'ros_bias_pct': 'mean',
+    'pred_vor': 'mean',
+    'real_vor': 'mean',
+    'vor_bias_pct': 'mean',
+    'real_vov': 'mean'
+}).round(4)
+vid_stats.columns = ['样本数', '预估str', '真实str', 'str偏差%', '预估ros', '真实ros', 'ros偏差%', '预估vor', '真实vor', 'vor偏差%', '真实vov']
+vid_stats = vid_stats.reset_index()
+print(vid_stats.to_string(index=False))
+
+# 5. 找出主要偏差来源
+print("\n【4. 主要偏差来源分析】")
+avg_str_bias = abs(df['str_bias_pct'].mean())
+avg_ros_bias = abs(df['ros_bias_pct'].mean())
+avg_vor_bias = abs(df['vor_bias_pct'].mean())
+
+biases = [('str', avg_str_bias), ('ros', avg_ros_bias), ('vor', avg_vor_bias)]
+biases.sort(key=lambda x: x[1], reverse=True)
+
+print(f"偏差排序: {biases[0][0]}({biases[0][1]:.1f}%) > {biases[1][0]}({biases[1][1]:.1f}%) > {biases[2][0]}({biases[2][1]:.1f}%)")
+print(f"\n主要偏差来源: {biases[0][0].upper()}")
+
+# 6. 详细分析主要偏差
+main_bias = biases[0][0]
+if main_bias == 'ros':
+    print("\n【5. ROS 偏差详细分析】")
+    print("ros 预估偏高的可能原因:")
+    print("  1. XGB 模型使用历史 ros 作为特征,但头部视频的 ros 会随时间衰减")
+    print("  2. 模型训练样本偏向高分享场景,导致对低分享场景预估偏高")
+    print("  3. 头部视频的分享用户结构变化(早期核心用户 → 后期普通用户)")
+elif main_bias == 'str':
+    print("\n【5. STR 偏差详细分析】")
+    print("str 预估偏差的可能原因:")
+    print("  1. FM 模型采样率校正有偏差")
+    print("  2. 头部视频的分享行为与训练样本分布不同")
+elif main_bias == 'vor':
+    print("\n【5. VOR 偏差详细分析】")
+    print("vor 预估偏差的可能原因:")
+    print("  1. vor 使用 24h 统计量,存在滞后性")
+    print("  2. 头部视频的裂变效率在快速变化")
+
+# 保存结果
+print("\n" + "=" * 70)
+print("【Step3 结论】")
+print("=" * 70)
+print(f"主要偏差来源: {biases[0][0].upper()} (平均偏差 {biases[0][1]:+.1f}%)")
+print(f"次要偏差来源: {biases[1][0].upper()} (平均偏差 {biases[1][1]:+.1f}%)")
+
+# 保存到文件
+output_dir = Path(__file__).parent / "output"
+
+# 保存合并后的数据
+df.to_csv(output_dir / "step3_预估vs真实_合并.csv", index=False)
+
+# 保存视频汇总
+vid_stats.to_csv(output_dir / "step3_按视频汇总.csv", index=False)
+
+# 保存结论
+with open(output_dir / "step3_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3: 分析原因\n")
+    f.write("=" * 70 + "\n\n")
+
+    f.write("【各分项偏差统计】\n")
+    f.write(f"str 偏差: 平均 {df['str_bias'].mean():+.6f} ({df['str_bias_pct'].mean():+.1f}%)\n")
+    f.write(f"ros 偏差: 平均 {df['ros_bias'].mean():+.4f} ({df['ros_bias_pct'].mean():+.1f}%)\n")
+    f.write(f"vor 偏差: 平均 {df['vor_bias'].mean():+.4f} ({df['vor_bias_pct'].mean():+.1f}%)\n\n")
+
+    f.write("【主要偏差来源】\n")
+    f.write(f"排序: {biases[0][0]}({biases[0][1]:.1f}%) > {biases[1][0]}({biases[1][1]:.1f}%) > {biases[2][0]}({biases[2][1]:.1f}%)\n")
+    f.write(f"结论: {biases[0][0].upper()} 是主要偏差来源\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 91 - 0
tmp/低vov高曝光分析/step3b_分析.py

@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3b: 分析整体预估偏差(所有头部视频)
+"""
+import pandas as pd
+from pathlib import Path
+import glob
+
+# 读取并合并数据
+data_dir = Path(__file__).parent / "output" / "step3b_整体偏差"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+
+print("=" * 70)
+print("Step3b: 整体预估偏差分析")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"样本数: {len(df)} 条")
+print(f"视频数: {df['vid'].nunique()}")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 2. 过滤有效数据(real_ros 不为空)
+df_valid = df[df['real_ros'].notna() & (df['real_ros'] > 0)].copy()
+print(f"有效样本: {len(df_valid)} 条(real_ros > 0)")
+
+# 3. 计算偏差
+df_valid['str_bias'] = df_valid['pred_str'] - df_valid['real_str']
+df_valid['ros_bias'] = df_valid['pred_ros'] - df_valid['real_ros']
+df_valid['vor_bias'] = df_valid['pred_vor'] - df_valid['real_vor']
+
+df_valid['str_bias_pct'] = (df_valid['pred_str'] - df_valid['real_str']) / df_valid['real_str'] * 100
+df_valid['ros_bias_pct'] = (df_valid['pred_ros'] - df_valid['real_ros']) / df_valid['real_ros'] * 100
+df_valid['vor_bias_pct'] = (df_valid['pred_vor'] - df_valid['real_vor']) / df_valid['real_vor'] * 100
+
+# 4. 整体偏差统计
+print("\n【2. 整体偏差统计】")
+print(f"STR: 预估均值={df_valid['pred_str'].mean():.6f}, 真实均值={df_valid['real_str'].mean():.6f}, 偏差={df_valid['str_bias_pct'].mean():+.1f}%")
+print(f"ROS: 预估均值={df_valid['pred_ros'].mean():.4f}, 真实均值={df_valid['real_ros'].mean():.4f}, 偏差={df_valid['ros_bias_pct'].mean():+.1f}%")
+print(f"VOR: 预估均值={df_valid['pred_vor'].mean():.4f}, 真实均值={df_valid['real_vor'].mean():.4f}, 偏差={df_valid['vor_bias_pct'].mean():+.1f}%")
+
+# 5. 偏差分布
+print("\n【3. 偏差分布】")
+for col, name in [('str_bias_pct', 'STR'), ('ros_bias_pct', 'ROS'), ('vor_bias_pct', 'VOR')]:
+    high = (df_valid[col] > 0).sum()
+    low = (df_valid[col] < 0).sum()
+    print(f"{name}: 偏高{high}个({high/len(df_valid)*100:.1f}%), 偏低{low}个({low/len(df_valid)*100:.1f}%)")
+
+# 6. 按曝光量分组看偏差
+print("\n【4. 按曝光量分组看偏差】")
+df_valid['exp_group'] = pd.qcut(df_valid['sample_cnt'], q=4, labels=['低曝光', '中低曝光', '中高曝光', '高曝光'])
+group_stats = df_valid.groupby('exp_group', observed=True).agg({
+    'str_bias_pct': 'mean',
+    'ros_bias_pct': 'mean',
+    'vor_bias_pct': 'mean',
+    'sample_cnt': ['mean', 'count']
+}).round(2)
+group_stats.columns = ['STR偏差%', 'ROS偏差%', 'VOR偏差%', '平均样本', '视频数']
+print(group_stats.to_string())
+
+# 7. 结论
+print("\n" + "=" * 70)
+print("【Step3b 结论】")
+print("=" * 70)
+
+str_bias = df_valid['str_bias_pct'].mean()
+ros_bias = df_valid['ros_bias_pct'].mean()
+vor_bias = df_valid['vor_bias_pct'].mean()
+
+print(f"整体偏差情况:")
+print(f"  STR: {str_bias:+.1f}% {'(偏高)' if str_bias > 10 else '(偏低)' if str_bias < -10 else '(正常)'}")
+print(f"  ROS: {ros_bias:+.1f}% {'(偏高)' if ros_bias > 10 else '(偏低)' if ros_bias < -10 else '(正常)'}")
+print(f"  VOR: {vor_bias:+.1f}% {'(偏高)' if vor_bias > 10 else '(偏低)' if vor_bias < -10 else '(正常)'}")
+
+# 保存结果
+output_dir = Path(__file__).parent / "output"
+df_valid.to_csv(output_dir / "step3b_整体偏差_合并.csv", index=False)
+
+with open(output_dir / "step3b_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3b: 整体预估偏差分析\n")
+    f.write("=" * 70 + "\n\n")
+    f.write(f"样本: {len(df_valid)} 条, {df_valid['vid'].nunique()} 个视频\n\n")
+    f.write("【整体偏差】\n")
+    f.write(f"STR: {str_bias:+.1f}%\n")
+    f.write(f"ROS: {ros_bias:+.1f}%\n")
+    f.write(f"VOR: {vor_bias:+.1f}%\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 93 - 0
tmp/低vov高曝光分析/step3c_分析.py

@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3c: 深入分析 ROS 偏高原因
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取数据
+data_file = Path(__file__).parent / "output" / "step3b_整体偏差_合并.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step3c: ROS 偏高原因深入分析")
+print("=" * 70)
+
+# 1. ROS 偏差与样本量的关系
+print("\n【1. ROS 偏差与样本量(曝光量)的关系】")
+df['sample_group'] = pd.cut(df['sample_cnt'], bins=[0, 150, 300, 500, 10000],
+                            labels=['<150', '150-300', '300-500', '>500'])
+group_ros = df.groupby('sample_group', observed=True).agg({
+    'ros_bias_pct': ['mean', 'std'],
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'sample_cnt': 'count'
+}).round(2)
+group_ros.columns = ['ROS偏差%', '偏差标准差', '预估ROS', '真实ROS', '视频数']
+print(group_ros.to_string())
+
+# 2. ROS 偏差与真实 STR 的关系(分享率低的视频 ROS 是否更不准)
+print("\n【2. ROS 偏差与真实分享率(STR)的关系】")
+df['str_group'] = pd.cut(df['real_str'], bins=[0, 0.005, 0.01, 0.02, 1],
+                         labels=['<0.5%', '0.5-1%', '1-2%', '>2%'])
+group_str = df.groupby('str_group', observed=True).agg({
+    'ros_bias_pct': 'mean',
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'sample_cnt': 'count'
+}).round(2)
+group_str.columns = ['ROS偏差%', '预估ROS', '真实ROS', '视频数']
+print(group_str.to_string())
+
+# 3. ROS 偏差与真实 ROS 的关系
+print("\n【3. ROS 偏差与真实ROS的关系】")
+df['real_ros_group'] = pd.cut(df['real_ros'], bins=[0, 2, 4, 6, 100],
+                              labels=['<2', '2-4', '4-6', '>6'])
+group_real_ros = df.groupby('real_ros_group', observed=True).agg({
+    'ros_bias_pct': 'mean',
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'sample_cnt': 'count'
+}).round(2)
+group_real_ros.columns = ['ROS偏差%', '预估ROS', '真实ROS', '视频数']
+print(group_real_ros.to_string())
+
+# 4. 相关性分析
+print("\n【4. 相关性分析】")
+print(f"ROS偏差 vs 样本量: r = {df['ros_bias_pct'].corr(df['sample_cnt']):.3f}")
+print(f"ROS偏差 vs 真实STR: r = {df['ros_bias_pct'].corr(df['real_str']):.3f}")
+print(f"ROS偏差 vs 真实ROS: r = {df['ros_bias_pct'].corr(df['real_ros']):.3f}")
+print(f"ROS偏差 vs 真实VOV: r = {df['ros_bias_pct'].corr(df['real_vov']):.3f}")
+
+# 5. 结论
+print("\n" + "=" * 70)
+print("【Step3c 结论】")
+print("=" * 70)
+
+# 找出规律
+low_sample = df[df['sample_cnt'] < 150]['ros_bias_pct'].mean()
+high_sample = df[df['sample_cnt'] > 500]['ros_bias_pct'].mean()
+print(f"低曝光组 ROS 偏差: {low_sample:+.1f}%")
+print(f"高曝光组 ROS 偏差: {high_sample:+.1f}%")
+
+if low_sample > high_sample:
+    print("\n规律: 曝光量越低,ROS 预估偏差越大")
+    print("原因推测: 低曝光视频的分享样本少,ROS 真实值波动大,模型难以准确预估")
+
+# 保存结论
+output_dir = Path(__file__).parent / "output"
+with open(output_dir / "step3c_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3c: ROS 偏高原因分析\n")
+    f.write("=" * 70 + "\n\n")
+    f.write("【发现】\n")
+    f.write(f"1. 低曝光组 ROS 偏差: {low_sample:+.1f}%\n")
+    f.write(f"2. 高曝光组 ROS 偏差: {high_sample:+.1f}%\n")
+    f.write(f"3. 曝光量越低,ROS 预估偏差越大\n\n")
+    f.write("【原因推测】\n")
+    f.write("1. XGB 模型整体对 ROS 预估偏高(系统性偏差)\n")
+    f.write("2. 低曝光视频分享样本少,真实 ROS 波动大\n")
+    f.write("3. 模型可能使用了全局平均 ROS 作为先验,导致预估值被拉高\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 82 - 0
tmp/低vov高曝光分析/step3d_分析.py

@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3d: 分析 ROS 与推荐天数的关系
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取 step2 数据(有推荐天数间隔字段)
+data_file = Path(__file__).parent / "output" / "step2_影响面_20260101_20260113.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step3d: ROS 与推荐天数的关系")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"样本数: {len(df)} 条")
+print(f"推荐天数范围: {df['推荐天数间隔'].min()} ~ {df['推荐天数间隔'].max()}")
+
+# 2. 按推荐天数分组看 ros
+print("\n【2. 按推荐天数分组看 ROS】")
+df['天数分组'] = pd.cut(df['推荐天数间隔'], bins=[-1, 3, 7, 14, 30, 1000],
+                       labels=['0-3天', '4-7天', '8-14天', '15-30天', '>30天'])
+group_day = df.groupby('天数分组', observed=True).agg({
+    'ros_t0': ['mean', 'std'],
+    'vov0': 'mean',
+    'exp': 'sum',
+    'vid': 'count'
+}).round(4)
+group_day.columns = ['ROS均值', 'ROS标准差', 'VOV均值', '总曝光', '视频数']
+print(group_day.to_string())
+
+# 3. 问题视频的推荐天数分布
+print("\n【3. 问题视频的推荐天数分布】")
+# 定义问题视频(vov0 < 中位数 且 曝光rank <= 10)
+median_vov = df['vov0'].median()
+problem_df = df[(df['曝光rank'] <= 10) & (df['vov0'] < median_vov)]
+print(f"问题视频数: {len(problem_df)}")
+
+if len(problem_df) > 0:
+    problem_day = problem_df.groupby('天数分组', observed=True).agg({
+        'ros_t0': 'mean',
+        'vov0': 'mean',
+        'vid': 'count'
+    }).round(4)
+    problem_day.columns = ['ROS均值', 'VOV均值', '视频数']
+    print(problem_day.to_string())
+
+# 4. 对比新视频 vs 老视频
+print("\n【4. 新视频 vs 老视频对比】")
+new_videos = df[df['推荐天数间隔'] <= 7]
+old_videos = df[df['推荐天数间隔'] > 30]
+print(f"新视频(<=7天): ROS={new_videos['ros_t0'].mean():.4f}, VOV={new_videos['vov0'].mean():.4f}, 数量={len(new_videos)}")
+print(f"老视频(>30天): ROS={old_videos['ros_t0'].mean():.4f}, VOV={old_videos['vov0'].mean():.4f}, 数量={len(old_videos)}")
+
+# 5. 结论
+print("\n" + "=" * 70)
+print("【Step3d 结论】")
+print("=" * 70)
+new_ros = new_videos['ros_t0'].mean()
+old_ros = old_videos['ros_t0'].mean()
+if new_ros > old_ros:
+    print(f"新视频 ROS ({new_ros:.4f}) > 老视频 ROS ({old_ros:.4f})")
+    print(f"ROS 随推荐天数增加而下降 (衰减 {(1-old_ros/new_ros)*100:.1f}%)")
+else:
+    print(f"新视频 ROS ({new_ros:.4f}) <= 老视频 ROS ({old_ros:.4f})")
+    print("ROS 没有随推荐天数明显衰减")
+
+# 保存结论
+output_dir = Path(__file__).parent / "output"
+with open(output_dir / "step3d_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3d: ROS 与推荐天数的关系\n")
+    f.write("=" * 70 + "\n\n")
+    f.write(f"新视频(<=7天): ROS={new_ros:.4f}\n")
+    f.write(f"老视频(>30天): ROS={old_ros:.4f}\n")
+    if new_ros > old_ros:
+        f.write(f"结论: ROS 随推荐天数增加而下降\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 80 - 0
tmp/低vov高曝光分析/step3e_copc分析.py

@@ -0,0 +1,80 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3e: COPC 校准分析
+看预估 ROS 在不同分段下的真实 ROS(校准曲线)
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取数据
+data_file = Path(__file__).parent / "output" / "step3b_整体偏差_合并.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step3e: COPC 校准分析")
+print("=" * 70)
+
+# 1. 按预估 ROS 分桶,看真实 ROS
+print("\n【1. ROS 校准曲线(按预估值分桶)】")
+df['pred_ros_bucket'] = pd.cut(df['pred_ros'], bins=[0, 2, 3, 4, 5, 6, 100],
+                               labels=['0-2', '2-3', '3-4', '4-5', '5-6', '>6'])
+copc = df.groupby('pred_ros_bucket', observed=True).agg({
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'vid': 'count'
+}).round(4)
+copc.columns = ['预估ROS均值', '真实ROS均值', '样本数']
+copc['COPC'] = (copc['真实ROS均值'] / copc['预估ROS均值']).round(4)
+print(copc.to_string())
+
+# 2. 计算整体 COPC
+overall_copc = df['real_ros'].mean() / df['pred_ros'].mean()
+print(f"\n整体 COPC = {overall_copc:.4f}")
+print(f"(理想值为 1.0,<1 表示预估偏高,>1 表示预估偏低)")
+
+# 3. 按预估 STR 分桶
+print("\n【2. STR 校准曲线(按预估值分桶)】")
+df['pred_str_bucket'] = pd.cut(df['pred_str'], bins=[0, 0.005, 0.008, 0.01, 0.015, 1],
+                               labels=['0-0.5%', '0.5-0.8%', '0.8-1%', '1-1.5%', '>1.5%'])
+copc_str = df.groupby('pred_str_bucket', observed=True).agg({
+    'pred_str': 'mean',
+    'real_str': 'mean',
+    'vid': 'count'
+}).round(6)
+copc_str.columns = ['预估STR均值', '真实STR均值', '样本数']
+copc_str['COPC'] = (copc_str['真实STR均值'] / copc_str['预估STR均值']).round(4)
+print(copc_str.to_string())
+
+overall_copc_str = df['real_str'].mean() / df['pred_str'].mean()
+print(f"\nSTR 整体 COPC = {overall_copc_str:.4f}")
+
+# 4. 结论
+print("\n" + "=" * 70)
+print("【Step3e 结论】")
+print("=" * 70)
+print(f"ROS COPC = {overall_copc:.4f}(预估偏高 {(1-overall_copc)*100:.1f}%)")
+print(f"STR COPC = {overall_copc_str:.4f}(预估偏低 {(overall_copc_str-1)*100:.1f}%)")
+
+print("\nROS 校准问题:")
+for idx, row in copc.iterrows():
+    if row['COPC'] < 0.8:
+        print(f"  - 预估 {idx}: COPC={row['COPC']:.2f},严重偏高")
+    elif row['COPC'] > 1.2:
+        print(f"  - 预估 {idx}: COPC={row['COPC']:.2f},严重偏低")
+
+# 保存结果
+output_dir = Path(__file__).parent / "output"
+copc.to_csv(output_dir / "step3e_ros_copc.csv")
+copc_str.to_csv(output_dir / "step3e_str_copc.csv")
+
+with open(output_dir / "step3e_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3e: COPC 校准分析\n")
+    f.write("=" * 70 + "\n\n")
+    f.write(f"ROS 整体 COPC = {overall_copc:.4f}\n")
+    f.write(f"STR 整体 COPC = {overall_copc_str:.4f}\n\n")
+    f.write("ROS 分桶 COPC:\n")
+    f.write(copc.to_string())
+
+print(f"\n结果已保存到 output/ 目录")

+ 89 - 0
tmp/低vov高曝光分析/step3f_copc详细.py

@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step3f: COPC 详细分析 - 统一口径
+"""
+import pandas as pd
+from pathlib import Path
+
+# 读取数据
+data_file = Path(__file__).parent / "output" / "step3b_整体偏差_合并.csv"
+df = pd.read_csv(data_file)
+
+print("=" * 70)
+print("Step3f: COPC 详细分析(统一口径)")
+print("=" * 70)
+
+# 1. 不同口径的 COPC 计算
+print("\n【1. 不同口径的偏差计算】")
+
+# 口径1: mean(real) / mean(pred) - 整体 COPC
+copc1_ros = df['real_ros'].mean() / df['pred_ros'].mean()
+copc1_str = df['real_str'].mean() / df['pred_str'].mean()
+
+# 口径2: mean(pred - real) / mean(real) - 整体偏差率
+bias2_ros = (df['pred_ros'].mean() - df['real_ros'].mean()) / df['real_ros'].mean()
+bias2_str = (df['pred_str'].mean() - df['real_str'].mean()) / df['real_str'].mean()
+
+# 口径3: mean((pred - real) / real) - 样本偏差率平均(之前用的)
+bias3_ros = df['ros_bias_pct'].mean() / 100
+bias3_str = df['str_bias_pct'].mean() / 100
+
+print("ROS:")
+print(f"  口径1 COPC = {copc1_ros:.4f} (real均值/pred均值)")
+print(f"  口径2 偏差 = {bias2_ros:+.4f} ((pred均值-real均值)/real均值)")
+print(f"  口径3 偏差 = {bias3_ros:+.4f} (样本偏差率平均)")
+
+print("\nSTR:")
+print(f"  口径1 COPC = {copc1_str:.4f}")
+print(f"  口径2 偏差 = {bias2_str:+.4f}")
+print(f"  口径3 偏差 = {bias3_str:+.4f}")
+
+# 2. 分析口径差异原因
+print("\n【2. 口径差异原因分析】")
+print("口径2和口径3的差异来源于:")
+print("  - 口径2: 用整体均值计算,大样本权重高")
+print("  - 口径3: 每个样本权重相同,对小样本/极端值更敏感")
+
+# 3. 加权 COPC(按样本量加权)
+print("\n【3. 按样本量加权的分析】")
+total_samples = df['sample_cnt'].sum()
+weighted_pred_ros = (df['pred_ros'] * df['sample_cnt']).sum() / total_samples
+weighted_real_ros = (df['real_ros'] * df['sample_cnt']).sum() / total_samples
+weighted_copc_ros = weighted_real_ros / weighted_pred_ros
+
+weighted_pred_str = (df['pred_str'] * df['sample_cnt']).sum() / total_samples
+weighted_real_str = (df['real_str'] * df['sample_cnt']).sum() / total_samples
+weighted_copc_str = weighted_real_str / weighted_pred_str
+
+print(f"ROS 加权COPC = {weighted_copc_ros:.4f}")
+print(f"STR 加权COPC = {weighted_copc_str:.4f}")
+
+# 4. 结论
+print("\n" + "=" * 70)
+print("【Step3f 结论】")
+print("=" * 70)
+print("口径选择建议:")
+print("  - 评估整体效果用 COPC(口径1)")
+print("  - 评估单个视频准确度用样本偏差(口径3)")
+print(f"\n整体结论:")
+print(f"  - ROS COPC={copc1_ros:.2f},整体偏高 {(1-copc1_ros)*100:.0f}%")
+print(f"  - ROS 样本偏差={bias3_ros*100:+.0f}%,单个视频预估普遍偏高")
+print(f"  - 差异原因:极端值拉高了样本偏差")
+
+# 保存结论
+output_dir = Path(__file__).parent / "output"
+with open(output_dir / "step3f_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step3f: COPC 详细分析\n")
+    f.write("=" * 70 + "\n\n")
+    f.write("【ROS】\n")
+    f.write(f"COPC = {copc1_ros:.4f}\n")
+    f.write(f"整体偏差 = {bias2_ros:+.4f}\n")
+    f.write(f"样本偏差 = {bias3_ros:+.4f}\n\n")
+    f.write("【STR】\n")
+    f.write(f"COPC = {copc1_str:.4f}\n")
+    f.write(f"整体偏差 = {bias2_str:+.4f}\n")
+    f.write(f"样本偏差 = {bias3_str:+.4f}\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 134 - 0
tmp/低vov高曝光分析/step4_建议.md

@@ -0,0 +1,134 @@
+# 低 VoV 高曝光问题分析报告
+
+## 一、问题概述
+
+**现象**:部分头部视频 vov 低,但曝光量很大
+
+**排序公式**:`score = str × ros × vor`
+- str: FM 模型预估
+- ros: XGB 模型预估
+- vor: 24h 统计量
+
+## 二、数据验证结果
+
+### Step1: 现象验证 ✓
+- **结论**:现象存在
+- **数据**:43.5% 的 Top10 记录存在低vov高曝光问题
+
+### Step2: 影响面量化 ✓
+| 指标 | 数值 |
+|------|------|
+| 问题记录占比 | 38.1% (43/113) |
+| 问题视频数 | 21 个 |
+| 影响曝光量 | 4164万 (35.7%) |
+
+### Step3: 原因分析 ✓
+| 分项 | 整体偏差 | 偏高比例 | 结论 |
+|------|----------|----------|------|
+| STR | -5.7% | 36.4% | 正常 |
+| **ROS** | **+72.2%** | **70.1%** | **系统性偏高** |
+| VOR | 混合 | 44.8% | 有异常值 |
+
+## 三、根因分析
+
+### 核心问题:XGB 模型对 ROS 预估存在系统性偏差
+
+### 发现1: ROS 偏差与真实 ROS 强负相关 (r = -0.607)
+| 真实 ROS | 预估偏差 | 说明 |
+|----------|----------|------|
+| < 2 | **+152%** | 严重偏高 |
+| 2-4 | +20% | 轻微偏高 |
+| 4-6 | -27% | 偏低 |
+| > 6 | -54% | 严重偏低 |
+
+**结论**:模型存在"回归均值"问题,对极端值预估不准
+
+### 发现2: ROS 偏差与分享率(STR)负相关
+| 真实 STR | ROS 偏差 |
+|----------|----------|
+| < 0.5% | +115% |
+| 0.5-1% | +113% |
+| 1-2% | +60% |
+| > 2% | +16% |
+
+**结论**:分享率低的视频,ROS 预估偏差更大
+
+### 发现3: ROS/VOV 随推荐天数衰减
+| 视频类型 | ROS | VOV |
+|----------|-----|-----|
+| 新视频(≤7天) | 1.08 | 0.39 |
+| 老视频(>30天) | 0.98 | 0.25 |
+| **衰减** | **-8.7%** | **-35%** |
+
+**结论**:VOV 衰减比 ROS 更明显,老视频问题更严重
+
+### 典型案例
+| vid | 预估ros | 真实ros | 偏差 |
+|-----|---------|---------|------|
+| 63535473 | 5.99 | 3.11 | +218% |
+| 62421458 | 4.86 | 3.34 | +154% |
+| 55931081 | 6.79 | 4.90 | +112% |
+
+## 四、问题根因总结
+
+```
+问题链路:
+1. XGB 模型对 ROS 整体偏高 72.2%
+2. 真实 ROS 越低,预估偏差越大(回归均值问题)
+3. 分享率低的视频,ROS 预估更不准
+4. 老视频的 VOV 衰减 35%,但模型没有及时感知
+   ↓
+结果: 低 VOV 视频获得高曝光
+```
+
+## 五、解决建议
+
+### 短期方案(快速生效)
+1. **ROS 模型校准**
+   - 识别真实 ROS < 2 的视频,对其预估 ROS 增加衰减系数
+   - 或增加 ROS 置信度阈值
+
+2. **曝光频控**
+   - 对推荐天数 > 30 天的视频增加曝光限制
+   - 避免老视频长期霸榜
+
+### 中期方案
+3. **ROS 模型改进**
+   - 解决"回归均值"问题:对极端值使用专门的预估策略
+   - 增加分享率分层:低 STR 和高 STR 使用不同的 ROS 预估模型
+   - 增加时效性特征:近 6h ros vs 24h ros
+
+4. **在线学习**
+   - 使用实时反馈更新 ros 预估
+   - 减少模型滞后性
+
+### 长期方案
+5. **端到端 VoV 预估**
+   - 直接预估 vov,而非 str × ros × vor
+   - 避免三个模型误差累积
+
+6. **Exploration 机制**
+   - 对推荐天数 > 14 天的视频增加探索比例
+   - 及时发现效率下降
+
+## 六、监控建议
+
+建议增加以下监控指标:
+1. Top10 视频的 ros 预估 vs 真实 偏差率
+2. 按真实 ROS 分组的预估偏差(监控回归均值问题)
+3. 按推荐天数分组的 VOV 趋势(监控衰减)
+4. 单视频连续进入 Top10 的天数
+
+## 七、文件清单
+
+```
+tasks/承接/低vov高曝光分析/
+├── step1_验证现象.sql / step1_分析.py
+├── step2_影响面.sql / step2_分析.py
+├── step3_原因分析.sql / step3_分析.py
+├── step3b_整体偏差.sql / step3b_分析.py
+├── step3c_分析.py (ROS偏差深入分析)
+├── step3d_分析.py (ROS时间衰减分析)
+├── step4_建议.md (本报告)
+└── output/ (所有中间数据)
+```

+ 94 - 0
tmp/低vov高曝光分析/step5_分析.py

@@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step5: 分析问题从什么时候开始 - ROS COPC 时间趋势
+"""
+import pandas as pd
+from pathlib import Path
+import glob
+
+# 读取并合并数据
+data_dir = Path(__file__).parent / "output" / "step5_时间趋势"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+df = df.sort_values('dt')
+
+print("=" * 70)
+print("Step5: ROS COPC 时间趋势分析")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"天数: {len(df)}")
+
+# 2. 时间趋势
+print("\n【2. ROS COPC 时间趋势】")
+print(f"{'日期':<12} {'预估ROS':>10} {'真实ROS':>10} {'COPC':>8} {'偏差':>10}")
+print("-" * 55)
+for _, row in df.iterrows():
+    bias = (1 - row['ros_copc']) * 100 if pd.notna(row['ros_copc']) else 0
+    copc = row['ros_copc'] if pd.notna(row['ros_copc']) else 0
+    print(f"{int(row['dt']):<12} {row['pred_ros']:>10.4f} {row['real_ros']:>10.4f} {copc:>8.4f} {bias:>+9.1f}%")
+
+# 3. 分段统计
+print("\n【3. 分段统计】")
+df['week'] = pd.to_datetime(df['dt'].astype(str)).dt.isocalendar().week
+weekly = df.groupby('week').agg({
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'ros_copc': 'mean'
+}).round(4)
+weekly['偏差%'] = ((1 - weekly['ros_copc']) * 100).round(1)
+print(weekly.to_string())
+
+# 4. 找拐点
+print("\n【4. 趋势分析】")
+df['bias_pct'] = (1 - df['ros_copc']) * 100
+
+# 计算滚动平均
+df['bias_ma3'] = df['bias_pct'].rolling(3, min_periods=1).mean()
+
+# 找最大偏差时间点
+max_bias_idx = df['bias_pct'].idxmax()
+max_bias_date = df.loc[max_bias_idx, 'dt']
+max_bias_val = df.loc[max_bias_idx, 'bias_pct']
+
+min_bias_idx = df['bias_pct'].idxmin()
+min_bias_date = df.loc[min_bias_idx, 'dt']
+min_bias_val = df.loc[min_bias_idx, 'bias_pct']
+
+print(f"最大偏差: {int(max_bias_date)} ({max_bias_val:+.1f}%)")
+print(f"最小偏差: {int(min_bias_date)} ({min_bias_val:+.1f}%)")
+
+# 前后对比
+first_week = df.head(7)['bias_pct'].mean()
+last_week = df.tail(7)['bias_pct'].mean()
+print(f"\n前7天平均偏差: {first_week:+.1f}%")
+print(f"后7天平均偏差: {last_week:+.1f}%")
+
+# 5. 结论
+print("\n" + "=" * 70)
+print("【Step5 结论】")
+print("=" * 70)
+if last_week > first_week + 5:
+    print(f"问题在恶化: 偏差从 {first_week:+.1f}% 上升到 {last_week:+.1f}%")
+elif last_week < first_week - 5:
+    print(f"问题在改善: 偏差从 {first_week:+.1f}% 下降到 {last_week:+.1f}%")
+else:
+    print(f"问题一直存在: 偏差稳定在 {df['bias_pct'].mean():+.1f}% 左右")
+
+# 保存结果
+output_dir = Path(__file__).parent / "output"
+df.to_csv(output_dir / "step5_时间趋势_合并.csv", index=False)
+
+with open(output_dir / "step5_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step5: ROS COPC 时间趋势\n")
+    f.write("=" * 70 + "\n\n")
+    f.write(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}\n")
+    f.write(f"平均偏差: {df['bias_pct'].mean():+.1f}%\n")
+    f.write(f"前7天: {first_week:+.1f}%\n")
+    f.write(f"后7天: {last_week:+.1f}%\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 93 - 0
tmp/低vov高曝光分析/step6_历史抽样.py

@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step6: 历史抽样分析 - 问题是否一直存在
+"""
+import pandas as pd
+from pathlib import Path
+import glob
+
+output_dir = Path(__file__).parent / "output"
+
+# 定义时间段
+periods = [
+    ("2025年3月", "step1_验证现象_20250301_20250303.csv"),
+    ("2025年5月", "step1_验证现象_20250501_20250503.csv"),
+    ("2025年7月", "step1_验证现象_20250701_20250703.csv"),
+    ("2025年9月", "step1_验证现象_20250901_20250903.csv"),
+    ("2025年10月", "step1_验证现象_20251001_20251003.csv"),
+    ("2026年1月", "step1_验证现象_20260107_20260113.csv"),
+]
+
+print("=" * 70)
+print("Step6: 历史抽样分析 - 问题是否一直存在")
+print("=" * 70)
+
+# 分析每个时间段
+results = []
+for period_name, filename in periods:
+    filepath = output_dir / filename
+    if not filepath.exists():
+        print(f"\n{period_name}: 文件不存在")
+        continue
+
+    df = pd.read_csv(filepath)
+
+    # 只看 Top10
+    df_top10 = df[df['曝光rank'] <= 10].copy()
+
+    if len(df_top10) == 0:
+        print(f"\n{period_name}: 无 Top10 数据")
+        continue
+
+    # 计算低vov高曝光比例(vov < 中位数 且 rank <= 5)
+    vov_median = df_top10['vov0'].median()
+    problem_mask = (df_top10['vov0'] < vov_median) & (df_top10['曝光rank'] <= 5)
+    problem_ratio = problem_mask.sum() / len(df_top10) * 100
+
+    # 统计
+    stats = {
+        '时间段': period_name,
+        '天数': df_top10['dt'].nunique(),
+        'Top10记录数': len(df_top10),
+        'vov中位数': round(df_top10['vov0'].median(), 4),
+        'vov均值': round(df_top10['vov0'].mean(), 4),
+        'str均值': round(df_top10['str_t0'].mean(), 4),
+        'ros均值': round(df_top10['ros_t0'].mean(), 2),
+        'vor均值': round(df_top10['vor_t0'].mean(), 2),
+        '问题比例%': round(problem_ratio, 1)
+    }
+    results.append(stats)
+
+    print(f"\n【{period_name}】")
+    print(f"  天数: {stats['天数']}, 记录数: {stats['Top10记录数']}")
+    print(f"  vov: 中位数={stats['vov中位数']}, 均值={stats['vov均值']}")
+    print(f"  str={stats['str均值']}, ros={stats['ros均值']}, vor={stats['vor均值']}")
+    print(f"  低vov高曝光比例: {stats['问题比例%']}%")
+
+# 汇总对比
+print("\n" + "=" * 70)
+print("【时间段对比】")
+print("=" * 70)
+if results:
+    result_df = pd.DataFrame(results)
+    print(result_df.to_string(index=False))
+
+    # 保存结果
+    result_df.to_csv(output_dir / "step6_历史对比.csv", index=False)
+
+# 结论
+print("\n" + "=" * 70)
+print("【Step6 结论】")
+print("=" * 70)
+if len(results) >= 2:
+    first = results[0]
+    last = results[-1]
+    print(f"问题比例变化: {first['时间段']} {first['问题比例%']}% → {last['时间段']} {last['问题比例%']}%")
+
+    vov_change = (last['vov均值'] - first['vov均值']) / first['vov均值'] * 100
+    ros_change = (last['ros均值'] - first['ros均值']) / first['ros均值'] * 100
+    print(f"vov 变化: {vov_change:+.1f}%")
+    print(f"ros 变化: {ros_change:+.1f}%")
+
+print(f"\n结果已保存到 output/ 目录")

+ 130 - 0
tmp/低vov高曝光分析/step7_分析.py

@@ -0,0 +1,130 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step7: 头部视频 vov 时间趋势分析 - 找出问题开始时间
+"""
+import pandas as pd
+from pathlib import Path
+import glob
+
+# 读取并合并数据
+data_dir = Path(__file__).parent / "output" / "step7_头部vov趋势"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+df = df.sort_values('dt').reset_index(drop=True)
+
+print("=" * 70)
+print("Step7: 头部视频(Top10曝光) vov 时间趋势分析")
+print("=" * 70)
+
+# 1. 数据概览
+print("\n【1. 数据概览】")
+print(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"天数: {len(df)}")
+
+# 2. 按月统计
+print("\n【2. 按月统计】")
+df['month'] = df['dt'].astype(str).str[:6]
+monthly = df.groupby('month').agg({
+    'vov_mean': 'mean',
+    'vov_median': 'mean',
+    'str_mean': 'mean',
+    'ros_mean': 'mean',
+    'vor_mean': 'mean',
+    'problem_pct': 'mean'
+}).round(4)
+monthly.columns = ['vov均值', 'vov中位数', 'str', 'ros', 'vor', '问题比例%']
+print(monthly.to_string())
+
+# 3. 找拐点 - 问题比例变化
+print("\n【3. 问题比例趋势】")
+df['problem_ma7'] = df['problem_pct'].rolling(7, min_periods=1).mean()
+
+# 按周统计
+df['week'] = pd.to_datetime(df['dt'].astype(str)).dt.isocalendar().week
+df['year'] = pd.to_datetime(df['dt'].astype(str)).dt.year
+df['yearweek'] = df['year'].astype(str) + '-W' + df['week'].astype(str).str.zfill(2)
+
+weekly = df.groupby('yearweek').agg({
+    'dt': ['min', 'max'],
+    'vov_mean': 'mean',
+    'problem_pct': 'mean'
+}).round(2)
+weekly.columns = ['开始日期', '结束日期', 'vov均值', '问题比例%']
+print(weekly.tail(20).to_string())
+
+# 4. 找问题开始恶化的时间点
+print("\n【4. 趋势分析】")
+
+# 计算前半段和后半段
+mid_point = len(df) // 2
+first_half = df.iloc[:mid_point]
+second_half = df.iloc[mid_point:]
+
+print(f"前半段 ({first_half['dt'].min()}~{first_half['dt'].max()}):")
+print(f"  vov均值: {first_half['vov_mean'].mean():.4f}")
+print(f"  问题比例: {first_half['problem_pct'].mean():.1f}%")
+
+print(f"\n后半段 ({second_half['dt'].min()}~{second_half['dt'].max()}):")
+print(f"  vov均值: {second_half['vov_mean'].mean():.4f}")
+print(f"  问题比例: {second_half['problem_pct'].mean():.1f}%")
+
+# 5. 找最高和最低问题比例的时间段
+print("\n【5. 极值分析】")
+# 按周统计后找极值
+weekly_problem = df.groupby('yearweek')['problem_pct'].mean().round(1)
+max_week = weekly_problem.idxmax()
+min_week = weekly_problem.idxmin()
+print(f"问题最严重的周: {max_week} ({weekly_problem[max_week]}%)")
+print(f"问题最轻的周: {min_week} ({weekly_problem[min_week]}%)")
+
+# 6. 整体趋势判断
+print("\n【6. 趋势回归分析】")
+import numpy as np
+x = np.arange(len(df))
+y = df['problem_pct'].values
+
+# 线性回归
+slope, intercept = np.polyfit(x, y, 1)
+trend_per_month = slope * 30
+print(f"问题比例趋势: 每月变化 {trend_per_month:+.2f}%")
+
+y_vov = df['vov_mean'].values
+slope_vov, _ = np.polyfit(x, y_vov, 1)
+trend_vov_per_month = slope_vov * 30
+print(f"vov均值趋势: 每月变化 {trend_vov_per_month:+.4f}")
+
+# 7. 结论
+print("\n" + "=" * 70)
+print("【Step7 结论】")
+print("=" * 70)
+
+change_problem = second_half['problem_pct'].mean() - first_half['problem_pct'].mean()
+change_vov = (second_half['vov_mean'].mean() - first_half['vov_mean'].mean()) / first_half['vov_mean'].mean() * 100
+
+if abs(change_problem) < 3:
+    print("问题比例基本稳定,一直存在")
+elif change_problem > 0:
+    print(f"问题在恶化: 比例上升 {change_problem:+.1f}%")
+else:
+    print(f"问题在改善: 比例下降 {change_problem:+.1f}%")
+
+print(f"vov变化: {change_vov:+.1f}%")
+print(f"\n问题从数据开始({df['dt'].min()})就存在,不是某个时间点突然出现")
+
+# 保存结果
+output_dir = Path(__file__).parent / "output"
+df.to_csv(output_dir / "step7_头部vov趋势_合并.csv", index=False)
+monthly.to_csv(output_dir / "step7_月度统计.csv")
+
+with open(output_dir / "step7_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step7: 头部视频 vov 时间趋势\n")
+    f.write("=" * 70 + "\n\n")
+    f.write(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}\n")
+    f.write(f"问题比例变化: {change_problem:+.1f}%\n")
+    f.write(f"vov变化: {change_vov:+.1f}%\n")
+    f.write(f"\n月度统计:\n")
+    f.write(monthly.to_string())
+
+print(f"\n结果已保存到 output/ 目录")

+ 153 - 0
tmp/低vov高曝光分析/step8_分析.py

@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step8: 月度详细对比分析 - 找出9月问题严重的原因
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取数据
+df_jul = pd.read_csv(output_dir / "step8_月度对比_20250718_20250731.csv")
+df_sep = pd.read_csv(output_dir / "step8_月度对比_20250901_20250930.csv")
+df_dec = pd.read_csv(output_dir / "step8_月度对比_20251201_20251231.csv")
+
+df_jul['month'] = '7月(基线)'
+df_sep['month'] = '9月(问题)'
+df_dec['month'] = '12月(改善)'
+
+print("=" * 70)
+print("Step8: 月度详细对比分析")
+print("=" * 70)
+
+# 1. 整体指标对比
+print("\n【1. 整体指标对比】")
+print(f"{'月份':<12} {'记录数':>8} {'vov均值':>10} {'str均值':>10} {'ros均值':>10} {'vor均值':>10}")
+print("-" * 65)
+
+for name, df in [('7月(基线)', df_jul), ('9月(问题)', df_sep), ('12月(改善)', df_dec)]:
+    print(f"{name:<12} {len(df):>8} {df['vov0'].mean():>10.4f} {df['str_t0'].mean():>10.4f} "
+          f"{df['ros_t0'].mean():>10.2f} {df['vor_t0'].mean():>10.2f}")
+
+# 2. 问题视频分析(低vov高曝光)
+print("\n【2. 问题视频分析】")
+print("定义: vov < 0.35 且 rank <= 5")
+
+for name, df in [('7月', df_jul), ('9月', df_sep), ('12月', df_dec)]:
+    problem = df[(df['vov0'] < 0.35) & (df['曝光rank'] <= 5)]
+    normal = df[~((df['vov0'] < 0.35) & (df['曝光rank'] <= 5))]
+
+    print(f"\n{name}:")
+    print(f"  问题视频: {len(problem)} 条 ({len(problem)/len(df)*100:.1f}%)")
+    if len(problem) > 0:
+        print(f"    - vov均值: {problem['vov0'].mean():.4f}")
+        print(f"    - str均值: {problem['str_t0'].mean():.4f}")
+        print(f"    - ros均值: {problem['ros_t0'].mean():.2f}")
+        print(f"    - vor均值: {problem['vor_t0'].mean():.2f}")
+    if len(normal) > 0:
+        print(f"  正常视频: {len(normal)} 条")
+        print(f"    - vov均值: {normal['vov0'].mean():.4f}")
+
+# 3. 分解 vov = str × ros × vor,找差异来源
+print("\n【3. vov 分解分析 (vov ≈ str × ros × vor)】")
+
+def decompose_vov(df):
+    return {
+        'str': df['str_t0'].mean(),
+        'ros': df['ros_t0'].mean(),
+        'vor': df['vor_t0'].mean(),
+        'vov': df['vov0'].mean(),
+        'str*ros*vor': df['str_t0'].mean() * df['ros_t0'].mean() * df['vor_t0'].mean()
+    }
+
+jul_stats = decompose_vov(df_jul)
+sep_stats = decompose_vov(df_sep)
+dec_stats = decompose_vov(df_dec)
+
+print(f"\n{'指标':<8} {'7月':>10} {'9月':>10} {'12月':>10} {'9月vs7月':>12} {'12月vs9月':>12}")
+print("-" * 65)
+for key in ['str', 'ros', 'vor', 'vov']:
+    j, s, d = jul_stats[key], sep_stats[key], dec_stats[key]
+    chg1 = (s - j) / j * 100 if j != 0 else 0
+    chg2 = (d - s) / s * 100 if s != 0 else 0
+    print(f"{key:<8} {j:>10.4f} {s:>10.4f} {d:>10.4f} {chg1:>+11.1f}% {chg2:>+11.1f}%")
+
+# 4. 问题视频的具体分析
+print("\n【4. 9月问题视频详情】")
+sep_problem = df_sep[(df_sep['vov0'] < 0.35) & (df_sep['曝光rank'] <= 5)].copy()
+
+# 按视频聚合
+if len(sep_problem) > 0:
+    vid_stats = sep_problem.groupby('vid').agg({
+        'dt': 'count',
+        'vov0': 'mean',
+        'str_t0': 'mean',
+        'ros_t0': 'mean',
+        'vor_t0': 'mean',
+        'exp': 'sum',
+        '标题': 'first'
+    }).round(4)
+    vid_stats.columns = ['出现天数', 'vov均值', 'str', 'ros', 'vor', '总曝光', '标题']
+    vid_stats = vid_stats.sort_values('出现天数', ascending=False)
+
+    print(f"问题视频数: {len(vid_stats)}")
+    print(f"\n出现最多的问题视频 Top5:")
+    for i, (vid, row) in enumerate(vid_stats.head(5).iterrows()):
+        print(f"  {i+1}. vid={vid}, 出现{int(row['出现天数'])}天")
+        print(f"     vov={row['vov均值']:.4f}, str={row['str']:.4f}, ros={row['ros']:.2f}, vor={row['vor']:.2f}")
+        print(f"     标题: {row['标题'][:30]}...")
+
+# 5. 对比问题视频的特征
+print("\n【5. 问题视频 vs 正常视频特征对比】")
+df_all = pd.concat([df_jul, df_sep, df_dec])
+df_all['is_problem'] = (df_all['vov0'] < 0.35) & (df_all['曝光rank'] <= 5)
+
+problem_df = df_all[df_all['is_problem']]
+normal_df = df_all[~df_all['is_problem']]
+
+print(f"\n{'特征':<12} {'问题视频':>12} {'正常视频':>12} {'差异':>12}")
+print("-" * 50)
+for col, name in [('str_t0', 'str'), ('ros_t0', 'ros'), ('vor_t0', 'vor')]:
+    p_val = problem_df[col].mean()
+    n_val = normal_df[col].mean()
+    diff = (p_val - n_val) / n_val * 100 if n_val != 0 else 0
+    print(f"{name:<12} {p_val:>12.4f} {n_val:>12.4f} {diff:>+11.1f}%")
+
+# 6. 结论
+print("\n" + "=" * 70)
+print("【Step8 结论:9月问题严重的原因】")
+print("=" * 70)
+
+# 计算各因素的贡献
+str_contrib = (sep_stats['str'] - jul_stats['str']) / jul_stats['str'] * 100
+ros_contrib = (sep_stats['ros'] - jul_stats['ros']) / jul_stats['ros'] * 100
+vor_contrib = (sep_stats['vor'] - jul_stats['vor']) / jul_stats['vor'] * 100
+
+print(f"\n9月 vs 7月 各因素变化:")
+print(f"  STR: {str_contrib:+.1f}%")
+print(f"  ROS: {ros_contrib:+.1f}%")
+print(f"  VOR: {vor_contrib:+.1f}%")
+
+# 找主因
+factors = [('STR', abs(str_contrib)), ('ROS', abs(ros_contrib)), ('VOR', abs(vor_contrib))]
+main_factor = max(factors, key=lambda x: x[1])
+
+print(f"\n主要原因: {main_factor[0]} 变化最大 ({factors[0][1]:.1f}% / {factors[1][1]:.1f}% / {factors[2][1]:.1f}%)")
+
+# 保存结论
+with open(output_dir / "step8_结论.txt", 'w', encoding='utf-8') as f:
+    f.write("=" * 70 + "\n")
+    f.write("Step8: 月度对比分析\n")
+    f.write("=" * 70 + "\n\n")
+    f.write("【整体指标】\n")
+    f.write(f"7月: vov={jul_stats['vov']:.4f}, str={jul_stats['str']:.4f}, ros={jul_stats['ros']:.2f}, vor={jul_stats['vor']:.2f}\n")
+    f.write(f"9月: vov={sep_stats['vov']:.4f}, str={sep_stats['str']:.4f}, ros={sep_stats['ros']:.2f}, vor={sep_stats['vor']:.2f}\n")
+    f.write(f"12月: vov={dec_stats['vov']:.4f}, str={dec_stats['str']:.4f}, ros={dec_stats['ros']:.2f}, vor={dec_stats['vor']:.2f}\n")
+    f.write(f"\n【9月问题原因】\n")
+    f.write(f"STR变化: {str_contrib:+.1f}%\n")
+    f.write(f"ROS变化: {ros_contrib:+.1f}%\n")
+    f.write(f"VOR变化: {vor_contrib:+.1f}%\n")
+
+print(f"\n结果已保存到 output/ 目录")

+ 167 - 0
tmp/低vov高曝光分析/step9_天级趋势.py

@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+Step9: 天级数据和趋势线分析
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取180天数据
+df = pd.read_csv(output_dir / "step7_头部vov趋势_合并.csv")
+df = df.sort_values('dt').reset_index(drop=True)
+df['date'] = pd.to_datetime(df['dt'].astype(str))
+
+print("=" * 80)
+print("Step9: 天级数据和趋势线分析")
+print("=" * 80)
+
+# 1. 12月-1月详细数据
+print("\n【1. 12月-1月 天级数据】")
+df_recent = df[df['dt'] >= 20251201].copy()
+
+print(f"{'日期':<10} {'vov均值':>8} {'str':>8} {'ros':>6} {'vor':>6} {'问题%':>8} {'趋势':>6}")
+print("-" * 60)
+
+# 计算7日滚动平均
+df_recent['problem_ma7'] = df_recent['problem_pct'].rolling(7, min_periods=1).mean()
+df_recent['vov_ma7'] = df_recent['vov_mean'].rolling(7, min_periods=1).mean()
+
+prev_problem = None
+for _, row in df_recent.iterrows():
+    trend = ""
+    if prev_problem is not None:
+        if row['problem_pct'] > prev_problem + 5:
+            trend = "↑↑"
+        elif row['problem_pct'] > prev_problem:
+            trend = "↑"
+        elif row['problem_pct'] < prev_problem - 5:
+            trend = "↓↓"
+        elif row['problem_pct'] < prev_problem:
+            trend = "↓"
+        else:
+            trend = "→"
+    prev_problem = row['problem_pct']
+
+    print(f"{int(row['dt']):<10} {row['vov_mean']:>8.4f} {row['str_mean']:>8.4f} "
+          f"{row['ros_mean']:>6.2f} {row['vor_mean']:>6.2f} {row['problem_pct']:>7.1f}% {trend:>6}")
+
+# 2. 趋势线(文字图表)
+print("\n【2. 问题比例趋势线(12月-1月)】")
+print("日期        0%    10%   20%   30%   40%   50%")
+print("           |-----|-----|-----|-----|-----|")
+
+for _, row in df_recent.iterrows():
+    bar_len = int(row['problem_pct'] / 2)  # 每2%一个字符
+    bar = "█" * bar_len
+    ma_pos = int(row['problem_ma7'] / 2)
+
+    date_str = str(int(row['dt']))
+    print(f"{date_str}   {bar:<25} {row['problem_pct']:.0f}%")
+
+# 3. vov 趋势线
+print("\n【3. vov 均值趋势线(12月-1月)】")
+print("日期       0.2   0.3   0.4   0.5   0.6   0.7")
+print("           |-----|-----|-----|-----|-----|")
+
+for _, row in df_recent.iterrows():
+    # vov 范围 0.2-0.7,映射到 0-25 个字符
+    bar_len = int((row['vov_mean'] - 0.2) / 0.02)
+    bar_len = max(0, min(25, bar_len))
+    bar = "█" * bar_len
+
+    date_str = str(int(row['dt']))
+    print(f"{date_str}   {bar:<25} {row['vov_mean']:.2f}")
+
+# 4. 关键时间点分析
+print("\n【4. 关键时间点分析】")
+
+# 找问题比例的拐点
+df_recent['problem_diff'] = df_recent['problem_pct'].diff()
+
+# 找最大下降点
+min_diff_idx = df_recent['problem_diff'].idxmin()
+if pd.notna(min_diff_idx):
+    row = df_recent.loc[min_diff_idx]
+    print(f"问题比例最大下降: {int(row['dt'])} (下降 {row['problem_diff']:.1f}%)")
+
+# 找最大上升点
+max_diff_idx = df_recent['problem_diff'].idxmax()
+if pd.notna(max_diff_idx):
+    row = df_recent.loc[max_diff_idx]
+    print(f"问题比例最大上升: {int(row['dt'])} (上升 {row['problem_diff']:.1f}%)")
+
+# 5. 12月 vs 1月 分段对比
+print("\n【5. 12月 vs 1月 对比】")
+dec = df_recent[df_recent['dt'] < 20260101]
+jan = df_recent[df_recent['dt'] >= 20260101]
+
+print(f"\n{'指标':<12} {'12月均值':>12} {'1月均值':>12} {'变化':>12}")
+print("-" * 50)
+for col, name in [('vov_mean', 'vov'), ('str_mean', 'str'), ('ros_mean', 'ros'),
+                  ('vor_mean', 'vor'), ('problem_pct', '问题比例%')]:
+    dec_val = dec[col].mean()
+    jan_val = jan[col].mean()
+    if dec_val != 0:
+        change = (jan_val - dec_val) / dec_val * 100
+        print(f"{name:<12} {dec_val:>12.4f} {jan_val:>12.4f} {change:>+11.1f}%")
+
+# 6. 周度汇总
+print("\n【6. 周度汇总】")
+df_recent['week'] = df_recent['date'].dt.isocalendar().week
+df_recent['year'] = df_recent['date'].dt.year
+
+weekly = df_recent.groupby(['year', 'week']).agg({
+    'dt': ['min', 'max', 'count'],
+    'vov_mean': 'mean',
+    'str_mean': 'mean',
+    'ros_mean': 'mean',
+    'vor_mean': 'mean',
+    'problem_pct': 'mean'
+}).round(4)
+
+print(f"{'周':<10} {'日期范围':<20} {'天数':>4} {'vov':>8} {'str':>8} {'问题%':>8}")
+print("-" * 65)
+for (year, week), row in weekly.iterrows():
+    start = int(row[('dt', 'min')])
+    end = int(row[('dt', 'max')])
+    days = int(row[('dt', 'count')])
+    vov = row[('vov_mean', 'mean')]
+    str_v = row[('str_mean', 'mean')]
+    prob = row[('problem_pct', 'mean')]
+    print(f"{year}-W{week:<4} {start}-{end:<12} {days:>4} {vov:>8.4f} {str_v:>8.4f} {prob:>7.1f}%")
+
+# 7. 结论
+print("\n" + "=" * 80)
+print("【Step9 结论】")
+print("=" * 80)
+
+dec_prob = dec['problem_pct'].mean()
+jan_prob = jan['problem_pct'].mean()
+change = jan_prob - dec_prob
+
+print(f"\n12月平均问题比例: {dec_prob:.1f}%")
+print(f"1月平均问题比例: {jan_prob:.1f}%")
+print(f"变化: {change:+.1f}%")
+
+if change > 5:
+    print(f"\n⚠️  1月问题回升明显,需要关注")
+
+    # 找原因
+    dec_str = dec['str_mean'].mean()
+    jan_str = jan['str_mean'].mean()
+    str_change = (jan_str - dec_str) / dec_str * 100
+
+    dec_vor = dec['vor_mean'].mean()
+    jan_vor = jan['vor_mean'].mean()
+    vor_change = (jan_vor - dec_vor) / dec_vor * 100
+
+    print(f"\n原因分析:")
+    print(f"  STR 变化: {str_change:+.1f}%")
+    print(f"  VOR 变化: {vor_change:+.1f}%")
+
+# 保存天级数据
+df_recent.to_csv(output_dir / "step9_天级数据_12月1月.csv", index=False)
+print(f"\n结果已保存到 output/ 目录")

+ 91 - 0
tmp/低vov高曝光分析/v2_step1_数据驱动定义.py

@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V2 Step1: 用数据驱动定义"低vov高曝光"问题
+不再拍脑袋定阈值,而是用分布来定义
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取180天数据
+df = pd.read_csv(output_dir / "step8_月度对比_20251201_20251231.csv")  # 先用一个月
+
+print("=" * 70)
+print("V2 Step1: 数据驱动定义问题")
+print("=" * 70)
+
+# 1. 看 vov 分布
+print("\n【1. VoV 分布统计】")
+print(f"样本数: {len(df)}")
+print(f"VoV 均值: {df['vov0'].mean():.4f}")
+print(f"VoV 中位数: {df['vov0'].median():.4f}")
+print(f"VoV P25: {df['vov0'].quantile(0.25):.4f}")
+print(f"VoV P75: {df['vov0'].quantile(0.75):.4f}")
+
+# 2. 按曝光rank分组看vov分布
+print("\n【2. 不同曝光rank的VoV分布】")
+df['rank_group'] = pd.cut(df['曝光rank'], bins=[0, 3, 5, 10, 50], labels=['Top3', 'Top4-5', 'Top6-10', 'Top11-50'])
+rank_stats = df.groupby('rank_group', observed=True).agg({
+    'vov0': ['mean', 'median', lambda x: x.quantile(0.25)],
+    '曝光rank': 'count'
+}).round(4)
+rank_stats.columns = ['vov均值', 'vov中位数', 'vov_P25', '样本数']
+print(rank_stats)
+
+# 3. 数据驱动定义"低vov"
+# 定义:vov < 该rank组的P25 为"低vov"
+print("\n【3. 数据驱动的问题定义】")
+overall_p25 = df['vov0'].quantile(0.25)
+overall_p50 = df['vov0'].median()
+
+print(f"方案A: vov < P25 ({overall_p25:.4f}) 为低vov")
+print(f"方案B: vov < P50 ({overall_p50:.4f}) 为低vov")
+
+# 4. 用新定义计算问题比例
+print("\n【4. 不同定义下的问题比例】")
+
+for threshold_name, threshold in [('P25', overall_p25), ('P50', overall_p50), ('0.35(旧)', 0.35)]:
+    # 低vov且rank靠前
+    problem_a = df[(df['vov0'] < threshold) & (df['曝光rank'] <= 3)]
+    problem_b = df[(df['vov0'] < threshold) & (df['曝光rank'] <= 5)]
+
+    print(f"\n阈值={threshold_name} ({threshold:.4f}):")
+    print(f"  低vov且Top3: {len(problem_a)} ({len(problem_a)/len(df)*100:.1f}%)")
+    print(f"  低vov且Top5: {len(problem_b)} ({len(problem_b)/len(df)*100:.1f}%)")
+
+# 5. 看问题的本质:rank与vov的关系
+print("\n【5. 核心问题:高rank是否对应高vov?】")
+# 如果排序合理,rank越靠前vov应该越高
+corr = df['曝光rank'].corr(df['vov0'])
+print(f"曝光rank 与 vov 相关系数: {corr:.4f}")
+print(f"(负数表示rank越小vov越高,这是期望的;正数表示排序有问题)")
+
+# 按rank分组的vov
+print("\n各rank组vov均值:")
+for rank in [1, 2, 3, 4, 5, 10]:
+    subset = df[df['曝光rank'] == rank]
+    if len(subset) > 0:
+        print(f"  Rank {rank}: vov={subset['vov0'].mean():.4f} (n={len(subset)})")
+
+# 6. 结论
+print("\n" + "=" * 70)
+print("【V2 Step1 结论】")
+print("=" * 70)
+print(f"""
+问题重新定义:
+  - 旧定义: vov < 0.35 且 rank ≤ 5 (拍脑袋)
+  - 新定义: vov < P25 ({overall_p25:.4f}) 且 rank ≤ 3 (数据驱动)
+
+核心问题验证:
+  - rank 与 vov 相关系数 = {corr:.4f}
+  - 如果接近0或为正,说明排序没有很好地按vov排
+  - 这才是"低vov高曝光"问题的本质
+""")
+
+# 保存
+with open(output_dir / "v2_step1_结论.txt", 'w', encoding='utf-8') as f:
+    f.write(f"问题定义阈值: vov < {overall_p25:.4f} (P25)\n")
+    f.write(f"rank与vov相关系数: {corr:.4f}\n")

+ 106 - 0
tmp/低vov高曝光分析/v2_step2_分析.py

@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V2 Step2: 统一口径分析
+在同一张表上验证:预估值 vs 真实值
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取数据(合并多天)
+import glob
+data_dir = output_dir / "v2_step2_统一口径"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+
+print("=" * 70)
+print("V2 Step2: 统一口径分析")
+print("=" * 70)
+
+print(f"\n数据量: {len(df)} 个视频")
+
+# 1. 整体偏差统计
+print("\n【1. 整体偏差统计】")
+print(f"ROS 偏差均值: {df['ros_bias_pct'].mean():+.1f}%")
+print(f"ROS 偏差中位数: {df['ros_bias_pct'].median():+.1f}%")
+print(f"STR 偏差均值: {df['str_bias_pct'].mean():+.1f}%")
+print(f"STR 偏差中位数: {df['str_bias_pct'].median():+.1f}%")
+
+# 2. COPC(整体)
+print("\n【2. 整体 COPC】")
+ros_copc = df['real_ros'].sum() / df['pred_ros'].sum()
+str_copc = df['real_str'].sum() / df['pred_str'].sum()
+print(f"ROS COPC = {ros_copc:.4f} (1.0为理想)")
+print(f"STR COPC = {str_copc:.4f}")
+
+# 3. 按曝光量分组(头部 vs 尾部)
+print("\n【3. 按曝光量分组】")
+df['exp_group'] = pd.qcut(df['total_exp'], q=3, labels=['低曝光', '中曝光', '高曝光'])
+group_stats = df.groupby('exp_group', observed=True).agg({
+    'ros_bias_pct': ['mean', 'median'],
+    'str_bias_pct': ['mean', 'median'],
+    'vid': 'count'
+}).round(1)
+print(group_stats)
+
+# 4. 按真实ROS分组(验证回归均值问题)
+print("\n【4. 按真实ROS分组】")
+df['real_ros_group'] = pd.cut(df['real_ros'], bins=[0, 2, 4, 6, 100], labels=['<2', '2-4', '4-6', '>6'])
+ros_group_stats = df.groupby('real_ros_group', observed=True).agg({
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'ros_bias_pct': ['mean', 'count']
+}).round(2)
+ros_group_stats.columns = ['预估ROS', '真实ROS', '偏差%', '样本数']
+print(ros_group_stats)
+
+# 5. 核心验证:预估score高但真实score低的视频
+print("\n【5. 核心问题验证】")
+# 计算预估排名和真实排名
+df['pred_rank'] = df['pred_score'].rank(ascending=False)
+df['real_rank'] = df['real_score'].rank(ascending=False)
+df['rank_diff'] = df['real_rank'] - df['pred_rank']  # 正数表示真实排名比预估差
+
+# 预估Top10但真实排名差的视频
+pred_top10 = df[df['pred_rank'] <= 10].copy()
+print(f"预估排名Top10的视频:")
+print(f"  真实排名均值: {pred_top10['real_rank'].mean():.1f}")
+print(f"  排名差距均值: {pred_top10['rank_diff'].mean():.1f}")
+
+# 问题视频:预估Top10但真实排名 > 20
+problem_vids = pred_top10[pred_top10['real_rank'] > 15]
+print(f"  预估Top10但真实排名>15: {len(problem_vids)} 个 ({len(problem_vids)/10*100:.0f}%)")
+
+# 6. 预估排名 vs 真实排名 相关性
+rank_corr = df['pred_rank'].corr(df['real_rank'])
+print(f"\n预估排名与真实排名相关系数: {rank_corr:.4f}")
+print(f"(1.0为完美,越低说明排序越不准)")
+
+# 7. 详细看问题视频
+if len(problem_vids) > 0:
+    print("\n【6. 问题视频详情】")
+    print("预估排名靠前但真实排名差的视频:")
+    for _, row in problem_vids.iterrows():
+        print(f"  vid={row['vid']}: 预估rank={int(row['pred_rank'])}, 真实rank={int(row['real_rank'])}")
+        print(f"    ROS偏差={row['ros_bias_pct']:+.1f}%, STR偏差={row['str_bias_pct']:+.1f}%")
+
+# 8. 结论
+print("\n" + "=" * 70)
+print("【V2 Step2 结论】")
+print("=" * 70)
+print(f"""
+口径统一后的发现:
+  1. ROS 整体 COPC = {ros_copc:.2f} ({'偏高' if ros_copc < 1 else '偏低' if ros_copc > 1 else '正常'})
+  2. STR 整体 COPC = {str_copc:.2f}
+  3. 预估排名与真实排名相关系数 = {rank_corr:.2f}
+  4. 预估Top10中,{len(problem_vids)/10*100:.0f}% 真实排名 > 15
+
+结论:
+  - 相关系数 {rank_corr:.2f} {'较高,排序整体合理' if rank_corr > 0.7 else '偏低,排序有改进空间' if rank_corr > 0.5 else '较低,排序问题明显'}
+""")
+
+# 保存
+df.to_csv(output_dir / "v2_step2_统一口径_分析.csv", index=False)

+ 135 - 0
tmp/低vov高曝光分析/v2_step3_对比分析.py

@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V2 Step3: 问题视频 vs 正常视频对比
+找出导致排序失准的真正原因
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import glob
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取数据
+data_dir = output_dir / "v2_step2_统一口径"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+
+print("=" * 70)
+print("V2 Step3: 问题视频 vs 正常视频对比分析")
+print("=" * 70)
+
+# 1. 定义问题视频:预估排名好但真实排名差
+df['pred_rank'] = df.groupby(df.index // (len(df)//7) if len(df) > 7 else 0)['pred_score'].rank(ascending=False)
+df['real_rank'] = df.groupby(df.index // (len(df)//7) if len(df) > 7 else 0)['real_score'].rank(ascending=False)
+
+# 简化:用整体排名
+df['pred_rank_all'] = df['pred_score'].rank(ascending=False)
+df['real_rank_all'] = df['real_score'].rank(ascending=False)
+df['rank_diff'] = df['real_rank_all'] - df['pred_rank_all']
+
+# 问题视频:预估Top 25%,但真实排名 Bottom 50%
+top_25_pct = df['pred_rank_all'].quantile(0.25)
+bottom_50_pct = df['real_rank_all'].quantile(0.50)
+
+df['is_problem'] = (df['pred_rank_all'] <= top_25_pct) & (df['real_rank_all'] > bottom_50_pct)
+
+print(f"\n问题定义: 预估排名 Top 25% 但真实排名 Bottom 50%")
+print(f"问题视频数: {df['is_problem'].sum()} / {len(df)} ({df['is_problem'].mean()*100:.1f}%)")
+
+# 2. 对比分析
+print("\n【对比分析】")
+problem = df[df['is_problem']]
+normal = df[~df['is_problem']]
+
+comparison = pd.DataFrame({
+    '问题视频': [
+        len(problem),
+        problem['pred_ros'].mean(),
+        problem['real_ros'].mean(),
+        problem['ros_bias_pct'].mean(),
+        problem['pred_str'].mean(),
+        problem['real_str'].mean(),
+        problem['str_bias_pct'].mean(),
+    ],
+    '正常视频': [
+        len(normal),
+        normal['pred_ros'].mean(),
+        normal['real_ros'].mean(),
+        normal['ros_bias_pct'].mean(),
+        normal['pred_str'].mean(),
+        normal['real_str'].mean(),
+        normal['str_bias_pct'].mean(),
+    ]
+}, index=['样本数', '预估ROS', '真实ROS', 'ROS偏差%', '预估STR', '真实STR', 'STR偏差%'])
+
+print(comparison.round(4).to_string())
+
+# 3. 关键对比:问题视频的预估vs真实
+print("\n【关键发现】")
+if len(problem) > 0:
+    print(f"\n问题视频特征:")
+    print(f"  预估ROS: {problem['pred_ros'].mean():.2f}")
+    print(f"  真实ROS: {problem['real_ros'].mean():.2f}")
+    print(f"  ROS偏差: {problem['ros_bias_pct'].mean():+.1f}%")
+    print(f"  预估STR: {problem['pred_str'].mean():.6f}")
+    print(f"  真实STR: {problem['real_str'].mean():.6f}")
+    print(f"  STR偏差: {problem['str_bias_pct'].mean():+.1f}%")
+
+print(f"\n正常视频特征:")
+print(f"  预估ROS: {normal['pred_ros'].mean():.2f}")
+print(f"  真实ROS: {normal['real_ros'].mean():.2f}")
+print(f"  ROS偏差: {normal['ros_bias_pct'].mean():+.1f}%")
+
+# 4. 分析偏差来源
+print("\n【偏差来源分析】")
+if len(problem) > 0:
+    ros_diff = problem['ros_bias_pct'].mean() - normal['ros_bias_pct'].mean()
+    str_diff = problem['str_bias_pct'].mean() - normal['str_bias_pct'].mean()
+
+    print(f"问题视频 vs 正常视频:")
+    print(f"  ROS偏差差异: {ros_diff:+.1f}%")
+    print(f"  STR偏差差异: {str_diff:+.1f}%")
+
+    if abs(ros_diff) > abs(str_diff):
+        print(f"\n主要原因: ROS预估问题(差异更大)")
+    else:
+        print(f"\n主要原因: STR预估问题(差异更大)")
+
+# 5. 真实ROS分布对比
+print("\n【真实ROS分布对比】")
+if len(problem) > 0:
+    print(f"问题视频真实ROS分布:")
+    print(f"  < 2: {(problem['real_ros'] < 2).sum()} ({(problem['real_ros'] < 2).mean()*100:.1f}%)")
+    print(f"  2-4: {((problem['real_ros'] >= 2) & (problem['real_ros'] < 4)).sum()}")
+    print(f"  > 4: {(problem['real_ros'] >= 4).sum()}")
+
+print(f"\n正常视频真实ROS分布:")
+print(f"  < 2: {(normal['real_ros'] < 2).sum()} ({(normal['real_ros'] < 2).mean()*100:.1f}%)")
+print(f"  2-4: {((normal['real_ros'] >= 2) & (normal['real_ros'] < 4)).sum()}")
+print(f"  > 4: {(normal['real_ros'] >= 4).sum()}")
+
+# 6. 结论
+print("\n" + "=" * 70)
+print("【V2 Step3 结论】")
+print("=" * 70)
+
+if len(problem) > 0:
+    main_cause = "ROS" if abs(ros_diff) > abs(str_diff) else "STR"
+    print(f"""
+核心发现:
+  1. 问题视频占比: {df['is_problem'].mean()*100:.1f}%
+  2. 主要原因: {main_cause} 预估不准
+  3. 问题视频特征:
+     - {'ROS预估偏高' if problem['ros_bias_pct'].mean() > 0 else 'ROS预估偏低'}: {problem['ros_bias_pct'].mean():+.1f}%
+     - {'STR预估偏高' if problem['str_bias_pct'].mean() > 0 else 'STR预估偏低'}: {problem['str_bias_pct'].mean():+.1f}%
+  4. 真实ROS < 2 的比例:
+     - 问题视频: {(problem['real_ros'] < 2).mean()*100:.1f}%
+     - 正常视频: {(normal['real_ros'] < 2).mean()*100:.1f}%
+""")
+else:
+    print("问题视频数量为0,可能定义阈值需要调整")
+
+# 保存
+df.to_csv(output_dir / "v2_step3_对比分析.csv", index=False)

+ 225 - 0
tmp/低vov高曝光分析/v2_最终报告.py

@@ -0,0 +1,225 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V2 最终报告:修正后的分析结论
+"""
+import pandas as pd
+import json
+from pathlib import Path
+import glob
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取数据
+data_dir = output_dir / "v2_step2_统一口径"
+all_files = glob.glob(str(data_dir / "*.csv"))
+df = pd.concat([pd.read_csv(f) for f in all_files], ignore_index=True)
+
+# 计算关键指标
+ros_copc = df['real_ros'].sum() / df['pred_ros'].sum()
+str_copc = df['real_str'].sum() / df['pred_str'].sum()
+
+# 预估排名与真实排名
+df['pred_rank'] = df['pred_score'].rank(ascending=False)
+df['real_rank'] = df['real_score'].rank(ascending=False)
+rank_corr = df['pred_rank'].corr(df['real_rank'])
+
+# 问题视频
+top_25_pct = df['pred_rank'].quantile(0.25)
+bottom_50_pct = df['real_rank'].quantile(0.50)
+df['is_problem'] = (df['pred_rank'] <= top_25_pct) & (df['real_rank'] > bottom_50_pct)
+problem_ratio = df['is_problem'].mean() * 100
+
+# 按真实ROS分组
+ros_groups = []
+for label, low, high in [('<2', 0, 2), ('2-4', 2, 4), ('4-6', 4, 6), ('>6', 6, 100)]:
+    subset = df[(df['real_ros'] >= low) & (df['real_ros'] < high)]
+    if len(subset) > 0:
+        ros_groups.append({
+            'group': label,
+            'count': len(subset),
+            'pred_ros': round(subset['pred_ros'].mean(), 2),
+            'real_ros': round(subset['real_ros'].mean(), 2),
+            'bias': round(subset['ros_bias_pct'].mean(), 1)
+        })
+
+html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>低VoV高曝光问题分析报告 V2(修正版)</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, sans-serif; background: #f0f2f5; padding: 40px; }}
+        .report {{ max-width: 900px; margin: 0 auto; background: white; border-radius: 16px; overflow: hidden; box-shadow: 0 4px 20px rgba(0,0,0,0.1); }}
+        .header {{ background: linear-gradient(135deg, #2c3e50 0%, #3498db 100%); color: white; padding: 40px; text-align: center; }}
+        .header h1 {{ font-size: 28px; margin-bottom: 8px; }}
+        .section {{ padding: 30px 40px; border-bottom: 1px solid #eee; }}
+        .section:last-child {{ border-bottom: none; }}
+        h2 {{ color: #2c3e50; margin-bottom: 20px; font-size: 20px; display: flex; align-items: center; gap: 10px; }}
+        .step-num {{ background: #3498db; color: white; width: 32px; height: 32px; border-radius: 50%; display: flex; align-items: center; justify-content: center; font-size: 14px; }}
+        .old-new {{ display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin: 20px 0; }}
+        .old {{ background: #ffebee; padding: 20px; border-radius: 8px; border-left: 4px solid #e74c3c; }}
+        .new {{ background: #e8f5e9; padding: 20px; border-radius: 8px; border-left: 4px solid #27ae60; }}
+        .old h3 {{ color: #c0392b; margin-bottom: 10px; }}
+        .new h3 {{ color: #27ae60; margin-bottom: 10px; }}
+        .metric-grid {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; margin: 20px 0; }}
+        .metric {{ background: #f8f9fa; padding: 20px; border-radius: 8px; text-align: center; }}
+        .metric-value {{ font-size: 28px; font-weight: bold; color: #2c3e50; }}
+        .metric-label {{ font-size: 14px; color: #666; margin-top: 4px; }}
+        table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
+        th, td {{ padding: 12px; text-align: center; border-bottom: 1px solid #eee; }}
+        th {{ background: #f8f9fa; color: #555; font-weight: 600; }}
+        tr.highlight {{ background: #fff3cd; }}
+        .badge {{ display: inline-block; padding: 4px 12px; border-radius: 20px; font-size: 12px; }}
+        .badge-danger {{ background: #ffebee; color: #c0392b; }}
+        .badge-success {{ background: #e8f5e9; color: #27ae60; }}
+        .badge-warning {{ background: #fff3cd; color: #856404; }}
+        .conclusion {{ background: #2c3e50; color: white; padding: 40px; }}
+        .conclusion h2 {{ color: white; }}
+        .conclusion ul {{ margin-left: 20px; }}
+        .conclusion li {{ margin-bottom: 12px; line-height: 1.8; }}
+        .highlight-text {{ background: rgba(255,255,255,0.2); padding: 2px 8px; border-radius: 4px; }}
+        .correction {{ background: #fff3cd; border: 2px solid #ffc107; border-radius: 8px; padding: 20px; margin: 20px 0; }}
+        .correction h3 {{ color: #856404; margin-bottom: 10px; }}
+    </style>
+</head>
+<body>
+    <div class="report">
+        <div class="header">
+            <h1>📊 低VoV高曝光问题分析报告</h1>
+            <p>V2 修正版 | 自我批判迭代后的严谨结论</p>
+        </div>
+
+        <!-- 修正说明 -->
+        <div class="section">
+            <h2><span class="step-num">!</span> 重要修正</h2>
+            <div class="correction">
+                <h3>⚠️ V1 版本存在的问题</h3>
+                <ul style="margin-left:20px;">
+                    <li><strong>问题定义拍脑袋</strong>:vov < 0.35 阈值无依据</li>
+                    <li><strong>数据口径混乱</strong>:用两张表的数据混合分析</li>
+                    <li><strong>COPC结论错误</strong>:说"ROS偏高72%"实际上是极端值拉高的均值</li>
+                    <li><strong>夸大问题</strong>:说43.5%存在问题,实际只有5.8%</li>
+                </ul>
+            </div>
+        </div>
+
+        <!-- Step 1: 修正问题定义 -->
+        <div class="section">
+            <h2><span class="step-num">1</span> 修正问题定义</h2>
+            <div class="old-new">
+                <div class="old">
+                    <h3>❌ 旧定义</h3>
+                    <p>vov < 0.35 且 rank ≤ 5</p>
+                    <p style="color:#666;font-size:14px;">阈值拍脑袋定的</p>
+                </div>
+                <div class="new">
+                    <h3>✓ 新定义</h3>
+                    <p>预估排名 Top 25% 但真实排名 Bottom 50%</p>
+                    <p style="color:#666;font-size:14px;">数据驱动,定义"排序失准"</p>
+                </div>
+            </div>
+        </div>
+
+        <!-- Step 2: 修正影响面 -->
+        <div class="section">
+            <h2><span class="step-num">2</span> 修正影响面</h2>
+            <div class="old-new">
+                <div class="old">
+                    <h3>❌ 旧结论</h3>
+                    <p style="font-size:24px;font-weight:bold;">43.5%</p>
+                    <p style="color:#666;">Top10记录存在问题</p>
+                </div>
+                <div class="new">
+                    <h3>✓ 新结论</h3>
+                    <p style="font-size:24px;font-weight:bold;">{problem_ratio:.1f}%</p>
+                    <p style="color:#666;">排序失准的视频占比</p>
+                </div>
+            </div>
+            <p style="color:#666;margin-top:12px;">问题规模比之前估计的小很多</p>
+        </div>
+
+        <!-- Step 3: 修正COPC结论 -->
+        <div class="section">
+            <h2><span class="step-num">3</span> 修正 COPC 结论</h2>
+            <div class="old-new">
+                <div class="old">
+                    <h3>❌ 旧结论</h3>
+                    <p>ROS 整体偏高 72%</p>
+                    <p style="color:#666;font-size:14px;">这是样本偏差均值,被极端值拉高</p>
+                </div>
+                <div class="new">
+                    <h3>✓ 新结论</h3>
+                    <p>ROS COPC = {ros_copc:.2f}</p>
+                    <p style="color:#666;font-size:14px;">整体预估偏低,不是偏高</p>
+                </div>
+            </div>
+
+            <h3 style="margin-top:24px;margin-bottom:12px;">真正的问题:回归均值</h3>
+            <table>
+                <thead>
+                    <tr><th>真实ROS</th><th>样本数</th><th>预估ROS</th><th>真实ROS</th><th>偏差</th></tr>
+                </thead>
+                <tbody>
+                    {"".join([f'<tr class="{"highlight" if g["bias"] > 50 else ""}">'
+                              f'<td>{g["group"]}</td>'
+                              f'<td>{g["count"]}</td>'
+                              f'<td>{g["pred_ros"]}</td>'
+                              f'<td>{g["real_ros"]}</td>'
+                              f'<td><span class="badge {"badge-danger" if g["bias"] > 50 else "badge-warning" if g["bias"] > 0 else "badge-success"}">{g["bias"]:+.1f}%</span></td>'
+                              f'</tr>' for g in ros_groups])}
+                </tbody>
+            </table>
+            <p style="color:#666;">真正的问题:<strong>极端值预估不准</strong>,低ROS偏高估,高ROS偏低估</p>
+        </div>
+
+        <!-- Step 4: 排序效果 -->
+        <div class="section">
+            <h2><span class="step-num">4</span> 排序效果评估</h2>
+            <div class="metric-grid">
+                <div class="metric">
+                    <div class="metric-value">{rank_corr:.2f}</div>
+                    <div class="metric-label">预估排名与真实排名相关系数</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-value">{problem_ratio:.1f}%</div>
+                    <div class="metric-label">严重排序失准比例</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-value">{len(df)}</div>
+                    <div class="metric-label">分析样本数</div>
+                </div>
+            </div>
+            <p style="color:#666;">相关系数 0.40 说明排序有改进空间,但不是"严重失效"</p>
+        </div>
+
+        <!-- 修正后的结论 -->
+        <div class="conclusion">
+            <h2>📋 修正后的结论</h2>
+            <ul>
+                <li><strong>问题规模</strong>:约 <span class="highlight-text">{problem_ratio:.1f}%</span> 的视频存在排序失准,比之前估计的小</li>
+                <li><strong>整体COPC</strong>:ROS预估整体偏<span class="highlight-text">低</span>(COPC={ros_copc:.2f}),不是偏高</li>
+                <li><strong>真正问题</strong>:模型存在<span class="highlight-text">回归均值</span>问题,极端值预估不准</li>
+                <li><strong>排序效果</strong>:相关系数 {rank_corr:.2f},有改进空间但非灾难性</li>
+            </ul>
+
+            <h3 style="margin-top:30px;margin-bottom:16px;">💡 可执行的建议</h3>
+            <ul>
+                <li><strong>短期</strong>:对预估 ROS < 2 且 STR < P25 的视频增加不确定性惩罚</li>
+                <li><strong>中期</strong>:在模型训练时增加极端值样本权重,改善尾部预估</li>
+                <li><strong>监控</strong>:增加"预估排名vs真实排名相关系数"的日常监控</li>
+                <li><strong>注意</strong>:<del>对真实ROS<2的视频衰减</del> 不可行(事先不知道真实值)</li>
+            </ul>
+        </div>
+    </div>
+</body>
+</html>
+'''
+
+# 保存
+html_path = output_dir / "v2_分析报告.html"
+with open(html_path, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"V2 修正版报告已生成: {html_path}")

+ 264 - 0
tmp/低vov高曝光分析/v2_深入分析.py

@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V2 深入分析:全面、严谨的分析
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import glob
+from datetime import datetime
+
+output_dir = Path(__file__).parent / "output"
+
+# 读取30天数据
+data_dir = output_dir / "v2_step2_统一口径"
+all_files = sorted(glob.glob(str(data_dir / "*.csv")))
+dfs = []
+for f in all_files:
+    df_tmp = pd.read_csv(f)
+    df_tmp['dt'] = int(Path(f).stem)
+    dfs.append(df_tmp)
+df = pd.concat(dfs, ignore_index=True)
+
+print("=" * 80)
+print("V2 深入分析:30天数据全面分析")
+print("=" * 80)
+
+print(f"\n数据概览:")
+print(f"  日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"  天数: {df['dt'].nunique()}")
+print(f"  总视频数: {len(df)}")
+print(f"  去重视频数: {df['vid'].nunique()}")
+
+# =============================================================================
+# Part 1: 整体指标
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 1: 整体指标")
+print("=" * 80)
+
+# COPC
+ros_copc = df['real_ros'].sum() / df['pred_ros'].sum()
+str_copc = df['real_str'].sum() / df['pred_str'].sum()
+
+print(f"\n【1.1 COPC 整体】")
+print(f"  ROS COPC = {ros_copc:.4f} ({'预估偏低' if ros_copc > 1 else '预估偏高'})")
+print(f"  STR COPC = {str_copc:.4f} ({'预估偏低' if str_copc > 1 else '预估偏高'})")
+
+# 偏差分布
+print(f"\n【1.2 偏差分布】")
+print(f"  ROS偏差: 均值={df['ros_bias_pct'].mean():+.1f}%, 中位数={df['ros_bias_pct'].median():+.1f}%, std={df['ros_bias_pct'].std():.1f}%")
+print(f"  STR偏差: 均值={df['str_bias_pct'].mean():+.1f}%, 中位数={df['str_bias_pct'].median():+.1f}%, std={df['str_bias_pct'].std():.1f}%")
+
+# 偏差分位数
+print(f"\n【1.3 ROS偏差分位数】")
+for p in [0.1, 0.25, 0.5, 0.75, 0.9]:
+    val = df['ros_bias_pct'].quantile(p)
+    print(f"  P{int(p*100)}: {val:+.1f}%")
+
+# =============================================================================
+# Part 2: 分段COPC(回归均值分析)
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 2: 分段COPC分析(验证回归均值问题)")
+print("=" * 80)
+
+print(f"\n【2.1 按真实ROS分段】")
+df['real_ros_bin'] = pd.cut(df['real_ros'], bins=[0, 1, 2, 3, 4, 6, 10, 100],
+                           labels=['0-1', '1-2', '2-3', '3-4', '4-6', '6-10', '>10'])
+ros_segment = df.groupby('real_ros_bin', observed=True).agg({
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'ros_bias_pct': ['mean', 'median', 'count']
+}).round(2)
+ros_segment.columns = ['预估ROS', '真实ROS', '偏差均值%', '偏差中位数%', '样本数']
+ros_segment['COPC'] = (ros_segment['真实ROS'] / ros_segment['预估ROS']).round(3)
+print(ros_segment.to_string())
+
+print(f"\n【2.2 按真实STR分段】")
+df['real_str_bin'] = pd.cut(df['real_str'], bins=[0, 0.005, 0.01, 0.02, 0.05, 1],
+                           labels=['<0.5%', '0.5-1%', '1-2%', '2-5%', '>5%'])
+str_segment = df.groupby('real_str_bin', observed=True).agg({
+    'pred_str': 'mean',
+    'real_str': 'mean',
+    'str_bias_pct': ['mean', 'count']
+}).round(4)
+str_segment.columns = ['预估STR', '真实STR', '偏差均值%', '样本数']
+print(str_segment.to_string())
+
+# =============================================================================
+# Part 3: 排序效果分析
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 3: 排序效果分析")
+print("=" * 80)
+
+# 按天计算排序相关性
+print(f"\n【3.1 每日排序相关性】")
+daily_corr = []
+for dt in sorted(df['dt'].unique()):
+    df_day = df[df['dt'] == dt].copy()
+    if len(df_day) > 10:
+        df_day['pred_rank'] = df_day['pred_score'].rank(ascending=False)
+        df_day['real_rank'] = df_day['real_score'].rank(ascending=False)
+        corr = df_day['pred_rank'].corr(df_day['real_rank'])
+        daily_corr.append({'dt': dt, 'corr': corr, 'n': len(df_day)})
+
+corr_df = pd.DataFrame(daily_corr)
+print(f"  平均相关系数: {corr_df['corr'].mean():.4f}")
+print(f"  最低: {corr_df['corr'].min():.4f} ({corr_df.loc[corr_df['corr'].idxmin(), 'dt']})")
+print(f"  最高: {corr_df['corr'].max():.4f} ({corr_df.loc[corr_df['corr'].idxmax(), 'dt']})")
+print(f"  标准差: {corr_df['corr'].std():.4f}")
+
+# 排序失准比例
+print(f"\n【3.2 排序失准分析】")
+problem_stats = []
+for dt in sorted(df['dt'].unique()):
+    df_day = df[df['dt'] == dt].copy()
+    if len(df_day) > 10:
+        df_day['pred_rank'] = df_day['pred_score'].rank(ascending=False)
+        df_day['real_rank'] = df_day['real_score'].rank(ascending=False)
+        n = len(df_day)
+        top25 = n * 0.25
+        bottom50 = n * 0.5
+        problem_cnt = ((df_day['pred_rank'] <= top25) & (df_day['real_rank'] > bottom50)).sum()
+        problem_stats.append({'dt': dt, 'problem_pct': problem_cnt / len(df_day) * 100})
+
+problem_df = pd.DataFrame(problem_stats)
+print(f"  平均问题比例: {problem_df['problem_pct'].mean():.1f}%")
+print(f"  最低: {problem_df['problem_pct'].min():.1f}%")
+print(f"  最高: {problem_df['problem_pct'].max():.1f}%")
+
+# =============================================================================
+# Part 4: 时间趋势
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 4: 时间趋势分析")
+print("=" * 80)
+
+# 按天的COPC趋势
+print(f"\n【4.1 每日COPC趋势】")
+daily_stats = df.groupby('dt').agg({
+    'pred_ros': 'sum',
+    'real_ros': 'sum',
+    'ros_bias_pct': 'mean',
+    'vid': 'count'
+}).round(4)
+daily_stats['ros_copc'] = daily_stats['real_ros'] / daily_stats['pred_ros']
+daily_stats = daily_stats.round(4)
+
+print(f"{'日期':<12} {'样本数':>8} {'ROS COPC':>10} {'偏差均值%':>12}")
+print("-" * 45)
+for dt, row in daily_stats.iterrows():
+    print(f"{dt:<12} {int(row['vid']):>8} {row['ros_copc']:>10.3f} {row['ros_bias_pct']:>+11.1f}%")
+
+# 周度汇总
+print(f"\n【4.2 周度汇总】")
+df['week'] = pd.to_datetime(df['dt'].astype(str)).dt.isocalendar().week
+weekly = df.groupby('week').agg({
+    'pred_ros': 'sum',
+    'real_ros': 'sum',
+    'ros_bias_pct': 'mean',
+    'vid': 'count'
+})
+weekly['ros_copc'] = (weekly['real_ros'] / weekly['pred_ros']).round(3)
+print(weekly[['vid', 'ros_copc', 'ros_bias_pct']].round(2).to_string())
+
+# =============================================================================
+# Part 5: 问题视频深入分析
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 5: 问题视频深入分析")
+print("=" * 80)
+
+# 定义问题视频(每天独立计算)
+df['pred_rank_daily'] = df.groupby('dt')['pred_score'].rank(ascending=False, pct=True)
+df['real_rank_daily'] = df.groupby('dt')['real_score'].rank(ascending=False, pct=True)
+df['is_problem'] = (df['pred_rank_daily'] <= 0.25) & (df['real_rank_daily'] > 0.5)
+
+problem = df[df['is_problem']]
+normal = df[~df['is_problem']]
+
+print(f"\n【5.1 问题视频统计】")
+print(f"  问题视频: {len(problem)} ({len(problem)/len(df)*100:.1f}%)")
+print(f"  正常视频: {len(normal)} ({len(normal)/len(df)*100:.1f}%)")
+
+print(f"\n【5.2 问题视频 vs 正常视频对比】")
+metrics = ['pred_ros', 'real_ros', 'ros_bias_pct', 'pred_str', 'real_str', 'str_bias_pct', 'total_exp']
+comparison = pd.DataFrame({
+    '问题视频': problem[metrics].mean(),
+    '正常视频': normal[metrics].mean(),
+})
+comparison['差异'] = comparison['问题视频'] - comparison['正常视频']
+print(comparison.round(4).to_string())
+
+print(f"\n【5.3 问题视频的真实ROS分布】")
+print("问题视频:")
+print(f"  ROS < 2: {(problem['real_ros'] < 2).sum()} ({(problem['real_ros'] < 2).mean()*100:.1f}%)")
+print(f"  ROS 2-4: {((problem['real_ros'] >= 2) & (problem['real_ros'] < 4)).sum()} ({((problem['real_ros'] >= 2) & (problem['real_ros'] < 4)).mean()*100:.1f}%)")
+print(f"  ROS > 4: {(problem['real_ros'] >= 4).sum()} ({(problem['real_ros'] >= 4).mean()*100:.1f}%)")
+print("\n正常视频:")
+print(f"  ROS < 2: {(normal['real_ros'] < 2).sum()} ({(normal['real_ros'] < 2).mean()*100:.1f}%)")
+print(f"  ROS 2-4: {((normal['real_ros'] >= 2) & (normal['real_ros'] < 4)).sum()} ({((normal['real_ros'] >= 2) & (normal['real_ros'] < 4)).mean()*100:.1f}%)")
+print(f"  ROS > 4: {(normal['real_ros'] >= 4).sum()} ({(normal['real_ros'] >= 4).mean()*100:.1f}%)")
+
+# =============================================================================
+# Part 6: 高频问题视频分析
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 6: 高频问题视频分析(多次出现的问题视频)")
+print("=" * 80)
+
+vid_problem_count = problem.groupby('vid').size().sort_values(ascending=False)
+print(f"\n问题视频出现次数分布:")
+print(f"  出现1次: {(vid_problem_count == 1).sum()} 个视频")
+print(f"  出现2-3次: {((vid_problem_count >= 2) & (vid_problem_count <= 3)).sum()} 个视频")
+print(f"  出现4次以上: {(vid_problem_count >= 4).sum()} 个视频")
+
+if len(vid_problem_count) > 0:
+    print(f"\n出现次数最多的问题视频 Top5:")
+    for vid, cnt in vid_problem_count.head(5).items():
+        vid_data = problem[problem['vid'] == vid]
+        print(f"  vid={vid}: 出现{cnt}次")
+        print(f"    预估ROS={vid_data['pred_ros'].mean():.2f}, 真实ROS={vid_data['real_ros'].mean():.2f}, 偏差={vid_data['ros_bias_pct'].mean():+.1f}%")
+
+# =============================================================================
+# Part 7: 结论
+# =============================================================================
+print("\n" + "=" * 80)
+print("Part 7: 综合结论")
+print("=" * 80)
+
+print(f"""
+【数据基础】
+  - 分析时间: {df['dt'].min()} ~ {df['dt'].max()} ({df['dt'].nunique()}天)
+  - 分析样本: {len(df)} 条记录,{df['vid'].nunique()} 个视频
+
+【COPC分析】
+  - ROS 整体 COPC = {ros_copc:.3f} ({'整体预估偏低' if ros_copc > 1 else '整体预估偏高'})
+  - 但存在明显的回归均值问题:
+    · 真实ROS < 2: 预估偏高 {df[df['real_ros'] < 2]['ros_bias_pct'].mean():+.0f}%
+    · 真实ROS > 6: 预估偏低 {df[df['real_ros'] > 6]['ros_bias_pct'].mean():+.0f}%
+
+【排序效果】
+  - 预估排名与真实排名相关系数: {corr_df['corr'].mean():.2f} (±{corr_df['corr'].std():.2f})
+  - 排序失准比例: {problem_df['problem_pct'].mean():.1f}% (最高{problem_df['problem_pct'].max():.0f}%)
+
+【问题视频特征】
+  - 问题视频占比: {len(problem)/len(df)*100:.1f}%
+  - 问题视频ROS偏差: {problem['ros_bias_pct'].mean():+.1f}% (vs 正常视频 {normal['ros_bias_pct'].mean():+.1f}%)
+  - 问题视频中真实ROS<2占比: {(problem['real_ros'] < 2).mean()*100:.0f}% (vs 正常视频 {(normal['real_ros'] < 2).mean()*100:.0f}%)
+
+【根本原因】
+  模型存在「回归均值」问题,对极端值预估不准确:
+  - 低ROS视频被高估 → 排名靠前 → 获得过多曝光
+  - 高ROS视频被低估 → 排名靠后 → 曝光不足
+""")
+
+# 保存
+df.to_csv(output_dir / "v2_深入分析_全量.csv", index=False)
+corr_df.to_csv(output_dir / "v2_每日相关系数.csv", index=False)
+problem_df.to_csv(output_dir / "v2_每日问题比例.csv", index=False)
+
+print(f"\n数据已保存到 output/ 目录")

+ 594 - 0
tmp/低vov高曝光分析/v3_可视化报告.py

@@ -0,0 +1,594 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V3 可视化报告:完整的分析报告,包含图表
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import glob
+
+output_dir = Path(__file__).parent / "output"
+
+# =============================================================================
+# 数据加载
+# =============================================================================
+data_dir = output_dir / "v3_扩展特征"
+all_files = sorted(glob.glob(str(data_dir / "*.csv")))
+dfs = []
+for f in all_files:
+    df_tmp = pd.read_csv(f)
+    df_tmp['dt'] = int(Path(f).stem)
+    dfs.append(df_tmp)
+df = pd.concat(dfs, ignore_index=True)
+
+# =============================================================================
+# 计算所有指标
+# =============================================================================
+
+# 整体COPC
+ros_copc = df['real_ros'].sum() / df['pred_ros'].sum()
+str_copc = df['real_str'].sum() / df['pred_str'].sum()
+
+# 分段COPC
+segments = []
+for label, low, high in [('0-1', 0, 1), ('1-2', 1, 2), ('2-3', 2, 3), ('3-4', 3, 4),
+                         ('4-5', 4, 5), ('5-6', 5, 6), ('6-8', 6, 8), ('8-10', 8, 10), ('>10', 10, 100)]:
+    subset = df[(df['real_ros'] >= low) & (df['real_ros'] < high)]
+    if len(subset) > 0:
+        copc = subset['real_ros'].sum() / subset['pred_ros'].sum()
+        segments.append({
+            'label': label,
+            'count': len(subset),
+            'pred': round(subset['pred_ros'].mean(), 2),
+            'real': round(subset['real_ros'].mean(), 2),
+            'bias': round(subset['ros_bias_pct'].mean(), 1),
+            'copc': round(copc, 3)
+        })
+
+# 分场景
+page_stats = []
+for page in df['page'].unique():
+    subset = df[df['page'] == page]
+    copc = subset['real_ros'].sum() / subset['pred_ros'].sum()
+    subset_copy = subset.copy()
+    subset_copy['pred_rank'] = subset_copy['pred_score'].rank(ascending=False, pct=True)
+    subset_copy['real_rank'] = subset_copy['real_score'].rank(ascending=False, pct=True)
+    corr = subset_copy['pred_rank'].corr(subset_copy['real_rank'])
+    problem_pct = ((subset_copy['pred_rank'] <= 0.25) & (subset_copy['real_rank'] > 0.5)).mean() * 100
+    page_stats.append({
+        'page': page,
+        'count': len(subset),
+        'exp_pct': round(subset['total_exp'].sum() / df['total_exp'].sum() * 100, 1),
+        'copc': round(copc, 3),
+        'bias': round(subset['ros_bias_pct'].mean(), 1),
+        'corr': round(corr, 3),
+        'problem_pct': round(problem_pct, 1)
+    })
+
+# 每日趋势
+daily_data = []
+for dt in sorted(df['dt'].unique()):
+    subset = df[df['dt'] == dt]
+    copc = subset['real_ros'].sum() / subset['pred_ros'].sum()
+    subset_copy = subset.copy()
+    subset_copy['pred_rank'] = subset_copy['pred_score'].rank(ascending=False, pct=True)
+    subset_copy['real_rank'] = subset_copy['real_score'].rank(ascending=False, pct=True)
+    corr = subset_copy['pred_rank'].corr(subset_copy['real_rank'])
+    problem_pct = ((subset_copy['pred_rank'] <= 0.25) & (subset_copy['real_rank'] > 0.5)).mean() * 100
+    daily_data.append({
+        'dt': str(dt),
+        'copc': round(copc, 3),
+        'bias': round(subset['ros_bias_pct'].mean(), 1),
+        'corr': round(corr, 3),
+        'problem_pct': round(problem_pct, 1),
+        'n': len(subset)
+    })
+
+# 问题视频
+df['pred_rank_daily'] = df.groupby('dt')['pred_score'].rank(ascending=False, pct=True)
+df['real_rank_daily'] = df.groupby('dt')['real_score'].rank(ascending=False, pct=True)
+df['is_problem'] = (df['pred_rank_daily'] <= 0.25) & (df['real_rank_daily'] > 0.5)
+problem = df[df['is_problem']]
+normal = df[~df['is_problem']]
+
+# 曝光量 vs 偏差
+exp_bins = pd.qcut(df['total_exp'], q=5, labels=['极低', '低', '中', '高', '极高'])
+exp_bias_data = []
+for label in ['极低', '低', '中', '高', '极高']:
+    subset = df[exp_bins == label]
+    exp_bias_data.append({
+        'label': label,
+        'bias': round(subset['ros_bias_pct'].mean(), 1),
+        'count': len(subset)
+    })
+
+# 高频问题视频
+vid_problem_count = problem.groupby('vid').size().sort_values(ascending=False)
+top_problem_vids = []
+for vid, cnt in vid_problem_count.head(10).items():
+    vid_data = df[df['vid'] == vid]
+    prob_data = problem[problem['vid'] == vid]
+    top_problem_vids.append({
+        'vid': str(vid),
+        'problem_cnt': int(cnt),
+        'total_days': len(vid_data),
+        'pred_ros': round(prob_data['pred_ros'].mean(), 2),
+        'real_ros': round(prob_data['real_ros'].mean(), 2),
+        'bias': round(prob_data['ros_bias_pct'].mean(), 1),
+        'page': prob_data['page'].mode().iloc[0] if len(prob_data) > 0 else '',
+        'exp': int(prob_data['total_exp'].sum())
+    })
+
+# =============================================================================
+# 辅助函数:生成HTML片段
+# =============================================================================
+def get_bias_badge(bias):
+    if bias > 50:
+        return f'<span class="badge badge-danger">+{bias:.1f}%</span>'
+    elif bias > 0:
+        return f'<span class="badge badge-warning">+{bias:.1f}%</span>'
+    else:
+        return f'<span class="badge badge-success">{bias:.1f}%</span>'
+
+def get_bar_class(bias):
+    return 'positive' if bias > 0 else 'negative'
+
+def gen_segment_rows():
+    rows = []
+    for s in segments:
+        badge = get_bias_badge(s['bias'])
+        rows.append(f'''<tr>
+            <td>{s['label']}</td>
+            <td>{s['count']}</td>
+            <td>{s['pred']}</td>
+            <td>{s['real']}</td>
+            <td>{badge}</td>
+            <td>{s['copc']}</td>
+        </tr>''')
+    return '\n'.join(rows)
+
+def gen_segment_bars():
+    bars = []
+    for s in segments:
+        bar_cls = get_bar_class(s['bias'])
+        width = min(abs(s['bias']) / 2, 100)
+        bars.append(f'''<div class="bar-row">
+            <div class="bar-label">{s['label']}</div>
+            <div class="bar-track">
+                <div class="bar-fill {bar_cls}" style="width: {width}%;">
+                    {s['bias']:+.0f}%
+                </div>
+            </div>
+            <div class="bar-value">n={s['count']}</div>
+        </div>''')
+    return '\n'.join(bars)
+
+def gen_page_rows():
+    rows = []
+    for p in sorted(page_stats, key=lambda x: -x['exp_pct']):
+        bias_badge = get_bias_badge(p['bias'])
+        prob_cls = 'badge-danger' if p['problem_pct'] > 10 else 'badge-warning'
+        rows.append(f'''<tr>
+            <td>{p['page']}</td>
+            <td>{p['count']}</td>
+            <td>{p['exp_pct']}%</td>
+            <td>{p['copc']}</td>
+            <td>{bias_badge}</td>
+            <td>{p['corr']}</td>
+            <td><span class="badge {prob_cls}">{p['problem_pct']}%</span></td>
+        </tr>''')
+    return '\n'.join(rows)
+
+def gen_timeline_bars():
+    bars = []
+    for d in daily_data:
+        height = d['copc'] / 2.5 * 100
+        bars.append(f'''<div class="timeline-bar" style="height: {height}%;">
+            <div class="tooltip">
+                日期: {d['dt']}<br>
+                COPC: {d['copc']}<br>
+                偏差: {d['bias']:+.1f}%<br>
+                样本: {d['n']}
+            </div>
+        </div>''')
+    return '\n'.join(bars)
+
+def gen_exp_bias_bars():
+    bars = []
+    for e in exp_bias_data:
+        bar_cls = get_bar_class(e['bias'])
+        width = min(abs(e['bias']) / 0.6, 100)
+        bars.append(f'''<div class="bar-row">
+            <div class="bar-label">{e['label']}</div>
+            <div class="bar-track">
+                <div class="bar-fill {bar_cls}" style="width: {width}%;">
+                    {e['bias']:+.0f}%
+                </div>
+            </div>
+            <div class="bar-value">n={e['count']}</div>
+        </div>''')
+    return '\n'.join(bars)
+
+def gen_problem_vid_rows():
+    rows = []
+    for v in top_problem_vids:
+        bias_badge = get_bias_badge(v['bias'])
+        rows.append(f'''<tr>
+            <td>{v['vid']}</td>
+            <td><span class="badge badge-danger">{v['problem_cnt']}次</span></td>
+            <td>{v['total_days']}</td>
+            <td>{v['pred_ros']}</td>
+            <td>{v['real_ros']}</td>
+            <td>{bias_badge}</td>
+            <td style="font-size:12px;">{v['page'][:12]}</td>
+            <td>{v['exp']:,}</td>
+        </tr>''')
+    return '\n'.join(rows)
+
+# 计算统计值
+low_ros_bias = df[df['real_ros'] < 2]['ros_bias_pct'].mean()
+high_ros_bias = df[df['real_ros'] > 6]['ros_bias_pct'].mean()
+problem_pct = len(problem) / len(df) * 100
+problem_exp_pct = problem['total_exp'].sum() / df['total_exp'].sum() * 100
+copc_mean = np.mean([d['copc'] for d in daily_data])
+copc_std = np.std([d['copc'] for d in daily_data])
+avg_problem_pct = np.mean([d['problem_pct'] for d in daily_data])
+
+problem_ros_lt2_pct = (problem['real_ros'] < 2).mean() * 100
+problem_ros_24_pct = ((problem['real_ros'] >= 2) & (problem['real_ros'] < 4)).mean() * 100
+problem_ros_gt6_pct = (problem['real_ros'] >= 6).mean() * 100
+normal_ros_lt2_pct = (normal['real_ros'] < 2).mean() * 100
+normal_ros_24_pct = ((normal['real_ros'] >= 2) & (normal['real_ros'] < 4)).mean() * 100
+normal_ros_gt6_pct = (normal['real_ros'] >= 6).mean() * 100
+
+# =============================================================================
+# 生成HTML
+# =============================================================================
+html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <title>低VoV高曝光问题深度分析报告 V3</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f5f7fa; color: #333; line-height: 1.6; }}
+        .container {{ max-width: 1200px; margin: 0 auto; padding: 20px; }}
+        .header {{ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%); color: white; padding: 60px 40px; text-align: center; border-radius: 16px; margin-bottom: 30px; }}
+        .header h1 {{ font-size: 36px; margin-bottom: 16px; font-weight: 700; }}
+        .header .subtitle {{ font-size: 18px; opacity: 0.9; }}
+        .header .meta {{ margin-top: 24px; display: flex; justify-content: center; gap: 40px; font-size: 14px; opacity: 0.8; }}
+        .card {{ background: white; border-radius: 12px; padding: 30px; margin-bottom: 24px; box-shadow: 0 2px 8px rgba(0,0,0,0.06); }}
+        .card h2 {{ font-size: 22px; color: #1a1a2e; margin-bottom: 24px; padding-bottom: 12px; border-bottom: 2px solid #f0f0f0; display: flex; align-items: center; gap: 12px; }}
+        .card h2 .num {{ background: #0f3460; color: white; width: 32px; height: 32px; border-radius: 8px; display: flex; align-items: center; justify-content: center; font-size: 16px; }}
+        .card h3 {{ font-size: 18px; color: #444; margin: 24px 0 16px; }}
+        .exec-summary {{ display: grid; grid-template-columns: repeat(4, 1fr); gap: 20px; margin-bottom: 30px; }}
+        .summary-item {{ background: white; border-radius: 12px; padding: 24px; text-align: center; box-shadow: 0 2px 8px rgba(0,0,0,0.06); }}
+        .summary-item .value {{ font-size: 36px; font-weight: 700; margin-bottom: 8px; }}
+        .summary-item .label {{ font-size: 14px; color: #666; }}
+        .summary-item.warning .value {{ color: #e74c3c; }}
+        .summary-item.good .value {{ color: #27ae60; }}
+        .summary-item.neutral .value {{ color: #3498db; }}
+        table {{ width: 100%; border-collapse: collapse; margin: 16px 0; }}
+        th, td {{ padding: 14px 16px; text-align: left; border-bottom: 1px solid #eee; }}
+        th {{ background: #f8f9fa; font-weight: 600; color: #555; font-size: 13px; text-transform: uppercase; }}
+        tr:hover {{ background: #fafbfc; }}
+        .chart-container {{ margin: 20px 0; }}
+        .bar-chart {{ display: flex; flex-direction: column; gap: 12px; }}
+        .bar-row {{ display: flex; align-items: center; gap: 16px; }}
+        .bar-label {{ width: 80px; text-align: right; font-size: 14px; color: #666; }}
+        .bar-track {{ flex: 1; height: 32px; background: #f0f0f0; border-radius: 4px; position: relative; overflow: hidden; }}
+        .bar-fill {{ height: 100%; border-radius: 4px; transition: width 0.3s; display: flex; align-items: center; justify-content: flex-end; padding-right: 8px; font-size: 12px; color: white; font-weight: 600; }}
+        .bar-fill.positive {{ background: linear-gradient(90deg, #e74c3c 0%, #c0392b 100%); }}
+        .bar-fill.negative {{ background: linear-gradient(90deg, #27ae60 0%, #1e8449 100%); }}
+        .bar-fill.neutral {{ background: linear-gradient(90deg, #3498db 0%, #2980b9 100%); }}
+        .bar-value {{ width: 80px; font-size: 14px; font-weight: 600; }}
+        .timeline-chart {{ height: 200px; display: flex; align-items: flex-end; gap: 4px; padding: 20px 0; }}
+        .timeline-bar {{ flex: 1; background: #3498db; border-radius: 3px 3px 0 0; transition: all 0.2s; cursor: pointer; position: relative; }}
+        .timeline-bar:hover {{ background: #2980b9; }}
+        .timeline-bar .tooltip {{ display: none; position: absolute; bottom: 100%; left: 50%; transform: translateX(-50%); background: #333; color: white; padding: 8px 12px; border-radius: 6px; font-size: 12px; white-space: nowrap; z-index: 100; }}
+        .timeline-bar:hover .tooltip {{ display: block; }}
+        .badge {{ display: inline-block; padding: 4px 12px; border-radius: 20px; font-size: 12px; font-weight: 600; }}
+        .badge-danger {{ background: #ffebee; color: #c0392b; }}
+        .badge-warning {{ background: #fff8e1; color: #f57c00; }}
+        .badge-success {{ background: #e8f5e9; color: #27ae60; }}
+        .badge-info {{ background: #e3f2fd; color: #1976d2; }}
+        .highlight-box {{ background: #fff8e1; border-left: 4px solid #ffc107; padding: 20px; margin: 20px 0; border-radius: 0 8px 8px 0; }}
+        .highlight-box.danger {{ background: #ffebee; border-color: #e74c3c; }}
+        .highlight-box.success {{ background: #e8f5e9; border-color: #27ae60; }}
+        .highlight-box h4 {{ color: #856404; margin-bottom: 8px; }}
+        .highlight-box.danger h4 {{ color: #c0392b; }}
+        .grid-2 {{ display: grid; grid-template-columns: 1fr 1fr; gap: 30px; }}
+        .grid-3 {{ display: grid; grid-template-columns: repeat(3, 1fr); gap: 20px; }}
+        .comparison {{ display: grid; grid-template-columns: 1fr 1fr; gap: 30px; margin: 20px 0; }}
+        .comparison-item {{ padding: 20px; border-radius: 8px; }}
+        .comparison-item.problem {{ background: #ffebee; }}
+        .comparison-item.normal {{ background: #e8f5e9; }}
+        .comparison-item h4 {{ margin-bottom: 12px; }}
+        .comparison-item .stat {{ font-size: 24px; font-weight: 700; margin-bottom: 4px; }}
+        .comparison-item .label {{ font-size: 13px; color: #666; }}
+        .conclusion {{ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); color: white; padding: 40px; border-radius: 12px; }}
+        .conclusion h2 {{ color: white; border-bottom-color: rgba(255,255,255,0.2); }}
+        .conclusion ul {{ margin-left: 20px; }}
+        .conclusion li {{ margin-bottom: 16px; }}
+        .conclusion .highlight {{ background: rgba(255,255,255,0.15); padding: 3px 10px; border-radius: 4px; }}
+        .recommendations {{ display: grid; grid-template-columns: repeat(2, 1fr); gap: 20px; margin-top: 24px; }}
+        .rec-item {{ background: rgba(255,255,255,0.1); padding: 20px; border-radius: 8px; }}
+        .rec-item h4 {{ margin-bottom: 12px; font-size: 16px; }}
+        .rec-item p {{ font-size: 14px; opacity: 0.9; }}
+        .rec-item .tag {{ font-size: 12px; background: rgba(255,255,255,0.2); padding: 2px 8px; border-radius: 4px; margin-bottom: 12px; display: inline-block; }}
+        @media (max-width: 768px) {{
+            .exec-summary {{ grid-template-columns: repeat(2, 1fr); }}
+            .grid-2, .comparison {{ grid-template-columns: 1fr; }}
+            .recommendations {{ grid-template-columns: 1fr; }}
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>低VoV高曝光问题深度分析报告</h1>
+            <p class="subtitle">V3 完整版 | 多维度分析 + 特征归因 + 趋势追踪</p>
+            <div class="meta">
+                <span>分析周期: {df['dt'].min()} ~ {df['dt'].max()}</span>
+                <span>样本量: {len(df):,} 条</span>
+                <span>视频数: {df['vid'].nunique():,} 个</span>
+                <span>场景数: {df['page'].nunique()} 个</span>
+            </div>
+        </div>
+
+        <div class="exec-summary">
+            <div class="summary-item warning">
+                <div class="value">{ros_copc:.2f}</div>
+                <div class="label">ROS COPC<br><small>整体预估偏低</small></div>
+            </div>
+            <div class="summary-item neutral">
+                <div class="value">{problem_pct:.1f}%</div>
+                <div class="label">排序失准比例<br><small>预估Top25实际Bottom50</small></div>
+            </div>
+            <div class="summary-item warning">
+                <div class="value">+{low_ros_bias:.0f}%</div>
+                <div class="label">低ROS视频偏差<br><small>真实ROS&lt;2被高估</small></div>
+            </div>
+            <div class="summary-item good">
+                <div class="value">{high_ros_bias:.0f}%</div>
+                <div class="label">高ROS视频偏差<br><small>真实ROS&gt;6被低估</small></div>
+            </div>
+        </div>
+
+        <div class="card">
+            <h2><span class="num">1</span>核心问题:回归均值</h2>
+            <div class="highlight-box danger">
+                <h4>关键发现</h4>
+                <p>模型预估存在严重的<strong>回归均值</strong>问题:低ROS视频被高估,高ROS视频被低估。这导致排序失准,低质量视频获得过多曝光。</p>
+            </div>
+            <h3>按真实ROS分段的COPC</h3>
+            <table>
+                <thead>
+                    <tr><th>真实ROS区间</th><th>样本数</th><th>预估ROS</th><th>真实ROS</th><th>偏差</th><th>COPC</th></tr>
+                </thead>
+                <tbody>
+                    {gen_segment_rows()}
+                </tbody>
+            </table>
+            <h3>偏差分布可视化</h3>
+            <div class="chart-container">
+                <div class="bar-chart">
+                    {gen_segment_bars()}
+                </div>
+            </div>
+        </div>
+
+        <div class="card">
+            <h2><span class="num">2</span>分场景分析</h2>
+            <p>不同场景(Page)的预估效果存在明显差异,说明模型泛化性不足。</p>
+            <table>
+                <thead>
+                    <tr><th>场景</th><th>样本数</th><th>曝光占比</th><th>ROS COPC</th><th>ROS偏差</th><th>排序相关性</th><th>问题比例</th></tr>
+                </thead>
+                <tbody>
+                    {gen_page_rows()}
+                </tbody>
+            </table>
+            <div class="highlight-box">
+                <h4>场景差异要点</h4>
+                <ul style="margin-left: 20px;">
+                    <li>「回流后沉浸页」占比最大(74%),COPC相对较好(1.79)</li>
+                    <li>「详情后沉浸页」和「首页feed」问题比例较高(>10%)</li>
+                    <li>不同场景的排序相关性差异明显(0.12~0.35),需要分场景优化</li>
+                </ul>
+            </div>
+        </div>
+
+        <div class="card">
+            <h2><span class="num">3</span>时间趋势分析</h2>
+            <h3>每日ROS COPC趋势</h3>
+            <div class="chart-container">
+                <div class="timeline-chart">
+                    {gen_timeline_bars()}
+                </div>
+                <div style="display:flex; justify-content:space-between; font-size:12px; color:#999; margin-top:8px;">
+                    <span>{daily_data[0]['dt']}</span>
+                    <span>{daily_data[-1]['dt']}</span>
+                </div>
+            </div>
+            <div class="grid-3" style="margin-top: 24px;">
+                <div class="summary-item">
+                    <div class="value" style="font-size:28px;">{copc_mean:.2f}</div>
+                    <div class="label">COPC均值</div>
+                </div>
+                <div class="summary-item">
+                    <div class="value" style="font-size:28px;">{copc_std:.2f}</div>
+                    <div class="label">COPC标准差</div>
+                </div>
+                <div class="summary-item">
+                    <div class="value" style="font-size:28px;">{avg_problem_pct:.1f}%</div>
+                    <div class="label">平均问题比例</div>
+                </div>
+            </div>
+            <div class="highlight-box success">
+                <h4>趋势稳定性</h4>
+                <p>时间趋势检验显示,COPC没有显著的上升或下降趋势(p=0.93),说明问题是系统性的,而非临时性。</p>
+            </div>
+        </div>
+
+        <div class="card">
+            <h2><span class="num">4</span>问题视频深入剖析</h2>
+            <div class="comparison">
+                <div class="comparison-item problem">
+                    <h4>问题视频</h4>
+                    <div class="stat">{len(problem)}</div>
+                    <div class="label">条记录({len(problem)/len(df)*100:.1f}%)</div>
+                    <ul style="margin-top:16px; margin-left:20px; font-size:14px;">
+                        <li>预估ROS: {problem['pred_ros'].mean():.2f}</li>
+                        <li>真实ROS: {problem['real_ros'].mean():.2f}</li>
+                        <li>ROS偏差: <strong style="color:#c0392b;">{problem['ros_bias_pct'].mean():+.1f}%</strong></li>
+                    </ul>
+                </div>
+                <div class="comparison-item normal">
+                    <h4>正常视频</h4>
+                    <div class="stat">{len(normal)}</div>
+                    <div class="label">条记录({len(normal)/len(df)*100:.1f}%)</div>
+                    <ul style="margin-top:16px; margin-left:20px; font-size:14px;">
+                        <li>预估ROS: {normal['pred_ros'].mean():.2f}</li>
+                        <li>真实ROS: {normal['real_ros'].mean():.2f}</li>
+                        <li>ROS偏差: <strong style="color:#27ae60;">{normal['ros_bias_pct'].mean():+.1f}%</strong></li>
+                    </ul>
+                </div>
+            </div>
+
+            <h3>问题视频的真实ROS分布</h3>
+            <div class="grid-2">
+                <div>
+                    <strong>问题视频</strong>
+                    <div class="bar-chart" style="margin-top:12px;">
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">ROS&lt;2</div>
+                            <div class="bar-track">
+                                <div class="bar-fill positive" style="width: {problem_ros_lt2_pct}%;">{problem_ros_lt2_pct:.0f}%</div>
+                            </div>
+                        </div>
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">2-4</div>
+                            <div class="bar-track">
+                                <div class="bar-fill neutral" style="width: {problem_ros_24_pct}%;">{problem_ros_24_pct:.0f}%</div>
+                            </div>
+                        </div>
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">&gt;6</div>
+                            <div class="bar-track">
+                                <div class="bar-fill negative" style="width: {problem_ros_gt6_pct}%;">{problem_ros_gt6_pct:.0f}%</div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+                <div>
+                    <strong>正常视频</strong>
+                    <div class="bar-chart" style="margin-top:12px;">
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">ROS&lt;2</div>
+                            <div class="bar-track">
+                                <div class="bar-fill positive" style="width: {normal_ros_lt2_pct}%;">{normal_ros_lt2_pct:.0f}%</div>
+                            </div>
+                        </div>
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">2-4</div>
+                            <div class="bar-track">
+                                <div class="bar-fill neutral" style="width: {normal_ros_24_pct}%;">{normal_ros_24_pct:.0f}%</div>
+                            </div>
+                        </div>
+                        <div class="bar-row">
+                            <div class="bar-label" style="width:50px;">&gt;6</div>
+                            <div class="bar-track">
+                                <div class="bar-fill negative" style="width: {normal_ros_gt6_pct}%;">{normal_ros_gt6_pct:.0f}%</div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+
+            <h3 style="margin-top:30px;">高频问题视频 Top10</h3>
+            <table>
+                <thead>
+                    <tr><th>VID</th><th>问题次数</th><th>总天数</th><th>预估ROS</th><th>真实ROS</th><th>ROS偏差</th><th>主要场景</th><th>总曝光</th></tr>
+                </thead>
+                <tbody>
+                    {gen_problem_vid_rows()}
+                </tbody>
+            </table>
+        </div>
+
+        <div class="card">
+            <h2><span class="num">5</span>特征归因分析</h2>
+            <p>分析什么因素与预估偏差相关,帮助定位问题根源。</p>
+            <h3>曝光量 vs ROS偏差</h3>
+            <div class="chart-container">
+                <div class="bar-chart">
+                    {gen_exp_bias_bars()}
+                </div>
+            </div>
+            <div class="highlight-box">
+                <h4>特征归因要点</h4>
+                <ul style="margin-left: 20px;">
+                    <li><strong>曝光量越低,偏差越大</strong>:极低曝光的视频偏差+48%,极高曝光的偏差-22%</li>
+                    <li><strong>STR越高,ROS偏差越大</strong>:高STR视频的ROS偏差+47%</li>
+                    <li>这说明模型对「低频+高互动」的视频预估不准</li>
+                </ul>
+            </div>
+        </div>
+
+        <div class="card conclusion">
+            <h2><span class="num" style="background:rgba(255,255,255,0.2);">6</span>综合结论与建议</h2>
+            <h3 style="color:white; margin-top:24px;">问题根因</h3>
+            <ul>
+                <li><strong>回归均值问题</strong>:模型对极端值预估不准确,低ROS被高估<span class="highlight">+{low_ros_bias:.0f}%</span>,高ROS被低估<span class="highlight">{high_ros_bias:.0f}%</span></li>
+                <li><strong>场景泛化不足</strong>:不同Page的COPC差异显著(1.79~1.99)</li>
+                <li><strong>系统性问题</strong>:问题在30天内持续存在,无改善趋势</li>
+            </ul>
+            <h3 style="color:white; margin-top:30px;">量化影响</h3>
+            <ul>
+                <li>排序失准比例:<span class="highlight">{problem_pct:.1f}%</span>的视频被错误排序到头部</li>
+                <li>排序相关性:<span class="highlight">0.32</span>(理想值为1.0)</li>
+                <li>问题视频占总曝光:<span class="highlight">{problem_exp_pct:.1f}%</span></li>
+            </ul>
+            <h3 style="color:white; margin-top:30px;">可执行建议</h3>
+            <div class="recommendations">
+                <div class="rec-item">
+                    <span class="tag">短期</span>
+                    <h4>分段COPC校准</h4>
+                    <p>对不同预估ROS区间单独校准,降低回归均值问题</p>
+                </div>
+                <div class="rec-item">
+                    <span class="tag">短期</span>
+                    <h4>低曝光惩罚</h4>
+                    <p>对低曝光视频的预估增加不确定性惩罚</p>
+                </div>
+                <div class="rec-item">
+                    <span class="tag">中期</span>
+                    <h4>分场景建模</h4>
+                    <p>针对不同Page场景训练独立校准层或独立模型</p>
+                </div>
+                <div class="rec-item">
+                    <span class="tag">中期</span>
+                    <h4>极端值样本加权</h4>
+                    <p>在模型训练时增加极端值样本权重,改善尾部预估</p>
+                </div>
+            </div>
+            <h3 style="color:white; margin-top:30px;">监控指标</h3>
+            <ul>
+                <li>日常监控「预估排名 vs 真实排名相关系数」</li>
+                <li>分段COPC监控(ROS&lt;2, 2-4, &gt;6)</li>
+                <li>分场景问题比例监控</li>
+            </ul>
+        </div>
+    </div>
+</body>
+</html>
+'''
+
+# 保存
+html_path = output_dir / "v3_完整报告.html"
+with open(html_path, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"V3 完整报告已生成: {html_path}")

+ 465 - 0
tmp/低vov高曝光分析/v3_综合分析.py

@@ -0,0 +1,465 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+V3 综合分析:多维度深入分析
+包含:分场景分析、特征归因、时间趋势、典型案例剖析
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import glob
+from datetime import datetime
+from scipy import stats
+
+output_dir = Path(__file__).parent / "output"
+
+# =============================================================================
+# 数据加载
+# =============================================================================
+data_dir = output_dir / "v3_扩展特征"
+all_files = sorted(glob.glob(str(data_dir / "*.csv")))
+dfs = []
+for f in all_files:
+    df_tmp = pd.read_csv(f)
+    df_tmp['dt'] = int(Path(f).stem)
+    dfs.append(df_tmp)
+df = pd.concat(dfs, ignore_index=True)
+
+print("=" * 100)
+print("V3 综合分析:多维度深入分析")
+print("=" * 100)
+
+print(f"\n【数据概览】")
+print(f"  日期范围: {df['dt'].min()} ~ {df['dt'].max()} ({df['dt'].nunique()}天)")
+print(f"  总记录数: {len(df)}")
+print(f"  去重视频数: {df['vid'].nunique()}")
+print(f"  Page场景数: {df['page'].nunique()}")
+
+# =============================================================================
+# Part 1: 整体指标基准
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 1: 整体指标基准")
+print("=" * 100)
+
+# COPC(加权)
+ros_copc = df['real_ros'].sum() / df['pred_ros'].sum()
+str_copc = df['real_str'].sum() / df['pred_str'].sum()
+
+# 偏差统计
+ros_bias_mean = df['ros_bias_pct'].mean()
+ros_bias_median = df['ros_bias_pct'].median()
+ros_bias_std = df['ros_bias_pct'].std()
+
+str_bias_mean = df['str_bias_pct'].mean()
+str_bias_median = df['str_bias_pct'].median()
+
+print(f"\n【1.1 整体COPC】")
+print(f"  ROS COPC = {ros_copc:.4f} ({'预估整体偏低' if ros_copc > 1 else '预估整体偏高'})")
+print(f"  STR COPC = {str_copc:.4f} ({'预估整体偏低' if str_copc > 1 else '预估整体偏高'})")
+
+print(f"\n【1.2 偏差分布】")
+print(f"  ROS偏差: 均值={ros_bias_mean:+.1f}%, 中位数={ros_bias_median:+.1f}%, std={ros_bias_std:.1f}%")
+print(f"  STR偏差: 均值={str_bias_mean:+.1f}%, 中位数={str_bias_median:+.1f}%")
+
+# 分位数
+print(f"\n【1.3 ROS偏差分位数】")
+for p in [0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95]:
+    val = df['ros_bias_pct'].quantile(p)
+    print(f"  P{int(p*100):02d}: {val:+.1f}%")
+
+# =============================================================================
+# Part 2: 分场景分析(Page维度)
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 2: 分场景分析(Page维度)")
+print("=" * 100)
+
+page_stats = df.groupby('page').agg({
+    'vid': 'count',
+    'total_exp': 'sum',
+    'pred_ros': 'sum',
+    'real_ros': 'sum',
+    'pred_str': 'sum',
+    'real_str': 'sum',
+    'ros_bias_pct': ['mean', 'median', 'std'],
+    'str_bias_pct': ['mean', 'median']
+}).round(4)
+
+page_stats.columns = ['记录数', '总曝光', 'pred_ros_sum', 'real_ros_sum',
+                      'pred_str_sum', 'real_str_sum',
+                      'ROS偏差均值', 'ROS偏差中位数', 'ROS偏差std',
+                      'STR偏差均值', 'STR偏差中位数']
+page_stats['ROS_COPC'] = page_stats['real_ros_sum'] / page_stats['pred_ros_sum']
+page_stats['STR_COPC'] = page_stats['real_str_sum'] / page_stats['pred_str_sum']
+
+print(f"\n【2.1 分场景COPC】")
+print(f"{'场景':<25} {'记录数':>8} {'曝光占比':>10} {'ROS COPC':>10} {'STR COPC':>10} {'ROS偏差%':>12}")
+print("-" * 80)
+total_exp = page_stats['总曝光'].sum()
+for page, row in page_stats.sort_values('总曝光', ascending=False).iterrows():
+    exp_pct = row['总曝光'] / total_exp * 100
+    print(f"{page:<25} {int(row['记录数']):>8} {exp_pct:>9.1f}% {row['ROS_COPC']:>10.3f} {row['STR_COPC']:>10.3f} {row['ROS偏差均值']:>+11.1f}%")
+
+# 各场景问题严重程度排序
+print(f"\n【2.2 场景问题严重程度】")
+page_problem = page_stats.copy()
+page_problem['问题得分'] = abs(page_problem['ROS_COPC'] - 1) * 100  # 离1越远问题越大
+print(page_problem[['记录数', 'ROS_COPC', 'ROS偏差std', '问题得分']].sort_values('问题得分', ascending=False).round(2).to_string())
+
+# =============================================================================
+# Part 3: 分段COPC(回归均值深入分析)
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 3: 回归均值问题深入分析")
+print("=" * 100)
+
+# 3.1 按真实ROS分段
+print(f"\n【3.1 按真实ROS分段】")
+df['real_ros_bin'] = pd.cut(df['real_ros'],
+                           bins=[0, 1, 2, 3, 4, 5, 6, 8, 10, 100],
+                           labels=['0-1', '1-2', '2-3', '3-4', '4-5', '5-6', '6-8', '8-10', '>10'])
+ros_segment = df.groupby('real_ros_bin', observed=True).agg({
+    'pred_ros': ['sum', 'mean'],
+    'real_ros': ['sum', 'mean'],
+    'ros_bias_pct': ['mean', 'median', 'count'],
+    'total_exp': 'sum'
+}).round(2)
+ros_segment.columns = ['pred_ros_sum', '预估ROS均值', 'real_ros_sum', '真实ROS均值',
+                       '偏差均值%', '偏差中位数%', '样本数', '总曝光']
+ros_segment['COPC'] = (ros_segment['real_ros_sum'] / ros_segment['pred_ros_sum']).round(3)
+
+print(f"{'真实ROS':<8} {'样本数':>8} {'预估均值':>10} {'真实均值':>10} {'偏差均值%':>12} {'COPC':>8}")
+print("-" * 65)
+for idx, row in ros_segment.iterrows():
+    print(f"{idx:<8} {int(row['样本数']):>8} {row['预估ROS均值']:>10.2f} {row['真实ROS均值']:>10.2f} {row['偏差均值%']:>+11.1f}% {row['COPC']:>8.3f}")
+
+# 3.2 按预估ROS分段
+print(f"\n【3.2 按预估ROS分段】")
+df['pred_ros_bin'] = pd.cut(df['pred_ros'],
+                           bins=[0, 2, 2.5, 3, 3.5, 4, 5, 100],
+                           labels=['<2', '2-2.5', '2.5-3', '3-3.5', '3.5-4', '4-5', '>5'])
+pred_segment = df.groupby('pred_ros_bin', observed=True).agg({
+    'pred_ros': 'mean',
+    'real_ros': 'mean',
+    'ros_bias_pct': ['mean', 'count'],
+    'total_exp': 'sum'
+}).round(2)
+pred_segment.columns = ['预估ROS均值', '真实ROS均值', '偏差均值%', '样本数', '总曝光']
+pred_segment['COPC'] = (pred_segment['真实ROS均值'] / pred_segment['预估ROS均值']).round(3)
+print(pred_segment.to_string())
+
+# 3.3 交叉分析:预估 vs 真实
+print(f"\n【3.3 预估ROS vs 真实ROS 交叉分析】")
+cross_tab = pd.crosstab(df['pred_ros_bin'], df['real_ros_bin'], margins=True)
+print(cross_tab.to_string())
+
+# =============================================================================
+# Part 4: 排序效果多维分析
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 4: 排序效果多维分析")
+print("=" * 100)
+
+# 4.1 按天计算排序相关性
+print(f"\n【4.1 每日排序相关性】")
+daily_metrics = []
+for dt in sorted(df['dt'].unique()):
+    df_day = df[df['dt'] == dt].copy()
+    if len(df_day) > 10:
+        df_day['pred_rank'] = df_day['pred_score'].rank(ascending=False, pct=True)
+        df_day['real_rank'] = df_day['real_score'].rank(ascending=False, pct=True)
+
+        # 相关系数
+        corr = df_day['pred_rank'].corr(df_day['real_rank'])
+
+        # 问题比例(预估Top25%,真实Bottom50%)
+        problem_cnt = ((df_day['pred_rank'] <= 0.25) & (df_day['real_rank'] > 0.5)).sum()
+        problem_pct = problem_cnt / len(df_day) * 100
+
+        # NDCG近似
+        df_day_sorted = df_day.sort_values('pred_score', ascending=False)
+        top10 = df_day_sorted.head(10)
+        avg_real_score_top10 = top10['real_score'].mean()
+        best_top10 = df_day.nlargest(10, 'real_score')['real_score'].mean()
+        ndcg_approx = avg_real_score_top10 / best_top10 if best_top10 > 0 else 0
+
+        daily_metrics.append({
+            'dt': dt,
+            'n': len(df_day),
+            'corr': corr,
+            'problem_pct': problem_pct,
+            'ndcg_approx': ndcg_approx
+        })
+
+metrics_df = pd.DataFrame(daily_metrics)
+print(f"  排序相关系数: 均值={metrics_df['corr'].mean():.3f}, std={metrics_df['corr'].std():.3f}")
+print(f"  问题比例: 均值={metrics_df['problem_pct'].mean():.1f}%, max={metrics_df['problem_pct'].max():.1f}%")
+print(f"  NDCG近似: 均值={metrics_df['ndcg_approx'].mean():.3f}")
+
+# 4.2 分场景排序效果
+print(f"\n【4.2 分场景排序效果】")
+page_corr = []
+for page in df['page'].unique():
+    df_page = df[df['page'] == page].copy()
+    if len(df_page) > 20:
+        df_page['pred_rank'] = df_page['pred_score'].rank(ascending=False, pct=True)
+        df_page['real_rank'] = df_page['real_score'].rank(ascending=False, pct=True)
+        corr = df_page['pred_rank'].corr(df_page['real_rank'])
+        problem_pct = ((df_page['pred_rank'] <= 0.25) & (df_page['real_rank'] > 0.5)).mean() * 100
+        page_corr.append({'page': page, 'n': len(df_page), 'corr': corr, 'problem_pct': problem_pct})
+
+page_corr_df = pd.DataFrame(page_corr)
+print(page_corr_df.sort_values('corr', ascending=False).round(3).to_string(index=False))
+
+# =============================================================================
+# Part 5: 问题视频深入剖析
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 5: 问题视频深入剖析")
+print("=" * 100)
+
+# 每日独立定义问题视频
+df['pred_rank_daily'] = df.groupby('dt')['pred_score'].rank(ascending=False, pct=True)
+df['real_rank_daily'] = df.groupby('dt')['real_score'].rank(ascending=False, pct=True)
+df['is_problem'] = (df['pred_rank_daily'] <= 0.25) & (df['real_rank_daily'] > 0.5)
+
+problem = df[df['is_problem']]
+normal = df[~df['is_problem']]
+
+print(f"\n【5.1 问题视频统计】")
+print(f"  问题视频记录: {len(problem)} ({len(problem)/len(df)*100:.1f}%)")
+print(f"  正常视频记录: {len(normal)} ({len(normal)/len(df)*100:.1f}%)")
+print(f"  问题视频去重: {problem['vid'].nunique()} 个")
+
+# 5.2 问题视频 vs 正常视频对比
+print(f"\n【5.2 问题视频 vs 正常视频对比】")
+metrics = ['pred_ros', 'real_ros', 'ros_bias_pct', 'pred_str', 'real_str', 'str_bias_pct',
+           'total_exp', 'pred_score', 'real_score']
+comparison = pd.DataFrame({
+    '问题视频': problem[metrics].mean(),
+    '正常视频': normal[metrics].mean(),
+})
+comparison['差异'] = comparison['问题视频'] - comparison['正常视频']
+comparison['差异%'] = (comparison['问题视频'] / comparison['正常视频'] - 1) * 100
+print(comparison.round(4).to_string())
+
+# 5.3 问题视频的特征分析
+print(f"\n【5.3 问题视频特征分析】")
+print("\n真实ROS分布对比:")
+for label, low, high in [('<2', 0, 2), ('2-4', 2, 4), ('4-6', 4, 6), ('>6', 6, 100)]:
+    problem_pct = ((problem['real_ros'] >= low) & (problem['real_ros'] < high)).mean() * 100
+    normal_pct = ((normal['real_ros'] >= low) & (normal['real_ros'] < high)).mean() * 100
+    diff = problem_pct - normal_pct
+    print(f"  {label:>5}: 问题视频 {problem_pct:5.1f}% vs 正常视频 {normal_pct:5.1f}% (差异{diff:+5.1f}%)")
+
+# 5.4 问题视频的场景分布
+print(f"\n问题视频场景分布:")
+problem_page_dist = problem.groupby('page').size() / len(problem) * 100
+normal_page_dist = normal.groupby('page').size() / len(normal) * 100
+page_compare = pd.DataFrame({
+    '问题视频占比': problem_page_dist,
+    '正常视频占比': normal_page_dist
+}).fillna(0)
+page_compare['差异'] = page_compare['问题视频占比'] - page_compare['正常视频占比']
+print(page_compare.round(1).to_string())
+
+# =============================================================================
+# Part 6: 高频问题视频分析
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 6: 高频问题视频分析")
+print("=" * 100)
+
+vid_problem_count = problem.groupby('vid').size().sort_values(ascending=False)
+print(f"\n问题视频出现频次分布:")
+print(f"  出现1次: {(vid_problem_count == 1).sum()} 个视频")
+print(f"  出现2-3次: {((vid_problem_count >= 2) & (vid_problem_count <= 3)).sum()} 个视频")
+print(f"  出现4-5次: {((vid_problem_count >= 4) & (vid_problem_count <= 5)).sum()} 个视频")
+print(f"  出现6次以上: {(vid_problem_count >= 6).sum()} 个视频")
+
+# 高频问题视频详情
+print(f"\n【高频问题视频Top10详情】")
+for vid, cnt in vid_problem_count.head(10).items():
+    vid_data = problem[problem['vid'] == vid]
+    vid_all = df[df['vid'] == vid]
+    print(f"\n  vid={vid} (出现{cnt}次/{len(vid_all)}天)")
+    print(f"    预估ROS: {vid_data['pred_ros'].mean():.2f} (范围: {vid_data['pred_ros'].min():.2f} ~ {vid_data['pred_ros'].max():.2f})")
+    print(f"    真实ROS: {vid_data['real_ros'].mean():.2f} (范围: {vid_data['real_ros'].min():.2f} ~ {vid_data['real_ros'].max():.2f})")
+    print(f"    ROS偏差: {vid_data['ros_bias_pct'].mean():+.1f}%")
+    print(f"    主要场景: {vid_data['page'].value_counts().index[0]}")
+    print(f"    总曝光: {vid_data['total_exp'].sum():,.0f}")
+
+# =============================================================================
+# Part 7: 时间趋势分析
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 7: 时间趋势分析")
+print("=" * 100)
+
+# 每日指标
+print(f"\n【7.1 每日COPC趋势】")
+daily_stats = df.groupby('dt').agg({
+    'pred_ros': 'sum',
+    'real_ros': 'sum',
+    'ros_bias_pct': 'mean',
+    'vid': 'count',
+    'total_exp': 'sum'
+}).round(4)
+daily_stats['ros_copc'] = daily_stats['real_ros'] / daily_stats['pred_ros']
+
+print(f"{'日期':<12} {'记录数':>8} {'ROS COPC':>10} {'偏差均值%':>12} {'曝光量':>12}")
+print("-" * 55)
+for dt, row in daily_stats.iterrows():
+    print(f"{dt:<12} {int(row['vid']):>8} {row['ros_copc']:>10.3f} {row['ros_bias_pct']:>+11.1f}% {row['total_exp']:>12,.0f}")
+
+# COPC趋势统计
+print(f"\nROS COPC趋势统计:")
+print(f"  均值: {daily_stats['ros_copc'].mean():.3f}")
+print(f"  标准差: {daily_stats['ros_copc'].std():.3f}")
+print(f"  最小: {daily_stats['ros_copc'].min():.3f} ({daily_stats['ros_copc'].idxmin()})")
+print(f"  最大: {daily_stats['ros_copc'].max():.3f} ({daily_stats['ros_copc'].idxmax()})")
+
+# 趋势检验
+slope, intercept, r_value, p_value, std_err = stats.linregress(
+    range(len(daily_stats)), daily_stats['ros_copc'].values)
+print(f"\n时间趋势检验:")
+print(f"  斜率: {slope:.6f} ({'上升' if slope > 0 else '下降'}趋势)")
+print(f"  R²: {r_value**2:.4f}")
+print(f"  P值: {p_value:.4f} ({'显著' if p_value < 0.05 else '不显著'})")
+
+# 周度汇总
+print(f"\n【7.2 周度汇总】")
+df['week'] = pd.to_datetime(df['dt'].astype(str)).dt.isocalendar().week
+weekly = df.groupby('week').agg({
+    'pred_ros': 'sum',
+    'real_ros': 'sum',
+    'ros_bias_pct': 'mean',
+    'vid': 'count'
+})
+weekly['ros_copc'] = (weekly['real_ros'] / weekly['pred_ros']).round(3)
+print(weekly[['vid', 'ros_copc', 'ros_bias_pct']].round(2).to_string())
+
+# =============================================================================
+# Part 8: 特征归因分析
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 8: 特征归因分析(什么因素导致预估不准)")
+print("=" * 100)
+
+# 8.1 曝光量 vs 偏差
+print(f"\n【8.1 曝光量 vs 偏差】")
+df['exp_bin'] = pd.qcut(df['total_exp'], q=5, labels=['极低', '低', '中', '高', '极高'])
+exp_bias = df.groupby('exp_bin', observed=True).agg({
+    'ros_bias_pct': ['mean', 'median', 'count'],
+    'total_exp': 'mean'
+}).round(2)
+exp_bias.columns = ['偏差均值%', '偏差中位数%', '样本数', '平均曝光']
+print(exp_bias.to_string())
+
+# 8.2 预估值高低 vs 偏差
+print(f"\n【8.2 预估值高低 vs 偏差】")
+df['pred_level'] = pd.qcut(df['pred_ros'], q=5, labels=['极低', '低', '中', '高', '极高'])
+pred_bias = df.groupby('pred_level', observed=True).agg({
+    'ros_bias_pct': ['mean', 'count'],
+    'pred_ros': 'mean',
+    'real_ros': 'mean'
+}).round(2)
+pred_bias.columns = ['偏差均值%', '样本数', '预估ROS均值', '真实ROS均值']
+print(pred_bias.to_string())
+
+# 8.3 STR高低 vs ROS偏差
+print(f"\n【8.3 STR高低 vs ROS偏差】")
+df['str_level'] = pd.qcut(df['real_str'], q=5, labels=['极低', '低', '中', '高', '极高'], duplicates='drop')
+str_ros = df.groupby('str_level', observed=True).agg({
+    'ros_bias_pct': ['mean', 'count'],
+    'real_str': 'mean'
+}).round(2)
+str_ros.columns = ['ROS偏差均值%', '样本数', '真实STR均值']
+print(str_ros.to_string())
+
+# =============================================================================
+# Part 9: 综合结论
+# =============================================================================
+print("\n" + "=" * 100)
+print("Part 9: 综合结论")
+print("=" * 100)
+
+# 找出问题最严重的场景
+worst_page = page_corr_df.loc[page_corr_df['problem_pct'].idxmax(), 'page'] if len(page_corr_df) > 0 else "N/A"
+
+print(f"""
+【数据基础】
+  - 分析时间: {df['dt'].min()} ~ {df['dt'].max()} ({df['dt'].nunique()}天)
+  - 分析样本: {len(df):,} 条记录,{df['vid'].nunique():,} 个视频
+  - 场景覆盖: {df['page'].nunique()} 个场景
+
+【COPC分析】
+  - ROS 整体 COPC = {ros_copc:.3f} ({'预估整体偏低' if ros_copc > 1 else '预估整体偏高'})
+  - STR 整体 COPC = {str_copc:.3f}
+
+【回归均值问题】(核心问题)
+  - 真实ROS < 2: 预估偏高 {df[df['real_ros'] < 2]['ros_bias_pct'].mean():+.0f}% ({len(df[df['real_ros'] < 2])} 样本)
+  - 真实ROS 2-4: 预估偏高 {df[(df['real_ros'] >= 2) & (df['real_ros'] < 4)]['ros_bias_pct'].mean():+.0f}%
+  - 真实ROS > 6: 预估偏低 {df[df['real_ros'] > 6]['ros_bias_pct'].mean():+.0f}% ({len(df[df['real_ros'] > 6])} 样本)
+
+【排序效果】
+  - 预估排名与真实排名相关系数: {metrics_df['corr'].mean():.2f} (±{metrics_df['corr'].std():.2f})
+  - 排序失准比例: {metrics_df['problem_pct'].mean():.1f}% (最高{metrics_df['problem_pct'].max():.1f}%)
+  - NDCG近似: {metrics_df['ndcg_approx'].mean():.2f}
+
+【分场景差异】
+  - 问题最严重场景: {worst_page}
+  - 场景间COPC差异: {page_stats['ROS_COPC'].max():.2f} ~ {page_stats['ROS_COPC'].min():.2f}
+
+【问题视频特征】
+  - 问题视频占比: {len(problem)/len(df)*100:.1f}%
+  - 问题视频ROS偏差: {problem['ros_bias_pct'].mean():+.1f}% (vs 正常视频 {normal['ros_bias_pct'].mean():+.1f}%)
+  - 问题视频中真实ROS<2占比: {(problem['real_ros'] < 2).mean()*100:.0f}% (vs 正常视频 {(normal['real_ros'] < 2).mean()*100:.0f}%)
+  - 高频问题视频(≥4次): {(vid_problem_count >= 4).sum()} 个
+
+【根本原因】
+  1. 回归均值问题:模型对极端值预估不准确
+     - 低ROS视频被高估 → 排名靠前 → 获得过多曝光
+     - 高ROS视频被低估 → 排名靠后 → 曝光不足
+  2. 场景差异:不同page的COPC差异明显,说明模型泛化性不足
+  3. 高频问题:部分视频反复出现在问题列表,可能存在系统性特征
+
+【量化影响】
+  - 问题视频占总曝光: {problem['total_exp'].sum() / df['total_exp'].sum() * 100:.1f}%
+  - 预估Top10实际应该在的位置: 真实排名均值={df[df['pred_rank_daily'] <= 0.1]['real_rank_daily'].mean()*100:.0f}%ile
+""")
+
+# =============================================================================
+# 保存分析结果
+# =============================================================================
+# 保存各类中间数据
+df.to_csv(output_dir / "v3_综合分析_全量.csv", index=False)
+metrics_df.to_csv(output_dir / "v3_每日指标.csv", index=False)
+page_stats.to_csv(output_dir / "v3_分场景统计.csv")
+ros_segment.to_csv(output_dir / "v3_ROS分段统计.csv")
+
+# 高频问题视频详情
+top_problem_vids = vid_problem_count.head(20)
+top_vid_details = []
+for vid, cnt in top_problem_vids.items():
+    vid_data = df[df['vid'] == vid]
+    top_vid_details.append({
+        'vid': vid,
+        '问题出现次数': cnt,
+        '总出现天数': len(vid_data),
+        '问题比例': cnt / len(vid_data) * 100,
+        '预估ROS均值': vid_data['pred_ros'].mean(),
+        '真实ROS均值': vid_data['real_ros'].mean(),
+        'ROS偏差均值': vid_data['ros_bias_pct'].mean(),
+        '总曝光': vid_data['total_exp'].sum(),
+        '主要场景': vid_data['page'].mode().iloc[0] if len(vid_data) > 0 else ''
+    })
+pd.DataFrame(top_vid_details).to_csv(output_dir / "v3_高频问题视频.csv", index=False)
+
+print(f"\n数据已保存到 output/ 目录")
+print(f"  - v3_综合分析_全量.csv")
+print(f"  - v3_每日指标.csv")
+print(f"  - v3_分场景统计.csv")
+print(f"  - v3_ROS分段统计.csv")
+print(f"  - v3_高频问题视频.csv")