Просмотр исходного кода

feat(人群品类曝光分析): 新增头部品类分析可视化

- 新增头部品类→推荐品类矩阵热力图
- 新增头部品类下钻表格,支持各人群Top N对比
- 支持按日期、人群、头部品类筛选
- 品类hover高亮跨人群相同品类
- 日期切换和自动播放功能
- run_sql.py输出文件名添加SQL文件前缀

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 месяцев назад
Родитель
Сommit
27ddc983ff
23 измененных файлов с 2384 добавлено и 4 удалено
  1. 7 4
      run_sql.py
  2. BIN
      tasks/人群品类曝光分析/.DS_Store
  3. 163 0
      tasks/人群品类曝光分析/query.sql
  4. BIN
      tasks/人群品类曝光分析/头部关联分析/.DS_Store
  5. 145 0
      tasks/人群品类曝光分析/头部关联分析/query.sql
  6. 141 0
      tasks/人群品类曝光分析/头部关联分析/query_v2_放宽条件.sql
  7. 81 0
      tasks/人群品类曝光分析/头部关联分析/query_关联率对比.sql
  8. BIN
      tasks/人群品类曝光分析/头部品类分析/.DS_Store
  9. 133 0
      tasks/人群品类曝光分析/头部品类分析/query.sql
  10. 829 0
      tasks/人群品类曝光分析/头部品类分析/visualize.py
  11. BIN
      tasks/人群品类曝光分析/头部品类分析_过滤小量/.DS_Store
  12. 135 0
      tasks/人群品类曝光分析/头部品类分析_过滤小量/query.sql
  13. 103 0
      tasks/人群品类曝光分析/数据膨胀排查/query.sql
  14. 70 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v10_关联率排查.sql
  15. 59 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v11_放宽条件.sql
  16. 86 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v2.sql
  17. 17 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v3.sql
  18. 53 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v4.sql
  19. 48 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v5.sql
  20. 72 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v6.sql
  21. 80 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v7.sql
  22. 83 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v8.sql
  23. 79 0
      tasks/人群品类曝光分析/数据膨胀排查/query_v9.sql

+ 7 - 4
run_sql.py

@@ -64,16 +64,19 @@ def run_sql(sql_file: str, output_file: str = None, variables: dict = None,
     if end:
         variables['end'] = end
 
-    # 输出目录:SQL 同目录下的 output/;文件名:日期.csv
+    # 输出目录:SQL 同目录下的 output/;文件名:[sql前缀_]日期.csv
     if output_file is None:
         output_dir = sql_path.parent / "output"
         output_dir.mkdir(exist_ok=True)
+        # SQL 文件名作为前缀
+        sql_stem = sql_path.stem  # 去掉 .sql 后缀
+        prefix = f"{sql_stem}_"
         if start and end:
-            output_file = output_dir / f"{start}_{end}.csv"
+            output_file = output_dir / f"{prefix}{start}_{end}.csv"
         elif start:
-            output_file = output_dir / f"{start}.csv"
+            output_file = output_dir / f"{prefix}{start}.csv"
         else:
-            output_file = output_dir / "result.csv"
+            output_file = output_dir / f"{prefix}result.csv"
     else:
         output_file = Path(output_file)
 

BIN
tasks/人群品类曝光分析/.DS_Store


+ 163 - 0
tasks/人群品类曝光分析/query.sql

@@ -0,0 +1,163 @@
+-- 增长人群品类曝光分析
+-- 3类人群(内部/外部0层/外部裂变)对不同品类表现,groupby dt 人群 cate2 曝光top20的品类
+WITH t_base AS
+(
+    SELECT  dt
+            ,apptype
+            ,abcode
+            ,CASE   WHEN rootSourceId REGEXP 'longArticles_' AND b.root_source_id IS NULL THEN '公众号代运营-Daily'
+                    WHEN rootSourceId REGEXP 'longArticles_' AND b.root_source_id IS NOT NULL THEN '公众号买号'
+                    WHEN rootSourceId REGEXP 'dyyjs_' THEN '公众号代运营-即转'
+                    WHEN rootSourceId REGEXP 'touliu_tencent_' THEN '小程序投流'
+                    WHEN rootSourceId REGEXP 'touliu_tencentgzh_|touliu_tencentGzhArticle_|GzhTouLiu_Articles_gh' THEN '公众号投流'
+                    WHEN rootSourceId REGEXP 'touliu_tencentqw_|WeCom_' THEN '测-企微投放'
+                    WHEN rootSourceId REGEXP 'touliu_tencentwbqw_|dyyqw_' THEN '测-企微合作'
+                    WHEN rootSourceId REGEXP 'gzhhz_' THEN '停-公众号合作'
+                    WHEN rootSourceId REGEXP 'daitou_tencentgzh|DaiTou_gh' THEN '测-公众号完全代投放'
+                    WHEN rootSourceId = '' OR rootSourceId IS NULL THEN '内部'
+                    ELSE 'other'
+            END AS channel
+            ,page
+            ,page_rec
+            ,layer
+            ,mid
+            ,vid
+            ,is_share
+            ,share_cnt
+            ,is_return_1
+            ,is_return_n
+            ,return_1_uv
+            ,return_n_uv
+            ,return_n_uv_noself
+            ,new_exposure_cnt
+            ,flowpool
+    FROM    (
+                SELECT  dt
+                        ,apptype
+                        ,rootsourceid
+                        ,mid
+                        ,vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                        ,subsessionid
+                        ,sessionid
+                        ,extend
+                        ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+                        ,page
+                        ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                                WHEN page IN ("回流页","其他") THEN "非推荐"
+                                ELSE "其他"
+                        END AS page_rec
+                        ,abcode
+                        ,flowpool
+                FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+                WHERE   dt BETWEEN "${start}" AND "${end}"
+                AND     apptype IN ('4','0')
+                AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+                AND     abcode IN ("ab0","ab1","ab2","ab5","ab6","ab7","ab3","ab4","ab8","ab9")
+                AND     abcode NOT IN ("ab100")
+            ) a
+    LEFT JOIN   (
+                    SELECT  root_source_id
+                    FROM    loghubods.changwen_rootsourceid_group_hour
+                    WHERE   dt = MAX_PT('loghubods.changwen_rootsourceid_group_hour')
+                    AND     group_name = '公众号买号'
+                    GROUP BY root_source_id
+                ) b
+    ON      a.rootsourceid = b.root_source_id
+    WHERE   a.page_rec = "推荐"
+)
+,t_base_data AS
+(
+    SELECT  ta.dt
+            ,ta.in_out
+            ,ta.vid
+            ,ta.share_cnt
+            ,ta.return_n_uv
+            ,ta.new_exposure_cnt
+            ,tb.merge_cate2
+            ,tb.title_time_w_h_unionid
+    FROM    (
+                SELECT  dt
+                        ,CASE   WHEN channel = "内部" THEN "内部"
+                                WHEN layer = 0 THEN "外部0层"
+                                WHEN layer > 0 THEN "外部裂变"
+                                ELSE "其他"
+                        END AS in_out
+                        ,vid
+                        ,is_share
+                        ,share_cnt
+                        ,is_return_1
+                        ,is_return_n
+                        ,return_1_uv
+                        ,return_n_uv
+                        ,return_n_uv_noself
+                        ,new_exposure_cnt
+                FROM    t_base
+            ) ta
+    LEFT JOIN   (
+                    SELECT  vid
+                            ,title_time_w_h_unionid
+                            ,channel AS vid_channel
+                            ,merge_cate1
+                            ,merge_cate2
+                            ,festive_label1
+                            ,festive_label2
+                    FROM    (
+                                SELECT  vid
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.title_time_w_h_unionid"),"unknown") AS title_time_w_h_unionid
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.channel"),"unknown") AS channel
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_first_level_cate"),"unknown") AS merge_cate1
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS merge_cate2
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.festive_label1"),"unknown") AS festive_label1
+                                        ,COALESCE(GET_JSON_OBJECT(feature,"$.festive_label2"),"unknown") AS festive_label2
+                                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                                FROM    loghubods.alg_vid_feature_basic_info
+                                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+                            )
+                    WHERE   rn = 1
+                ) tb
+    ON      ta.vid = tb.vid
+)
+,t_base_data_v1 AS
+(
+    SELECT  dt
+            ,in_out
+            ,merge_cate2
+            ,SUM(1) AS exp
+            ,SUM(share_cnt) AS share_cnt
+            ,SUM(return_n_uv) AS return_n_uv
+            ,SUM(new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_base_data
+    GROUP BY dt
+             ,in_out
+             ,merge_cate2
+)
+,t_base_data_v2 AS
+(
+    SELECT  dt
+            ,in_out
+            ,merge_cate2
+            ,exp
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,round(COALESCE(share_cnt / exp,0),4) AS str
+            ,round(COALESCE(return_n_uv / share_cnt,0),4) AS ros
+            ,round(COALESCE(return_n_uv / exp,0),4) AS rovn
+            ,round(COALESCE(new_exposure_cnt / exp,0),4) AS vov
+            ,ROW_NUMBER() OVER (PARTITION BY dt,in_out ORDER BY exp DESC ) AS rn
+    FROM    t_base_data_v1
+)
+SELECT  *
+FROM    t_base_data_v2
+WHERE   rn <= 20
+AND     in_out <> "其他"
+ORDER BY dt DESC,in_out,exp DESC
+;

BIN
tasks/人群品类曝光分析/头部关联分析/.DS_Store


+ 145 - 0
tasks/人群品类曝光分析/头部关联分析/query.sql

@@ -0,0 +1,145 @@
+-- 推荐曝光 join 头部视频表,分析头部视频品类对推荐曝光的影响
+-- 用 subsessionid + headvideoid + 时间条件关联,避免数据膨胀
+WITH t_head AS (
+    -- 头部视频表
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+            ,videoid AS head_vid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge一级品类` AS head_cate1
+            ,`merge二级品类` AS head_cate2
+            ,channel AS head_channel
+    FROM    loghubods.opengid_base_data
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+)
+,t_rec AS (
+    -- 推荐曝光表
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS rec_in_out
+            ,vid AS rec_vid
+            ,page
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_rec
+            ,is_share
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+)
+,t_joined_raw AS (
+    -- 用 subsessionid + headvideoid + 时间条件 join
+    -- 点击时间 <= 曝光时间,取最近的一条头部记录
+    SELECT  r.dt
+            ,r.mid
+            ,r.subsessionid
+            ,r.rec_vid
+            ,r.ts
+            ,r.rec_in_out
+            ,r.page
+            ,r.page_rec
+            ,r.share_cnt
+            ,r.return_n_uv
+            ,r.new_exposure_cnt
+            ,r.layer
+            ,h.head_in_out
+            ,h.head_vid
+            ,h.head_cate1
+            ,h.head_cate2
+            ,h.head_channel
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.dt, r.mid, r.subsessionid, r.rec_vid, r.ts
+                ORDER BY h.click_ts DESC  -- 取最接近曝光时间的点击
+            ) AS rn
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.head_vid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)  -- 点击时间 <= 曝光时间
+    WHERE   r.page_rec = '推荐'
+)
+,t_joined AS (
+    SELECT  dt
+            ,COALESCE(head_in_out, rec_in_out) AS in_out
+            ,CASE   WHEN COALESCE(head_in_out, rec_in_out) = '内部' THEN '内部'
+                    WHEN layer = 0 THEN '外部0层'
+                    WHEN layer > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+            ,CASE   WHEN head_vid IS NULL THEN '未关联'
+                    ELSE '已关联'
+            END AS head_joined
+            ,head_vid
+            ,head_cate1
+            ,CASE   WHEN head_vid IS NULL THEN '未关联头部'
+                    WHEN head_cate2 IS NULL OR head_cate2 = '' THEN 'unknown'
+                    ELSE head_cate2
+            END AS head_cate2
+            ,head_channel
+            ,rec_vid
+            ,page
+            ,page_rec
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+    FROM    t_joined_raw
+    WHERE   rn = 1
+)
+,t_vid_info AS (
+    -- 推荐视频品类信息
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid
+                        ,feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+            )
+    WHERE   rn = 1
+)
+,t_final AS (
+    SELECT  a.dt
+            ,a.crowd
+            ,a.head_cate2
+            ,b.rec_cate2
+            ,SUM(1) AS exp
+            ,SUM(a.share_cnt) AS share_cnt
+            ,SUM(a.return_n_uv) AS return_n_uv
+            ,SUM(a.new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_joined a
+    LEFT JOIN t_vid_info b ON a.rec_vid = b.vid
+    GROUP BY a.dt, a.crowd, a.head_cate2, b.rec_cate2
+)
+SELECT  dt
+        ,crowd
+        ,ROW_NUMBER() OVER (PARTITION BY dt, crowd ORDER BY exp DESC) AS rn
+        ,head_cate2
+        ,rec_cate2
+        ,exp
+        ,share_cnt
+        ,return_n_uv
+        ,new_exposure_cnt
+        ,round(COALESCE(share_cnt / exp,0),4) AS str
+        ,round(COALESCE(return_n_uv / share_cnt,0),4) AS ros
+        ,round(COALESCE(return_n_uv / exp,0),4) AS rovn
+        ,round(COALESCE(new_exposure_cnt / exp,0),4) AS vov
+FROM    t_final
+WHERE   crowd <> '其他'
+ORDER BY dt DESC, crowd, rn
+;

+ 141 - 0
tasks/人群品类曝光分析/头部关联分析/query_v2_放宽条件.sql

@@ -0,0 +1,141 @@
+-- 推荐曝光 join 头部视频表(放宽版本)
+-- 用 mid + subsessionid + 时间条件关联,不要求 headvideoid 匹配
+WITH t_head AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+            ,videoid AS head_vid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge一级品类` AS head_cate1
+            ,`merge二级品类` AS head_cate2
+            ,channel AS head_channel
+    FROM    loghubods.opengid_base_data
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+)
+,t_rec AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS rec_in_out
+            ,vid AS rec_vid
+            ,page
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_rec
+            ,is_share
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+)
+,t_joined_raw AS (
+    -- 放宽条件:mid + subsessionid + 时间,取最近一条
+    SELECT  r.dt
+            ,r.mid
+            ,r.subsessionid
+            ,r.rec_vid
+            ,r.ts
+            ,r.rec_in_out
+            ,r.page
+            ,r.page_rec
+            ,r.share_cnt
+            ,r.return_n_uv
+            ,r.new_exposure_cnt
+            ,r.layer
+            ,h.head_in_out
+            ,h.head_vid
+            ,h.head_cate1
+            ,h.head_cate2
+            ,h.head_channel
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.dt, r.mid, r.subsessionid, r.rec_vid, r.ts
+                ORDER BY h.click_ts DESC  -- 取最接近曝光时间的点击
+            ) AS rn
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)  -- 点击时间 <= 曝光时间
+    -- 不再要求 headvideoid = head_vid
+    WHERE   r.page_rec = '推荐'
+)
+,t_joined AS (
+    SELECT  dt
+            ,COALESCE(head_in_out, rec_in_out) AS in_out
+            ,CASE   WHEN COALESCE(head_in_out, rec_in_out) = '内部' THEN '内部'
+                    WHEN layer = '0' THEN '外部0层'
+                    WHEN CAST(layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+            ,CASE   WHEN head_vid IS NULL THEN '未关联'
+                    ELSE '已关联'
+            END AS head_joined
+            ,head_vid
+            ,head_cate1
+            ,CASE   WHEN head_vid IS NULL THEN '未关联头部'
+                    WHEN head_cate2 IS NULL OR head_cate2 = '' THEN 'unknown'
+                    ELSE head_cate2
+            END AS head_cate2
+            ,head_channel
+            ,rec_vid
+            ,page
+            ,page_rec
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+    FROM    t_joined_raw
+    WHERE   rn = 1
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid
+                        ,feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+            )
+    WHERE   rn = 1
+)
+,t_final AS (
+    SELECT  a.dt
+            ,a.crowd
+            ,a.head_cate2
+            ,b.rec_cate2
+            ,SUM(1) AS exp
+            ,SUM(a.share_cnt) AS share_cnt
+            ,SUM(a.return_n_uv) AS return_n_uv
+            ,SUM(a.new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_joined a
+    LEFT JOIN t_vid_info b ON a.rec_vid = b.vid
+    GROUP BY a.dt, a.crowd, a.head_cate2, b.rec_cate2
+)
+SELECT  dt
+        ,crowd
+        ,ROW_NUMBER() OVER (PARTITION BY dt, crowd ORDER BY exp DESC) AS rn
+        ,head_cate2
+        ,rec_cate2
+        ,exp
+        ,share_cnt
+        ,return_n_uv
+        ,new_exposure_cnt
+        ,round(COALESCE(share_cnt / exp,0),4) AS str
+        ,round(COALESCE(return_n_uv / share_cnt,0),4) AS ros
+        ,round(COALESCE(return_n_uv / exp,0),4) AS rovn
+        ,round(COALESCE(new_exposure_cnt / exp,0),4) AS vov
+FROM    t_final
+WHERE   crowd <> '其他'
+ORDER BY dt DESC, crowd, rn
+;

+ 81 - 0
tasks/人群品类曝光分析/头部关联分析/query_关联率对比.sql

@@ -0,0 +1,81 @@
+-- 对比 v1(严格)和 v2(放宽)的关联率
+WITH t_head AS (
+    SELECT  dt, mid, subsessionid, videoid AS head_vid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+    FROM    loghubods.opengid_base_data
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+)
+,t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_rec_with_crowd AS (
+    SELECT  r.*
+            ,CASE   WHEN in_out = '内部' THEN '内部'
+                    WHEN layer = '0' THEN '外部0层'
+                    WHEN CAST(layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+    FROM    t_rec r
+)
+-- v1: 严格条件(mid + subsessionid + headvideoid + 时间)
+,t_v1 AS (
+    SELECT  r.crowd, r.mid, r.subsessionid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+            ,h.head_vid
+    FROM    t_rec_with_crowd r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.head_vid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.crowd <> '其他'
+)
+,t_v1_stats AS (
+    SELECT  crowd
+            ,'v1_严格' AS version
+            ,COUNT(1) AS exp
+            ,SUM(CASE WHEN head_vid IS NOT NULL THEN 1 ELSE 0 END) AS matched
+    FROM    t_v1 WHERE rn = 1
+    GROUP BY crowd
+)
+-- v2: 放宽条件(mid + subsessionid + 时间,无headvideoid)
+,t_v2 AS (
+    SELECT  r.crowd, r.mid, r.subsessionid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+            ,h.head_vid
+    FROM    t_rec_with_crowd r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.crowd <> '其他'
+)
+,t_v2_stats AS (
+    SELECT  crowd
+            ,'v2_放宽' AS version
+            ,COUNT(1) AS exp
+            ,SUM(CASE WHEN head_vid IS NOT NULL THEN 1 ELSE 0 END) AS matched
+    FROM    t_v2 WHERE rn = 1
+    GROUP BY crowd
+)
+
+SELECT  crowd, version, exp, matched
+        ,ROUND(matched / exp, 4) AS match_rate
+FROM    t_v1_stats
+UNION ALL
+SELECT  crowd, version, exp, matched
+        ,ROUND(matched / exp, 4) AS match_rate
+FROM    t_v2_stats
+ORDER BY crowd, version
+;

BIN
tasks/人群品类曝光分析/头部品类分析/.DS_Store


+ 133 - 0
tasks/人群品类曝光分析/头部品类分析/query.sql

@@ -0,0 +1,133 @@
+-- 推荐曝光 join 头部视频表,分析头部视频品类对推荐曝光的影响
+-- 用 subsessionid + headvideoid + 时间条件关联
+WITH t_head AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+            ,videoid AS head_vid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge一级品类` AS head_cate1
+            ,`merge二级品类` AS head_cate2
+            ,channel AS head_channel
+    FROM    loghubods.opengid_base_data
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+)
+,t_rec AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS rec_in_out
+            ,vid AS rec_vid
+            ,page
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_rec
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+)
+,t_joined_raw AS (
+    SELECT  r.dt
+            ,r.mid
+            ,r.subsessionid
+            ,r.rec_vid
+            ,r.ts
+            ,r.rec_in_out
+            ,r.page
+            ,r.page_rec
+            ,r.share_cnt
+            ,r.return_n_uv
+            ,r.new_exposure_cnt
+            ,r.layer
+            ,h.head_in_out
+            ,h.head_vid
+            ,h.head_cate1
+            ,h.head_cate2
+            ,h.head_channel
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.dt, r.mid, r.subsessionid, r.rec_vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.head_vid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.page_rec = '推荐'
+)
+,t_joined AS (
+    SELECT  dt
+            ,COALESCE(head_in_out, rec_in_out) AS in_out
+            ,CASE   WHEN COALESCE(head_in_out, rec_in_out) = '内部' THEN '内部'
+                    WHEN layer = '0' THEN '外部0层'
+                    WHEN CAST(layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+            ,head_vid
+            ,head_cate1
+            ,CASE   WHEN head_vid IS NULL THEN '未关联头部'
+                    WHEN head_cate2 IS NULL OR head_cate2 = '' THEN 'unknown'
+                    ELSE head_cate2
+            END AS head_cate2
+            ,head_channel
+            ,rec_vid
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+    FROM    t_joined_raw
+    WHERE   rn = 1
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid
+                        ,feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+            )
+    WHERE   rn = 1
+)
+,t_final AS (
+    SELECT  a.dt
+            ,a.crowd
+            ,a.head_cate2
+            ,b.rec_cate2
+            ,SUM(1) AS exp
+            ,SUM(a.share_cnt) AS share_cnt
+            ,SUM(a.return_n_uv) AS return_n_uv
+            ,SUM(a.new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_joined a
+    LEFT JOIN t_vid_info b ON a.rec_vid = b.vid
+    GROUP BY a.dt, a.crowd, a.head_cate2, b.rec_cate2
+)
+SELECT  dt
+        ,crowd
+        ,head_cate2
+        ,rec_cate2
+        ,exp
+        ,share_cnt
+        ,return_n_uv
+        ,new_exposure_cnt
+        ,round(COALESCE(share_cnt / exp,0),4) AS str
+        ,round(COALESCE(return_n_uv / share_cnt,0),4) AS ros
+        ,round(COALESCE(return_n_uv / exp,0),4) AS rovn
+        ,round(COALESCE(new_exposure_cnt / exp,0),4) AS vov
+FROM    t_final
+WHERE   crowd <> '其他'
+ORDER BY dt DESC, crowd, exp DESC
+;

+ 829 - 0
tasks/人群品类曝光分析/头部品类分析/visualize.py

@@ -0,0 +1,829 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+头部品类分析可视化
+Tab 1: Matrix - 头部品类 × 推荐品类矩阵
+Tab 2: Compare - Top 10 品类人群对比
+"""
+import pandas as pd
+import json
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 找到最新的原始数据文件
+csv_files = [f for f in output_dir.glob("query_*.csv")]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+print(f"分析文件: {latest_file.name}")
+print(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 日期列表
+all_dates = sorted([str(d) for d in df['dt'].unique()])
+date_options = ['all'] + all_dates
+latest_date = all_dates[-1] if all_dates else 'all'
+print(f"日期数: {len(all_dates)}")
+
+# 人群列表
+crowd_list = ['内部', '外部0层', '外部裂变']
+print(f"人群: {crowd_list}")
+
+# 曝光阈值
+EXP_THRESHOLD = 1000
+
+# 计算人群×日期的矩阵数据
+def calc_matrix_data(crowd, date=None):
+    ch_df = df[df['crowd'] == crowd].copy()
+    if date and date != 'all':
+        ch_df = ch_df[ch_df['dt'].astype(str) == str(date)]
+    if len(ch_df) == 0:
+        return None
+
+    row_col = 'head_cate2'
+    col_col = 'rec_cate2'
+
+    matrix = ch_df.groupby([row_col, col_col]).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+
+    matrix = matrix[matrix['exp'] >= EXP_THRESHOLD]
+    if len(matrix) == 0:
+        return None
+
+    matrix['str'] = matrix['share_cnt'] / (matrix['exp'] + 1)
+    matrix['ros'] = matrix['return_n_uv'] / (matrix['share_cnt'] + 1)
+    matrix['rovn'] = matrix['return_n_uv'] / (matrix['exp'] + 1)
+    matrix['vov'] = matrix['new_exposure_cnt'] / (matrix['exp'] + 1)
+
+    exp_pivot = matrix.pivot(index=row_col, columns=col_col, values='exp').fillna(0)
+    str_pivot = matrix.pivot(index=row_col, columns=col_col, values='str').fillna(0)
+    ros_pivot = matrix.pivot(index=row_col, columns=col_col, values='ros').fillna(0)
+    rovn_pivot = matrix.pivot(index=row_col, columns=col_col, values='rovn').fillna(0)
+    vov_pivot = matrix.pivot(index=row_col, columns=col_col, values='vov').fillna(0)
+
+    row_order = exp_pivot.sum(axis=1).sort_values(ascending=False).index.tolist()
+    col_order = exp_pivot.sum(axis=0).sort_values(ascending=False).index.tolist()
+
+    def to_dict(pivot, is_int=False):
+        return {str(r): {str(c): int(pivot.loc[r, c]) if is_int else round(float(pivot.loc[r, c]), 4) if c in pivot.columns else 0 for c in col_order} for r in row_order}
+
+    total_exp = int(ch_df['exp'].sum())
+    total_share = int(ch_df['share_cnt'].sum())
+    total_return = int(ch_df['return_n_uv'].sum())
+
+    return {
+        'rows': row_order,
+        'cols': col_order,
+        'exp': to_dict(exp_pivot, is_int=True),
+        'str': to_dict(str_pivot),
+        'ros': to_dict(ros_pivot),
+        'rovn': to_dict(rovn_pivot),
+        'vov': to_dict(vov_pivot),
+        'total_exp': total_exp,
+        'total_str': round(total_share / (total_exp + 1), 4),
+        'total_rovn': round(total_return / (total_exp + 1), 4),
+    }
+
+# 计算头部品类下钻数据:head_cate2 -> crowd -> rec_cate2
+def calc_head_drill_data(date=None):
+    ch_df = df.copy()
+    if date and date != 'all':
+        ch_df = ch_df[ch_df['dt'].astype(str) == str(date)]
+    if len(ch_df) == 0:
+        return None
+
+    # 按 head_cate2 + crowd + rec_cate2 聚合
+    agg = ch_df.groupby(['head_cate2', 'crowd', 'rec_cate2']).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+
+    agg['str'] = agg['share_cnt'] / (agg['exp'] + 1)
+    agg['ros'] = agg['return_n_uv'] / (agg['share_cnt'] + 1)
+    agg['rovn'] = agg['return_n_uv'] / (agg['exp'] + 1)
+    agg['vov'] = agg['new_exposure_cnt'] / (agg['exp'] + 1)
+
+    # 构建嵌套字典: head_cate2 -> crowd -> {rec_cate2: metrics}
+    result = {}
+
+    # 添加 "all" 选项:不区分头部品类,按 crowd + rec_cate2 聚合
+    agg_all = ch_df.groupby(['crowd', 'rec_cate2']).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+    agg_all['str'] = agg_all['share_cnt'] / (agg_all['exp'] + 1)
+    agg_all['ros'] = agg_all['return_n_uv'] / (agg_all['share_cnt'] + 1)
+    agg_all['rovn'] = agg_all['return_n_uv'] / (agg_all['exp'] + 1)
+    agg_all['vov'] = agg_all['new_exposure_cnt'] / (agg_all['exp'] + 1)
+
+    result['all'] = {}
+    for crowd in crowd_list:
+        crowd_df = agg_all[agg_all['crowd'] == crowd]
+        result['all'][crowd] = {}
+        for _, row in crowd_df.iterrows():
+            result['all'][crowd][row['rec_cate2']] = {
+                'exp': int(row['exp']),
+                'str': round(row['str'], 4),
+                'ros': round(row['ros'], 4),
+                'rovn': round(row['rovn'], 4),
+                'vov': round(row['vov'], 4),
+            }
+
+    # 按头部品类聚合
+    for head_cate in agg['head_cate2'].unique():
+        result[head_cate] = {}
+        for crowd in crowd_list:
+            crowd_df = agg[(agg['head_cate2'] == head_cate) & (agg['crowd'] == crowd)]
+            result[head_cate][crowd] = {}
+            for _, row in crowd_df.iterrows():
+                result[head_cate][crowd][row['rec_cate2']] = {
+                    'exp': int(row['exp']),
+                    'str': round(row['str'], 4),
+                    'ros': round(row['ros'], 4),
+                    'rovn': round(row['rovn'], 4),
+                    'vov': round(row['vov'], 4),
+                }
+
+    # 获取所有头部品类列表(按总曝光排序)
+    head_exp = ch_df.groupby('head_cate2')['exp'].sum().sort_values(ascending=False)
+    head_list = head_exp.index.tolist()
+
+    return {
+        'heads': ['all'] + head_list,  # all 放在最前面
+        'data': result
+    }
+
+
+# 预计算所有数据
+all_data = {}
+for crowd in crowd_list:
+    all_data[crowd] = {}
+    for dt in date_options:
+        matrix = calc_matrix_data(crowd, dt)
+        if matrix:
+            all_data[crowd][dt] = matrix
+
+# 预计算头部品类下钻数据
+head_drill_data = {}
+for dt in date_options:
+    drill = calc_head_drill_data(dt)
+    if drill:
+        head_drill_data[dt] = drill
+
+# 转为JSON
+data_json = json.dumps(all_data, ensure_ascii=False)
+head_drill_json = json.dumps(head_drill_data, ensure_ascii=False)
+crowd_list_json = json.dumps(crowd_list, ensure_ascii=False)
+dates_json = json.dumps(date_options)
+
+# 日期选项HTML
+date_options_html = "".join([
+    f'<option value="{dt}" {"selected" if dt == latest_date else ""}>'
+    f'{"all" if dt == "all" else dt}</option>'
+    for dt in date_options
+])
+
+# 人群选项HTML
+crowd_options_html = "".join([
+    f'<option value="{c}">{c}</option>'
+    for c in crowd_list
+])
+
+html_content = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>头部品类分析</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+               background: #f5f5f5; padding: 20px; }}
+        .container {{ max-width: 1600px; margin: 0 auto; background: white;
+                     border-radius: 8px; padding: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        h1 {{ font-size: 24px; margin-bottom: 20px; color: #333; }}
+        .controls {{ display: flex; gap: 20px; margin-bottom: 20px; align-items: center; flex-wrap: wrap; }}
+        .controls .date-switcher {{ margin-left: auto; }}
+        .play-btn {{ background: #4CAF50; color: white; border: none; border-radius: 4px; padding: 6px 12px; font-size: 14px; }}
+        .play-btn:hover {{ background: #45a049; }}
+        .play-btn.playing {{ background: #f44336; }}
+        .control-group {{ display: flex; align-items: center; gap: 8px; }}
+        .control-group label {{ font-weight: 500; color: #666; }}
+        select {{ padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 14px; min-width: 120px; }}
+        .summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
+        .stat-card {{ background: #f8f9fa; padding: 15px 20px; border-radius: 6px; text-align: center; }}
+        .stat-card h4 {{ font-size: 24px; color: #28a745; margin-bottom: 5px; }}
+        .stat-card p {{ font-size: 12px; color: #666; }}
+        .matrix-container {{ overflow-x: auto; max-height: 600px; overflow-y: auto; }}
+        table {{ border-collapse: collapse; font-size: 11px; }}
+        th, td {{ border: 1px solid #e0e0e0; padding: 4px 6px; text-align: center; white-space: nowrap; }}
+        th {{ background: #f5f5f5; font-weight: 600; position: sticky; top: 0; z-index: 1; }}
+        th:first-child {{ position: sticky; left: 0; z-index: 3; }}
+        td:first-child {{ background: #f5f5f5; font-weight: 500; position: sticky; left: 0; z-index: 1; text-align: left; }}
+        .corner-cell {{
+            position: relative;
+            width: 100px;
+            height: 50px;
+            background: linear-gradient(to top right, #f5f5f5 49.5%, #ccc 49.5%, #ccc 50.5%, #f5f5f5 50.5%);
+        }}
+        .corner-cell .row-label {{
+            position: absolute;
+            bottom: 4px;
+            left: 4px;
+            font-size: 10px;
+            color: #666;
+        }}
+        .corner-cell .col-label {{
+            position: absolute;
+            top: 4px;
+            right: 4px;
+            font-size: 10px;
+            color: #666;
+        }}
+        .legend {{ font-size: 12px; color: #666; margin-bottom: 10px; }}
+        .date-switcher {{ display: flex; align-items: center; gap: 5px; }}
+        .date-switcher button {{ padding: 5px 10px; border: 1px solid #ddd; background: white;
+                                cursor: pointer; border-radius: 3px; }}
+        .date-switcher button:hover {{ background: #f0f0f0; }}
+        .play-btn.playing {{ background: #28a745; color: white; }}
+        /* Compare tab styles */
+        .chart-container {{ width: 100%; overflow-x: auto; }}
+        .bar-chart {{ min-width: 800px; }}
+        .bar-group {{ display: flex; align-items: flex-end; gap: 4px; margin-bottom: 8px; }}
+        .bar {{ min-width: 60px; text-align: center; font-size: 10px; color: white;
+               border-radius: 3px 3px 0 0; transition: all 0.3s; cursor: pointer; }}
+        .bar:hover {{ opacity: 0.8; }}
+        .bar-label {{ font-size: 11px; color: #333; margin-bottom: 5px; font-weight: 500; }}
+        .chart-legend {{ display: flex; gap: 20px; margin-bottom: 15px; }}
+        .legend-item {{ display: flex; align-items: center; gap: 5px; font-size: 12px; }}
+        .legend-color {{ width: 16px; height: 16px; border-radius: 3px; }}
+        .compare-table {{ width: 100%; border-collapse: collapse; }}
+        .compare-table th {{ background: #f5f5f5; padding: 8px 10px; text-align: center; font-weight: 600; border: 1px solid #ddd; }}
+        .compare-table td {{ padding: 6px 8px; border: 1px solid #eee; text-align: center; }}
+        .compare-table .crowd-header {{ background: #e8e8e8; font-size: 14px; }}
+        .compare-table .cat-cell {{ text-align: left; padding-left: 10px; }}
+        .compare-section {{ display: flex; gap: 20px; }}
+        .crowd-block {{ flex: 1; min-width: 250px; }}
+        .crowd-block table {{ width: 100%; border-collapse: collapse; }}
+        .crowd-block th {{ background: #f0f0f0; padding: 8px; border: 1px solid #ddd; }}
+        .crowd-block td {{ padding: 6px 8px; border: 1px solid #eee; }}
+        .crowd-block .rn {{ width: 40px; text-align: center; color: #666; }}
+        .crowd-block .cat {{ text-align: left; cursor: pointer; transition: all 0.2s; }}
+        .crowd-block .val {{ text-align: right; font-family: monospace; }}
+        .crowd-block .cat.highlight {{
+            font-weight: bold;
+        }}
+        .crowd-block tr.row-highlight {{
+            outline: 2px solid #1565C0;
+            outline-offset: -1px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>头部品类 → 推荐品类</h1>
+
+        <!-- Matrix Tab -->
+        <div id="tab-matrix">
+            <div class="controls">
+                <div class="control-group">
+                    <label>人群:</label>
+                    <select id="crowd-select" onchange="updateMatrix()">
+                        {crowd_options_html}
+                    </select>
+                </div>
+                <div class="control-group">
+                    <label>指标:</label>
+                    <select id="metric-select" onchange="updateMatrix()">
+                        <option value="exp">exp</option>
+                        <option value="str">str</option>
+                        <option value="ros">ros</option>
+                        <option value="rovn">rovn</option>
+                        <option value="vov" selected>vov</option>
+                    </select>
+                </div>
+                <div class="control-group date-switcher">
+                    <label>日期:</label>
+                    <button onclick="switchDate(-1)">◀</button>
+                    <select id="date-select" onchange="updateMatrix()">
+                        {date_options_html}
+                    </select>
+                    <button onclick="switchDate(1)">▶</button>
+                    <button id="play-btn" class="play-btn" onclick="togglePlay()">▶</button>
+                </div>
+            </div>
+
+            <div class="summary" id="summary"></div>
+
+            <div class="legend">
+                行=头部品类,列=推荐品类 | 颜色越深=数值越高 | 点击表头排序
+                <button onclick="resetSort()" style="margin-left:15px;padding:3px 10px;cursor:pointer;">重置</button>
+            </div>
+
+            <div class="matrix-container">
+                <table id="matrix-table">
+                    <thead id="matrix-header"></thead>
+                    <tbody id="matrix-body"></tbody>
+                </table>
+            </div>
+
+            <!-- 头部品类下钻表格 -->
+            <div style="margin-top: 30px; border-top: 2px solid #e0e0e0; padding-top: 20px;">
+                <h3 style="margin-bottom: 15px; font-size: 16px; color: #333;">头部品类下钻:各人群推荐品类 Top N</h3>
+                <div class="controls">
+                    <div class="control-group">
+                        <label>头部品类:</label>
+                        <select id="drill-head" onchange="updateHeadDrill()">
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>排序:</label>
+                        <select id="drill-sort" onchange="updateHeadDrill()">
+                            <option value="exp" selected>exp</option>
+                            <option value="str">str</option>
+                            <option value="ros">ros</option>
+                            <option value="rovn">rovn</option>
+                            <option value="vov">vov</option>
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>展示:</label>
+                        <select id="drill-metric" onchange="updateHeadDrill()">
+                            <option value="exp">exp</option>
+                            <option value="str">str</option>
+                            <option value="ros">ros</option>
+                            <option value="rovn">rovn</option>
+                            <option value="vov" selected>vov</option>
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>Top:</label>
+                        <select id="drill-topn" onchange="updateHeadDrill()">
+                            <option value="5">5</option>
+                            <option value="10" selected>10</option>
+                            <option value="15">15</option>
+                            <option value="20">20</option>
+                        </select>
+                    </div>
+                    <div class="control-group date-switcher">
+                        <label>日期:</label>
+                        <button onclick="switchDrillDate(-1)">◀</button>
+                        <select id="drill-date" onchange="updateHeadDrill()">
+                            {date_options_html}
+                        </select>
+                        <button onclick="switchDrillDate(1)">▶</button>
+                        <button id="drill-play-btn" class="play-btn" onclick="toggleDrillPlay()">▶</button>
+                    </div>
+                </div>
+                <div class="compare-section" id="drill-section"></div>
+            </div>
+        </div>
+
+    </div>
+
+    <script>
+    const allData = {data_json};
+    const headDrillData = {head_drill_json};
+    const crowdList = {crowd_list_json};
+    const dates = {dates_json};
+    const crowdColors = {{ '内部': '#4CAF50', '外部0层': '#2196F3', '外部裂变': '#FF9800' }};
+    let playInterval = null;
+    let drillPlayInterval = null;
+    let currentRowOrder = null;
+    let currentColOrder = null;
+    let sortState = {{ row: null, col: null, asc: true }};
+    let lastCrowd = null;
+    let lastDate = null;
+
+    function getGradient(val, maxVal, minVal = 0) {{
+        if (val <= minVal || maxVal <= minVal) return '#f8f9fa';
+        const ratio = Math.min((val - minVal) / (maxVal - minVal), 1);
+        const r = Math.round(255 - ratio * 215);
+        const g = Math.round(255 - ratio * 88);
+        const b = Math.round(255 - ratio * 186);
+        return `rgb(${{r}},${{g}},${{b}})`;
+    }}
+
+    function updateMatrix() {{
+        const crowd = document.getElementById('crowd-select').value;
+        const metric = document.getElementById('metric-select').value;
+        const date = document.getElementById('date-select').value;
+
+        if (!allData[crowd] || !allData[crowd][date]) {{
+            document.getElementById('summary').innerHTML = '<div class="stat-card"><h4>-</h4><p>no data</p></div>';
+            document.getElementById('matrix-header').innerHTML = '';
+            document.getElementById('matrix-body').innerHTML = '';
+            return;
+        }}
+
+        const data = allData[crowd][date];
+
+        document.getElementById('summary').innerHTML = `
+            <div class="stat-card"><h4>${{data.total_exp.toLocaleString()}}</h4><p>总 exp</p></div>
+            <div class="stat-card"><h4>${{data.total_str.toFixed(4)}}</h4><p>总 str</p></div>
+            <div class="stat-card"><h4>${{data.total_rovn.toFixed(4)}}</h4><p>总 rovn</p></div>
+            <div class="stat-card"><h4>${{data.rows.length}}</h4><p>头部品类数</p></div>
+            <div class="stat-card"><h4>${{data.cols.length}}</h4><p>推荐品类数</p></div>
+        `;
+
+        const metricData = data[metric];
+        const allVals = [];
+        data.rows.forEach(r => data.cols.forEach(c => {{
+            const val = metricData[r]?.[c] || 0;
+            if (val > 0) allVals.push(val);
+        }}));
+        allVals.sort((a, b) => a - b);
+
+        const p95Idx = Math.floor(allVals.length * 0.95);
+        let maxVal = allVals.length > 0 ? allVals[Math.min(p95Idx, allVals.length - 1)] : 0;
+        const thresholds = {{ exp: 10000, str: 0.1, ros: 0.5, rovn: 0.05, vov: 0.3 }};
+        maxVal = Math.max(maxVal, thresholds[metric] || 0.1);
+
+        // 切换人群或日期时,重置排序,使用新数据的 exp 排序
+        if (crowd !== lastCrowd || date !== lastDate) {{
+            currentRowOrder = null;
+            currentColOrder = null;
+            sortState = {{ row: null, col: null, asc: true }};
+            lastCrowd = crowd;
+            lastDate = date;
+        }}
+
+        if (!currentRowOrder) currentRowOrder = [...data.rows];
+        if (!currentColOrder) currentColOrder = [...data.cols];
+
+        const rows = currentRowOrder.filter(r => data.rows.includes(r));
+        const cols = currentColOrder.filter(c => data.cols.includes(c));
+
+        const expData = data.exp;
+        const rowExpTotals = {{}};
+        const colExpTotals = {{}};
+        rows.forEach(r => {{ rowExpTotals[r] = cols.reduce((sum, c) => sum + (expData[r]?.[c] || 0), 0); }});
+        cols.forEach(c => {{ colExpTotals[c] = rows.reduce((sum, r) => sum + (expData[r]?.[c] || 0), 0); }});
+
+        // 计算原始排名(按exp排序)
+        const origRowOrder = [...data.rows];
+        const origColOrder = [...data.cols];
+
+        document.getElementById('matrix-header').innerHTML = `
+            <tr>
+                <th class="corner-cell" style="cursor:pointer" onclick="sortByRowSum()">
+                    <span class="row-label">头部品类 ↓</span>
+                    <span class="col-label">推荐品类 →</span>
+                </th>
+                ${{cols.map((c, i) => {{
+                    const origRank = origColOrder.indexOf(c) + 1;
+                    return `<th style="cursor:pointer" onclick="sortByCol('${{c}}')" title="推荐品类: ${{c}}&#10;exp排名: #${{origRank}}&#10;exp: ${{colExpTotals[c].toLocaleString()}}">#${{origRank}} ${{c}}</th>`;
+                }}).join('')}}
+            </tr>
+        `;
+
+        document.getElementById('matrix-body').innerHTML = rows.map((r, ri) => {{
+            const origRowRank = origRowOrder.indexOf(r) + 1;
+            const cells = cols.map(c => {{
+                const val = metricData[r]?.[c] || 0;
+                const cellExp = expData[r]?.[c] || 0;
+                const bg = getGradient(val, maxVal);
+                const display = metric === 'exp' ? parseInt(val).toLocaleString() : val.toFixed(4);
+                const rowPct = rowExpTotals[r] > 0 ? (cellExp / rowExpTotals[r] * 100).toFixed(1) : '0.0';
+                const colPct = colExpTotals[c] > 0 ? (cellExp / colExpTotals[c] * 100).toFixed(1) : '0.0';
+                return `<td style="background:${{bg}}" title="头部: ${{r}}&#10;推荐: ${{c}}&#10;${{metric}}: ${{display}}&#10;exp: ${{cellExp.toLocaleString()}}&#10;横向占比: ${{rowPct}}%&#10;纵向占比: ${{colPct}}%">${{display}}</td>`;
+            }}).join('');
+            return `<tr><td style="cursor:pointer;background:#f5f5f5" onclick="sortByRow('${{r}}')" title="头部品类: ${{r}}&#10;exp排名: #${{origRowRank}}&#10;exp: ${{rowExpTotals[r].toLocaleString()}}">#${{origRowRank}} ${{r}}</td>${{cells}}</tr>`;
+        }}).join('');
+    }}
+
+    function switchDate(delta) {{
+        const select = document.getElementById('date-select');
+        const idx = dates.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dates.length) {{
+            select.value = dates[newIdx];
+            updateMatrix();
+        }}
+    }}
+
+    function switchDrillDate(delta) {{
+        const select = document.getElementById('drill-date');
+        const idx = dates.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dates.length) {{
+            select.value = dates[newIdx];
+            // 触发 change 事件以更新头部品类列表
+            select.dispatchEvent(new Event('change'));
+        }}
+    }}
+
+    function toggleDrillPlay() {{
+        const btn = document.getElementById('drill-play-btn');
+        if (drillPlayInterval) {{
+            clearInterval(drillPlayInterval);
+            drillPlayInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸';
+            let idx = 0;
+            const play = () => {{
+                if (idx >= dates.length) {{
+                    clearInterval(drillPlayInterval);
+                    drillPlayInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶';
+                    return;
+                }}
+                document.getElementById('drill-date').value = dates[idx];
+                document.getElementById('drill-date').dispatchEvent(new Event('change'));
+                idx++;
+            }};
+            play();
+            drillPlayInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    function togglePlay() {{
+        const btn = document.getElementById('play-btn');
+        if (playInterval) {{
+            clearInterval(playInterval);
+            playInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸';
+            let idx = 0;
+            const play = () => {{
+                if (idx >= dates.length) {{
+                    clearInterval(playInterval);
+                    playInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶';
+                    return;
+                }}
+                document.getElementById('date-select').value = dates[idx];
+                updateMatrix();
+                idx++;
+            }};
+            play();
+            playInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    function getCurrentData() {{
+        const crowd = document.getElementById('crowd-select').value;
+        const date = document.getElementById('date-select').value;
+        const metric = document.getElementById('metric-select').value;
+        if (!allData[crowd] || !allData[crowd][date]) return null;
+        return {{ data: allData[crowd][date], metric }};
+    }}
+
+    function sortByRowSum() {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        const rowSums = {{}};
+        data.rows.forEach(r => {{ rowSums[r] = data.cols.reduce((sum, c) => sum + (metricData[r]?.[c] || 0), 0); }});
+        sortState.asc = sortState.row === 'sum' ? !sortState.asc : false;
+        sortState.row = 'sum';
+        currentRowOrder = [...data.rows].sort((a, b) => sortState.asc ? rowSums[a] - rowSums[b] : rowSums[b] - rowSums[a]);
+        updateMatrix();
+    }}
+
+    function sortByCol(colName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        sortState.asc = sortState.col === colName ? !sortState.asc : false;
+        sortState.col = colName;
+        currentRowOrder = [...data.rows].sort((a, b) => {{
+            const va = metricData[a]?.[colName] || 0;
+            const vb = metricData[b]?.[colName] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+        updateMatrix();
+    }}
+
+    function sortByRow(rowName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        sortState.asc = sortState.row === rowName ? !sortState.asc : false;
+        sortState.row = rowName;
+        currentColOrder = [...data.cols].sort((a, b) => {{
+            const va = metricData[rowName]?.[a] || 0;
+            const vb = metricData[rowName]?.[b] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+        updateMatrix();
+    }}
+
+    function resetSort() {{
+        currentRowOrder = null;
+        currentColOrder = null;
+        sortState = {{ row: null, col: null, asc: true }};
+        updateMatrix();
+    }}
+
+    function highlightCat(el) {{
+        const cat = el.getAttribute('data-cat');
+        document.querySelectorAll('.cat[data-cat]').forEach(cell => {{
+            if (cell.getAttribute('data-cat') === cat) {{
+                cell.classList.add('highlight');
+                cell.closest('tr').classList.add('row-highlight');
+            }}
+        }});
+    }}
+
+    function unhighlightCat() {{
+        document.querySelectorAll('.cat.highlight').forEach(cell => {{
+            cell.classList.remove('highlight');
+            cell.closest('tr').classList.remove('row-highlight');
+        }});
+    }}
+
+    // 初始化头部品类下钻
+    function initHeadDrill() {{
+        const date = document.getElementById('drill-date').value;
+        const headSelect = document.getElementById('drill-head');
+
+        if (!headDrillData[date]) {{
+            headSelect.innerHTML = '<option value="">无数据</option>';
+            return;
+        }}
+
+        const heads = headDrillData[date].heads;
+        headSelect.innerHTML = heads.map((h, i) => {{
+            const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+            return `<option value="${{h}}">${{label}}</option>`;
+        }}).join('');
+
+        updateHeadDrill();
+    }}
+
+    function updateHeadDrill() {{
+        const date = document.getElementById('drill-date').value;
+        const headCate = document.getElementById('drill-head').value;
+        const sortBy = document.getElementById('drill-sort').value;
+        const showMetric = document.getElementById('drill-metric').value;
+        const topN = parseInt(document.getElementById('drill-topn').value);
+
+        // 检查日期变化,更新头部品类列表
+        const headSelect = document.getElementById('drill-head');
+        if (headDrillData[date] && headSelect.options.length > 0) {{
+            const currentHeads = headDrillData[date].heads;
+            const firstOption = headSelect.options[0]?.value;
+            if (currentHeads[0] !== firstOption) {{
+                headSelect.innerHTML = currentHeads.map((h, i) => {{
+                    const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+                    return `<option value="${{h}}" ${{h === headCate ? 'selected' : ''}}>${{label}}</option>`;
+                }}).join('');
+            }}
+        }}
+
+        if (!headDrillData[date] || !headCate) {{
+            document.getElementById('drill-section').innerHTML = '<p>无数据</p>';
+            return;
+        }}
+
+        const data = headDrillData[date].data[headCate];
+        if (!data) {{
+            document.getElementById('drill-section').innerHTML = '<p>该头部品类无数据</p>';
+            return;
+        }}
+
+        // 为每个人群计算 Top N
+        const crowdTopN = {{}};
+        crowdList.forEach(crowd => {{
+            const items = [];
+            if (data[crowd]) {{
+                for (const cat in data[crowd]) {{
+                    items.push({{
+                        cat: cat,
+                        sortVal: data[crowd][cat][sortBy] || 0,
+                        showVal: data[crowd][cat][showMetric] || 0,
+                        exp: data[crowd][cat].exp || 0
+                    }});
+                }}
+            }}
+            items.sort((a, b) => b.sortVal - a.sortVal);
+            crowdTopN[crowd] = items.slice(0, topN);
+        }});
+
+        // 收集所有品类用于颜色映射
+        const allCats = new Set();
+        crowdList.forEach(crowd => {{
+            crowdTopN[crowd].forEach(item => allCats.add(item.cat));
+        }});
+        const catList = Array.from(allCats);
+
+        const catColors = {{}};
+        const colorPalette = [
+            '#FFCDD2', '#F8BBD0', '#E1BEE7', '#D1C4E9', '#C5CAE9',
+            '#BBDEFB', '#B3E5FC', '#B2EBF2', '#B2DFDB', '#C8E6C9',
+            '#DCEDC8', '#F0F4C3', '#FFF9C4', '#FFECB3', '#FFE0B2',
+            '#FFCCBC', '#D7CCC8', '#CFD8DC', '#BCAAA4', '#B0BEC5'
+        ];
+        catList.forEach((cat, i) => {{
+            catColors[cat] = colorPalette[i % colorPalette.length];
+        }});
+
+        // 计算指标渐变范围
+        let maxVal = 0, minVal = Infinity;
+        crowdList.forEach(crowd => {{
+            crowdTopN[crowd].forEach(item => {{
+                if (item.showVal > maxVal) maxVal = item.showVal;
+                if (item.showVal < minVal) minVal = item.showVal;
+            }});
+        }});
+        if (minVal === Infinity) minVal = 0;
+
+        function getValueColor(val) {{
+            if (maxVal === minVal) return '#C8E6C9';
+            const ratio = (val - minVal) / (maxVal - minVal);
+            const r = Math.round(200 - ratio * 120);
+            const g = Math.round(230 - ratio * 80);
+            const b = Math.round(201 - ratio * 120);
+            return `rgb(${{r}},${{g}},${{b}})`;
+        }}
+
+        // 生成表格
+        let html = '';
+        crowdList.forEach(crowd => {{
+            const colSpan = showMetric === 'exp' ? 3 : 4;
+            html += `<div class="crowd-block">
+                <table>
+                    <thead>
+                        <tr><th colspan="${{colSpan}}" style="background:${{crowdColors[crowd]}};color:white">${{crowd}}</th></tr>
+                        <tr><th class="rn">rn</th><th>推荐品类</th><th>exp</th>${{showMetric !== 'exp' ? `<th>${{showMetric}}</th>` : ''}}</tr>
+                    </thead>
+                    <tbody>`;
+
+            if (crowdTopN[crowd].length === 0) {{
+                html += `<tr><td colspan="${{colSpan}}" style="color:#999">无数据</td></tr>`;
+            }} else {{
+                crowdTopN[crowd].forEach((item, i) => {{
+                    const expDisplay = parseInt(item.exp).toLocaleString();
+                    const metricDisplay = (item.showVal * 100).toFixed(1) + '%';
+                    const valColor = getValueColor(item.showVal);
+                    const catColor = catColors[item.cat];
+                    const catAttr = item.cat.replace(/"/g, '&quot;');
+                    html += `<tr>
+                        <td class="rn">${{i + 1}}</td>
+                        <td class="cat" style="background:${{catColor}}" data-cat="${{catAttr}}" onmouseenter="highlightCat(this)" onmouseleave="unhighlightCat()">${{item.cat}}</td>
+                        <td class="val">${{expDisplay}}</td>
+                        ${{showMetric !== 'exp' ? `<td class="val" style="background:${{valColor}}">${{metricDisplay}}</td>` : ''}}
+                    </tr>`;
+                }});
+            }}
+
+            html += `</tbody></table></div>`;
+        }});
+
+        document.getElementById('drill-section').innerHTML = html;
+    }}
+
+    // 监听日期变化,更新头部品类列表
+    document.getElementById('drill-date').addEventListener('change', function() {{
+        const date = this.value;
+        const headSelect = document.getElementById('drill-head');
+        const currentHead = headSelect.value;
+
+        if (headDrillData[date]) {{
+            const heads = headDrillData[date].heads;
+            headSelect.innerHTML = heads.map((h, i) => {{
+                const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+                return `<option value="${{h}}" ${{h === currentHead ? 'selected' : ''}}>${{label}}</option>`;
+            }}).join('');
+        }} else {{
+            headSelect.innerHTML = '<option value="">无数据</option>';
+        }}
+        updateHeadDrill();
+    }});
+
+    updateMatrix();
+    initHeadDrill();
+    </script>
+</body>
+</html>
+"""
+
+html_file = output_dir / f"{latest_file.stem}_头部品类分析.html"
+with open(html_file, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\nHTML 报告已生成: {html_file}")

BIN
tasks/人群品类曝光分析/头部品类分析_过滤小量/.DS_Store


+ 135 - 0
tasks/人群品类曝光分析/头部品类分析_过滤小量/query.sql

@@ -0,0 +1,135 @@
+-- 推荐曝光 join 头部视频表,分析头部视频品类对推荐曝光的影响
+-- 用 subsessionid + headvideoid + 时间条件关联
+-- 过滤曝光量 < 1000 的记录
+WITH t_head AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+            ,videoid AS head_vid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge一级品类` AS head_cate1
+            ,`merge二级品类` AS head_cate2
+            ,channel AS head_channel
+    FROM    loghubods.opengid_base_data
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+)
+,t_rec AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS rec_in_out
+            ,vid AS rec_vid
+            ,page
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_rec
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+)
+,t_joined_raw AS (
+    SELECT  r.dt
+            ,r.mid
+            ,r.subsessionid
+            ,r.rec_vid
+            ,r.ts
+            ,r.rec_in_out
+            ,r.page
+            ,r.page_rec
+            ,r.share_cnt
+            ,r.return_n_uv
+            ,r.new_exposure_cnt
+            ,r.layer
+            ,h.head_in_out
+            ,h.head_vid
+            ,h.head_cate1
+            ,h.head_cate2
+            ,h.head_channel
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.dt, r.mid, r.subsessionid, r.rec_vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.head_vid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.page_rec = '推荐'
+)
+,t_joined AS (
+    SELECT  dt
+            ,COALESCE(head_in_out, rec_in_out) AS in_out
+            ,CASE   WHEN COALESCE(head_in_out, rec_in_out) = '内部' THEN '内部'
+                    WHEN layer = '0' THEN '外部0层'
+                    WHEN CAST(layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+            ,head_vid
+            ,head_cate1
+            ,CASE   WHEN head_vid IS NULL THEN '未关联头部'
+                    WHEN head_cate2 IS NULL OR head_cate2 = '' THEN 'unknown'
+                    ELSE head_cate2
+            END AS head_cate2
+            ,head_channel
+            ,rec_vid
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+    FROM    t_joined_raw
+    WHERE   rn = 1
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid
+                        ,feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+            )
+    WHERE   rn = 1
+)
+,t_final AS (
+    SELECT  a.dt
+            ,a.crowd
+            ,a.head_cate2
+            ,b.rec_cate2
+            ,SUM(1) AS exp
+            ,SUM(a.share_cnt) AS share_cnt
+            ,SUM(a.return_n_uv) AS return_n_uv
+            ,SUM(a.new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_joined a
+    LEFT JOIN t_vid_info b ON a.rec_vid = b.vid
+    GROUP BY a.dt, a.crowd, a.head_cate2, b.rec_cate2
+)
+SELECT  dt
+        ,crowd
+        ,head_cate2
+        ,rec_cate2
+        ,exp
+        ,share_cnt
+        ,return_n_uv
+        ,new_exposure_cnt
+        ,round(COALESCE(share_cnt / exp,0),4) AS str
+        ,round(COALESCE(return_n_uv / share_cnt,0),4) AS ros
+        ,round(COALESCE(return_n_uv / exp,0),4) AS rovn
+        ,round(COALESCE(new_exposure_cnt / exp,0),4) AS vov
+FROM    t_final
+WHERE   crowd <> '其他'
+AND     exp >= 1000
+ORDER BY dt DESC, crowd, exp DESC
+;

+ 103 - 0
tasks/人群品类曝光分析/数据膨胀排查/query.sql

@@ -0,0 +1,103 @@
+-- 排查 join 后数据膨胀问题
+-- 对比原始曝光表和 join 后的数量
+
+-- 1. 原始曝光表:内部+人生忠告的 exp
+WITH t_rec_origin AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,vid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid, feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${start}","23")
+            )
+    WHERE   rn = 1
+)
+,t_rec_with_cate AS (
+    SELECT  r.*, v.rec_cate2
+    FROM    t_rec_origin r
+    LEFT JOIN t_vid_info v ON r.vid = v.vid
+)
+-- 原始曝光数
+,t_origin_cnt AS (
+    SELECT  '1_原始曝光' AS step
+            ,in_out
+            ,rec_cate2
+            ,COUNT(1) AS exp
+            ,COUNT(DISTINCT CONCAT(mid, subsessionid)) AS subsession_cnt
+    FROM    t_rec_with_cate
+    WHERE   in_out = '内部'
+    AND     rec_cate2 = '人生忠告'
+    GROUP BY in_out, rec_cate2
+)
+
+-- 2. 头部表:每个 subsessionid 有多少条记录?
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid, rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+,t_head_dup AS (
+    SELECT  '2_头部表重复' AS step
+            ,mid
+            ,subsessionid
+            ,COUNT(1) AS head_cnt
+    FROM    t_head
+    GROUP BY mid, subsessionid
+    HAVING  COUNT(1) > 1
+)
+,t_head_dup_stat AS (
+    SELECT  '2_头部表重复统计' AS step
+            ,CAST(NULL AS STRING) AS in_out
+            ,CAST(NULL AS STRING) AS rec_cate2
+            ,COUNT(1) AS exp  -- 有重复的 subsession 数
+            ,SUM(head_cnt) AS subsession_cnt  -- 总重复行数
+    FROM    t_head_dup
+)
+
+-- 3. Join 后的数量
+,t_joined AS (
+    SELECT  r.mid
+            ,r.subsessionid
+            ,r.vid
+            ,r.in_out
+            ,r.rec_cate2
+            ,h.videoid AS head_vid
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    WHERE   r.in_out = '内部'
+    AND     r.rec_cate2 = '人生忠告'
+)
+,t_joined_cnt AS (
+    SELECT  '3_Join后' AS step
+            ,in_out
+            ,rec_cate2
+            ,COUNT(1) AS exp
+            ,COUNT(DISTINCT CONCAT(mid, subsessionid)) AS subsession_cnt
+    FROM    t_joined
+    GROUP BY in_out, rec_cate2
+)
+
+-- 汇总输出
+SELECT * FROM t_origin_cnt
+UNION ALL
+SELECT * FROM t_head_dup_stat
+UNION ALL
+SELECT * FROM t_joined_cnt
+;

+ 70 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v10_关联率排查.sql

@@ -0,0 +1,70 @@
+-- 排查内部流量关联不上头部的原因
+-- 分析 headvideoid 是否在头部表中存在
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+    AND     (rootsourceid = '' OR rootsourceid IS NULL)  -- 只看内部
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 分析 headvideoid 情况
+,t_headvid_analysis AS (
+    SELECT  CASE
+                WHEN headvideoid IS NULL OR headvideoid = '' THEN 'headvideoid为空'
+                ELSE 'headvideoid有值'
+            END AS headvid_status
+            ,COUNT(1) AS exp_cnt
+            ,COUNT(DISTINCT mid) AS uv
+    FROM    t_rec
+    GROUP BY CASE
+                WHEN headvideoid IS NULL OR headvideoid = '' THEN 'headvideoid为空'
+                ELSE 'headvideoid有值'
+            END
+)
+-- headvideoid 有值的情况下,分析关联情况
+,t_rec_with_headvid AS (
+    SELECT  r.*
+    FROM    t_rec r
+    WHERE   r.headvideoid IS NOT NULL AND r.headvideoid <> ''
+)
+,t_join_analysis AS (
+    SELECT  r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,CASE
+                WHEN h1.videoid IS NOT NULL THEN '1_mid+subsession+vid匹配'
+                WHEN h2.videoid IS NOT NULL THEN '2_仅mid+vid匹配'
+                WHEN h3.videoid IS NOT NULL THEN '3_仅vid匹配'
+                ELSE '4_完全不匹配'
+            END AS match_status
+            ,h1.click_ts AS ts1
+    FROM    t_rec_with_headvid r
+    LEFT JOIN t_head h1
+    ON      r.dt = h1.dt AND r.mid = h1.mid AND r.subsessionid = h1.subsessionid
+    AND     r.headvideoid = h1.videoid
+    AND     h1.click_ts <= CAST(r.ts AS BIGINT)
+    LEFT JOIN (SELECT DISTINCT dt, mid, videoid FROM t_head) h2
+    ON      r.dt = h2.dt AND r.mid = h2.mid AND r.headvideoid = h2.videoid
+    LEFT JOIN (SELECT DISTINCT dt, videoid FROM t_head) h3
+    ON      r.dt = h3.dt AND r.headvideoid = h3.videoid
+)
+,t_match_summary AS (
+    SELECT  match_status, COUNT(1) AS exp_cnt
+    FROM    t_join_analysis
+    GROUP BY match_status
+)
+
+SELECT '0_headvideoid分布' AS analysis, headvid_status AS status, exp_cnt, uv
+FROM t_headvid_analysis
+UNION ALL
+SELECT '1_关联情况分析' AS analysis, match_status AS status, exp_cnt, NULL AS uv
+FROM t_match_summary
+ORDER BY analysis, status
+;

+ 59 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v11_放宽条件.sql

@@ -0,0 +1,59 @@
+-- 测试放宽条件:mid + subsessionid 匹配,取最近头部
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+    AND     (rootsourceid = '' OR rootsourceid IS NULL)  -- 内部
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge二级品类` AS head_cate2
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 方案1:当前(mid+subsession+headvideoid+时间)
+,t_v1 AS (
+    SELECT  r.mid, r.subsessionid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+            ,h.videoid AS head_vid
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+)
+,t_cnt_v1 AS (
+    SELECT  '1_当前(+headvideoid)' AS step
+            ,COUNT(1) AS exp
+            ,SUM(CASE WHEN head_vid IS NOT NULL THEN 1 ELSE 0 END) AS matched
+    FROM    t_v1 WHERE rn = 1
+)
+-- 方案2:放宽(mid+subsession+时间,不要headvideoid)
+,t_v2 AS (
+    SELECT  r.mid, r.subsessionid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+            ,h.videoid AS head_vid
+    FROM    t_rec r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+)
+,t_cnt_v2 AS (
+    SELECT  '2_放宽(无headvideoid)' AS step
+            ,COUNT(1) AS exp
+            ,SUM(CASE WHEN head_vid IS NOT NULL THEN 1 ELSE 0 END) AS matched
+    FROM    t_v2 WHERE rn = 1
+)
+
+SELECT * FROM t_cnt_v1
+UNION ALL SELECT * FROM t_cnt_v2
+;

+ 86 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v2.sql

@@ -0,0 +1,86 @@
+-- 用 subsessionid + headvideoid 关联,验证是否解决膨胀问题
+WITH t_rec_origin AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,vid
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid, feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${start}","23")
+            )
+    WHERE   rn = 1
+)
+,t_rec_with_cate AS (
+    SELECT  r.*, v.rec_cate2
+    FROM    t_rec_origin r
+    LEFT JOIN t_vid_info v ON r.vid = v.vid
+)
+-- 原始曝光数
+,t_origin_cnt AS (
+    SELECT  '1_原始曝光' AS step
+            ,in_out
+            ,rec_cate2
+            ,COUNT(1) AS exp
+            ,COUNT(DISTINCT CONCAT(mid, subsessionid)) AS subsession_cnt
+    FROM    t_rec_with_cate
+    WHERE   in_out = '内部'
+    AND     rec_cate2 = '人生忠告'
+    GROUP BY in_out, rec_cate2
+)
+
+-- 头部表
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid, rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS head_in_out
+            ,`merge二级品类` AS head_cate2
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+
+-- 用 subsessionid + headvideoid 关联
+,t_joined_v2 AS (
+    SELECT  r.mid
+            ,r.subsessionid
+            ,r.headvideoid
+            ,r.vid
+            ,r.in_out
+            ,r.rec_cate2
+            ,h.videoid AS head_vid
+            ,h.head_cate2
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid  -- 增加 headvideoid 关联
+    WHERE   r.in_out = '内部'
+    AND     r.rec_cate2 = '人生忠告'
+)
+,t_joined_v2_cnt AS (
+    SELECT  '2_Join_v2(+headvideoid)' AS step
+            ,in_out
+            ,rec_cate2
+            ,COUNT(1) AS exp
+            ,COUNT(DISTINCT CONCAT(mid, subsessionid)) AS subsession_cnt
+    FROM    t_joined_v2
+    GROUP BY in_out, rec_cate2
+)
+
+-- 汇总输出
+SELECT * FROM t_origin_cnt
+UNION ALL
+SELECT * FROM t_joined_v2_cnt
+;

+ 17 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v3.sql

@@ -0,0 +1,17 @@
+-- 排查头部表 (subsessionid, videoid) 是否有重复
+WITH t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+SELECT  '头部表(subsessionid,videoid)重复' AS check_type
+        ,COUNT(1) AS dup_key_cnt
+        ,SUM(cnt) AS total_rows
+        ,SUM(cnt - 1) AS extra_rows
+FROM    (
+            SELECT  mid, subsessionid, videoid, COUNT(1) AS cnt
+            FROM    t_head
+            GROUP BY mid, subsessionid, videoid
+            HAVING  COUNT(1) > 1
+        )
+;

+ 53 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v4.sql

@@ -0,0 +1,53 @@
+-- 排查不同 key 组合的重复情况
+WITH t_head AS (
+    SELECT  dt, mid, sessionid, subsessionid, videoid
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 1. (mid, subsessionid, videoid) 重复
+SELECT  '1_(mid,subsessionid,videoid)' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,SUM(cnt) AS total_rows
+        ,SUM(cnt - 1) AS extra_rows
+FROM    (
+            SELECT  mid, subsessionid, videoid, COUNT(1) AS cnt
+            FROM    t_head
+            GROUP BY mid, subsessionid, videoid
+            HAVING  COUNT(1) > 1
+        )
+
+UNION ALL
+
+-- 2. (mid, sessionid, subsessionid, videoid) 重复
+SELECT  '2_(mid,sessionid,subsessionid,videoid)' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,SUM(cnt) AS total_rows
+        ,SUM(cnt - 1) AS extra_rows
+FROM    (
+            SELECT  mid, sessionid, subsessionid, videoid, COUNT(1) AS cnt
+            FROM    t_head
+            GROUP BY mid, sessionid, subsessionid, videoid
+            HAVING  COUNT(1) > 1
+        )
+
+UNION ALL
+
+-- 3. 头部表总行数 vs 去重后行数
+SELECT  '3_头部表总量' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,CAST(NULL AS BIGINT) AS total_rows
+        ,CAST(NULL AS BIGINT) AS extra_rows
+FROM    t_head
+
+UNION ALL
+
+SELECT  '4_头部表去重(mid,subsessionid,videoid)' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,CAST(NULL AS BIGINT) AS total_rows
+        ,CAST(NULL AS BIGINT) AS extra_rows
+FROM    (
+            SELECT  mid, subsessionid, videoid
+            FROM    t_head
+            GROUP BY mid, subsessionid, videoid
+        )
+;

+ 48 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v5.sql

@@ -0,0 +1,48 @@
+-- 排查加上 shareid 后的重复情况
+WITH t_head AS (
+    SELECT  dt, mid, sessionid, subsessionid, videoid, shareid, `点击时间`
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 1. (mid, subsessionid, videoid, shareid) 重复
+SELECT  '1_(mid,subsessionid,videoid,shareid)' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,SUM(cnt) AS total_rows
+FROM    (
+            SELECT  mid, subsessionid, videoid, shareid, COUNT(1) AS cnt
+            FROM    t_head
+            GROUP BY mid, subsessionid, videoid, shareid
+            HAVING  COUNT(1) > 1
+        )
+
+UNION ALL
+
+-- 2. shareid 是否唯一
+SELECT  '2_shareid单独' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,SUM(cnt) AS total_rows
+FROM    (
+            SELECT  shareid, COUNT(1) AS cnt
+            FROM    t_head
+            WHERE   shareid IS NOT NULL AND shareid <> ''
+            GROUP BY shareid
+            HAVING  COUNT(1) > 1
+        )
+
+UNION ALL
+
+-- 3. shareid 为空的比例
+SELECT  '3_shareid为空' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,CAST(NULL AS BIGINT) AS total_rows
+FROM    t_head
+WHERE   shareid IS NULL OR shareid = ''
+
+UNION ALL
+
+-- 4. 头部表总行数
+SELECT  '4_头部表总量' AS key_combo
+        ,COUNT(1) AS dup_key_cnt
+        ,CAST(NULL AS BIGINT) AS total_rows
+FROM    t_head
+;

+ 72 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v6.sql

@@ -0,0 +1,72 @@
+-- 用时间条件关联:头部点击时间 < 曝光ts,取最接近的一条
+-- 验证是否解决膨胀问题
+
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid, feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${start}","23")
+            )
+    WHERE   rn = 1
+)
+,t_rec_with_cate AS (
+    SELECT  r.*, v.rec_cate2
+    FROM    t_rec r
+    LEFT JOIN t_vid_info v ON r.vid = v.vid
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge二级品类` AS head_cate2
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 原始曝光数
+,t_origin_cnt AS (
+    SELECT  '1_原始曝光' AS step, COUNT(1) AS exp
+    FROM    t_rec_with_cate
+    WHERE   in_out = '内部' AND rec_cate2 = '人生忠告'
+)
+-- 关联:点击时间 < 曝光ts,取最接近的一条
+,t_joined_with_time AS (
+    SELECT  r.*
+            ,h.videoid AS head_vid
+            ,h.head_cate2
+            ,h.click_ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC  -- 取最接近的(最大的小于ts的)
+            ) AS rn
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt
+    AND     r.mid = h.mid
+    AND     r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts < CAST(r.ts AS BIGINT)  -- 点击时间 < 曝光时间
+    WHERE   r.in_out = '内部' AND r.rec_cate2 = '人生忠告'
+)
+,t_joined_dedup AS (
+    SELECT  * FROM t_joined_with_time WHERE rn = 1
+)
+,t_joined_cnt AS (
+    SELECT  '2_Join(点击时间<ts,取最近)' AS step, COUNT(1) AS exp
+    FROM    t_joined_dedup
+)
+
+SELECT * FROM t_origin_cnt
+UNION ALL
+SELECT * FROM t_joined_cnt
+;

+ 80 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v7.sql

@@ -0,0 +1,80 @@
+-- 对比 < 和 <= 的结果
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid, feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${start}","23")
+            )
+    WHERE   rn = 1
+)
+,t_rec_with_cate AS (
+    SELECT  r.*, v.rec_cate2
+    FROM    t_rec r
+    LEFT JOIN t_vid_info v ON r.vid = v.vid
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,`merge二级品类` AS head_cate2
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 原始
+,t_origin AS (
+    SELECT  '1_原始曝光' AS step, COUNT(1) AS exp
+    FROM    t_rec_with_cate
+    WHERE   in_out = '内部' AND rec_cate2 = '人生忠告'
+)
+-- < 方案
+,t_joined_lt AS (
+    SELECT  r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts < CAST(r.ts AS BIGINT)
+    WHERE   r.in_out = '内部' AND r.rec_cate2 = '人生忠告'
+)
+,t_cnt_lt AS (
+    SELECT  '2_Join(click_ts < ts)' AS step, COUNT(1) AS exp
+    FROM    t_joined_lt WHERE rn = 1
+)
+-- <= 方案
+,t_joined_le AS (
+    SELECT  r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.in_out = '内部' AND r.rec_cate2 = '人生忠告'
+)
+,t_cnt_le AS (
+    SELECT  '3_Join(click_ts <= ts)' AS step, COUNT(1) AS exp
+    FROM    t_joined_le WHERE rn = 1
+)
+
+SELECT * FROM t_origin
+UNION ALL SELECT * FROM t_cnt_lt
+UNION ALL SELECT * FROM t_cnt_le
+;

+ 83 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v8.sql

@@ -0,0 +1,83 @@
+-- 测试加上 layer = usersharedepth 条件
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_vid_info AS (
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS rec_cate2
+    FROM    (
+                SELECT  vid, feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC,hh DESC ) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${start}","23")
+            )
+    WHERE   rn = 1
+)
+,t_rec_with_cate AS (
+    SELECT  r.*, v.rec_cate2
+    FROM    t_rec r
+    LEFT JOIN t_vid_info v ON r.vid = v.vid
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,usersharedepth
+            ,`merge二级品类` AS head_cate2
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 原始
+,t_origin AS (
+    SELECT  '1_原始曝光' AS step, COUNT(1) AS exp
+    FROM    t_rec_with_cate
+    WHERE   in_out = '内部' AND rec_cate2 = '人生忠告'
+)
+-- 当前方案:click_ts <= ts
+,t_joined_v1 AS (
+    SELECT  r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.in_out = '内部' AND r.rec_cate2 = '人生忠告'
+)
+,t_cnt_v1 AS (
+    SELECT  '2_当前(click_ts<=ts)' AS step, COUNT(1) AS exp
+    FROM    t_joined_v1 WHERE rn = 1
+)
+-- 新方案:click_ts <= ts + layer = usersharedepth
+,t_joined_v2 AS (
+    SELECT  r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_cate r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    AND     r.layer = h.usersharedepth  -- 新增条件
+    WHERE   r.in_out = '内部' AND r.rec_cate2 = '人生忠告'
+)
+,t_cnt_v2 AS (
+    SELECT  '3_新增(+layer=usersharedepth)' AS step, COUNT(1) AS exp
+    FROM    t_joined_v2 WHERE rn = 1
+)
+
+SELECT * FROM t_origin
+UNION ALL SELECT * FROM t_cnt_v1
+UNION ALL SELECT * FROM t_cnt_v2
+;

+ 79 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v9.sql

@@ -0,0 +1,79 @@
+-- 测试三类人群:内部/外部0层/外部裂变,加上 layer = usersharedepth 条件的效果
+WITH t_rec AS (
+    SELECT  dt, mid, subsessionid, headvideoid, vid, ts
+            ,rootsourceid
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt = "${start}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页")
+)
+,t_rec_with_crowd AS (
+    SELECT  r.*
+            ,CASE   WHEN in_out = '内部' THEN '内部'
+                    WHEN layer = '0' THEN '外部0层'
+                    WHEN CAST(layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+    FROM    t_rec r
+)
+,t_head AS (
+    SELECT  dt, mid, subsessionid, videoid
+            ,UNIX_TIMESTAMP(`点击时间`) AS click_ts
+            ,usersharedepth
+    FROM    loghubods.opengid_base_data
+    WHERE   dt = "${start}"
+)
+-- 原始曝光(按人群)
+,t_origin AS (
+    SELECT  crowd, '1_原始曝光' AS step, COUNT(1) AS exp
+    FROM    t_rec_with_crowd
+    WHERE   crowd <> '其他'
+    GROUP BY crowd
+)
+-- 当前方案:click_ts <= ts
+,t_joined_v1 AS (
+    SELECT  r.crowd, r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_crowd r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    WHERE   r.crowd <> '其他'
+)
+,t_cnt_v1 AS (
+    SELECT  crowd, '2_当前(click_ts<=ts)' AS step, COUNT(1) AS exp
+    FROM    t_joined_v1 WHERE rn = 1
+    GROUP BY crowd
+)
+-- 新方案:click_ts <= ts + layer = usersharedepth
+,t_joined_v2 AS (
+    SELECT  r.crowd, r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+            ,ROW_NUMBER() OVER (
+                PARTITION BY r.mid, r.subsessionid, r.headvideoid, r.vid, r.ts
+                ORDER BY h.click_ts DESC
+            ) AS rn
+    FROM    t_rec_with_crowd r
+    LEFT JOIN t_head h
+    ON      r.dt = h.dt AND r.mid = h.mid AND r.subsessionid = h.subsessionid
+    AND     r.headvideoid = h.videoid
+    AND     h.click_ts <= CAST(r.ts AS BIGINT)
+    AND     r.layer = h.usersharedepth  -- 新增条件
+    WHERE   r.crowd <> '其他'
+)
+,t_cnt_v2 AS (
+    SELECT  crowd, '3_新增(+layer=depth)' AS step, COUNT(1) AS exp
+    FROM    t_joined_v2 WHERE rn = 1
+    GROUP BY crowd
+)
+
+SELECT * FROM t_origin
+UNION ALL SELECT * FROM t_cnt_v1
+UNION ALL SELECT * FROM t_cnt_v2
+ORDER BY crowd, step
+;