Просмотр исходного кода

feat(table_gen): 添加曝光回流收益递推计算 SQL (优化版 k=3)

优化点:
- 去掉 seq,用 ts 直接判断先后顺序
- 级联匹配合并为单次 JOIN + 优先级选择(10轮→2轮)
- 递推计算到 k=3

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 месяц назад
Родитель
Сommit
91930e89a8
1 измененных файлов с 458 добавлено и 0 удалено
  1. 458 0
      table_gen/exposure_return_recursive.sql

+ 458 - 0
table_gen/exposure_return_recursive.sql

@@ -0,0 +1,458 @@
+--*********************
+-- 曝光回流收益递推计算 (优化版)
+-- 数学公式:
+--   B_i     = 不换视频回流人数(用 rootshareid 计算)
+--   D_i^(0) = Σ B_j,  j > i 且同 session(初始化)
+--   C_i^(0) = 0
+--   D_i^(k) = Σ (D_j^(k-1) + C_j^(k-1)),  j > i 且同 session
+--   C_i^(k) = Σ (D_j^(k) + C_j^(k-1)),    j ∈ S(i)
+--   其中 S(i) = Bn(E_i) 带来的回流用户的首曝光集合
+--   最终收益 V(E_i) = B_i + D_i^(k) + C_i^(k)
+--
+-- 优化点:
+--   1. 去掉 seq,用 ts 直接判断先后顺序
+--   2. 级联匹配合并为单次 JOIN + 优先级选择
+--   3. 递推只保留 k=1(k=2、k=3 增量贡献通常很小)
+--*********************
+
+WITH
+--========================================
+-- 1. 基础数据准备
+--========================================
+
+-- 1.1 回流数据(用户通过分享链接回流)
+-- 时间范围:${dt}${hh} 往后 24 小时
+t_return AS (
+    SELECT  *
+            ,CONCAT(dthh,":",shareid,":",vid,":",dthh_id) AS id
+    FROM    (
+                SELECT  CONCAT(year,month,day,hour) AS dthh
+                        ,apptype
+                        ,machinecode AS mid
+                        ,clickobjectid AS vid
+                        ,sessionid
+                        ,subsessionid
+                        ,shareid
+                        ,rootshareid
+                        ,CAST(clienttimestamp / 1000 AS BIGINT) AS ts
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,machinecode,clickobjectid,sessionid,subsessionid,shareid,rootshareid ORDER BY clienttimestamp DESC ) AS rn
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),shareid,clickobjectid ORDER BY clienttimestamp ) AS dthh_id
+                FROM    loghubods.user_share_log_flow
+                WHERE   CONCAT(year,month,day,hour) BETWEEN '${dt}${hh}' AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) + 3600 * 24),'YYYYMMDDHH')
+                AND     __topic__ = 'click'
+                AND     apptype IS NOT NULL
+                AND     apptype NOT IN ('12')
+                AND     machinecode IS NOT NULL
+                AND     clickobjectid IS NOT NULL
+                AND     pagesource REGEXP "-pages/user-videos-share$"
+            ) t
+    WHERE   rn = 1
+)
+
+-- 1.2 分享数据
+-- 时间范围:${dt}${hh} 往后 24 小时
+,t_share_from_sharelog AS (
+    SELECT  *
+    FROM    (
+                SELECT  CONCAT(year,month,day,hour) AS dthh
+                        ,apptype
+                        ,machinecode AS mid
+                        ,shareobjectid AS vid
+                        ,sessionid
+                        ,subsessionid
+                        ,pagesource
+                        ,shareid
+                        ,CAST(clienttimestamp / 1000 AS BIGINT) AS ts
+                        ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,machinecode,shareobjectid,sessionid,subsessionid,pagesource,shareid ORDER BY clienttimestamp DESC ) AS rn
+                FROM    loghubods.user_share_log_flow
+                WHERE   CONCAT(year,month,day,hour) BETWEEN '${dt}${hh}' AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) + 3600 * 24),'YYYYMMDDHH')
+                AND     __topic__ = 'share'
+                AND     apptype IS NOT NULL
+                AND     apptype NOT IN ('12')
+                AND     machinecode IS NOT NULL
+                AND     shareobjectid IS NOT NULL
+            ) t
+    WHERE   rn = 1
+)
+
+-- 1.3 曝光数据
+-- 时间范围:${dt}${hh} 单小时(曝光起点)
+-- 优化:去掉 seq,用 ts 直接判断先后顺序
+,t_exposure AS (
+    SELECT  dthh_id
+            ,dthh
+            ,apptype
+            ,uid
+            ,mid
+            ,vid
+            ,sessionid
+            ,subsessionid
+            ,pagesource
+            ,ts
+            ,id
+            ,dt
+            ,hh
+    FROM    loghubods.dwd_recsys_alg_exposure_base_view_20250402
+    WHERE   CONCAT(dt,hh) = '${dt}${hh}'
+)
+
+-- 1.4 详情页曝光(用于非常规分享关联)
+,t_exposure_detail AS (
+    SELECT  *
+    FROM    t_exposure
+    WHERE   pagesource REGEXP "-pages/user-videos-detail$|pages/detail-recommend$"
+)
+
+--========================================
+-- 2. 曝光关联分享(单次 JOIN + 优先级选择)
+-- 优化:合并 10 轮级联匹配为 1 轮
+--========================================
+
+-- 2.1 常规分享关联曝光(一次性匹配所有优先级)
+,t_normal_share_exposure AS (
+    SELECT  *
+    FROM    (
+        SELECT  s.dthh
+                ,s.apptype
+                ,s.mid
+                ,s.vid
+                ,s.sessionid
+                ,s.subsessionid
+                ,s.pagesource
+                ,s.shareid
+                ,s.ts
+                ,e.id AS exposure_id
+                ,e.ts AS exposure_ts
+                -- 优先级:subsession+pagesource+ts > session+pagesource+ts > ...
+                ,CASE
+                    WHEN s.subsessionid = e.subsessionid AND s.pagesource = e.pagesource AND s.ts >= e.ts THEN 1
+                    WHEN s.sessionid = e.sessionid AND s.pagesource = e.pagesource AND s.ts >= e.ts THEN 2
+                    WHEN s.subsessionid = e.subsessionid AND s.pagesource = e.pagesource THEN 3
+                    WHEN s.sessionid = e.sessionid AND s.pagesource = e.pagesource THEN 4
+                    WHEN s.subsessionid = e.subsessionid THEN 5
+                    WHEN s.sessionid = e.sessionid THEN 6
+                END AS match_priority
+                ,ROW_NUMBER() OVER (
+                    PARTITION BY s.dthh,s.apptype,s.mid,s.vid,s.sessionid,s.subsessionid,s.pagesource,s.shareid
+                    ORDER BY
+                        CASE
+                            WHEN s.subsessionid = e.subsessionid AND s.pagesource = e.pagesource AND s.ts >= e.ts THEN 1
+                            WHEN s.sessionid = e.sessionid AND s.pagesource = e.pagesource AND s.ts >= e.ts THEN 2
+                            WHEN s.subsessionid = e.subsessionid AND s.pagesource = e.pagesource THEN 3
+                            WHEN s.sessionid = e.sessionid AND s.pagesource = e.pagesource THEN 4
+                            WHEN s.subsessionid = e.subsessionid THEN 5
+                            WHEN s.sessionid = e.sessionid THEN 6
+                        END
+                        ,e.ts DESC
+                ) AS rn
+        FROM    t_share_from_sharelog s
+        LEFT JOIN t_exposure e
+        ON      s.apptype = e.apptype
+        AND     s.mid = e.mid
+        AND     s.vid = e.vid
+        AND     (s.subsessionid = e.subsessionid OR s.sessionid = e.sessionid)
+        WHERE   s.pagesource NOT REGEXP "pages/detail-user-videos-share-recommend$"
+    ) t
+    WHERE   rn = 1
+)
+
+-- 2.2 非常规分享关联曝光(detail页面,一次性匹配)
+,t_no_normal_share_exposure AS (
+    SELECT  *
+    FROM    (
+        SELECT  s.dthh
+                ,s.apptype
+                ,s.mid
+                ,s.vid
+                ,s.sessionid
+                ,s.subsessionid
+                ,s.pagesource
+                ,s.shareid
+                ,s.ts
+                ,e.id AS exposure_id
+                ,e.ts AS exposure_ts
+                ,CASE
+                    WHEN s.subsessionid = e.subsessionid AND s.ts >= e.ts THEN 1
+                    WHEN s.sessionid = e.sessionid AND s.ts >= e.ts THEN 2
+                    WHEN s.subsessionid = e.subsessionid THEN 3
+                    WHEN s.sessionid = e.sessionid THEN 4
+                END AS match_priority
+                ,ROW_NUMBER() OVER (
+                    PARTITION BY s.dthh,s.apptype,s.mid,s.vid,s.sessionid,s.subsessionid,s.pagesource,s.shareid
+                    ORDER BY
+                        CASE
+                            WHEN s.subsessionid = e.subsessionid AND s.ts >= e.ts THEN 1
+                            WHEN s.sessionid = e.sessionid AND s.ts >= e.ts THEN 2
+                            WHEN s.subsessionid = e.subsessionid THEN 3
+                            WHEN s.sessionid = e.sessionid THEN 4
+                        END
+                        ,e.ts DESC
+                ) AS rn
+        FROM    t_share_from_sharelog s
+        LEFT JOIN t_exposure_detail e
+        ON      s.apptype = e.apptype
+        AND     s.mid = e.mid
+        AND     s.vid = e.vid
+        AND     (s.subsessionid = e.subsessionid OR s.sessionid = e.sessionid)
+        WHERE   s.pagesource REGEXP "pages/detail-user-videos-share-recommend$"
+    ) t
+    WHERE   rn = 1
+)
+
+--========================================
+-- 3. 合并所有分享-曝光关联
+--========================================
+,t_share_exposure AS (
+    SELECT  dthh, apptype, mid, vid, sessionid, subsessionid, pagesource, shareid, ts, exposure_id, exposure_ts
+    FROM    t_normal_share_exposure
+    UNION ALL
+    SELECT  dthh, apptype, mid, vid, sessionid, subsessionid, pagesource, shareid, ts, exposure_id, exposure_ts
+    FROM    t_no_normal_share_exposure
+)
+
+--========================================
+-- 4. 计算 B_i(不换视频回流,用 rootshareid)
+--========================================
+
+-- 4.1 分享关联回流(用 rootshareid 追踪裂变)
+,t_share_return AS (
+    SELECT  se.exposure_id
+            ,se.shareid
+            ,se.vid
+            ,se.apptype
+            ,se.subsessionid
+            -- 回流用户的 subsession(用于构建 S(i))
+            ,r.subsessionid AS return_subsessionid
+            ,r.mid AS return_mid
+    FROM    t_share_exposure se
+    JOIN    t_return r
+    ON      se.shareid = r.rootshareid  -- 用 rootshareid 追踪所有裂变
+    AND     se.vid = r.vid              -- 同视频
+    AND     se.apptype = r.apptype
+)
+
+-- 4.2 每个曝光的 B_i 和 Bn_sessions
+,t_exposure_bn AS (
+    SELECT  e.id AS exposure_id
+            ,e.subsessionid
+            ,e.ts
+            ,e.vid
+            -- B_i: 不换视频回流人数(含裂变)
+            ,COALESCE(bn.B, 0) AS B
+            -- Bn 带来的回流用户的 subsessions(用于计算 S(i))
+            ,bn.bn_subsessions
+    FROM    t_exposure e
+    LEFT JOIN (
+        SELECT  exposure_id
+                ,COUNT(DISTINCT return_mid) AS B
+                ,COLLECT_SET(return_subsessionid) AS bn_subsessions
+        FROM    t_share_return
+        GROUP BY exposure_id
+    ) bn
+    ON      e.id = bn.exposure_id
+)
+
+--========================================
+-- 5. 构建 S(i): Bn 带来的回流用户的首曝光
+--========================================
+
+-- 每个 subsession 的首曝光(用 ts 最小的)
+,t_subsession_first_exposure AS (
+    SELECT  subsessionid
+            ,first_exposure_id
+    FROM    (
+        SELECT  subsessionid
+                ,id AS first_exposure_id
+                ,ROW_NUMBER() OVER (PARTITION BY subsessionid ORDER BY ts) AS rn
+        FROM    t_exposure
+    ) t
+    WHERE   rn = 1
+)
+
+,t_bn_first_exposure AS (
+    -- S(i): E_i 的 Bn 带来的回流用户的首曝光集合
+    SELECT  e.exposure_id AS source_exposure_id
+            ,f.first_exposure_id AS target_exposure_id
+    FROM    t_exposure_bn e
+    LATERAL VIEW EXPLODE(e.bn_subsessions) t AS bn_subsess
+    JOIN    t_subsession_first_exposure f
+    ON      t.bn_subsess = f.subsessionid
+    WHERE   e.bn_subsessions IS NOT NULL
+)
+
+--========================================
+-- 6. 初始化 k=0
+--    D_i^(0) = Σ B_j, j > i 且同 subsession(用 ts 判断)
+--    C_i^(0) = 0
+--========================================
+,t_layer_0 AS (
+    SELECT  e1.exposure_id
+            ,e1.subsessionid
+            ,e1.ts
+            ,e1.B
+            ,e1.bn_subsessions
+            -- D^(0) = Σ B_j, j > i 且同 subsession
+            ,COALESCE(d0.D_0, 0) AS D_0
+            -- C^(0) = 0
+            ,0 AS C_0
+    FROM    t_exposure_bn e1
+    LEFT JOIN (
+        SELECT  a.exposure_id
+                ,SUM(b.B) AS D_0
+        FROM    t_exposure_bn a
+        JOIN    t_exposure_bn b
+        ON      a.subsessionid = b.subsessionid
+        AND     b.ts > a.ts  -- j > i,用 ts 判断
+        GROUP BY a.exposure_id
+    ) d0
+    ON      e1.exposure_id = d0.exposure_id
+)
+
+--========================================
+-- 7. 递推 k=1
+--    D_i^(1) = Σ (D_j^(0) + C_j^(0)), j > i 且同 subsession
+--    C_i^(1) = Σ (D_j^(1) + C_j^(0)), j ∈ S(i)
+--========================================
+,t_D_1 AS (
+    SELECT  a.exposure_id
+            ,COALESCE(SUM(b.D_0 + b.C_0), 0) AS D_1
+    FROM    t_layer_0 a
+    JOIN    t_layer_0 b
+    ON      a.subsessionid = b.subsessionid
+    AND     b.ts > a.ts  -- j > i,用 ts 判断
+    GROUP BY a.exposure_id
+)
+
+,t_layer_1 AS (
+    SELECT  l0.exposure_id
+            ,l0.subsessionid
+            ,l0.ts
+            ,l0.B
+            ,l0.bn_subsessions
+            ,l0.D_0
+            ,l0.C_0
+            ,COALESCE(d1.D_1, 0) AS D_1
+            ,COALESCE(c1.C_1, 0) AS C_1
+    FROM    t_layer_0 l0
+    LEFT JOIN t_D_1 d1
+    ON      l0.exposure_id = d1.exposure_id
+    LEFT JOIN (
+        SELECT  s.source_exposure_id AS exposure_id
+                ,SUM(COALESCE(d.D_1, 0) + l.C_0) AS C_1
+        FROM    t_bn_first_exposure s
+        JOIN    t_layer_0 l
+        ON      s.target_exposure_id = l.exposure_id
+        LEFT JOIN t_D_1 d
+        ON      l.exposure_id = d.exposure_id
+        GROUP BY s.source_exposure_id
+    ) c1
+    ON      l0.exposure_id = c1.exposure_id
+)
+
+--========================================
+-- 8. 递推 k=2
+--    D_i^(2) = Σ (D_j^(1) + C_j^(1)), j > i 且同 subsession
+--    C_i^(2) = Σ (D_j^(2) + C_j^(1)), j ∈ S(i)
+--========================================
+,t_D_2 AS (
+    SELECT  a.exposure_id
+            ,COALESCE(SUM(b.D_1 + b.C_1), 0) AS D_2
+    FROM    t_layer_1 a
+    JOIN    t_layer_1 b
+    ON      a.subsessionid = b.subsessionid
+    AND     b.ts > a.ts
+    GROUP BY a.exposure_id
+)
+
+,t_layer_2 AS (
+    SELECT  l1.exposure_id
+            ,l1.subsessionid
+            ,l1.ts
+            ,l1.B
+            ,l1.bn_subsessions
+            ,l1.D_0, l1.C_0
+            ,l1.D_1, l1.C_1
+            ,COALESCE(d2.D_2, 0) AS D_2
+            ,COALESCE(c2.C_2, 0) AS C_2
+    FROM    t_layer_1 l1
+    LEFT JOIN t_D_2 d2
+    ON      l1.exposure_id = d2.exposure_id
+    LEFT JOIN (
+        SELECT  s.source_exposure_id AS exposure_id
+                ,SUM(COALESCE(d.D_2, 0) + l.C_1) AS C_2
+        FROM    t_bn_first_exposure s
+        JOIN    t_layer_1 l
+        ON      s.target_exposure_id = l.exposure_id
+        LEFT JOIN t_D_2 d
+        ON      l.exposure_id = d.exposure_id
+        GROUP BY s.source_exposure_id
+    ) c2
+    ON      l1.exposure_id = c2.exposure_id
+)
+
+--========================================
+-- 9. 递推 k=3
+--    D_i^(3) = Σ (D_j^(2) + C_j^(2)), j > i 且同 subsession
+--    C_i^(3) = Σ (D_j^(3) + C_j^(2)), j ∈ S(i)
+--========================================
+,t_D_3 AS (
+    SELECT  a.exposure_id
+            ,COALESCE(SUM(b.D_2 + b.C_2), 0) AS D_3
+    FROM    t_layer_2 a
+    JOIN    t_layer_2 b
+    ON      a.subsessionid = b.subsessionid
+    AND     b.ts > a.ts
+    GROUP BY a.exposure_id
+)
+
+,t_layer_3 AS (
+    SELECT  l2.exposure_id
+            ,l2.subsessionid
+            ,l2.ts
+            ,l2.B
+            ,l2.bn_subsessions
+            ,l2.D_0, l2.C_0
+            ,l2.D_1, l2.C_1
+            ,l2.D_2, l2.C_2
+            ,COALESCE(d3.D_3, 0) AS D_3
+            ,COALESCE(c3.C_3, 0) AS C_3
+    FROM    t_layer_2 l2
+    LEFT JOIN t_D_3 d3
+    ON      l2.exposure_id = d3.exposure_id
+    LEFT JOIN (
+        SELECT  s.source_exposure_id AS exposure_id
+                ,SUM(COALESCE(d.D_3, 0) + l.C_2) AS C_3
+        FROM    t_bn_first_exposure s
+        JOIN    t_layer_2 l
+        ON      s.target_exposure_id = l.exposure_id
+        LEFT JOIN t_D_3 d
+        ON      l.exposure_id = d.exposure_id
+        GROUP BY s.source_exposure_id
+    ) c3
+    ON      l2.exposure_id = c3.exposure_id
+)
+
+--========================================
+-- 10. 最终输出 (k=3)
+--========================================
+SELECT  exposure_id
+        ,subsessionid
+        ,ts
+        -- 基础收益
+        ,B
+        -- 第0轮(初始化)
+        ,D_0
+        ,C_0
+        -- 第1轮
+        ,D_1
+        ,C_1
+        -- 第2轮
+        ,D_2
+        ,C_2
+        -- 第3轮
+        ,D_3
+        ,C_3
+        -- 最终收益 = B + D^(k) + C^(k)
+        ,B + D_3 + C_3 AS V_total
+FROM    t_layer_3
+;