Browse Source

docs(table_gen): 添加上游曝光视图表 base_view_20250402 及头部注释

去重逻辑、输出拆分、与 base_20260209 的关系

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
yangxiaohui 3 weeks ago
parent
commit
3f1b909bf5
1 changed files with 202 additions and 0 deletions
  1. 202 0
      table_gen/loghubods.dwd_recsys_alg_exposure_base_view_20250402.sql

+ 202 - 0
table_gen/loghubods.dwd_recsys_alg_exposure_base_view_20250402.sql

@@ -0,0 +1,202 @@
+--@exclude_input=loghubods.video_action_log_flow_new
+-- =====================================================================
+-- 上游曝光去重视图表 (行级, 每行 = 一次去重后的曝光)
+-- 版本: 20250402 (最早版本, 纯曝光去重, 无分享/回流)
+-- LIFECYCLE 3 (仅保留 3 天, 作为下游 base 表的输入中间层)
+-- =====================================================================
+--
+-- 数据源: video_action_log_flow_new (businesstype=videoView)
+-- 过滤:   apptype NOT IN ('12'), mid/videoid IS NOT NULL
+--
+-- 去重逻辑:
+--   ROW_NUMBER OVER (PARTITION BY dthh,apptype,uid,mid,vid,sessionid,subsessionid,pagesource
+--                    ORDER BY logtimestamp) = 1
+--
+-- 输出拆分 (UNION ALL):
+--   非 share 页: pagesource NOT REGEXP "-pages/user-videos-share$", rn=1
+--   share 页:    pagesource REGEXP "-pages/user-videos-share$" (不过滤 rn, 保留全部)
+--
+-- id 生成: CONCAT(dthh, ":", subsessionid, ":", dthh_id)
+--   dthh_id = ROW_NUMBER OVER (PARTITION BY dthh,subsessionid ORDER BY clienttimestamp DESC)
+--
+-- 与 base_20260209 的关系:
+--   base_20260209.t_exposure CTE 复用了本表的去重逻辑
+--   本表仅输出曝光字段, base 表在此基础上 JOIN 分享/回流/B/C/D 链
+-- =====================================================================
+CREATE TABLE IF NOT EXISTS loghubods.dwd_recsys_alg_exposure_base_view_20250402
+(
+        dthh_id string 
+        ,dthh string
+        ,apptype string
+        ,uid string
+        ,mid string
+        ,vid string
+        ,sessionid string
+        ,subsessionid string
+        ,rootsessionid_new string
+        ,pagesource string
+        ,recommendlogvo  string-- 推荐算法的返回结果日志存在这个字段中
+        ,abcode  string-- 推荐算法的ab分组
+        ,recommendpagetype string -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
+        ,recomtraceid string
+        ,headvideoid string
+        ,rootsourceid string
+        ,hotsencetype string
+        ,animationSceneType string
+        ,extParams string  
+        ,flowpool string-- 14#68#3#1735262438476#2
+        ,level string
+        ,clientip string
+        ,machineinfo_brand string
+        ,machineinfo_model string 
+        ,machineinfo_system string 
+        ,machineinfo_wechatversion string
+        ,machineinfo_sdkversion string
+        ,province string
+        ,city string
+        ,versioncode string
+        , ts string
+        ,rn string
+        ,id  string
+)
+PARTITIONED BY 
+(
+    dt                         STRING COMMENT '日期:20240105'
+    ,hh                        STRING COMMENT '小时:04'
+)
+STORED AS ALIORC
+TBLPROPERTIES ('columnar.nested.type' = 'true','comment' = '推荐算法-labelmatch表-上游曝光表')
+LIFECYCLE 3
+;
+SET hive.exec.dynamic.partition = true
+;
+
+SET hive.exec.dynamic.partition.mode = nonstrict
+;
+
+
+INSERT OVERWRITE TABLE loghubods.dwd_recsys_alg_exposure_base_view_20250402 PARTITION (dt,hh)
+
+
+WITH t_exposure_base AS 
+(
+    SELECT  ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),subsessionid ORDER BY clienttimestamp DESC ) AS dthh_id
+            ,CONCAT(year,month,day,hour) AS dthh
+            ,apptype
+            ,uid
+            ,mid
+            ,videoid AS vid
+            ,sessionid
+            ,subsessionid
+            ,rootsessionid_new
+            ,pagesource
+            ,recommendlogvo -- 推荐算法的返回结果日志存在这个字段中
+            ,COALESCE(GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003'),"unknown") AS abcode -- 推荐算法的ab分组
+            ,GET_JSON_OBJECT(extparams,'$.recommendPageType') AS recommendpagetype -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
+            ,GET_JSON_OBJECT(extparams,'$.recomTraceId') AS recomtraceid
+            ,CASE   WHEN GET_JSON_OBJECT(extParams,'$.head_videoid') IS NOT NULL THEN GET_JSON_OBJECT(extParams,'$.head_videoid')
+                    ELSE GET_JSON_OBJECT(extParams,'$.head_videoId')
+            END AS headvideoid
+            ,GET_JSON_OBJECT(extParams,'$.rootSourceId') AS rootsourceid
+            ,COALESCE(hotsencetype,sencetype,"other") AS hotsencetype
+            ,GET_JSON_OBJECT(extParams,'$.animationSceneType') AS animationSceneType
+            ,extParams AS extParams
+            ,flowpool -- 14#68#3#1735262438476#2
+            ,SPLIT(flowpool,'#')[2] AS level
+            ,clientip
+            ,machineinfo_brand
+            ,machineinfo_model
+            ,machineinfo_system
+            ,machineinfo_wechatversion
+            ,machineinfo_sdkversion
+            ,ANALYSISIP(clientip,"region") AS province
+            ,ANALYSISIP(clientip,"city") AS city
+            ,versioncode
+            ,CAST(logtimestamp / 1000 AS BIGINT) AS ts
+            ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,uid,mid,videoid,sessionid,subsessionid,pagesource ORDER BY logtimestamp ) AS rn
+            ,CONCAT(year,month,day) AS dt
+            ,hour AS hh
+    FROM    loghubods.video_action_log_flow_new
+    WHERE   CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE   CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH')
+    AND     businesstype IN ('videoView')
+    AND     apptype IS NOT NULL
+    AND     apptype NOT IN ('12')
+    AND     mid IS NOT NULL
+    AND     videoid IS NOT NULL
+)
+SELECT  dthh_id
+        ,dthh
+        ,apptype
+        ,uid
+        ,mid
+        ,vid
+        ,sessionid
+        ,subsessionid
+        ,rootsessionid_new
+        ,pagesource
+        ,recommendlogvo
+        ,abcode
+        ,recommendpagetype
+        ,recomtraceid
+        ,headvideoid
+        ,rootsourceid
+        ,hotsencetype
+        ,animationscenetype
+        ,extparams
+        ,flowpool
+        ,level
+        ,clientip
+        ,machineinfo_brand
+        ,machineinfo_model
+        ,machineinfo_system
+        ,machineinfo_wechatversion
+        ,machineinfo_sdkversion
+        ,province
+        ,city
+        ,versioncode
+        ,ts
+        ,rn
+        ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
+        ,dt
+        ,hh
+FROM    t_exposure_base
+WHERE   pagesource NOT REGEXP "-pages/user-videos-share$"
+AND     rn = 1
+UNION ALL
+SELECT  dthh_id
+        ,dthh
+        ,apptype
+        ,uid
+        ,mid
+        ,vid
+        ,sessionid
+        ,subsessionid
+        ,rootsessionid_new
+        ,pagesource
+        ,recommendlogvo
+        ,abcode
+        ,recommendpagetype
+        ,recomtraceid
+        ,headvideoid
+        ,rootsourceid
+        ,hotsencetype
+        ,animationscenetype
+        ,extparams
+        ,flowpool
+        ,level
+        ,clientip
+        ,machineinfo_brand
+        ,machineinfo_model
+        ,machineinfo_system
+        ,machineinfo_wechatversion
+        ,machineinfo_sdkversion
+        ,province
+        ,city
+        ,versioncode
+        ,ts
+        ,rn
+        ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
+        ,dt
+        ,hh
+FROM    t_exposure_base
+WHERE   pagesource REGEXP "-pages/user-videos-share$"