| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202 |
- --@exclude_input=loghubods.video_action_log_flow_new
- -- =====================================================================
- -- 上游曝光去重视图表 (行级, 每行 = 一次去重后的曝光)
- -- 版本: 20250402 (最早版本, 纯曝光去重, 无分享/回流)
- -- LIFECYCLE 3 (仅保留 3 天, 作为下游 base 表的输入中间层)
- -- =====================================================================
- --
- -- 数据源: video_action_log_flow_new (businesstype=videoView)
- -- 过滤: apptype NOT IN ('12'), mid/videoid IS NOT NULL
- --
- -- 去重逻辑:
- -- ROW_NUMBER OVER (PARTITION BY dthh,apptype,uid,mid,vid,sessionid,subsessionid,pagesource
- -- ORDER BY logtimestamp) = 1
- --
- -- 输出拆分 (UNION ALL):
- -- 非 share 页: pagesource NOT REGEXP "-pages/user-videos-share$", rn=1
- -- share 页: pagesource REGEXP "-pages/user-videos-share$" (不过滤 rn, 保留全部)
- --
- -- id 生成: CONCAT(dthh, ":", subsessionid, ":", dthh_id)
- -- dthh_id = ROW_NUMBER OVER (PARTITION BY dthh,subsessionid ORDER BY clienttimestamp DESC)
- --
- -- 与 base_20260209 的关系:
- -- base_20260209.t_exposure CTE 复用了本表的去重逻辑
- -- 本表仅输出曝光字段, base 表在此基础上 JOIN 分享/回流/B/C/D 链
- -- =====================================================================
- CREATE TABLE IF NOT EXISTS loghubods.dwd_recsys_alg_exposure_base_view_20250402
- (
- dthh_id string
- ,dthh string
- ,apptype string
- ,uid string
- ,mid string
- ,vid string
- ,sessionid string
- ,subsessionid string
- ,rootsessionid_new string
- ,pagesource string
- ,recommendlogvo string-- 推荐算法的返回结果日志存在这个字段中
- ,abcode string-- 推荐算法的ab分组
- ,recommendpagetype string -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
- ,recomtraceid string
- ,headvideoid string
- ,rootsourceid string
- ,hotsencetype string
- ,animationSceneType string
- ,extParams string
- ,flowpool string-- 14#68#3#1735262438476#2
- ,level string
- ,clientip string
- ,machineinfo_brand string
- ,machineinfo_model string
- ,machineinfo_system string
- ,machineinfo_wechatversion string
- ,machineinfo_sdkversion string
- ,province string
- ,city string
- ,versioncode string
- , ts string
- ,rn string
- ,id string
- )
- PARTITIONED BY
- (
- dt STRING COMMENT '日期:20240105'
- ,hh STRING COMMENT '小时:04'
- )
- STORED AS ALIORC
- TBLPROPERTIES ('columnar.nested.type' = 'true','comment' = '推荐算法-labelmatch表-上游曝光表')
- LIFECYCLE 3
- ;
- SET hive.exec.dynamic.partition = true
- ;
- SET hive.exec.dynamic.partition.mode = nonstrict
- ;
- INSERT OVERWRITE TABLE loghubods.dwd_recsys_alg_exposure_base_view_20250402 PARTITION (dt,hh)
- WITH t_exposure_base AS
- (
- SELECT ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),subsessionid ORDER BY clienttimestamp DESC ) AS dthh_id
- ,CONCAT(year,month,day,hour) AS dthh
- ,apptype
- ,uid
- ,mid
- ,videoid AS vid
- ,sessionid
- ,subsessionid
- ,rootsessionid_new
- ,pagesource
- ,recommendlogvo -- 推荐算法的返回结果日志存在这个字段中
- ,COALESCE(GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003'),"unknown") AS abcode -- 推荐算法的ab分组
- ,GET_JSON_OBJECT(extparams,'$.recommendPageType') AS recommendpagetype -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
- ,GET_JSON_OBJECT(extparams,'$.recomTraceId') AS recomtraceid
- ,CASE WHEN GET_JSON_OBJECT(extParams,'$.head_videoid') IS NOT NULL THEN GET_JSON_OBJECT(extParams,'$.head_videoid')
- ELSE GET_JSON_OBJECT(extParams,'$.head_videoId')
- END AS headvideoid
- ,GET_JSON_OBJECT(extParams,'$.rootSourceId') AS rootsourceid
- ,COALESCE(hotsencetype,sencetype,"other") AS hotsencetype
- ,GET_JSON_OBJECT(extParams,'$.animationSceneType') AS animationSceneType
- ,extParams AS extParams
- ,flowpool -- 14#68#3#1735262438476#2
- ,SPLIT(flowpool,'#')[2] AS level
- ,clientip
- ,machineinfo_brand
- ,machineinfo_model
- ,machineinfo_system
- ,machineinfo_wechatversion
- ,machineinfo_sdkversion
- ,ANALYSISIP(clientip,"region") AS province
- ,ANALYSISIP(clientip,"city") AS city
- ,versioncode
- ,CAST(logtimestamp / 1000 AS BIGINT) AS ts
- ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,uid,mid,videoid,sessionid,subsessionid,pagesource ORDER BY logtimestamp ) AS rn
- ,CONCAT(year,month,day) AS dt
- ,hour AS hh
- FROM loghubods.video_action_log_flow_new
- WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH')
- AND businesstype IN ('videoView')
- AND apptype IS NOT NULL
- AND apptype NOT IN ('12')
- AND mid IS NOT NULL
- AND videoid IS NOT NULL
- )
- SELECT dthh_id
- ,dthh
- ,apptype
- ,uid
- ,mid
- ,vid
- ,sessionid
- ,subsessionid
- ,rootsessionid_new
- ,pagesource
- ,recommendlogvo
- ,abcode
- ,recommendpagetype
- ,recomtraceid
- ,headvideoid
- ,rootsourceid
- ,hotsencetype
- ,animationscenetype
- ,extparams
- ,flowpool
- ,level
- ,clientip
- ,machineinfo_brand
- ,machineinfo_model
- ,machineinfo_system
- ,machineinfo_wechatversion
- ,machineinfo_sdkversion
- ,province
- ,city
- ,versioncode
- ,ts
- ,rn
- ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
- ,dt
- ,hh
- FROM t_exposure_base
- WHERE pagesource NOT REGEXP "-pages/user-videos-share$"
- AND rn = 1
- UNION ALL
- SELECT dthh_id
- ,dthh
- ,apptype
- ,uid
- ,mid
- ,vid
- ,sessionid
- ,subsessionid
- ,rootsessionid_new
- ,pagesource
- ,recommendlogvo
- ,abcode
- ,recommendpagetype
- ,recomtraceid
- ,headvideoid
- ,rootsourceid
- ,hotsencetype
- ,animationscenetype
- ,extparams
- ,flowpool
- ,level
- ,clientip
- ,machineinfo_brand
- ,machineinfo_model
- ,machineinfo_system
- ,machineinfo_wechatversion
- ,machineinfo_sdkversion
- ,province
- ,city
- ,versioncode
- ,ts
- ,rn
- ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
- ,dt
- ,hh
- FROM t_exposure_base
- WHERE pagesource REGEXP "-pages/user-videos-share$"
|