--@exclude_input=loghubods.video_action_log_flow_new -- ===================================================================== -- 上游曝光去重视图表 (行级, 每行 = 一次去重后的曝光) -- 版本: 20250402 (最早版本, 纯曝光去重, 无分享/回流) -- LIFECYCLE 3 (仅保留 3 天, 作为下游 base 表的输入中间层) -- ===================================================================== -- -- 数据源: video_action_log_flow_new (businesstype=videoView) -- 过滤: apptype NOT IN ('12'), mid/videoid IS NOT NULL -- -- 去重逻辑: -- ROW_NUMBER OVER (PARTITION BY dthh,apptype,uid,mid,vid,sessionid,subsessionid,pagesource -- ORDER BY logtimestamp) = 1 -- -- 输出拆分 (UNION ALL): -- 非 share 页: pagesource NOT REGEXP "-pages/user-videos-share$", rn=1 -- share 页: pagesource REGEXP "-pages/user-videos-share$" (不过滤 rn, 保留全部) -- -- id 生成: CONCAT(dthh, ":", subsessionid, ":", dthh_id) -- dthh_id = ROW_NUMBER OVER (PARTITION BY dthh,subsessionid ORDER BY clienttimestamp DESC) -- -- 与 base_20260209 的关系: -- base_20260209.t_exposure CTE 复用了本表的去重逻辑 -- 本表仅输出曝光字段, base 表在此基础上 JOIN 分享/回流/B/C/D 链 -- ===================================================================== CREATE TABLE IF NOT EXISTS loghubods.dwd_recsys_alg_exposure_base_view_20250402 ( dthh_id string ,dthh string ,apptype string ,uid string ,mid string ,vid string ,sessionid string ,subsessionid string ,rootsessionid_new string ,pagesource string ,recommendlogvo string-- 推荐算法的返回结果日志存在这个字段中 ,abcode string-- 推荐算法的ab分组 ,recommendpagetype string -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑 ,recomtraceid string ,headvideoid string ,rootsourceid string ,hotsencetype string ,animationSceneType string ,extParams string ,flowpool string-- 14#68#3#1735262438476#2 ,level string ,clientip string ,machineinfo_brand string ,machineinfo_model string ,machineinfo_system string ,machineinfo_wechatversion string ,machineinfo_sdkversion string ,province string ,city string ,versioncode string , ts string ,rn string ,id string ) PARTITIONED BY ( dt STRING COMMENT '日期:20240105' ,hh STRING COMMENT '小时:04' ) STORED AS ALIORC TBLPROPERTIES ('columnar.nested.type' = 'true','comment' = '推荐算法-labelmatch表-上游曝光表') LIFECYCLE 3 ; SET hive.exec.dynamic.partition = true ; SET hive.exec.dynamic.partition.mode = nonstrict ; INSERT OVERWRITE TABLE loghubods.dwd_recsys_alg_exposure_base_view_20250402 PARTITION (dt,hh) WITH t_exposure_base AS ( SELECT ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),subsessionid ORDER BY clienttimestamp DESC ) AS dthh_id ,CONCAT(year,month,day,hour) AS dthh ,apptype ,uid ,mid ,videoid AS vid ,sessionid ,subsessionid ,rootsessionid_new ,pagesource ,recommendlogvo -- 推荐算法的返回结果日志存在这个字段中 ,COALESCE(GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003'),"unknown") AS abcode -- 推荐算法的ab分组 ,GET_JSON_OBJECT(extparams,'$.recommendPageType') AS recommendpagetype -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑 ,GET_JSON_OBJECT(extparams,'$.recomTraceId') AS recomtraceid ,CASE WHEN GET_JSON_OBJECT(extParams,'$.head_videoid') IS NOT NULL THEN GET_JSON_OBJECT(extParams,'$.head_videoid') ELSE GET_JSON_OBJECT(extParams,'$.head_videoId') END AS headvideoid ,GET_JSON_OBJECT(extParams,'$.rootSourceId') AS rootsourceid ,COALESCE(hotsencetype,sencetype,"other") AS hotsencetype ,GET_JSON_OBJECT(extParams,'$.animationSceneType') AS animationSceneType ,extParams AS extParams ,flowpool -- 14#68#3#1735262438476#2 ,SPLIT(flowpool,'#')[2] AS level ,clientip ,machineinfo_brand ,machineinfo_model ,machineinfo_system ,machineinfo_wechatversion ,machineinfo_sdkversion ,ANALYSISIP(clientip,"region") AS province ,ANALYSISIP(clientip,"city") AS city ,versioncode ,CAST(logtimestamp / 1000 AS BIGINT) AS ts ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,uid,mid,videoid,sessionid,subsessionid,pagesource ORDER BY logtimestamp ) AS rn ,CONCAT(year,month,day) AS dt ,hour AS hh FROM loghubods.video_action_log_flow_new WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH') AND businesstype IN ('videoView') AND apptype IS NOT NULL AND apptype NOT IN ('12') AND mid IS NOT NULL AND videoid IS NOT NULL ) SELECT dthh_id ,dthh ,apptype ,uid ,mid ,vid ,sessionid ,subsessionid ,rootsessionid_new ,pagesource ,recommendlogvo ,abcode ,recommendpagetype ,recomtraceid ,headvideoid ,rootsourceid ,hotsencetype ,animationscenetype ,extparams ,flowpool ,level ,clientip ,machineinfo_brand ,machineinfo_model ,machineinfo_system ,machineinfo_wechatversion ,machineinfo_sdkversion ,province ,city ,versioncode ,ts ,rn ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id ,dt ,hh FROM t_exposure_base WHERE pagesource NOT REGEXP "-pages/user-videos-share$" AND rn = 1 UNION ALL SELECT dthh_id ,dthh ,apptype ,uid ,mid ,vid ,sessionid ,subsessionid ,rootsessionid_new ,pagesource ,recommendlogvo ,abcode ,recommendpagetype ,recomtraceid ,headvideoid ,rootsourceid ,hotsencetype ,animationscenetype ,extparams ,flowpool ,level ,clientip ,machineinfo_brand ,machineinfo_model ,machineinfo_system ,machineinfo_wechatversion ,machineinfo_sdkversion ,province ,city ,versioncode ,ts ,rn ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id ,dt ,hh FROM t_exposure_base WHERE pagesource REGEXP "-pages/user-videos-share$"