-- Task: 08_基于共线的用户分享CF特征 ID: 1017835016 Type: ODPS_SQL --@exclude_input=loghubods.user_share_log_flow --odps sql --********************************************************************-- --author:张博 --create time:2024-06-07 18:59:55 --********************************************************************-- -- select * from loghubods.alg_mid_feature_sharecf where dt = "20240606" and hh = "16" limit 100; CREATE TABLE IF NOT EXISTS loghubods.alg_mid_feature_sharecf ( mid STRING COMMENT '视频id' ,feature JSON COMMENT 'json格式的特征组合' ) COMMENT '基于视频共线CF特征的用户“分享”对应的vids' PARTITIONED BY ( dt STRING COMMENT '天' ,hh STRING COMMENT '小时' ) LIFECYCLE 30 ; INSERT OVERWRITE TABLE loghubods.alg_mid_feature_sharecf PARTITION (dt = '${dt}',hh = '${hh}') WITH t_origin AS ( SELECT apptype ,mid ,vid ,pagesource ,subsessionid ,MIN(clienttimestamp) AS clienttimestamp FROM ( SELECT __topic__ ,eventinfos ,apptype ,clickobjectid ,shareobjectid AS vid ,machinecode AS mid ,clienttimestamp ,pagesource ,parentpagesource ,parentrootpagesource ,shareid ,rootshareid ,subsessionid FROM loghubods.user_share_log_flow WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * ${hours_early}),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND __topic__ IN ('share') AND apptype NOT IN ('12') AND apptype IS NOT NULL AND shareobjectid IS NOT NULL AND machinecode IS NOT NULL AND machinecode != "" AND clienttimestamp IS NOT NULL AND rootPageSource NOT REGEXP '-pages/home' -- todo 加限制,保证是头部回流----非自分享回流 ) GROUP BY apptype ,mid ,vid ,pagesource ,shareid ,subsessionid ) ,t_share_cf AS ( SELECT vid ,JSON_FORMAT(feature) AS feature1 FROM loghubods.alg_vid_feature_cfshare WHERE dt = '${dt}' AND hh = '${hh}' ) ,t_return_cf AS ( SELECT vid ,JSON_FORMAT(feature) AS feature2 FROM loghubods.alg_vid_feature_cfreturn WHERE dt = '${dt}' AND hh = '${hh}' ) ,t_group AS ( SELECT mid ,COLLECT_LIST(feature1) AS feature1_list ,COUNT(feature1) AS cnt1 ,COLLECT_LIST(feature2) AS feature2_list ,COUNT(feature2) AS cnt2 FROM ( SELECT mid ,feature1 ,feature2 FROM t_origin t1 LEFT JOIN t_share_cf t2 ON t1.vid = t2.vid LEFT JOIN t_return_cf t3 ON t1.vid = t3.vid ) GROUP BY mid ) ,t_group_ AS ( SELECT mid ,IF(feature1_list IS NULL OR SIZE(feature1_list) == 0,NULL,loghubods.feature_from_list_json(feature1_list)) AS feature1 ,IF(feature2_list IS NULL OR SIZE(feature2_list) == 0,NULL,loghubods.feature_from_list_json(feature2_list)) AS feature2 ,cnt1 ,cnt2 FROM t_group )SELECT mid ,CASE WHEN feature1 IS NULL THEN JSON_OBJECT("return",feature2) WHEN feature2 IS NULL THEN JSON_OBJECT("share",feature1) ELSE JSON_OBJECT("share",feature1,"return",feature2) END AS feature FROM t_group_ WHERE feature1 IS NOT NULL AND feature2 IS NOT NULL