| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- -- Task: 08_基于共线的用户分享CF特征 ID: 1017835016 Type: ODPS_SQL
- --@exclude_input=loghubods.user_share_log_flow
- --odps sql
- --********************************************************************--
- --author:张博
- --create time:2024-06-07 18:59:55
- --********************************************************************--
- -- select * from loghubods.alg_mid_feature_sharecf where dt = "20240606" and hh = "16" limit 100;
- CREATE TABLE IF NOT EXISTS loghubods.alg_mid_feature_sharecf
- (
- mid STRING COMMENT '视频id'
- ,feature JSON COMMENT 'json格式的特征组合'
- )
- COMMENT '基于视频共线CF特征的用户“分享”对应的vids'
- PARTITIONED BY
- (
- dt STRING COMMENT '天'
- ,hh STRING COMMENT '小时'
- )
- LIFECYCLE 30
- ;
- INSERT OVERWRITE TABLE loghubods.alg_mid_feature_sharecf PARTITION (dt = '${dt}',hh = '${hh}')
- WITH t_origin AS
- (
- SELECT apptype
- ,mid
- ,vid
- ,pagesource
- ,subsessionid
- ,MIN(clienttimestamp) AS clienttimestamp
- FROM (
- SELECT __topic__
- ,eventinfos
- ,apptype
- ,clickobjectid
- ,shareobjectid AS vid
- ,machinecode AS mid
- ,clienttimestamp
- ,pagesource
- ,parentpagesource
- ,parentrootpagesource
- ,shareid
- ,rootshareid
- ,subsessionid
- FROM loghubods.user_share_log_flow
- WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * ${hours_early}),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
- AND __topic__ IN ('share')
- AND apptype NOT IN ('12')
- AND apptype IS NOT NULL
- AND shareobjectid IS NOT NULL
- AND machinecode IS NOT NULL
- AND machinecode != ""
- AND clienttimestamp IS NOT NULL
- AND rootPageSource NOT REGEXP '-pages/home' -- todo 加限制,保证是头部回流----非自分享回流
- )
- GROUP BY apptype
- ,mid
- ,vid
- ,pagesource
- ,shareid
- ,subsessionid
- )
- ,t_share_cf AS
- (
- SELECT vid
- ,JSON_FORMAT(feature) AS feature1
- FROM loghubods.alg_vid_feature_cfshare
- WHERE dt = '${dt}'
- AND hh = '${hh}'
- )
- ,t_return_cf AS
- (
- SELECT vid
- ,JSON_FORMAT(feature) AS feature2
- FROM loghubods.alg_vid_feature_cfreturn
- WHERE dt = '${dt}'
- AND hh = '${hh}'
- )
- ,t_group AS
- (
- SELECT mid
- ,COLLECT_LIST(feature1) AS feature1_list
- ,COUNT(feature1) AS cnt1
- ,COLLECT_LIST(feature2) AS feature2_list
- ,COUNT(feature2) AS cnt2
- FROM (
- SELECT mid
- ,feature1
- ,feature2
- FROM t_origin t1
- LEFT JOIN t_share_cf t2
- ON t1.vid = t2.vid
- LEFT JOIN t_return_cf t3
- ON t1.vid = t3.vid
- )
- GROUP BY mid
- )
- ,t_group_ AS
- (
- SELECT mid
- ,IF(feature1_list IS NULL OR SIZE(feature1_list) == 0,NULL,loghubods.feature_from_list_json(feature1_list)) AS feature1
- ,IF(feature2_list IS NULL OR SIZE(feature2_list) == 0,NULL,loghubods.feature_from_list_json(feature2_list)) AS feature2
- ,cnt1
- ,cnt2
- FROM t_group
- )SELECT mid
- ,CASE WHEN feature1 IS NULL THEN JSON_OBJECT("return",feature2)
- WHEN feature2 IS NULL THEN JSON_OBJECT("share",feature1)
- ELSE JSON_OBJECT("share",feature1,"return",feature2)
- END AS feature
- FROM t_group_
- WHERE feature1 IS NOT NULL
- AND feature2 IS NOT NULL
|