| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- -- Task: 02_基于用户点击的CF特征_20250212 ID: 1023781551 Type: ODPS_SQL
- CREATE TABLE IF NOT EXISTS loghubods.vid_click_cf_feature_20250212
- (
- vid_a STRING COMMENT '视频ID'
- ,vid_b STRING COMMENT '视频ID'
- ,feature STRING COMMENT '特征JSON'
- )
- COMMENT '02_视频点击事件的CF特征,使用详情页曝光和回流页曝光当作点击事件'
- PARTITIONED BY
- (
- dt STRING COMMENT '天'
- ,hh STRING COMMENT '小时'
- )
- LIFECYCLE 30
- ;
- INSERT OVERWRITE TABLE loghubods.vid_click_cf_feature_20250212 PARTITION (dt = '${dt}',hh = '${hh}')
- WITH t_return AS
- (
- SELECT mid
- ,vid
- FROM loghubods.dwd_recsys_alg_exposure_base_20250108
- WHERE CONCAT(dt,hh) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 24 * 7),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
- AND apptype NOT IN ("5","19","22","23","12")
- AND page IN ("回流页","详情页")
- GROUP BY mid
- ,vid
- )
- ,t_vid_hot AS
- (
- SELECT vid
- ,COUNT(DISTINCT mid) AS hots
- FROM t_return
- GROUP BY vid
- )
- ,t_mid_hot AS
- (
- SELECT mid
- ,COUNT(DISTINCT vid) AS hots
- ,1 / LN(1 + COUNT(DISTINCT vid)) AS mid_hot_score
- FROM t_return
- GROUP BY mid
- )
- ,t_pair AS
- (
- SELECT ta.mid
- ,ta.vid AS vid_left
- ,tb.vid AS vid_right
- FROM t_return ta
- JOIN t_return tb
- ON ta.mid = tb.mid
- AND ta.vid <> tb.vid
- )
- ,t_pair_hot AS
- (
- SELECT vid_left
- ,vid_right
- ,COUNT(DISTINCT mid) AS onlines
- ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY COUNT(DISTINCT mid) + rand() / 10 DESC ) AS rn
- FROM t_pair
- GROUP BY vid_left
- ,vid_right
- )
- ,t_pair_candidate AS
- (
- SELECT ta.vid_left
- ,ta.vid_right
- ,ta.mid
- ,tb.onlines
- FROM t_pair ta
- JOIN (
- SELECT vid_left
- ,vid_right
- ,onlines
- FROM t_pair_hot
- WHERE rn <= 100
- ) tb
- ON ta.vid_left = tb.vid_left
- AND ta.vid_right = tb.vid_right
- )
- ,t_pair_candidate_score AS
- (
- SELECT ta.vid_left
- ,ta.vid_right
- ,ta.mid
- ,ta.onlines
- ,COALESCE(tb.mid_hot_score,0) AS mid_hot_score
- FROM t_pair_candidate ta
- LEFT JOIN t_mid_hot tb
- ON ta.mid = tb.mid
- )
- ,t_result AS
- (
- SELECT ta.vid_left
- ,ta.vid_right
- ,ta.onlines
- ,ta.mid_hot_score
- ,COALESCE(tb.hots,0) AS vid_left_hots
- ,COALESCE(tc.hots,0) AS vid_right_hots
- ,ta.mid_hot_score / (COALESCE(tb.hots,0) + COALESCE(tc.hots,0) - ta.onlines) AS score
- FROM (
- SELECT vid_left
- ,vid_right
- ,AVG(onlines) AS onlines
- ,SUM(mid_hot_score) AS mid_hot_score
- FROM t_pair_candidate_score
- GROUP BY vid_left
- ,vid_right
- ) ta
- LEFT JOIN t_vid_hot tb
- ON ta.vid_left = tb.vid
- LEFT JOIN t_vid_hot tc
- ON ta.vid_right = tc.vid
- )
- ,t_recall AS
- (
- SELECT vid_left AS vid_a
- ,vid_right AS vid_b
- ,JSON_FORMAT(
- JSON_OBJECT("score",CAST(ROUND(score,8) AS STRING),"rank",CAST(rn AS STRING),"onlines",CAST(onlines AS STRING))
- )
- FROM (
- SELECT *
- ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY score DESC ) AS rn
- FROM t_result
- )
- )SELECT *
- FROM t_recall
- ;
|