-- Task: 02_基于用户点击的CF特征_20250212 ID: 1023781551 Type: ODPS_SQL CREATE TABLE IF NOT EXISTS loghubods.vid_click_cf_feature_20250212 ( vid_a STRING COMMENT '视频ID' ,vid_b STRING COMMENT '视频ID' ,feature STRING COMMENT '特征JSON' ) COMMENT '02_视频点击事件的CF特征,使用详情页曝光和回流页曝光当作点击事件' PARTITIONED BY ( dt STRING COMMENT '天' ,hh STRING COMMENT '小时' ) LIFECYCLE 30 ; INSERT OVERWRITE TABLE loghubods.vid_click_cf_feature_20250212 PARTITION (dt = '${dt}',hh = '${hh}') WITH t_return AS ( SELECT mid ,vid FROM loghubods.dwd_recsys_alg_exposure_base_20250108 WHERE CONCAT(dt,hh) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 24 * 7),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND apptype NOT IN ("5","19","22","23","12") AND page IN ("回流页","详情页") GROUP BY mid ,vid ) ,t_vid_hot AS ( SELECT vid ,COUNT(DISTINCT mid) AS hots FROM t_return GROUP BY vid ) ,t_mid_hot AS ( SELECT mid ,COUNT(DISTINCT vid) AS hots ,1 / LN(1 + COUNT(DISTINCT vid)) AS mid_hot_score FROM t_return GROUP BY mid ) ,t_pair AS ( SELECT ta.mid ,ta.vid AS vid_left ,tb.vid AS vid_right FROM t_return ta JOIN t_return tb ON ta.mid = tb.mid AND ta.vid <> tb.vid ) ,t_pair_hot AS ( SELECT vid_left ,vid_right ,COUNT(DISTINCT mid) AS onlines ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY COUNT(DISTINCT mid) + rand() / 10 DESC ) AS rn FROM t_pair GROUP BY vid_left ,vid_right ) ,t_pair_candidate AS ( SELECT ta.vid_left ,ta.vid_right ,ta.mid ,tb.onlines FROM t_pair ta JOIN ( SELECT vid_left ,vid_right ,onlines FROM t_pair_hot WHERE rn <= 100 ) tb ON ta.vid_left = tb.vid_left AND ta.vid_right = tb.vid_right ) ,t_pair_candidate_score AS ( SELECT ta.vid_left ,ta.vid_right ,ta.mid ,ta.onlines ,COALESCE(tb.mid_hot_score,0) AS mid_hot_score FROM t_pair_candidate ta LEFT JOIN t_mid_hot tb ON ta.mid = tb.mid ) ,t_result AS ( SELECT ta.vid_left ,ta.vid_right ,ta.onlines ,ta.mid_hot_score ,COALESCE(tb.hots,0) AS vid_left_hots ,COALESCE(tc.hots,0) AS vid_right_hots ,ta.mid_hot_score / (COALESCE(tb.hots,0) + COALESCE(tc.hots,0) - ta.onlines) AS score FROM ( SELECT vid_left ,vid_right ,AVG(onlines) AS onlines ,SUM(mid_hot_score) AS mid_hot_score FROM t_pair_candidate_score GROUP BY vid_left ,vid_right ) ta LEFT JOIN t_vid_hot tb ON ta.vid_left = tb.vid LEFT JOIN t_vid_hot tc ON ta.vid_right = tc.vid ) ,t_recall AS ( SELECT vid_left AS vid_a ,vid_right AS vid_b ,JSON_FORMAT( JSON_OBJECT("score",CAST(ROUND(score,8) AS STRING),"rank",CAST(rn AS STRING),"onlines",CAST(onlines AS STRING)) ) FROM ( SELECT * ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY score DESC ) AS rn FROM t_result ) )SELECT * FROM t_recall ;