loghubods.vid_click_cf_feature_20250212.sql 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. -- Task: 02_基于用户点击的CF特征_20250212 ID: 1023781551 Type: ODPS_SQL
  2. CREATE TABLE IF NOT EXISTS loghubods.vid_click_cf_feature_20250212
  3. (
  4. vid_a STRING COMMENT '视频ID'
  5. ,vid_b STRING COMMENT '视频ID'
  6. ,feature STRING COMMENT '特征JSON'
  7. )
  8. COMMENT '02_视频点击事件的CF特征,使用详情页曝光和回流页曝光当作点击事件'
  9. PARTITIONED BY
  10. (
  11. dt STRING COMMENT '天'
  12. ,hh STRING COMMENT '小时'
  13. )
  14. LIFECYCLE 30
  15. ;
  16. INSERT OVERWRITE TABLE loghubods.vid_click_cf_feature_20250212 PARTITION (dt = '${dt}',hh = '${hh}')
  17. WITH t_return AS
  18. (
  19. SELECT mid
  20. ,vid
  21. FROM loghubods.dwd_recsys_alg_exposure_base_20250108
  22. WHERE CONCAT(dt,hh) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 24 * 7),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
  23. AND apptype NOT IN ("5","19","22","23","12")
  24. AND page IN ("回流页","详情页")
  25. GROUP BY mid
  26. ,vid
  27. )
  28. ,t_vid_hot AS
  29. (
  30. SELECT vid
  31. ,COUNT(DISTINCT mid) AS hots
  32. FROM t_return
  33. GROUP BY vid
  34. )
  35. ,t_mid_hot AS
  36. (
  37. SELECT mid
  38. ,COUNT(DISTINCT vid) AS hots
  39. ,1 / LN(1 + COUNT(DISTINCT vid)) AS mid_hot_score
  40. FROM t_return
  41. GROUP BY mid
  42. )
  43. ,t_pair AS
  44. (
  45. SELECT ta.mid
  46. ,ta.vid AS vid_left
  47. ,tb.vid AS vid_right
  48. FROM t_return ta
  49. JOIN t_return tb
  50. ON ta.mid = tb.mid
  51. AND ta.vid <> tb.vid
  52. )
  53. ,t_pair_hot AS
  54. (
  55. SELECT vid_left
  56. ,vid_right
  57. ,COUNT(DISTINCT mid) AS onlines
  58. ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY COUNT(DISTINCT mid) + rand() / 10 DESC ) AS rn
  59. FROM t_pair
  60. GROUP BY vid_left
  61. ,vid_right
  62. )
  63. ,t_pair_candidate AS
  64. (
  65. SELECT ta.vid_left
  66. ,ta.vid_right
  67. ,ta.mid
  68. ,tb.onlines
  69. FROM t_pair ta
  70. JOIN (
  71. SELECT vid_left
  72. ,vid_right
  73. ,onlines
  74. FROM t_pair_hot
  75. WHERE rn <= 100
  76. ) tb
  77. ON ta.vid_left = tb.vid_left
  78. AND ta.vid_right = tb.vid_right
  79. )
  80. ,t_pair_candidate_score AS
  81. (
  82. SELECT ta.vid_left
  83. ,ta.vid_right
  84. ,ta.mid
  85. ,ta.onlines
  86. ,COALESCE(tb.mid_hot_score,0) AS mid_hot_score
  87. FROM t_pair_candidate ta
  88. LEFT JOIN t_mid_hot tb
  89. ON ta.mid = tb.mid
  90. )
  91. ,t_result AS
  92. (
  93. SELECT ta.vid_left
  94. ,ta.vid_right
  95. ,ta.onlines
  96. ,ta.mid_hot_score
  97. ,COALESCE(tb.hots,0) AS vid_left_hots
  98. ,COALESCE(tc.hots,0) AS vid_right_hots
  99. ,ta.mid_hot_score / (COALESCE(tb.hots,0) + COALESCE(tc.hots,0) - ta.onlines) AS score
  100. FROM (
  101. SELECT vid_left
  102. ,vid_right
  103. ,AVG(onlines) AS onlines
  104. ,SUM(mid_hot_score) AS mid_hot_score
  105. FROM t_pair_candidate_score
  106. GROUP BY vid_left
  107. ,vid_right
  108. ) ta
  109. LEFT JOIN t_vid_hot tb
  110. ON ta.vid_left = tb.vid
  111. LEFT JOIN t_vid_hot tc
  112. ON ta.vid_right = tc.vid
  113. )
  114. ,t_recall AS
  115. (
  116. SELECT vid_left AS vid_a
  117. ,vid_right AS vid_b
  118. ,JSON_FORMAT(
  119. JSON_OBJECT("score",CAST(ROUND(score,8) AS STRING),"rank",CAST(rn AS STRING),"onlines",CAST(onlines AS STRING))
  120. )
  121. FROM (
  122. SELECT *
  123. ,ROW_NUMBER() OVER (PARTITION BY vid_left ORDER BY score DESC ) AS rn
  124. FROM t_result
  125. )
  126. )SELECT *
  127. FROM t_recall
  128. ;