loghubods.alg_vid_feature_cfshare.sql 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. -- Task: 02_用户连续分享CF ID: 1017832518 Type: ODPS_SQL
  2. --@exclude_input=loghubods.user_share_log_flow
  3. --odps sql
  4. --********************************************************************--
  5. --author:张博
  6. --create time:2024-06-06 13:27:20
  7. --********************************************************************--
  8. -- select * from loghubods.alg_vid_feature_cfshare where dt = "20240606" and hh = "16";
  9. CREATE TABLE IF NOT EXISTS loghubods.alg_vid_feature_cfshare
  10. (
  11. vid STRING COMMENT '视频id'
  12. ,feature JSON COMMENT 'json格式的特征组合'
  13. )
  14. COMMENT '视频特征-用户连续分享CF'
  15. PARTITIONED BY
  16. (
  17. dt STRING COMMENT '天'
  18. ,hh STRING COMMENT '小时'
  19. )
  20. LIFECYCLE 30
  21. ;
  22. INSERT OVERWRITE TABLE loghubods.alg_vid_feature_cfshare PARTITION (dt = '${dt}',hh = '${hh}')
  23. WITH t_origin AS
  24. (
  25. -- 一次曝光的多次分享,只保留最早的一次。
  26. SELECT apptype
  27. ,mid
  28. ,vid
  29. ,pagesource
  30. ,subsessionid
  31. ,shareid
  32. ,MIN(clienttimestamp) AS ts
  33. ,UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) AS ts_now
  34. ,UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - MIN(clienttimestamp) / 1000 AS ts_diff
  35. FROM (
  36. SELECT __topic__
  37. ,eventinfos
  38. ,apptype
  39. ,clickobjectid
  40. ,shareobjectid AS vid
  41. ,machinecode AS mid
  42. ,clienttimestamp
  43. ,pagesource
  44. ,parentpagesource
  45. ,parentrootpagesource
  46. ,shareid
  47. ,rootshareid
  48. ,subsessionid
  49. FROM loghubods.user_share_log_flow
  50. WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * ${hours_early}),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
  51. AND __topic__ IN ('share')
  52. AND apptype NOT IN ('12')
  53. AND apptype IS NOT NULL
  54. AND shareobjectid IS NOT NULL
  55. AND machinecode IS NOT NULL
  56. AND machinecode != ""
  57. AND clienttimestamp IS NOT NULL
  58. )
  59. GROUP BY apptype
  60. ,mid
  61. ,vid
  62. ,pagesource
  63. ,subsessionid
  64. ,shareid
  65. ) -- SELECT *
  66. -- FROM t_share_
  67. -- limit 999
  68. -- ;
  69. -- SELECT vid2vid_cf_py("14824141:1,21175605:2,21175605:3,21256335:0")
  70. ,t_cf AS
  71. (
  72. SELECT loghubods.vid2vid_cf_py(vid_ts) AS (cf_a,cf_b)
  73. FROM (
  74. SELECT mid
  75. ,CONCAT_WS(',',COLLECT_SET(CONCAT(vid,":",CAST(ts / 1000 AS BIGINT)))) AS vid_ts
  76. ,COUNT(1) AS cnt
  77. FROM (
  78. SELECT mid
  79. ,vid
  80. ,ts
  81. FROM t_origin
  82. WHERE mid IS NOT NULL
  83. AND mid <> ""
  84. AND vid IS NOT NULL
  85. AND vid <> "0"
  86. AND vid <> ""
  87. AND ts IS NOT NULL
  88. )
  89. GROUP BY mid
  90. HAVING COUNT(1) > 1
  91. AND SIZE(COLLECT_SET(vid)) > 1
  92. )
  93. ) -- SELECT *
  94. -- FROM t_cf
  95. -- ;
  96. ,t_score AS
  97. (
  98. SELECT a.cf_a
  99. ,a.cf_b
  100. ,a.cnt AS cnt_ab
  101. ,b.cnt AS cnt_a
  102. ,ROUND(a.cnt / b.cnt,6) AS score
  103. ,ROW_NUMBER() OVER (PARTITION BY a.cf_a ORDER BY a.cnt / b.cnt DESC ) AS rank
  104. FROM (
  105. SELECT cf_a
  106. ,cf_b
  107. ,COUNT(1) AS cnt
  108. FROM t_cf
  109. GROUP BY cf_a
  110. ,cf_b
  111. ) a
  112. JOIN (
  113. SELECT cf_a
  114. ,COUNT(1) AS cnt
  115. FROM t_cf
  116. GROUP BY cf_a
  117. ORDER BY COUNT(1) DESC
  118. LIMIT ${key_limit}
  119. ) b
  120. ON a.cf_a = b.cf_a
  121. )
  122. ,t_recall AS
  123. (
  124. SELECT cf_a AS vid -- ,CONCAT_WS(',',COLLECT_LIST(cf_b)) AS videoid_arr
  125. -- ,CONCAT_WS(',',COLLECT_LIST(CAST(score AS STRING))) AS score_arr
  126. -- ,CONCAT_WS(',',COLLECT_LIST(CAST(cnt_ab AS STRING))) AS cnt_ab_arr
  127. -- ,CONCAT_WS(',',COLLECT_LIST(CAST(cnt_a AS STRING))) AS cnt_a_arr
  128. ,JSON_OBJECT("videoid_arr",CONCAT_WS(',',COLLECT_LIST(cf_b)),"score_arr",CONCAT_WS(',',COLLECT_LIST(CAST(score AS STRING))),"cnt_ab_arr",CONCAT_WS(',',COLLECT_LIST(CAST(cnt_ab AS STRING))),"cnt_a_arr",CONCAT_WS(',',COLLECT_LIST(CAST(cnt_a AS STRING)))) AS feature
  129. FROM (
  130. SELECT cf_a
  131. ,cf_b
  132. ,cnt_ab
  133. ,cnt_a
  134. ,score
  135. ,rank
  136. FROM t_score
  137. WHERE rank <= ${rank_limit}
  138. ORDER BY rank ASC,CAST(cf_b AS BIGINT) DESC
  139. )
  140. GROUP BY cf_a
  141. ORDER BY SUM(cnt_a) DESC
  142. )
  143. SELECT *
  144. FROM t_recall