loghubods.alg_mid_feature_sharecf.sql 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. -- Task: 08_基于共线的用户分享CF特征 ID: 1017835016 Type: ODPS_SQL
  2. --@exclude_input=loghubods.user_share_log_flow
  3. --odps sql
  4. --********************************************************************--
  5. --author:张博
  6. --create time:2024-06-07 18:59:55
  7. --********************************************************************--
  8. -- select * from loghubods.alg_mid_feature_sharecf where dt = "20240606" and hh = "16" limit 100;
  9. CREATE TABLE IF NOT EXISTS loghubods.alg_mid_feature_sharecf
  10. (
  11. mid STRING COMMENT '视频id'
  12. ,feature JSON COMMENT 'json格式的特征组合'
  13. )
  14. COMMENT '基于视频共线CF特征的用户“分享”对应的vids'
  15. PARTITIONED BY
  16. (
  17. dt STRING COMMENT '天'
  18. ,hh STRING COMMENT '小时'
  19. )
  20. LIFECYCLE 30
  21. ;
  22. INSERT OVERWRITE TABLE loghubods.alg_mid_feature_sharecf PARTITION (dt = '${dt}',hh = '${hh}')
  23. WITH t_origin AS
  24. (
  25. SELECT apptype
  26. ,mid
  27. ,vid
  28. ,pagesource
  29. ,subsessionid
  30. ,MIN(clienttimestamp) AS clienttimestamp
  31. FROM (
  32. SELECT __topic__
  33. ,eventinfos
  34. ,apptype
  35. ,clickobjectid
  36. ,shareobjectid AS vid
  37. ,machinecode AS mid
  38. ,clienttimestamp
  39. ,pagesource
  40. ,parentpagesource
  41. ,parentrootpagesource
  42. ,shareid
  43. ,rootshareid
  44. ,subsessionid
  45. FROM loghubods.user_share_log_flow
  46. WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * ${hours_early}),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH')
  47. AND __topic__ IN ('share')
  48. AND apptype NOT IN ('12')
  49. AND apptype IS NOT NULL
  50. AND shareobjectid IS NOT NULL
  51. AND machinecode IS NOT NULL
  52. AND machinecode != ""
  53. AND clienttimestamp IS NOT NULL
  54. AND rootPageSource NOT REGEXP '-pages/home' -- todo 加限制,保证是头部回流----非自分享回流
  55. )
  56. GROUP BY apptype
  57. ,mid
  58. ,vid
  59. ,pagesource
  60. ,shareid
  61. ,subsessionid
  62. )
  63. ,t_share_cf AS
  64. (
  65. SELECT vid
  66. ,JSON_FORMAT(feature) AS feature1
  67. FROM loghubods.alg_vid_feature_cfshare
  68. WHERE dt = '${dt}'
  69. AND hh = '${hh}'
  70. )
  71. ,t_return_cf AS
  72. (
  73. SELECT vid
  74. ,JSON_FORMAT(feature) AS feature2
  75. FROM loghubods.alg_vid_feature_cfreturn
  76. WHERE dt = '${dt}'
  77. AND hh = '${hh}'
  78. )
  79. ,t_group AS
  80. (
  81. SELECT mid
  82. ,COLLECT_LIST(feature1) AS feature1_list
  83. ,COUNT(feature1) AS cnt1
  84. ,COLLECT_LIST(feature2) AS feature2_list
  85. ,COUNT(feature2) AS cnt2
  86. FROM (
  87. SELECT mid
  88. ,feature1
  89. ,feature2
  90. FROM t_origin t1
  91. LEFT JOIN t_share_cf t2
  92. ON t1.vid = t2.vid
  93. LEFT JOIN t_return_cf t3
  94. ON t1.vid = t3.vid
  95. )
  96. GROUP BY mid
  97. )
  98. ,t_group_ AS
  99. (
  100. SELECT mid
  101. ,IF(feature1_list IS NULL OR SIZE(feature1_list) == 0,NULL,loghubods.feature_from_list_json(feature1_list)) AS feature1
  102. ,IF(feature2_list IS NULL OR SIZE(feature2_list) == 0,NULL,loghubods.feature_from_list_json(feature2_list)) AS feature2
  103. ,cnt1
  104. ,cnt2
  105. FROM t_group
  106. )SELECT mid
  107. ,CASE WHEN feature1 IS NULL THEN JSON_OBJECT("return",feature2)
  108. WHEN feature2 IS NULL THEN JSON_OBJECT("share",feature1)
  109. ELSE JSON_OBJECT("share",feature1,"return",feature2)
  110. END AS feature
  111. FROM t_group_
  112. WHERE feature1 IS NOT NULL
  113. AND feature2 IS NOT NULL