loghubods.dwd_recsys_alg_exposure_base_view_20250402.sql 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. --@exclude_input=loghubods.video_action_log_flow_new
  2. -- =====================================================================
  3. -- 上游曝光去重视图表 (行级, 每行 = 一次去重后的曝光)
  4. -- 版本: 20250402 (最早版本, 纯曝光去重, 无分享/回流)
  5. -- LIFECYCLE 3 (仅保留 3 天, 作为下游 base 表的输入中间层)
  6. -- =====================================================================
  7. --
  8. -- 数据源: video_action_log_flow_new (businesstype=videoView)
  9. -- 过滤: apptype NOT IN ('12'), mid/videoid IS NOT NULL
  10. --
  11. -- 去重逻辑:
  12. -- ROW_NUMBER OVER (PARTITION BY dthh,apptype,uid,mid,vid,sessionid,subsessionid,pagesource
  13. -- ORDER BY logtimestamp) = 1
  14. --
  15. -- 输出拆分 (UNION ALL):
  16. -- 非 share 页: pagesource NOT REGEXP "-pages/user-videos-share$", rn=1
  17. -- share 页: pagesource REGEXP "-pages/user-videos-share$" (不过滤 rn, 保留全部)
  18. --
  19. -- id 生成: CONCAT(dthh, ":", subsessionid, ":", dthh_id)
  20. -- dthh_id = ROW_NUMBER OVER (PARTITION BY dthh,subsessionid ORDER BY clienttimestamp DESC)
  21. --
  22. -- 与 base_20260209 的关系:
  23. -- base_20260209.t_exposure CTE 复用了本表的去重逻辑
  24. -- 本表仅输出曝光字段, base 表在此基础上 JOIN 分享/回流/B/C/D 链
  25. -- =====================================================================
  26. CREATE TABLE IF NOT EXISTS loghubods.dwd_recsys_alg_exposure_base_view_20250402
  27. (
  28. dthh_id string
  29. ,dthh string
  30. ,apptype string
  31. ,uid string
  32. ,mid string
  33. ,vid string
  34. ,sessionid string
  35. ,subsessionid string
  36. ,rootsessionid_new string
  37. ,pagesource string
  38. ,recommendlogvo string-- 推荐算法的返回结果日志存在这个字段中
  39. ,abcode string-- 推荐算法的ab分组
  40. ,recommendpagetype string -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
  41. ,recomtraceid string
  42. ,headvideoid string
  43. ,rootsourceid string
  44. ,hotsencetype string
  45. ,animationSceneType string
  46. ,extParams string
  47. ,flowpool string-- 14#68#3#1735262438476#2
  48. ,level string
  49. ,clientip string
  50. ,machineinfo_brand string
  51. ,machineinfo_model string
  52. ,machineinfo_system string
  53. ,machineinfo_wechatversion string
  54. ,machineinfo_sdkversion string
  55. ,province string
  56. ,city string
  57. ,versioncode string
  58. , ts string
  59. ,rn string
  60. ,id string
  61. )
  62. PARTITIONED BY
  63. (
  64. dt STRING COMMENT '日期:20240105'
  65. ,hh STRING COMMENT '小时:04'
  66. )
  67. STORED AS ALIORC
  68. TBLPROPERTIES ('columnar.nested.type' = 'true','comment' = '推荐算法-labelmatch表-上游曝光表')
  69. LIFECYCLE 3
  70. ;
  71. SET hive.exec.dynamic.partition = true
  72. ;
  73. SET hive.exec.dynamic.partition.mode = nonstrict
  74. ;
  75. INSERT OVERWRITE TABLE loghubods.dwd_recsys_alg_exposure_base_view_20250402 PARTITION (dt,hh)
  76. WITH t_exposure_base AS
  77. (
  78. SELECT ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),subsessionid ORDER BY clienttimestamp DESC ) AS dthh_id
  79. ,CONCAT(year,month,day,hour) AS dthh
  80. ,apptype
  81. ,uid
  82. ,mid
  83. ,videoid AS vid
  84. ,sessionid
  85. ,subsessionid
  86. ,rootsessionid_new
  87. ,pagesource
  88. ,recommendlogvo -- 推荐算法的返回结果日志存在这个字段中
  89. ,COALESCE(GET_JSON_OBJECT(extparams,'$.eventInfos.ab_test003'),"unknown") AS abcode -- 推荐算法的ab分组
  90. ,GET_JSON_OBJECT(extparams,'$.recommendPageType') AS recommendpagetype -- 三种回流头部;两种下滑-沉浸页下滑和feed下滑
  91. ,GET_JSON_OBJECT(extparams,'$.recomTraceId') AS recomtraceid
  92. ,CASE WHEN GET_JSON_OBJECT(extParams,'$.head_videoid') IS NOT NULL THEN GET_JSON_OBJECT(extParams,'$.head_videoid')
  93. ELSE GET_JSON_OBJECT(extParams,'$.head_videoId')
  94. END AS headvideoid
  95. ,GET_JSON_OBJECT(extParams,'$.rootSourceId') AS rootsourceid
  96. ,COALESCE(hotsencetype,sencetype,"other") AS hotsencetype
  97. ,GET_JSON_OBJECT(extParams,'$.animationSceneType') AS animationSceneType
  98. ,extParams AS extParams
  99. ,flowpool -- 14#68#3#1735262438476#2
  100. ,SPLIT(flowpool,'#')[2] AS level
  101. ,clientip
  102. ,machineinfo_brand
  103. ,machineinfo_model
  104. ,machineinfo_system
  105. ,machineinfo_wechatversion
  106. ,machineinfo_sdkversion
  107. ,ANALYSISIP(clientip,"region") AS province
  108. ,ANALYSISIP(clientip,"city") AS city
  109. ,versioncode
  110. ,CAST(logtimestamp / 1000 AS BIGINT) AS ts
  111. ,ROW_NUMBER() OVER (PARTITION BY CONCAT(year,month,day,hour),apptype,uid,mid,videoid,sessionid,subsessionid,pagesource ORDER BY logtimestamp ) AS rn
  112. ,CONCAT(year,month,day) AS dt
  113. ,hour AS hh
  114. FROM loghubods.video_action_log_flow_new
  115. WHERE CONCAT(year,month,day,hour) BETWEEN TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') AND TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 1),'YYYYMMDDHH') --WHERE CONCAT(year,month,day,hour) = TO_CHAR(FROM_UNIXTIME(UNIX_TIMESTAMP(TO_DATE('${dt}${hh}','YYYYMMDDHH')) - 3600 * 25),'YYYYMMDDHH')
  116. AND businesstype IN ('videoView')
  117. AND apptype IS NOT NULL
  118. AND apptype NOT IN ('12')
  119. AND mid IS NOT NULL
  120. AND videoid IS NOT NULL
  121. )
  122. SELECT dthh_id
  123. ,dthh
  124. ,apptype
  125. ,uid
  126. ,mid
  127. ,vid
  128. ,sessionid
  129. ,subsessionid
  130. ,rootsessionid_new
  131. ,pagesource
  132. ,recommendlogvo
  133. ,abcode
  134. ,recommendpagetype
  135. ,recomtraceid
  136. ,headvideoid
  137. ,rootsourceid
  138. ,hotsencetype
  139. ,animationscenetype
  140. ,extparams
  141. ,flowpool
  142. ,level
  143. ,clientip
  144. ,machineinfo_brand
  145. ,machineinfo_model
  146. ,machineinfo_system
  147. ,machineinfo_wechatversion
  148. ,machineinfo_sdkversion
  149. ,province
  150. ,city
  151. ,versioncode
  152. ,ts
  153. ,rn
  154. ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
  155. ,dt
  156. ,hh
  157. FROM t_exposure_base
  158. WHERE pagesource NOT REGEXP "-pages/user-videos-share$"
  159. AND rn = 1
  160. UNION ALL
  161. SELECT dthh_id
  162. ,dthh
  163. ,apptype
  164. ,uid
  165. ,mid
  166. ,vid
  167. ,sessionid
  168. ,subsessionid
  169. ,rootsessionid_new
  170. ,pagesource
  171. ,recommendlogvo
  172. ,abcode
  173. ,recommendpagetype
  174. ,recomtraceid
  175. ,headvideoid
  176. ,rootsourceid
  177. ,hotsencetype
  178. ,animationscenetype
  179. ,extparams
  180. ,flowpool
  181. ,level
  182. ,clientip
  183. ,machineinfo_brand
  184. ,machineinfo_model
  185. ,machineinfo_system
  186. ,machineinfo_wechatversion
  187. ,machineinfo_sdkversion
  188. ,province
  189. ,city
  190. ,versioncode
  191. ,ts
  192. ,rn
  193. ,CONCAT(dthh,":",subsessionid,":",dthh_id) AS id
  194. ,dt
  195. ,hh
  196. FROM t_exposure_base
  197. WHERE pagesource REGEXP "-pages/user-videos-share$"