de.sql 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722
  1. -- =====================================================================
  2. -- 用户拉活-曝光-拉回分析表 (recommend_distribution_data_total)
  3. -- 维度: 用户拉活量分层 × 进入小时 × 进入场景 × 进入内容品类 × 推荐内容品类 × 内容id
  4. -- 粒度: GROUP BY CUBE 全维度组合 + SUM 汇总
  5. -- =====================================================================
  6. -- DROP TABLE IF EXISTS loghubods.recommend_distribution_data_total;
  7. CREATE TABLE IF NOT EXISTS loghubods.recommend_distribution_data_total (
  8. `用户拉活量分层` STRING COMMENT '用户拉活量分层(R0&新用户/R1-50等,汇总为SUM)',
  9. `进入小时` STRING COMMENT '进入小时(1-4/5-8等,汇总为SUM)',
  10. `进入场景` STRING COMMENT '进入场景(业务场景描述,汇总为SUM)',
  11. `进入内容品类` STRING COMMENT '进入内容品类(merge_leve2维度,汇总为SUM)',
  12. `推荐内容品类` STRING COMMENT '推荐内容品类(merge_leve2维度,汇总为SUM)',
  13. `内容id` STRING COMMENT '内容id(videoid,其他为兜底,汇总为SUM)',
  14. -- ===== 基础流量指标 =====
  15. `访问人数` BIGINT COMMENT '去重访问人数(machinecode维度)',
  16. `曝光人数` BIGINT COMMENT '去重曝光人数(mid维度)',
  17. `曝光人数比访问人数` DOUBLE COMMENT '曝光人数/访问人数 比值',
  18. `曝光次数` BIGINT COMMENT '曝光总次数(mid计数)',
  19. `曝光次数人均` DOUBLE COMMENT '曝光次数/去重曝光人数 人均曝光次数',
  20. `视频个数` BIGINT COMMENT '去重曝光视频个数(vid维度)',
  21. `单视频曝光次数` DOUBLE COMMENT '曝光次数/去重视频个数 单视频平均曝光次数',
  22. -- ===== 全链路拉回 =====
  23. `ALL拉回n层` BIGINT COMMENT '全量拉回n层人数(多来源拉回人数求和)',
  24. `ALL拉回n层比曝光次数` DOUBLE COMMENT '全量拉回n层人数/曝光次数 比值,保留6位小数',
  25. -- ===== 原生拉回 & rovn =====
  26. `return_n_uv_noself` BIGINT COMMENT '原生n层拉回人数(无自返)',
  27. `rovn` DOUBLE COMMENT '原生n层拉回人数/曝光次数 比值,保留6位小数',
  28. `rovn_pred` DOUBLE COMMENT 'rovn预估值(str_pred*rosn_pred平均值),保留6位小数',
  29. `rovn_copc` DOUBLE COMMENT 'rovn实际/rovn预估 比值,保留4位小数',
  30. -- ===== STR 指标 =====
  31. `str_real` DOUBLE COMMENT 'str实际值(is_return_noself求和/曝光次数),保留6位小数',
  32. `str_pred` DOUBLE COMMENT 'str预估值(str_pred求和/曝光次数),保留6位小数',
  33. `str_copc` DOUBLE COMMENT 'str实际/str预估 比值,保留4位小数',
  34. `str_MAE` DOUBLE COMMENT 'str预估与实际的平均绝对误差,保留6位小数',
  35. `STR_VAR` DOUBLE COMMENT 'str预估与实际差值的方差,保留6位小数',
  36. `strauc` DOUBLE COMMENT 'str AUC,保留6位小数',
  37. `str多维加权copc` DOUBLE COMMENT 'str多维加权copc,保留6位小数',
  38. -- ===== ROSN 指标 =====
  39. `rosn_real` DOUBLE COMMENT 'rosn实际值(原生n层拉回人数/is_return_noself求和),保留6位小数',
  40. `rosn_pred` DOUBLE COMMENT 'rosn预估值(rosn_pred求和/曝光次数),保留6位小数',
  41. `rosn_copc` DOUBLE COMMENT 'rosn实际/rosn预估 比值,保留4位小数',
  42. `rosn_实际预估diff绝对距离` DOUBLE COMMENT '拉回时rosn预估与实际的平均绝对距离,保留6位小数',
  43. `rosn_实际预估高低估分布比例` DOUBLE COMMENT '拉回时rosn预估与实际差值的方差,保留6位小数',
  44. -- ===== B链 (分享→点击) =====
  45. `Buv1层拉回人数` BIGINT COMMENT 'B端uv1层拉回人数(is_return_noself求和)',
  46. `rov0` DOUBLE COMMENT 'B端uv1层拉回人数/曝光次数 比值,保留6位小数',
  47. `rov0预估` BIGINT COMMENT 'rov0预估值(固定为0)',
  48. `rov0copc` BIGINT COMMENT 'rov0copc值(固定为0)',
  49. `ros0` DOUBLE COMMENT 'ros0实际值(B端uv1层拉回人数/分享次数),保留6位小数',
  50. `ros0预估` BIGINT COMMENT 'ros0预估值(固定为0)',
  51. `ros0copc` BIGINT COMMENT 'ros0copc值(固定为0)',
  52. `Bnuvn层拉回人数` BIGINT COMMENT 'B端nuvn层拉回人数(原生n层拉回人数一致)',
  53. `rorn_b` DOUBLE COMMENT 'B端n层拉回率(Bnuvn/uv1层拉回人数),保留6位小数',
  54. `ror1单层_b` DOUBLE COMMENT 'B端1层拉回率(b1/uv1层拉回人数),保留6位小数',
  55. `ror2单层_b` DOUBLE COMMENT 'B端2层拉回率(b2/b1),保留6位小数',
  56. -- ===== B链 T+1 =====
  57. `Bnuvn层拉回人数_t1` DOUBLE COMMENT 'T1维度B端nuvn层拉回人数,保留6位小数',
  58. `rorn_b_t1` DOUBLE COMMENT 'T1维度B端n层拉回率,保留6位小数',
  59. `ror1单层_b_t1` DOUBLE COMMENT 'T1维度B端1层拉回率,保留6位小数',
  60. `ror2单层_b_t1` DOUBLE COMMENT 'T1维度B端2层拉回率,保留6位小数',
  61. -- ===== C链 (二次分享→点击) =====
  62. `cn` BIGINT COMMENT 'C端拉回n层人数',
  63. `rorn_c` DOUBLE COMMENT 'C端n层拉回率(cn/原生n层拉回人数)',
  64. `ror1_c` DOUBLE COMMENT 'C端1层拉回率(c/原生n层拉回人数)',
  65. `vor单层_c` DOUBLE COMMENT 'C端单层视频访问率(cviews/原生n层拉回人数)',
  66. `vor单层预估` BIGINT COMMENT 'C端单层视频访问率预估(固定为0)',
  67. `vor单层copc` BIGINT COMMENT 'C端单层视频访问率copc(固定为0)',
  68. -- ===== D链 (session内后续曝光传播) =====
  69. `a` BIGINT COMMENT 'D端去重访问人数(machinecode维度)',
  70. `dn` BIGINT COMMENT 'D端拉回n层人数',
  71. `rorn_d` DOUBLE COMMENT 'D端n层拉回率(dn/D端去重访问人数),保留6位小数',
  72. `ror1_d` DOUBLE COMMENT 'D端1层拉回率(d1/D端去重访问人数),保留6位小数',
  73. `vor_d` DOUBLE COMMENT 'D端视频访问率(g.mid计数/D端去重访问人数)'
  74. )
  75. COMMENT '用户拉活-曝光-拉回分析表'
  76. PARTITIONED BY (
  77. dt STRING COMMENT '统计日期,格式yyyyMMdd'
  78. );
  79. -- =====================================================================
  80. -- CTE: 基础样本提取 + 特征工程
  81. -- =====================================================================
  82. WITH
  83. -- [1] 原始样本: 从推荐算法样本表取数,解析 scoresMap
  84. t_raw AS (
  85. SELECT *
  86. ,REPLACE(GET_JSON_OBJECT(extend_alg, '$.scoresMap'), "\\", "") AS scoresmap
  87. ,CASE
  88. WHEN page IN ("回流后沉浸页&内页feed", "详情后沉浸页", "首页feed", "详情页") THEN "推荐"
  89. WHEN page IN ("回流页", "其他") THEN "非推荐"
  90. ELSE "其他"
  91. END AS page_type
  92. FROM loghubods.dwd_recsys_alg_sample_all_20250212
  93. WHERE dt = '${bizdate}'
  94. -- AND apptype IN ("0","4")
  95. AND abcode IN ("ab0","ab1","ab2","ab3","ab4","ab5","ab6","ab7","ab8","ab9")
  96. AND abcode NOT IN ("ab100")
  97. AND extend_alg IS NOT NULL
  98. AND GET_JSON_OBJECT(extend_alg, '$.scoresMap') IS NOT NULL
  99. )
  100. -- [2] 过滤: 只保留推荐页面
  101. ,t_filtered AS (
  102. SELECT *
  103. FROM t_raw
  104. WHERE page_type = "推荐"
  105. )
  106. -- [3] 特征提取: 维度映射 + 模型分数解析
  107. ,t_base AS (
  108. SELECT dt
  109. ,apptype
  110. ,CASE
  111. WHEN apptype IN ("4") AND abcode IN ("ab0","ab1") THEN "实验组-先验地域降权"
  112. WHEN apptype IN ("4") AND abcode IN ("ab6","ab7") THEN "实验组-str+校准&ros-统计量"
  113. WHEN apptype IN ("4") AND abcode IN ("ab8","ab9") THEN "实验组-str+校准"
  114. WHEN apptype IN ("4") AND abcode IN ("ab2","ab3") THEN "对照组"
  115. WHEN apptype IN ("4") AND abcode IN ("ab4","ab5") THEN "ab4-5"
  116. ELSE "其他"
  117. END AS abcode
  118. ,page_type AS page
  119. ,mid
  120. ,vid
  121. ,is_share
  122. ,share_cnt
  123. ,is_return_1
  124. ,is_return_n
  125. ,is_return_noself
  126. ,return_1_uv
  127. ,return_n_uv
  128. ,return_n_uv_noself
  129. ,new_exposure_cnt
  130. ,flowpool
  131. ,scoresmap
  132. ,subsessionid
  133. ,CAST(GET_JSON_OBJECT(scoresmap, '$.fmRov') AS DOUBLE) AS str_pred
  134. ,1.22 * POW(CAST(GET_JSON_OBJECT(scoresmap, '$.NorXGBScore') AS DOUBLE), 1.15) AS rosn_pred
  135. ,CAST(GET_JSON_OBJECT(scoresmap, '$.hasReturnRovScore') AS DOUBLE) AS rosn_stat
  136. ,GET_JSON_OBJECT(v1_feature, '$.title') AS vid_title
  137. FROM t_filtered
  138. )
  139. -- =====================================================================
  140. -- 主查询: INSERT OVERWRITE → 多维度 CUBE 聚合
  141. -- =====================================================================
  142. -- SELECT * FROM loghubods.recommend_distribution_data_total WHERE dt = 20260204 ORDER BY 访问人数 DESC
  143. INSERT OVERWRITE TABLE loghubods.recommend_distribution_data_total PARTITION (dt = '${bizdate}')
  144. SELECT
  145. -- ==================== 维度列 ====================
  146. -- 用户拉活量分层
  147. CASE WHEN grouping(
  148. COALESCE(
  149. CASE
  150. WHEN e.type IS NULL OR e.type = 'R_0' THEN 'R0&新用户'
  151. WHEN e.type IN ('R_1','R_2_10','R_10_50') THEN 'R1-50'
  152. WHEN e.type IN ('R_50_100','R_100_180','R_180_330') THEN 'R_180_330'
  153. ELSE e.type
  154. END
  155. , '-')
  156. ) = 1
  157. THEN 'SUM'
  158. ELSE NVL(
  159. COALESCE(
  160. CASE
  161. WHEN e.type IS NULL OR e.type = 'R_0' THEN 'R0&新用户'
  162. WHEN e.type IN ('R_1','R_2_10','R_10_50') THEN 'R1-50'
  163. WHEN e.type IN ('R_50_100','R_100_180','R_180_330') THEN 'R_180_330'
  164. ELSE e.type
  165. END
  166. , '-')
  167. , 'SUM')
  168. END AS 用户拉活量分层
  169. -- 进入小时
  170. ,CASE WHEN grouping(
  171. COALESCE(
  172. CASE
  173. WHEN in_hour >= 1 AND in_hour <= 4 THEN '1-4'
  174. WHEN in_hour >= 5 AND in_hour <= 8 THEN '5-8'
  175. WHEN in_hour >= 9 AND in_hour <= 12 THEN '9-12'
  176. WHEN in_hour >= 13 AND in_hour <= 16 THEN '13-16'
  177. WHEN in_hour >= 17 AND in_hour <= 20 THEN '17-20'
  178. WHEN in_hour >= 21 AND in_hour <= 24 THEN '21-24'
  179. ELSE '-'
  180. END
  181. , '-')
  182. ) = 1
  183. THEN 'SUM'
  184. ELSE NVL(
  185. COALESCE(
  186. CASE
  187. WHEN in_hour >= 1 AND in_hour <= 4 THEN '1-4'
  188. WHEN in_hour >= 5 AND in_hour <= 8 THEN '5-8'
  189. WHEN in_hour >= 9 AND in_hour <= 12 THEN '9-12'
  190. WHEN in_hour >= 13 AND in_hour <= 16 THEN '13-16'
  191. WHEN in_hour >= 17 AND in_hour <= 20 THEN '17-20'
  192. WHEN in_hour >= 21 AND in_hour <= 24 THEN '21-24'
  193. ELSE '-'
  194. END
  195. , '-')
  196. , 'SUM')
  197. END AS 进入小时
  198. -- 进入场景
  199. ,CASE WHEN grouping(CASE WHEN f.scene_id IS NOT NULL THEN f.scene_desc ELSE '其他' END) = 1
  200. THEN 'SUM'
  201. ELSE NVL(CASE WHEN f.scene_id IS NOT NULL THEN f.scene_desc ELSE '其他' END, 'SUM')
  202. END AS 进入场景
  203. -- 进入内容品类
  204. ,CASE WHEN grouping(CASE WHEN m.merge_leve2 IS NOT NULL THEN d.merge_leve2 ELSE '其他' END) = 1
  205. THEN 'SUM'
  206. ELSE NVL(CASE WHEN m.merge_leve2 IS NOT NULL THEN d.merge_leve2 ELSE '其他' END, 'SUM')
  207. END AS 进入内容品类
  208. -- 推荐内容品类
  209. ,CASE WHEN grouping(CASE WHEN l.merge_leve2 IS NOT NULL THEN h.merge_leve2 ELSE '其他' END) = 1
  210. THEN 'SUM'
  211. ELSE NVL(CASE WHEN l.merge_leve2 IS NOT NULL THEN h.merge_leve2 ELSE '其他' END, 'SUM')
  212. END AS 推荐内容品类
  213. -- 内容id
  214. ,CASE WHEN grouping(COALESCE(CASE WHEN i.merge_leve2 IS NOT NULL THEN i.videoid ELSE '其他' END, '-')) = 1
  215. THEN 'SUM'
  216. ELSE NVL(COALESCE(CASE WHEN i.merge_leve2 IS NOT NULL THEN i.videoid ELSE '其他' END, '-'), 'SUM')
  217. END AS 内容id
  218. -- ==================== 基础流量指标 ====================
  219. ,COUNT(DISTINCT a.machinecode) AS 访问人数
  220. ,COUNT(DISTINCT j.mid) AS 曝光人数
  221. ,COUNT(DISTINCT j.mid) / COUNT(DISTINCT a.machinecode) AS 曝光人数比访问人数
  222. ,COUNT(j.mid) AS 曝光次数
  223. ,COUNT(j.mid) / COUNT(DISTINCT j.mid) AS 曝光次数人均
  224. ,COUNT(DISTINCT j.vid) AS 视频个数
  225. ,COUNT(j.mid) / COUNT(DISTINCT j.vid) AS 单视频曝光次数
  226. -- ==================== 全链路拉回 ====================
  227. ,SUM(return_n_uv_noself) + SUM(cc.cn) + SUM(dd.dn) AS ALL拉回n层
  228. ,ROUND(COALESCE(
  229. (SUM(return_n_uv_noself) + SUM(cc.cn) + SUM(dd.dn)) / COUNT(j.mid)
  230. , 0), 6) AS ALL拉回n层比曝光次数
  231. -- ==================== 原生拉回 & rovn ====================
  232. ,SUM(return_n_uv_noself) AS return_n_uv_noself
  233. ,ROUND(SUM(return_n_uv_noself) / COUNT(j.mid), 6) AS rovn
  234. ,ROUND(AVG(str_pred * rosn_pred), 6) AS rovn_pred
  235. ,ROUND(
  236. (SUM(return_n_uv_noself) / COUNT(j.mid))
  237. / NULLIF(AVG(str_pred * rosn_pred), 0)
  238. , 4) AS rovn_copc
  239. -- ==================== STR 指标 ====================
  240. ,ROUND(COALESCE(SUM(is_return_noself) / COUNT(j.mid), 0), 6) AS str_real
  241. ,ROUND(COALESCE(SUM(str_pred) / COUNT(j.mid), 0), 6) AS str_pred
  242. ,ROUND(
  243. (SUM(is_return_noself) / COUNT(j.mid))
  244. / NULLIF(SUM(str_pred) / COUNT(j.mid), 0)
  245. , 4) AS str_copc
  246. ,ROUND(AVG(ABS(str_pred - is_return_noself)), 6) AS str_MAE
  247. ,ROUND(VARIANCE(str_pred - is_return_noself), 6) AS STR_VAR
  248. ,0 AS strauc
  249. ,0 AS str多维加权copc
  250. -- ==================== ROSN 指标 ====================
  251. ,ROUND(COALESCE(
  252. SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0)
  253. , 0), 6) AS rosn_real
  254. ,ROUND(COALESCE(SUM(rosn_pred) / COUNT(j.mid), 0), 6) AS rosn_pred
  255. ,ROUND(
  256. (SUM(return_n_uv_noself) / NULLIF(SUM(is_return_noself), 0))
  257. / NULLIF(SUM(rosn_pred) / COUNT(1), 0)
  258. , 4) AS rosn_copc
  259. ,ROUND(AVG(
  260. CASE WHEN is_return_noself = 1
  261. THEN ABS(rosn_pred - return_n_uv_noself)
  262. END
  263. ), 6) AS rosn_实际预估diff绝对距离
  264. ,ROUND(VARIANCE(
  265. CASE WHEN is_return_noself = 1
  266. THEN rosn_pred - return_n_uv_noself
  267. END
  268. ), 6) AS rosn_实际预估高低估分布比例
  269. -- ==================== B链: 分享→点击 ====================
  270. ,SUM(is_return_noself) AS Buv1层拉回人数
  271. ,ROUND(SUM(is_return_noself) / COUNT(j.mid), 6) AS rov0
  272. ,0 AS rov0预估
  273. ,0 AS rov0copc
  274. ,ROUND(COALESCE(
  275. SUM(is_return_noself) / NULLIF(SUM(share_cnt), 0)
  276. , 0), 6) AS ros0
  277. ,0 AS ros0预估
  278. ,0 AS ros0copc
  279. ,SUM(return_n_uv_noself) AS Bnuvn层拉回人数
  280. ,ROUND(COALESCE(SUM(return_n_uv_noself) / SUM(is_return_noself), 0), 6) AS rorn_b
  281. ,ROUND(COALESCE(SUM(k.b1) / SUM(is_return_noself), 0), 6) AS ror1单层_b
  282. ,ROUND(COALESCE(SUM(k.b2) / SUM(k.b1), 0), 6) AS ror2单层_b
  283. -- ==================== B链 T+1 ====================
  284. ,ROUND(COALESCE(SUM(k1.bn), 0), 6) AS Bnuvn层拉回人数_t1
  285. ,ROUND(COALESCE(SUM(k1.bn) / SUM(k1.b), 0), 6) AS rorn_b_t1
  286. ,ROUND(COALESCE(SUM(k1.b1) / SUM(k1.b), 0), 6) AS ror1单层_b_t1
  287. ,ROUND(COALESCE(SUM(k1.b2) / SUM(k1.b1), 0), 6) AS ror2单层_b_t1
  288. -- ==================== C链: 二次分享 ====================
  289. ,SUM(cc.cn) AS cn
  290. ,SUM(cc.cn) / SUM(return_n_uv_noself) AS rorn_c
  291. ,SUM(cc.c) / SUM(return_n_uv_noself) AS ror1_c
  292. ,SUM(cc.cviews) / SUM(return_n_uv_noself) AS vor单层_c
  293. ,0 AS vor单层预估
  294. ,0 AS vor单层copc
  295. -- ==================== D链: session内后续曝光传播 ====================
  296. ,COUNT(DISTINCT b.machinecode) AS a
  297. ,SUM(dd.dn) AS dn
  298. ,ROUND(COALESCE(SUM(dd.dn) / COUNT(DISTINCT b.machinecode), 0), 6) AS rorn_d
  299. ,ROUND(COALESCE(SUM(dd.d1) / COUNT(DISTINCT b.machinecode), 0), 6) AS ror1_d
  300. ,COUNT(g.mid) / COUNT(DISTINCT b.machinecode) AS vor_d
  301. -- =====================================================================
  302. -- FROM + JOINs
  303. -- =====================================================================
  304. FROM (
  305. -- [a] 用户活跃日志 + 进入小时
  306. SELECT *
  307. ,TO_CHAR(FROM_UNIXTIME(a.clienttimestamp / 1000), 'HH') + 1 AS in_hour
  308. FROM loghubods.useractive_log a
  309. WHERE a.dt = '${bizdate}'
  310. ) a
  311. -- [b] 分享点击日志 (topic=click)
  312. LEFT JOIN (
  313. SELECT DISTINCT
  314. machinecode
  315. ,sessionid
  316. ,subsessionid
  317. ,pagesource
  318. ,topic
  319. ,shareid
  320. ,clickobjectid
  321. ,shareobjectid
  322. ,clientip
  323. ,CASE
  324. WHEN topic = 'click' THEN clickobjectid
  325. WHEN topic = 'share' THEN shareobjectid
  326. END AS videoid
  327. ,sharetitleid
  328. ,sharetitle
  329. ,shareimageurl
  330. ,rootsharemid
  331. ,clienttimestamp
  332. ,FROM_UNIXTIME(clienttimestamp / 1000) AS click_time
  333. ,GET_JSON_OBJECT(videocdm.Ip2RegionAnalyse(clientip), "$.province") AS click_province
  334. ,GET_JSON_OBJECT(videocdm.Ip2RegionAnalyse(clientip), "$.city") AS click_city
  335. ,TO_CHAR(FROM_UNIXTIME(clienttimestamp / 1000), 'HH') + 1 AS click_hour
  336. FROM loghubods.user_share_log
  337. WHERE dt = '${bizdate}'
  338. AND topic = 'click'
  339. ) b
  340. ON a.machinecode = b.machinecode
  341. AND a.subsessionid = b.subsessionid
  342. -- [c] 视频元信息
  343. LEFT JOIN videoods.wx_video c
  344. ON b.videoid = c.id
  345. -- [d] 点击视频品类标签
  346. LEFT JOIN loghubods.video_merge_tag d
  347. ON b.videoid = d.videoid
  348. -- [e] 用户拉活量分层 (R0/R1-50/R_180_330/...)
  349. LEFT JOIN (
  350. SELECT DISTINCT type, openid
  351. FROM loghubods.mid_share_return_people_1year
  352. WHERE dt = '${bizdate-1}'
  353. AND type IS NOT NULL
  354. AND type != 'S_ALL'
  355. AND type NOT REGEXP 'R50'
  356. ) e
  357. ON SUBSTRING_INDEX(a.machinecode, 'weixin_openid_', -1) = e.openid
  358. -- [f] TOP5 进入场景
  359. LEFT JOIN (
  360. SELECT a.scene_id
  361. ,a.scene_desc
  362. ,COUNT(DISTINCT b.machinecode) AS cnt
  363. FROM loghubods.wechat_miniprogram_scene_simple a
  364. LEFT JOIN loghubods.useractive_log b
  365. ON a.scene_id = b.sencetype
  366. AND b.dt = '${bizdate}'
  367. GROUP BY a.scene_id, a.scene_desc
  368. ORDER BY cnt DESC
  369. LIMIT 5
  370. ) f
  371. ON a.sencetype = f.scene_id
  372. -- [g] 推荐页视频播放
  373. LEFT JOIN (
  374. SELECT DISTINCT mid, subsessionid, videoid
  375. FROM loghubods.video_action_log_rp
  376. WHERE dt = '${bizdate}'
  377. AND businesstype = 'videoView'
  378. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  379. ) g
  380. ON b.machinecode = g.mid
  381. AND b.subsessionid = g.subsessionid
  382. -- [h] 推荐视频品类标签
  383. LEFT JOIN loghubods.video_merge_tag h
  384. ON g.videoid = h.videoid
  385. -- [i] 热门视频 (按品类 TOP1, 曝光>10w)
  386. LEFT JOIN (
  387. SELECT merge_leve2, videoid
  388. FROM (
  389. SELECT b.merge_leve2
  390. ,a.videoid
  391. ,COUNT(1) AS view
  392. ,ROW_NUMBER() OVER (PARTITION BY b.merge_leve2 ORDER BY COUNT(1) DESC) AS rank
  393. FROM loghubods.video_action_log_rp a
  394. LEFT JOIN loghubods.video_merge_tag b
  395. ON a.videoid = b.videoid
  396. WHERE dt = '${bizdate}'
  397. AND businesstype = 'videoView'
  398. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  399. GROUP BY b.merge_leve2, a.videoid
  400. HAVING view > 100000
  401. )
  402. WHERE rank <= 1
  403. ) i
  404. ON g.videoid = i.videoid
  405. -- [j] 曝光样本 (t_base CTE)
  406. LEFT JOIN t_base j
  407. ON g.mid = j.mid
  408. AND g.subsessionid = j.subsessionid
  409. AND g.videoid = j.vid
  410. -- [k] B链: 当天分享→点击裂变 (sharedepth 1~4)
  411. LEFT JOIN (
  412. SELECT a.machinecode AS mid
  413. ,a.subsessionid
  414. ,a.videoid AS vid
  415. ,COUNT(DISTINCT CASE WHEN a.machinecode <> b.machinecode THEN b.machinecode END) AS bn
  416. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 1 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b
  417. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 2 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b1
  418. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 3 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b2
  419. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 4 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b3
  420. FROM (
  421. SELECT DISTINCT machinecode, shareobjectid AS videoid, recomTraceId,
  422. subsessionid, sharedepth, shareid
  423. FROM loghubods.user_share_log
  424. WHERE dt = '${bizdate}'
  425. AND topic = 'share'
  426. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  427. ) a
  428. LEFT JOIN (
  429. SELECT DISTINCT machinecode, clickobjectid, recomTraceId,
  430. subsessionid, sharedepth, rootshareid
  431. FROM loghubods.user_share_log
  432. WHERE dt = '${bizdate}'
  433. AND topic = 'click'
  434. ) b
  435. ON a.shareid = b.rootshareid
  436. GROUP BY a.machinecode, a.subsessionid, a.videoid
  437. ) k
  438. ON g.mid = k.mid
  439. AND g.subsessionid = k.subsessionid
  440. AND g.videoid = k.vid
  441. -- [k1] B链 T+1: 跨天分享→点击裂变
  442. LEFT JOIN (
  443. SELECT a.machinecode AS mid
  444. ,a.subsessionid
  445. ,a.videoid AS vid
  446. ,COUNT(DISTINCT CASE WHEN a.machinecode <> b.machinecode THEN b.machinecode END) AS bn
  447. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 1 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b
  448. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 2 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b1
  449. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 3 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b2
  450. ,COUNT(DISTINCT CASE WHEN b.sharedepth = 4 AND a.machinecode <> b.machinecode THEN b.machinecode END) AS b3
  451. FROM (
  452. SELECT DISTINCT machinecode, shareobjectid AS videoid, recomTraceId,
  453. subsessionid, sharedepth, shareid
  454. FROM loghubods.user_share_log
  455. WHERE dt = '${bizdate}'
  456. AND topic = 'share'
  457. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  458. ) a
  459. LEFT JOIN (
  460. SELECT DISTINCT machinecode, clickobjectid, recomTraceId,
  461. subsessionid, sharedepth, rootshareid
  462. FROM loghubods.user_share_log
  463. WHERE dt >= '${bizdate}'
  464. AND dt <= '${bizdate+1}' -- T+1 跨天
  465. AND topic = 'click'
  466. ) b
  467. ON a.shareid = b.rootshareid
  468. GROUP BY a.machinecode, a.subsessionid, a.videoid
  469. ) k1
  470. ON g.mid = k1.mid
  471. AND g.subsessionid = k1.subsessionid
  472. AND g.videoid = k1.vid
  473. -- [l] TOP10 推荐品类 (按播放量)
  474. LEFT JOIN (
  475. SELECT b.merge_leve2
  476. ,COUNT(1) AS view
  477. FROM loghubods.video_action_log_rp a
  478. LEFT JOIN loghubods.video_merge_tag b
  479. ON a.videoid = b.videoid
  480. WHERE dt = '${bizdate}'
  481. AND businesstype = 'videoView'
  482. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  483. AND b.merge_leve2 IS NOT NULL
  484. GROUP BY b.merge_leve2
  485. ORDER BY view DESC
  486. LIMIT 10
  487. ) l
  488. ON h.merge_leve2 = l.merge_leve2
  489. -- [m] TOP1 点击品类 (按点击人数)
  490. LEFT JOIN (
  491. SELECT b.merge_leve2
  492. ,COUNT(DISTINCT machinecode) AS click
  493. FROM loghubods.user_share_log a
  494. LEFT JOIN loghubods.video_merge_tag b
  495. ON a.clickobjectid = b.videoid
  496. WHERE dt = '${bizdate}'
  497. AND topic = 'click'
  498. AND b.merge_leve2 IS NOT NULL
  499. GROUP BY b.merge_leve2
  500. ORDER BY click DESC
  501. LIMIT 1
  502. ) m
  503. ON d.merge_leve2 = m.merge_leve2
  504. -- [cc] C链: 二次分享 (分享→点击→再分享→再点击 + 视频播放)
  505. LEFT JOIN (
  506. SELECT a.machinecode AS mid
  507. ,a.subsessionid
  508. ,a.videoid AS vid
  509. ,COUNT(DISTINCT CASE WHEN b1.machinecode <> b2.machinecode THEN b2.machinecode END) AS cn
  510. ,COUNT(DISTINCT
  511. CASE WHEN b2.sharedepth = 1 AND b1.machinecode <> b2.machinecode
  512. THEN b2.machinecode
  513. END
  514. ) AS c
  515. ,COUNT(c.mid) AS cviews
  516. FROM (
  517. -- 一次分享
  518. SELECT DISTINCT machinecode, shareobjectid AS videoid, recomTraceId,
  519. subsessionid, sharedepth, shareid
  520. FROM loghubods.user_share_log
  521. WHERE dt = '${bizdate}'
  522. AND topic = 'share'
  523. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  524. ) a
  525. LEFT JOIN (
  526. -- 一次点击
  527. SELECT DISTINCT machinecode, clickobjectid, recomTraceId,
  528. subsessionid, sharedepth, rootshareid
  529. FROM loghubods.user_share_log
  530. WHERE dt = '${bizdate}'
  531. AND topic = 'click'
  532. ) b
  533. ON a.shareid = b.rootshareid
  534. LEFT JOIN (
  535. -- 二次分享 (点击者再分享)
  536. SELECT DISTINCT machinecode, shareobjectid, recomTraceId,
  537. subsessionid, sharedepth, shareid
  538. FROM loghubods.user_share_log
  539. WHERE dt = '${bizdate}'
  540. AND topic = 'share'
  541. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  542. ) b1
  543. ON b.machinecode = b1.machinecode
  544. AND b.subsessionid = b1.subsessionid
  545. LEFT JOIN (
  546. -- 二次点击
  547. SELECT DISTINCT machinecode, clickobjectid, recomTraceId,
  548. subsessionid, sharedepth, shareid, rootshareid
  549. FROM loghubods.user_share_log
  550. WHERE dt = '${bizdate}'
  551. AND topic = 'click'
  552. ) b2
  553. ON b1.shareid = b2.rootshareid
  554. LEFT JOIN (
  555. -- 二次点击者的视频播放
  556. SELECT DISTINCT mid, subsessionid, videoid
  557. FROM loghubods.video_action_log_rp
  558. WHERE dt = '${bizdate}'
  559. AND businesstype = 'videoView'
  560. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  561. ) c
  562. ON b2.machinecode = c.mid
  563. AND b2.subsessionid = c.subsessionid
  564. GROUP BY a.machinecode, a.subsessionid, a.videoid
  565. ) cc
  566. ON g.mid = cc.mid
  567. AND g.subsessionid = cc.subsessionid
  568. AND g.videoid = cc.vid
  569. -- [dd] D链: session内后续曝光传播 (窗口函数累计后续拉回)
  570. LEFT JOIN (
  571. SELECT *
  572. ,COALESCE(
  573. SUM(回流) OVER (PARTITION BY mid, subsessionid ORDER BY rn ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
  574. , 0) AS dn
  575. ,COALESCE(
  576. SUM(回流1) OVER (PARTITION BY mid, subsessionid ORDER BY rn ROWS BETWEEN 1 FOLLOWING AND UNBOUNDED FOLLOWING)
  577. , 0) AS d1
  578. FROM (
  579. SELECT a.mid AS mid
  580. ,a.subsessionid
  581. ,a.videoid AS vid
  582. ,COUNT(DISTINCT b.shareid) AS 分享次数
  583. ,COUNT(DISTINCT
  584. CASE WHEN c.machinecode <> b.machinecode
  585. THEN c.machinecode
  586. END
  587. ) AS 回流
  588. ,COUNT(DISTINCT
  589. CASE WHEN c.machinecode <> b.machinecode AND c.sharedepth = 1
  590. THEN c.machinecode
  591. END
  592. ) AS 回流1
  593. ,ROW_NUMBER() OVER (PARTITION BY a.subsessionid ORDER BY a.logtimestamp DESC) AS rn
  594. FROM (
  595. SELECT DISTINCT mid, subsessionid, videoid, logtimestamp
  596. FROM loghubods.video_action_log_rp
  597. WHERE dt = '${bizdate}'
  598. AND businesstype = 'videoView'
  599. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  600. ) a
  601. LEFT JOIN (
  602. SELECT DISTINCT machinecode, shareobjectid AS videoid, recomTraceId,
  603. subsessionid, sharedepth, shareid, clienttimestamp
  604. FROM loghubods.user_share_log
  605. WHERE dt = '${bizdate}'
  606. AND topic = 'share'
  607. AND pagesource REGEXP 'category$|recommend$|-pages/user-videos-detail$'
  608. ) b
  609. ON a.mid = b.machinecode
  610. AND a.subsessionid = b.subsessionid
  611. AND a.videoid = b.videoid
  612. LEFT JOIN (
  613. SELECT DISTINCT machinecode, clickobjectid, recomTraceId,
  614. subsessionid, sharedepth, rootshareid
  615. FROM loghubods.user_share_log
  616. WHERE dt = '${bizdate}'
  617. AND topic = 'click'
  618. ) c
  619. ON b.shareid = c.rootshareid
  620. GROUP BY a.mid, a.subsessionid, a.videoid, a.logtimestamp
  621. )
  622. ) dd
  623. ON g.mid = dd.mid
  624. AND g.subsessionid = dd.subsessionid
  625. AND g.videoid = dd.vid
  626. -- =====================================================================
  627. -- GROUP BY CUBE: 全维度组合聚合
  628. -- =====================================================================
  629. GROUP BY CUBE(
  630. -- 用户拉活量分层
  631. COALESCE(
  632. CASE
  633. WHEN e.type IS NULL OR e.type = 'R_0' THEN 'R0&新用户'
  634. WHEN e.type IN ('R_1','R_2_10','R_10_50') THEN 'R1-50'
  635. WHEN e.type IN ('R_50_100','R_100_180','R_180_330') THEN 'R_180_330'
  636. ELSE e.type
  637. END
  638. , '-')
  639. -- 进入小时
  640. ,COALESCE(
  641. CASE
  642. WHEN in_hour >= 1 AND in_hour <= 4 THEN '1-4'
  643. WHEN in_hour >= 5 AND in_hour <= 8 THEN '5-8'
  644. WHEN in_hour >= 9 AND in_hour <= 12 THEN '9-12'
  645. WHEN in_hour >= 13 AND in_hour <= 16 THEN '13-16'
  646. WHEN in_hour >= 17 AND in_hour <= 20 THEN '17-20'
  647. WHEN in_hour >= 21 AND in_hour <= 24 THEN '21-24'
  648. ELSE '-'
  649. END
  650. , '-')
  651. -- 进入场景
  652. ,CASE WHEN f.scene_id IS NOT NULL THEN f.scene_desc ELSE '其他' END
  653. -- 进入内容品类
  654. ,CASE WHEN m.merge_leve2 IS NOT NULL THEN d.merge_leve2 ELSE '其他' END
  655. -- 推荐内容品类
  656. ,CASE WHEN l.merge_leve2 IS NOT NULL THEN h.merge_leve2 ELSE '其他' END
  657. -- 内容id
  658. ,COALESCE(CASE WHEN i.merge_leve2 IS NOT NULL THEN i.videoid ELSE '其他' END, '-')
  659. )
  660. ORDER BY 访问人数 DESC
  661. ;