hai 5 meses · 7a61a8860c
--- a/tasks/品类命中分析/query.sql
+++ b/tasks/品类命中分析/query.sql
@@ -35,16 +35,31 @@ WITH user_cate1_exploded AS (
 
				     WHERE json_obj IS NOT NULL AND json_obj != ''
			
 
				 )
			
 
				 
			
 
				--- Step 2: 用户一级品类去重列表
			
 
				+-- Step 2: 用户一级品类去重列表（含频次）
			
 
				 ,user_cate1_list AS (
			
 
				     SELECT
			
 
				         mid,
			
 
				-        collect_set(history_cat1) AS history_cat1_list
			
 
				+        collect_set(history_cat1) AS history_cat1_list,
			
 
				+        concat_ws(',', collect_set(history_cat1)) AS history_cat1_str
			
 
				     FROM user_cate1_exploded
			
 
				     WHERE history_cat1 IS NOT NULL
			
 
				     GROUP BY mid
			
 
				 )
			
 
				 
			
 
				+-- Step 2.1: 用户一级品类频次
			
 
				+,user_cate1_freq AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        concat_ws('|', collect_list(concat(history_cat1, ':', cast(cnt as string)))) AS history_cat1_freq
			
 
				+    FROM (
			
 
				+        SELECT mid, history_cat1, count(*) AS cnt
			
 
				+        FROM user_cate1_exploded
			
 
				+        WHERE history_cat1 IS NOT NULL
			
 
				+        GROUP BY mid, history_cat1
			
 
				+    ) t
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				 -- Step 3: 解析用户二级品类历史
			
 
				 ,user_cate2_exploded AS (
			
 
				     SELECT
			
@@ -77,16 +92,31 @@ WITH user_cate1_exploded AS (
 
				     WHERE json_obj IS NOT NULL AND json_obj != ''
			
 
				 )
			
 
				 
			
 
				--- Step 4: 用户二级品类去重列表
			
 
				+-- Step 4: 用户二级品类去重列表（含频次）
			
 
				 ,user_cate2_list AS (
			
 
				     SELECT
			
 
				         mid,
			
 
				-        collect_set(history_cat2) AS history_cat2_list
			
 
				+        collect_set(history_cat2) AS history_cat2_list,
			
 
				+        concat_ws(',', collect_set(history_cat2)) AS history_cat2_str
			
 
				     FROM user_cate2_exploded
			
 
				     WHERE history_cat2 IS NOT NULL
			
 
				     GROUP BY mid
			
 
				 )
			
 
				 
			
 
				+-- Step 4.1: 用户二级品类频次
			
 
				+,user_cate2_freq AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        concat_ws('|', collect_list(concat(history_cat2, ':', cast(cnt as string)))) AS history_cat2_freq
			
 
				+    FROM (
			
 
				+        SELECT mid, history_cat2, count(*) AS cnt
			
 
				+        FROM user_cate2_exploded
			
 
				+        WHERE history_cat2 IS NOT NULL
			
 
				+        GROUP BY mid, history_cat2
			
 
				+    ) t
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				 -- Step 5: 基础数据
			
 
				 ,base_data AS (
			
 
				     SELECT
			
@@ -117,7 +147,11 @@ WITH user_cate1_exploded AS (
 
				         a.再分享单聊回流uv,
			
 
				         a.是否原视频,
			
 
				         b.history_cat1_list,
			
 
				+        b.history_cat1_str,
			
 
				         c.history_cat2_list,
			
 
				+        c.history_cat2_str,
			
 
				+        d.history_cat1_freq,
			
 
				+        e.history_cat2_freq,
			
 
				         -- 一级品类命中情况
			
 
				         CASE
			
 
				             WHEN b.history_cat1_list IS NULL THEN '无历史'
			
@@ -133,6 +167,8 @@ WITH user_cate1_exploded AS (
 
				     FROM base_data a
			
 
				     LEFT JOIN user_cate1_list b ON a.mid = b.mid
			
 
				     LEFT JOIN user_cate2_list c ON a.mid = c.mid
			
 
				+    LEFT JOIN user_cate1_freq d ON a.mid = d.mid
			
 
				+    LEFT JOIN user_cate2_freq e ON a.mid = e.mid
			
 
				 )
			
 
				 
			
 
				 -- Step 7: 输出明细（保留中间结果）
			
--- a/tasks/品类命中分析/query_debug.sql
+++ b/tasks/品类命中分析/query_debug.sql
@@ -0,0 +1,83 @@
 
				+-- 品类命中分析 - Debug版本
			
 
				+-- 输出中间结果，方便检查命中逻辑
			
 
				+
			
 
				+-- Step 1: 解析用户一级品类历史
			
 
				+WITH user_cate1_exploded AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        get_json_object(json_obj, '$.na') AS history_cat1
			
 
				+    FROM (
			
 
				+        SELECT
			
 
				+            mid,
			
 
				+            json_piece AS json_obj
			
 
				+        FROM loghubods.alg_recsys_feature_user_share_return_stat
			
 
				+        LATERAL VIEW explode(
			
 
				+            split(
			
 
				+                regexp_replace(
			
 
				+                    regexp_replace(
			
 
				+                        regexp_replace(
			
 
				+                            get_json_object(feature, '$.c1_s'),
			
 
				+                            '\\\\\"', '\"'
			
 
				+                        ),
			
 
				+                        '^\\[|\\]$', ''
			
 
				+                    ),
			
 
				+                    '\\},\\{', '}|{'
			
 
				+                ),
			
 
				+                '\\|'
			
 
				+            )
			
 
				+        ) t AS json_piece
			
 
				+        WHERE dt = '${end}'
			
 
				+        AND get_json_object(feature, '$.c1_s') IS NOT NULL
			
 
				+        AND get_json_object(feature, '$.c1_s') != '[]'
			
 
				+    ) exploded
			
 
				+    WHERE json_obj IS NOT NULL AND json_obj != ''
			
 
				+)
			
 
				+
			
 
				+-- Step 2: 用户一级品类去重列表（转为字符串方便查看）
			
 
				+,user_cate1_list AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        collect_set(history_cat1) AS history_cat1_array,
			
 
				+        concat_ws(',', collect_set(history_cat1)) AS history_cat1_str
			
 
				+    FROM user_cate1_exploded
			
 
				+    WHERE history_cat1 IS NOT NULL
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				+-- Step 3: 基础数据（只取样本）
			
 
				+,base_data AS (
			
 
				+    SELECT
			
 
				+        dt,
			
 
				+        channel,
			
 
				+        mid,
			
 
				+        再分享merge一级品类 AS 再分享一级品类,
			
 
				+        再分享merge二级品类 AS 再分享二级品类,
			
 
				+        再分享群聊回流uv,
			
 
				+        再分享单聊回流uv
			
 
				+    FROM loghubods.opengid_base_data
			
 
				+    WHERE dt = ${end}
			
 
				+    AND usersharedepth = 0
			
 
				+    AND videoid IS NOT NULL
			
 
				+    AND 再分享merge一级品类 IS NOT NULL
			
 
				+)
			
 
				+
			
 
				+-- Step 4: Join 并输出详细信息
			
 
				+SELECT
			
 
				+    a.dt,
			
 
				+    a.channel,
			
 
				+    a.mid,
			
 
				+    a.再分享一级品类,
			
 
				+    b.history_cat1_str AS 用户历史一级品类,
			
 
				+    CASE
			
 
				+        WHEN b.history_cat1_array IS NULL THEN '无历史'
			
 
				+        WHEN array_contains(b.history_cat1_array, a.再分享一级品类) THEN '命中'
			
 
				+        ELSE '未命中'
			
 
				+    END AS 一级品类命中,
			
 
				+    -- 调试信息
			
 
				+    CASE WHEN b.history_cat1_array IS NULL THEN 0 ELSE size(b.history_cat1_array) END AS 历史品类数量,
			
 
				+    a.再分享群聊回流uv + a.再分享单聊回流uv AS 裂变uv
			
 
				+FROM base_data a
			
 
				+LEFT JOIN user_cate1_list b ON a.mid = b.mid
			
 
				+ORDER BY a.channel, a.mid
			
 
				+LIMIT 1000
			
 
				+;
			
--- a/tasks/品类命中分析/query_detail.sql
+++ b/tasks/品类命中分析/query_detail.sql
@@ -0,0 +1,163 @@
 
				+-- 品类命中分析 - 明细版本
			
 
				+-- 输出每条记录的用户历史品类和频次，方便debug
			
 
				+
			
 
				+-- Step 1: 解析用户一级品类历史
			
 
				+WITH user_cate1_exploded AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        get_json_object(json_obj, '$.na') AS history_cat1
			
 
				+    FROM (
			
 
				+        SELECT
			
 
				+            mid,
			
 
				+            json_piece AS json_obj
			
 
				+        FROM loghubods.alg_recsys_feature_user_share_return_stat
			
 
				+        LATERAL VIEW explode(
			
 
				+            split(
			
 
				+                regexp_replace(
			
 
				+                    regexp_replace(
			
 
				+                        regexp_replace(
			
 
				+                            get_json_object(feature, '$.c1_s'),
			
 
				+                            '\\\\\"', '\"'
			
 
				+                        ),
			
 
				+                        '^\\[|\\]$', ''
			
 
				+                    ),
			
 
				+                    '\\},\\{', '}|{'
			
 
				+                ),
			
 
				+                '\\|'
			
 
				+            )
			
 
				+        ) t AS json_piece
			
 
				+        WHERE dt = '${end}'
			
 
				+        AND get_json_object(feature, '$.c1_s') IS NOT NULL
			
 
				+        AND get_json_object(feature, '$.c1_s') != '[]'
			
 
				+    ) exploded
			
 
				+    WHERE json_obj IS NOT NULL AND json_obj != ''
			
 
				+)
			
 
				+
			
 
				+-- Step 2: 用户一级品类列表和频次
			
 
				+,user_cate1_agg AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        collect_set(history_cat1) AS history_cat1_list,
			
 
				+        concat_ws(',', collect_set(history_cat1)) AS 用户历史一级品类
			
 
				+    FROM user_cate1_exploded
			
 
				+    WHERE history_cat1 IS NOT NULL
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				+,user_cate1_freq AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        concat_ws('|', collect_list(cat_freq)) AS 用户历史一级品类频次
			
 
				+    FROM (
			
 
				+        SELECT mid, concat(history_cat1, ':', cast(count(*) as string)) AS cat_freq
			
 
				+        FROM user_cate1_exploded
			
 
				+        WHERE history_cat1 IS NOT NULL
			
 
				+        GROUP BY mid, history_cat1
			
 
				+    ) t
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				+-- Step 3: 解析用户二级品类历史
			
 
				+,user_cate2_exploded AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        get_json_object(json_obj, '$.na') AS history_cat2
			
 
				+    FROM (
			
 
				+        SELECT
			
 
				+            mid,
			
 
				+            json_piece AS json_obj
			
 
				+        FROM loghubods.alg_recsys_feature_user_share_return_stat
			
 
				+        LATERAL VIEW explode(
			
 
				+            split(
			
 
				+                regexp_replace(
			
 
				+                    regexp_replace(
			
 
				+                        regexp_replace(
			
 
				+                            get_json_object(feature, '$.c2_s'),
			
 
				+                            '\\\\\"', '\"'
			
 
				+                        ),
			
 
				+                        '^\\[|\\]$', ''
			
 
				+                    ),
			
 
				+                    '\\},\\{', '}|{'
			
 
				+                ),
			
 
				+                '\\|'
			
 
				+            )
			
 
				+        ) t AS json_piece
			
 
				+        WHERE dt = '${end}'
			
 
				+        AND get_json_object(feature, '$.c2_s') IS NOT NULL
			
 
				+        AND get_json_object(feature, '$.c2_s') != '[]'
			
 
				+    ) exploded
			
 
				+    WHERE json_obj IS NOT NULL AND json_obj != ''
			
 
				+)
			
 
				+
			
 
				+-- Step 4: 用户二级品类列表和频次
			
 
				+,user_cate2_agg AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        collect_set(history_cat2) AS history_cat2_list,
			
 
				+        concat_ws(',', collect_set(history_cat2)) AS 用户历史二级品类
			
 
				+    FROM user_cate2_exploded
			
 
				+    WHERE history_cat2 IS NOT NULL
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				+,user_cate2_freq AS (
			
 
				+    SELECT
			
 
				+        mid,
			
 
				+        concat_ws('|', collect_list(cat_freq)) AS 用户历史二级品类频次
			
 
				+    FROM (
			
 
				+        SELECT mid, concat(history_cat2, ':', cast(count(*) as string)) AS cat_freq
			
 
				+        FROM user_cate2_exploded
			
 
				+        WHERE history_cat2 IS NOT NULL
			
 
				+        GROUP BY mid, history_cat2
			
 
				+    ) t
			
 
				+    GROUP BY mid
			
 
				+)
			
 
				+
			
 
				+-- Step 5: 基础数据
			
 
				+,base_data AS (
			
 
				+    SELECT
			
 
				+        dt,
			
 
				+        channel,
			
 
				+        mid,
			
 
				+        再分享merge一级品类 AS 再分享一级品类,
			
 
				+        再分享merge二级品类 AS 再分享二级品类,
			
 
				+        再分享群聊回流uv,
			
 
				+        再分享单聊回流uv,
			
 
				+        是否原视频
			
 
				+    FROM loghubods.opengid_base_data
			
 
				+    WHERE dt >= ${start}
			
 
				+    AND dt <= ${end}
			
 
				+    AND usersharedepth = 0
			
 
				+    AND videoid IS NOT NULL
			
 
				+)
			
 
				+
			
 
				+-- Step 6: 输出明细（含用户历史品类）
			
 
				+SELECT
			
 
				+    a.dt,
			
 
				+    a.channel,
			
 
				+    a.mid,
			
 
				+    a.再分享一级品类,
			
 
				+    a.再分享二级品类,
			
 
				+    b.用户历史一级品类,
			
 
				+    c.用户历史一级品类频次,
			
 
				+    d.用户历史二级品类,
			
 
				+    e.用户历史二级品类频次,
			
 
				+    CASE
			
 
				+        WHEN b.history_cat1_list IS NULL THEN '无历史'
			
 
				+        WHEN array_contains(b.history_cat1_list, a.再分享一级品类) THEN '命中'
			
 
				+        ELSE '未命中'
			
 
				+    END AS 一级品类命中,
			
 
				+    CASE
			
 
				+        WHEN d.history_cat2_list IS NULL THEN '无历史'
			
 
				+        WHEN array_contains(d.history_cat2_list, a.再分享二级品类) THEN '命中'
			
 
				+        ELSE '未命中'
			
 
				+    END AS 二级品类命中,
			
 
				+    a.再分享群聊回流uv + a.再分享单聊回流uv AS 裂变uv,
			
 
				+    a.是否原视频
			
 
				+FROM base_data a
			
 
				+LEFT JOIN user_cate1_agg b ON a.mid = b.mid
			
 
				+LEFT JOIN user_cate1_freq c ON a.mid = c.mid
			
 
				+LEFT JOIN user_cate2_agg d ON a.mid = d.mid
			
 
				+LEFT JOIN user_cate2_freq e ON a.mid = e.mid
			
 
				+ORDER BY a.dt, a.channel, a.mid
			
 
				+;