Explorar o código

feat(品类命中分析): 增加明细查询,输出用户历史品类和频次

- query_detail.sql: 输出每条记录的用户历史品类列表和频次
- 格式: 品类列表逗号分隔,频次用 品类:次数|品类:次数

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui hai 2 meses
pai
achega
7a61a8860c

+ 40 - 4
tasks/品类命中分析/query.sql

@@ -35,16 +35,31 @@ WITH user_cate1_exploded AS (
     WHERE json_obj IS NOT NULL AND json_obj != ''
 )
 
--- Step 2: 用户一级品类去重列表
+-- Step 2: 用户一级品类去重列表(含频次)
 ,user_cate1_list AS (
     SELECT
         mid,
-        collect_set(history_cat1) AS history_cat1_list
+        collect_set(history_cat1) AS history_cat1_list,
+        concat_ws(',', collect_set(history_cat1)) AS history_cat1_str
     FROM user_cate1_exploded
     WHERE history_cat1 IS NOT NULL
     GROUP BY mid
 )
 
+-- Step 2.1: 用户一级品类频次
+,user_cate1_freq AS (
+    SELECT
+        mid,
+        concat_ws('|', collect_list(concat(history_cat1, ':', cast(cnt as string)))) AS history_cat1_freq
+    FROM (
+        SELECT mid, history_cat1, count(*) AS cnt
+        FROM user_cate1_exploded
+        WHERE history_cat1 IS NOT NULL
+        GROUP BY mid, history_cat1
+    ) t
+    GROUP BY mid
+)
+
 -- Step 3: 解析用户二级品类历史
 ,user_cate2_exploded AS (
     SELECT
@@ -77,16 +92,31 @@ WITH user_cate1_exploded AS (
     WHERE json_obj IS NOT NULL AND json_obj != ''
 )
 
--- Step 4: 用户二级品类去重列表
+-- Step 4: 用户二级品类去重列表(含频次)
 ,user_cate2_list AS (
     SELECT
         mid,
-        collect_set(history_cat2) AS history_cat2_list
+        collect_set(history_cat2) AS history_cat2_list,
+        concat_ws(',', collect_set(history_cat2)) AS history_cat2_str
     FROM user_cate2_exploded
     WHERE history_cat2 IS NOT NULL
     GROUP BY mid
 )
 
+-- Step 4.1: 用户二级品类频次
+,user_cate2_freq AS (
+    SELECT
+        mid,
+        concat_ws('|', collect_list(concat(history_cat2, ':', cast(cnt as string)))) AS history_cat2_freq
+    FROM (
+        SELECT mid, history_cat2, count(*) AS cnt
+        FROM user_cate2_exploded
+        WHERE history_cat2 IS NOT NULL
+        GROUP BY mid, history_cat2
+    ) t
+    GROUP BY mid
+)
+
 -- Step 5: 基础数据
 ,base_data AS (
     SELECT
@@ -117,7 +147,11 @@ WITH user_cate1_exploded AS (
         a.再分享单聊回流uv,
         a.是否原视频,
         b.history_cat1_list,
+        b.history_cat1_str,
         c.history_cat2_list,
+        c.history_cat2_str,
+        d.history_cat1_freq,
+        e.history_cat2_freq,
         -- 一级品类命中情况
         CASE
             WHEN b.history_cat1_list IS NULL THEN '无历史'
@@ -133,6 +167,8 @@ WITH user_cate1_exploded AS (
     FROM base_data a
     LEFT JOIN user_cate1_list b ON a.mid = b.mid
     LEFT JOIN user_cate2_list c ON a.mid = c.mid
+    LEFT JOIN user_cate1_freq d ON a.mid = d.mid
+    LEFT JOIN user_cate2_freq e ON a.mid = e.mid
 )
 
 -- Step 7: 输出明细(保留中间结果)

+ 83 - 0
tasks/品类命中分析/query_debug.sql

@@ -0,0 +1,83 @@
+-- 品类命中分析 - Debug版本
+-- 输出中间结果,方便检查命中逻辑
+
+-- Step 1: 解析用户一级品类历史
+WITH user_cate1_exploded AS (
+    SELECT
+        mid,
+        get_json_object(json_obj, '$.na') AS history_cat1
+    FROM (
+        SELECT
+            mid,
+            json_piece AS json_obj
+        FROM loghubods.alg_recsys_feature_user_share_return_stat
+        LATERAL VIEW explode(
+            split(
+                regexp_replace(
+                    regexp_replace(
+                        regexp_replace(
+                            get_json_object(feature, '$.c1_s'),
+                            '\\\\\"', '\"'
+                        ),
+                        '^\\[|\\]$', ''
+                    ),
+                    '\\},\\{', '}|{'
+                ),
+                '\\|'
+            )
+        ) t AS json_piece
+        WHERE dt = '${end}'
+        AND get_json_object(feature, '$.c1_s') IS NOT NULL
+        AND get_json_object(feature, '$.c1_s') != '[]'
+    ) exploded
+    WHERE json_obj IS NOT NULL AND json_obj != ''
+)
+
+-- Step 2: 用户一级品类去重列表(转为字符串方便查看)
+,user_cate1_list AS (
+    SELECT
+        mid,
+        collect_set(history_cat1) AS history_cat1_array,
+        concat_ws(',', collect_set(history_cat1)) AS history_cat1_str
+    FROM user_cate1_exploded
+    WHERE history_cat1 IS NOT NULL
+    GROUP BY mid
+)
+
+-- Step 3: 基础数据(只取样本)
+,base_data AS (
+    SELECT
+        dt,
+        channel,
+        mid,
+        再分享merge一级品类 AS 再分享一级品类,
+        再分享merge二级品类 AS 再分享二级品类,
+        再分享群聊回流uv,
+        再分享单聊回流uv
+    FROM loghubods.opengid_base_data
+    WHERE dt = ${end}
+    AND usersharedepth = 0
+    AND videoid IS NOT NULL
+    AND 再分享merge一级品类 IS NOT NULL
+)
+
+-- Step 4: Join 并输出详细信息
+SELECT
+    a.dt,
+    a.channel,
+    a.mid,
+    a.再分享一级品类,
+    b.history_cat1_str AS 用户历史一级品类,
+    CASE
+        WHEN b.history_cat1_array IS NULL THEN '无历史'
+        WHEN array_contains(b.history_cat1_array, a.再分享一级品类) THEN '命中'
+        ELSE '未命中'
+    END AS 一级品类命中,
+    -- 调试信息
+    CASE WHEN b.history_cat1_array IS NULL THEN 0 ELSE size(b.history_cat1_array) END AS 历史品类数量,
+    a.再分享群聊回流uv + a.再分享单聊回流uv AS 裂变uv
+FROM base_data a
+LEFT JOIN user_cate1_list b ON a.mid = b.mid
+ORDER BY a.channel, a.mid
+LIMIT 1000
+;

+ 163 - 0
tasks/品类命中分析/query_detail.sql

@@ -0,0 +1,163 @@
+-- 品类命中分析 - 明细版本
+-- 输出每条记录的用户历史品类和频次,方便debug
+
+-- Step 1: 解析用户一级品类历史
+WITH user_cate1_exploded AS (
+    SELECT
+        mid,
+        get_json_object(json_obj, '$.na') AS history_cat1
+    FROM (
+        SELECT
+            mid,
+            json_piece AS json_obj
+        FROM loghubods.alg_recsys_feature_user_share_return_stat
+        LATERAL VIEW explode(
+            split(
+                regexp_replace(
+                    regexp_replace(
+                        regexp_replace(
+                            get_json_object(feature, '$.c1_s'),
+                            '\\\\\"', '\"'
+                        ),
+                        '^\\[|\\]$', ''
+                    ),
+                    '\\},\\{', '}|{'
+                ),
+                '\\|'
+            )
+        ) t AS json_piece
+        WHERE dt = '${end}'
+        AND get_json_object(feature, '$.c1_s') IS NOT NULL
+        AND get_json_object(feature, '$.c1_s') != '[]'
+    ) exploded
+    WHERE json_obj IS NOT NULL AND json_obj != ''
+)
+
+-- Step 2: 用户一级品类列表和频次
+,user_cate1_agg AS (
+    SELECT
+        mid,
+        collect_set(history_cat1) AS history_cat1_list,
+        concat_ws(',', collect_set(history_cat1)) AS 用户历史一级品类
+    FROM user_cate1_exploded
+    WHERE history_cat1 IS NOT NULL
+    GROUP BY mid
+)
+
+,user_cate1_freq AS (
+    SELECT
+        mid,
+        concat_ws('|', collect_list(cat_freq)) AS 用户历史一级品类频次
+    FROM (
+        SELECT mid, concat(history_cat1, ':', cast(count(*) as string)) AS cat_freq
+        FROM user_cate1_exploded
+        WHERE history_cat1 IS NOT NULL
+        GROUP BY mid, history_cat1
+    ) t
+    GROUP BY mid
+)
+
+-- Step 3: 解析用户二级品类历史
+,user_cate2_exploded AS (
+    SELECT
+        mid,
+        get_json_object(json_obj, '$.na') AS history_cat2
+    FROM (
+        SELECT
+            mid,
+            json_piece AS json_obj
+        FROM loghubods.alg_recsys_feature_user_share_return_stat
+        LATERAL VIEW explode(
+            split(
+                regexp_replace(
+                    regexp_replace(
+                        regexp_replace(
+                            get_json_object(feature, '$.c2_s'),
+                            '\\\\\"', '\"'
+                        ),
+                        '^\\[|\\]$', ''
+                    ),
+                    '\\},\\{', '}|{'
+                ),
+                '\\|'
+            )
+        ) t AS json_piece
+        WHERE dt = '${end}'
+        AND get_json_object(feature, '$.c2_s') IS NOT NULL
+        AND get_json_object(feature, '$.c2_s') != '[]'
+    ) exploded
+    WHERE json_obj IS NOT NULL AND json_obj != ''
+)
+
+-- Step 4: 用户二级品类列表和频次
+,user_cate2_agg AS (
+    SELECT
+        mid,
+        collect_set(history_cat2) AS history_cat2_list,
+        concat_ws(',', collect_set(history_cat2)) AS 用户历史二级品类
+    FROM user_cate2_exploded
+    WHERE history_cat2 IS NOT NULL
+    GROUP BY mid
+)
+
+,user_cate2_freq AS (
+    SELECT
+        mid,
+        concat_ws('|', collect_list(cat_freq)) AS 用户历史二级品类频次
+    FROM (
+        SELECT mid, concat(history_cat2, ':', cast(count(*) as string)) AS cat_freq
+        FROM user_cate2_exploded
+        WHERE history_cat2 IS NOT NULL
+        GROUP BY mid, history_cat2
+    ) t
+    GROUP BY mid
+)
+
+-- Step 5: 基础数据
+,base_data AS (
+    SELECT
+        dt,
+        channel,
+        mid,
+        再分享merge一级品类 AS 再分享一级品类,
+        再分享merge二级品类 AS 再分享二级品类,
+        再分享群聊回流uv,
+        再分享单聊回流uv,
+        是否原视频
+    FROM loghubods.opengid_base_data
+    WHERE dt >= ${start}
+    AND dt <= ${end}
+    AND usersharedepth = 0
+    AND videoid IS NOT NULL
+)
+
+-- Step 6: 输出明细(含用户历史品类)
+SELECT
+    a.dt,
+    a.channel,
+    a.mid,
+    a.再分享一级品类,
+    a.再分享二级品类,
+    b.用户历史一级品类,
+    c.用户历史一级品类频次,
+    d.用户历史二级品类,
+    e.用户历史二级品类频次,
+    CASE
+        WHEN b.history_cat1_list IS NULL THEN '无历史'
+        WHEN array_contains(b.history_cat1_list, a.再分享一级品类) THEN '命中'
+        ELSE '未命中'
+    END AS 一级品类命中,
+    CASE
+        WHEN d.history_cat2_list IS NULL THEN '无历史'
+        WHEN array_contains(d.history_cat2_list, a.再分享二级品类) THEN '命中'
+        ELSE '未命中'
+    END AS 二级品类命中,
+    a.再分享群聊回流uv + a.再分享单聊回流uv AS 裂变uv,
+    a.是否原视频
+FROM base_data a
+LEFT JOIN user_cate1_agg b ON a.mid = b.mid
+LEFT JOIN user_cate1_freq c ON a.mid = c.mid
+LEFT JOIN user_cate2_agg d ON a.mid = d.mid
+LEFT JOIN user_cate2_freq e ON a.mid = e.mid
+ORDER BY a.dt, a.channel, a.mid
+;