소스 검색

feat(品类再分享分析): 新增头部品类×再分享品类分析任务

- 支持渠道选择、日期切换、品类级别(一级/二级)切换
- 支持多种指标:推荐裂变率、整体裂变率、头部裂变率、点击UV
- 矩阵支持点击排序(按行/列/总和)
- 渐变色用95分位数避免极端值影响

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 달 전
부모
커밋
0a29a0760c
2개의 변경된 파일506개의 추가작업 그리고 0개의 파일을 삭제
  1. 30 0
      tasks/品类再分享分析/query.sql
  2. 476 0
      tasks/品类再分享分析/visualize.py

+ 30 - 0
tasks/品类再分享分析/query.sql

@@ -0,0 +1,30 @@
+-- 品类再分享分析
+-- 分析不同渠道下,头部视频品类 × 再分享视频品类的关系
+
+SELECT  dt
+        ,channel
+        ,merge一级品类 AS 头部一级品类
+        ,merge二级品类 AS 头部二级品类
+        ,再分享merge一级品类 AS 再分享一级品类
+        ,再分享merge二级品类 AS 再分享二级品类
+        ,COUNT(DISTINCT mid) AS 点击uv
+        ,(SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 整体裂变率
+        ,(SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 头部裂变率
+        ,(SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 推荐裂变率
+        ,SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END) AS 裂变uv
+FROM    loghubods.opengid_base_data
+WHERE   dt >= ${start}
+AND     dt <= ${end}
+AND     usersharedepth = 0
+AND     videoid IS NOT NULL
+AND     再分享merge一级品类 IS NOT NULL
+GROUP BY dt, channel, merge一级品类, merge二级品类, 再分享merge一级品类, 再分享merge二级品类
+ORDER BY dt, channel, 点击uv DESC
+;

+ 476 - 0
tasks/品类再分享分析/visualize.py

@@ -0,0 +1,476 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+品类再分享分析可视化
+分析头部视频品类 × 再分享视频品类的关系
+支持一级/二级品类切换,多种裂变率指标
+"""
+import pandas as pd
+import json
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 找到最新的原始数据文件
+csv_files = [f for f in output_dir.glob("*.csv") if f.stem.count('_') == 1]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+print(f"分析文件: {latest_file.name}")
+print(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 日期列表
+all_dates = sorted([str(d) for d in df['dt'].unique()])
+date_options = ['all'] + all_dates
+latest_date = all_dates[-1] if all_dates else 'all'
+print(f"日期数: {len(all_dates)}")
+
+# 渠道列表(按UV排序)
+channel_uv = df.groupby('channel')['点击uv'].sum().sort_values(ascending=False)
+channel_list = channel_uv.index.tolist()
+print(f"渠道数: {len(channel_list)}")
+
+# 生成二级品类标签
+def get_cat2_label(cat1, cat2):
+    cat1_str = str(cat1) if pd.notna(cat1) and str(cat1).strip() else '未知'
+    if pd.notna(cat2) and str(cat2).strip():
+        return f"{cat1_str}/{cat2}"
+    return cat1_str
+
+# 计算渠道×日期×品类级别的矩阵数据
+def calc_matrix_data(channel, date=None, level='cat1'):
+    """计算指定渠道和日期的品类矩阵"""
+    ch_df = df[df['channel'] == channel].copy()
+    if date and date != 'all':
+        ch_df = ch_df[ch_df['dt'].astype(str) == str(date)]
+
+    if len(ch_df) == 0:
+        return None
+
+    # 根据级别选择品类列
+    if level == 'cat1':
+        # 填充空值
+        ch_df['头部一级品类'] = ch_df['头部一级品类'].fillna('未知')
+        ch_df['再分享一级品类'] = ch_df['再分享一级品类'].fillna('未知')
+        row_col = '头部一级品类'
+        col_col = '再分享一级品类'
+    else:
+        ch_df['头部二级标签'] = ch_df.apply(lambda r: get_cat2_label(r['头部一级品类'], r['头部二级品类']), axis=1)
+        ch_df['再分享二级标签'] = ch_df.apply(lambda r: get_cat2_label(r['再分享一级品类'], r['再分享二级品类']), axis=1)
+        row_col = '头部二级标签'
+        col_col = '再分享二级标签'
+
+    # 按品类聚合
+    matrix = ch_df.groupby([row_col, col_col]).agg({
+        '点击uv': 'sum',
+        '裂变uv': 'sum',
+    }).reset_index()
+
+    # 重新计算各种裂变率
+    matrix['整体裂变率'] = matrix['裂变uv'] / (matrix['点击uv'] + 10)
+
+    # 头部裂变率和推荐裂变率需要从原始数据聚合
+    orig_agg = ch_df.groupby([row_col, col_col]).apply(
+        lambda x: pd.Series({
+            '头部裂变率': (x['头部裂变率'] * x['点击uv']).sum() / (x['点击uv'].sum() + 10) if x['点击uv'].sum() > 0 else 0,
+            '推荐裂变率': (x['推荐裂变率'] * x['点击uv']).sum() / (x['点击uv'].sum() + 10) if x['点击uv'].sum() > 0 else 0,
+        }), include_groups=False
+    ).reset_index()
+
+    matrix = matrix.merge(orig_agg, on=[row_col, col_col], how='left')
+
+    # 生成pivot表
+    uv_pivot = matrix.pivot(index=row_col, columns=col_col, values='点击uv').fillna(0)
+    ror_pivot = matrix.pivot(index=row_col, columns=col_col, values='整体裂变率').fillna(0)
+    orig_pivot = matrix.pivot(index=row_col, columns=col_col, values='头部裂变率').fillna(0)
+    rec_pivot = matrix.pivot(index=row_col, columns=col_col, values='推荐裂变率').fillna(0)
+
+    # 按行总UV排序
+    row_order = uv_pivot.sum(axis=1).sort_values(ascending=False).index.tolist()
+    col_order = uv_pivot.sum(axis=0).sort_values(ascending=False).index.tolist()
+
+    def to_dict(pivot, is_int=False):
+        return {str(r): {str(c): int(pivot.loc[r, c]) if is_int else float(pivot.loc[r, c]) if c in pivot.columns else 0 for c in col_order} for r in row_order}
+
+    return {
+        'rows': row_order,
+        'cols': col_order,
+        'uv': to_dict(uv_pivot, is_int=True),
+        'ror': to_dict(ror_pivot),
+        'orig': to_dict(orig_pivot),
+        'rec': to_dict(rec_pivot),
+        'total_uv': int(ch_df['点击uv'].sum()),
+        'total_ror': float(ch_df['裂变uv'].sum() / (ch_df['点击uv'].sum() + 10)) if ch_df['点击uv'].sum() > 0 else 0,
+    }
+
+# 预计算所有渠道×日期×品类级别的数据
+all_data = {}
+for ch in channel_list:
+    all_data[ch] = {'cat1': {}, 'cat2': {}}
+    for dt in date_options:
+        for level in ['cat1', 'cat2']:
+            matrix = calc_matrix_data(ch, dt, level)
+            if matrix:
+                all_data[ch][level][dt] = matrix
+
+# 转为JSON
+data_json = json.dumps(all_data, ensure_ascii=False)
+channel_list_json = json.dumps(channel_list, ensure_ascii=False)
+dates_json = json.dumps(date_options)
+
+# 日期选项HTML
+date_options_html = "".join([
+    f'<option value="{dt}" {"selected" if dt == latest_date else ""}>'
+    f'{"汇总" if dt == "all" else dt}</option>'
+    for dt in date_options
+])
+
+# 渠道选项HTML
+channel_options_html = "".join([
+    f'<option value="{ch}">{ch}</option>'
+    for ch in channel_list
+])
+
+html_content = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>品类再分享分析</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+               background: #f5f5f5; padding: 20px; }}
+        .container {{ max-width: 1600px; margin: 0 auto; background: white;
+                     border-radius: 8px; padding: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        h1 {{ font-size: 24px; margin-bottom: 20px; color: #333; }}
+        .controls {{ display: flex; gap: 20px; margin-bottom: 20px; align-items: center; flex-wrap: wrap; }}
+        .control-group {{ display: flex; align-items: center; gap: 8px; }}
+        .control-group label {{ font-weight: 500; color: #666; }}
+        select {{ padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 14px; min-width: 150px; }}
+        .summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
+        .stat-card {{ background: #f8f9fa; padding: 15px 20px; border-radius: 6px; text-align: center; }}
+        .stat-card h4 {{ font-size: 24px; color: #28a745; margin-bottom: 5px; }}
+        .stat-card p {{ font-size: 12px; color: #666; }}
+        .matrix-container {{ overflow-x: auto; max-height: 600px; overflow-y: auto; }}
+        table {{ border-collapse: collapse; font-size: 11px; }}
+        th, td {{ border: 1px solid #e0e0e0; padding: 4px 6px; text-align: center; white-space: nowrap; }}
+        th {{ background: #f5f5f5; font-weight: 600; position: sticky; top: 0; z-index: 1; }}
+        th:first-child {{ position: sticky; left: 0; z-index: 3; }}
+        td:first-child {{ background: #f5f5f5; font-weight: 500; position: sticky; left: 0; z-index: 1; }}
+        .legend {{ font-size: 12px; color: #666; margin-bottom: 10px; }}
+        .date-switcher {{ display: flex; align-items: center; gap: 5px; }}
+        .date-switcher button {{ padding: 5px 10px; border: 1px solid #ddd; background: white;
+                                cursor: pointer; border-radius: 3px; }}
+        .date-switcher button:hover {{ background: #f0f0f0; }}
+        .play-btn.playing {{ background: #28a745; color: white; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>品类再分享分析</h1>
+        <p style="margin-bottom:20px;color:#666;">分析头部视频品类与再分享视频品类的关系</p>
+
+        <div class="controls">
+            <div class="control-group">
+                <label>渠道:</label>
+                <select id="channel-select" onchange="updateMatrix()">
+                    {channel_options_html}
+                </select>
+            </div>
+            <div class="control-group">
+                <label>品类:</label>
+                <select id="level-select" onchange="updateMatrix()">
+                    <option value="cat1">一级品类</option>
+                    <option value="cat2">二级品类</option>
+                </select>
+            </div>
+            <div class="control-group">
+                <label>指标:</label>
+                <select id="metric-select" onchange="updateMatrix()">
+                    <option value="rec" selected>推荐裂变率</option>
+                    <option value="ror">整体裂变率</option>
+                    <option value="orig">头部裂变率</option>
+                    <option value="uv">点击UV</option>
+                </select>
+            </div>
+            <div class="control-group date-switcher">
+                <label>日期:</label>
+                <button onclick="switchDate(-1)">◀</button>
+                <select id="date-select" onchange="updateMatrix()">
+                    {date_options_html}
+                </select>
+                <button onclick="switchDate(1)">▶</button>
+                <button id="play-btn" onclick="togglePlay()">▶</button>
+            </div>
+        </div>
+
+        <div class="summary" id="summary">
+            <!-- 由JS填充 -->
+        </div>
+
+        <div class="legend">
+            行=头部视频品类,列=再分享视频品类 | 颜色越深=数值越高 | 点击表头排序
+            <button onclick="resetSort()" style="margin-left:15px;padding:3px 10px;cursor:pointer;">重置排序</button>
+        </div>
+
+        <div class="matrix-container">
+            <table id="matrix-table">
+                <thead id="matrix-header"></thead>
+                <tbody id="matrix-body"></tbody>
+            </table>
+        </div>
+    </div>
+
+    <script>
+    const allData = {data_json};
+    const channelList = {channel_list_json};
+    const dates = {dates_json};
+    let playInterval = null;
+    let currentRowOrder = null;
+    let currentColOrder = null;
+    let sortState = {{ row: null, col: null, asc: true }};
+    let lastChannel = null;
+    let lastLevel = null;
+
+    function getGradient(val, maxVal, minVal = 0) {{
+        if (val <= minVal || maxVal <= minVal) return '#f8f9fa';
+        const ratio = Math.min((val - minVal) / (maxVal - minVal), 1);
+        const r = Math.round(255 - ratio * 215);
+        const g = Math.round(255 - ratio * 88);
+        const b = Math.round(255 - ratio * 186);
+        return `rgb(${{r}},${{g}},${{b}})`;
+    }}
+
+    function updateMatrix() {{
+        const channel = document.getElementById('channel-select').value;
+        const level = document.getElementById('level-select').value;
+        const metric = document.getElementById('metric-select').value;
+        const date = document.getElementById('date-select').value;
+
+        if (!allData[channel] || !allData[channel][level] || !allData[channel][level][date]) {{
+            document.getElementById('summary').innerHTML = '<div class="stat-card"><h4>无数据</h4><p>该渠道/日期无数据</p></div>';
+            document.getElementById('matrix-header').innerHTML = '';
+            document.getElementById('matrix-body').innerHTML = '';
+            return;
+        }}
+
+        const data = allData[channel][level][date];
+        const levelLabel = level === 'cat1' ? '一级' : '二级';
+
+        // 更新汇总
+        document.getElementById('summary').innerHTML = `
+            <div class="stat-card">
+                <h4>${{data.total_uv.toLocaleString()}}</h4>
+                <p>总点击UV</p>
+            </div>
+            <div class="stat-card">
+                <h4>${{data.total_ror.toFixed(4)}}</h4>
+                <p>整体裂变率</p>
+            </div>
+            <div class="stat-card">
+                <h4>${{data.rows.length}}</h4>
+                <p>头部${{levelLabel}}品类数</p>
+            </div>
+            <div class="stat-card">
+                <h4>${{data.cols.length}}</h4>
+                <p>再分享${{levelLabel}}品类数</p>
+            </div>
+        `;
+
+        // 收集所有非零值,计算 95 分位数作为渐变上限
+        const metricData = data[metric];
+        const allVals = [];
+        data.rows.forEach(r => {{
+            data.cols.forEach(c => {{
+                const val = metricData[r]?.[c] || 0;
+                if (val > 0) allVals.push(val);
+            }});
+        }});
+        allVals.sort((a, b) => a - b);
+
+        // 用 95 分位数作为上限,避免极端值影响
+        const p95Idx = Math.floor(allVals.length * 0.95);
+        let maxVal = allVals.length > 0 ? allVals[Math.min(p95Idx, allVals.length - 1)] : 0;
+
+        // 设置最小阈值
+        const thresholds = {{ uv: 1000, ror: 0.3, orig: 0.1, rec: 0.2 }};
+        maxVal = Math.max(maxVal, thresholds[metric] || 0.3);
+
+        // 渠道或品类级别变化时重置排序
+        if (channel !== lastChannel || level !== lastLevel) {{
+            currentRowOrder = null;
+            currentColOrder = null;
+            sortState = {{ row: null, col: null, asc: true }};
+            lastChannel = channel;
+            lastLevel = level;
+        }}
+
+        // 初始化排序顺序
+        if (!currentRowOrder) currentRowOrder = [...data.rows];
+        if (!currentColOrder) currentColOrder = [...data.cols];
+
+        // 使用当前排序后的顺序
+        const rows = currentRowOrder.filter(r => data.rows.includes(r));
+        const cols = currentColOrder.filter(c => data.cols.includes(c));
+
+        // 生成表头(可点击排序)
+        const metricLabels = {{ uv: '点击UV', ror: '整体裂变率', orig: '头部裂变率', rec: '推荐裂变率' }};
+        document.getElementById('matrix-header').innerHTML = `
+            <tr>
+                <th style="cursor:pointer" onclick="sortByRowSum()">头部品类 ↕</th>
+                ${{cols.map(c => `<th style="cursor:pointer" onclick="sortByCol('${{c}}')">${{c}}</th>`).join('')}}
+            </tr>
+        `;
+
+        // 生成数据行(行头可点击排序)
+        document.getElementById('matrix-body').innerHTML = rows.map(r => {{
+            const cells = cols.map(c => {{
+                const val = metricData[r]?.[c] || 0;
+                const bg = getGradient(val, maxVal);
+                const display = metric === 'uv' ? parseInt(val).toLocaleString() : val.toFixed(4);
+                return `<td style="background:${{bg}}">${{display}}</td>`;
+            }}).join('');
+            return `<tr><td style="cursor:pointer;background:#f5f5f5" onclick="sortByRow('${{r}}')">${{r}}</td>${{cells}}</tr>`;
+        }}).join('');
+    }}
+
+    function switchDate(delta) {{
+        const select = document.getElementById('date-select');
+        const idx = dates.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dates.length) {{
+            select.value = dates[newIdx];
+            updateMatrix();
+        }}
+    }}
+
+    function togglePlay() {{
+        const btn = document.getElementById('play-btn');
+        if (playInterval) {{
+            clearInterval(playInterval);
+            playInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸';
+            let idx = 0;
+            const play = () => {{
+                if (idx >= dates.length) {{
+                    clearInterval(playInterval);
+                    playInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶';
+                    return;
+                }}
+                document.getElementById('date-select').value = dates[idx];
+                updateMatrix();
+                idx++;
+            }};
+            play();
+            playInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    // 排序函数
+    function getCurrentData() {{
+        const channel = document.getElementById('channel-select').value;
+        const level = document.getElementById('level-select').value;
+        const date = document.getElementById('date-select').value;
+        const metric = document.getElementById('metric-select').value;
+        if (!allData[channel] || !allData[channel][level] || !allData[channel][level][date]) return null;
+        return {{ data: allData[channel][level][date], metric }};
+    }}
+
+    function sortByRowSum() {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+
+        // 计算每行的总和
+        const rowSums = {{}};
+        data.rows.forEach(r => {{
+            rowSums[r] = data.cols.reduce((sum, c) => sum + (metricData[r]?.[c] || 0), 0);
+        }});
+
+        // 切换排序方向
+        sortState.asc = sortState.row === 'sum' ? !sortState.asc : false;
+        sortState.row = 'sum';
+        sortState.col = null;
+
+        // 排序
+        currentRowOrder = [...data.rows].sort((a, b) => {{
+            return sortState.asc ? rowSums[a] - rowSums[b] : rowSums[b] - rowSums[a];
+        }});
+
+        updateMatrix();
+    }}
+
+    function sortByCol(colName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+
+        // 切换排序方向
+        sortState.asc = sortState.col === colName ? !sortState.asc : false;
+        sortState.col = colName;
+        sortState.row = null;
+
+        // 按该列的值排序行
+        currentRowOrder = [...data.rows].sort((a, b) => {{
+            const va = metricData[a]?.[colName] || 0;
+            const vb = metricData[b]?.[colName] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+
+        updateMatrix();
+    }}
+
+    function sortByRow(rowName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+
+        // 切换排序方向
+        sortState.asc = sortState.row === rowName ? !sortState.asc : false;
+        sortState.row = rowName;
+        sortState.col = null;
+
+        // 按该行的值排序列
+        currentColOrder = [...data.cols].sort((a, b) => {{
+            const va = metricData[rowName]?.[a] || 0;
+            const vb = metricData[rowName]?.[b] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+
+        updateMatrix();
+    }}
+
+    function resetSort() {{
+        currentRowOrder = null;
+        currentColOrder = null;
+        sortState = {{ row: null, col: null, asc: true }};
+        updateMatrix();
+    }}
+
+    // 初始化
+    updateMatrix();
+    </script>
+</body>
+</html>
+"""
+
+# 保存HTML
+html_file = output_dir / f"{latest_file.stem}_品类再分享.html"
+with open(html_file, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\nHTML 报告已生成: {html_file}")