ソースを参照

feat(标题相关性分析): 新增三种回流率指标显示

- SQL 新增整体/头部/推荐回流率及 UV 字段
- 可视化各层级显示三种回流率(整体、头部、推荐)
- 渠道筛选时动态重算三种回流率

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 ヶ月 前
コミット
8d8f2bab5d

+ 68 - 59
tasks/头部/进入前的I与头部I的相关性分析_v2/visualize.py

@@ -64,6 +64,21 @@ if sim_col not in df.columns:
     print(f"缺少相似度列: {sim_col}")
     exit(1)
 
+# 计算三种回流率的辅助函数
+def calc_return_rates(group, click):
+    """计算整体、头部、推荐三种回流率"""
+    overall_uv = group['整体回流uv'].sum() if '整体回流uv' in group.columns else 0
+    head_uv = group['头部回流uv'].sum() if '头部回流uv' in group.columns else 0
+    rec_uv = group['推荐回流uv'].sum() if '推荐回流uv' in group.columns else 0
+    return {
+        'overall_uv': int(overall_uv),
+        'overall_rate': round(float(overall_uv / click) if click > 0 else 0, 4),
+        'head_uv': int(head_uv),
+        'head_rate': round(float(head_uv / click) if click > 0 else 0, 4),
+        'rec_uv': int(rec_uv),
+        'rec_rate': round(float(rec_uv / click) if click > 0 else 0, 4),
+    }
+
 # ========== 视角1:分享标题 → 视频 ==========
 def build_share_title_view(input_df, min_video_titles=2, top_n=1000):
     """
@@ -102,20 +117,17 @@ def build_share_title_view(input_df, min_video_titles=2, top_n=1000):
                 channels_data = []
                 for channel, ch_group in vid_group.groupby('channel'):
                     click = int(ch_group['点击uv'].sum())
-                    return_uv = ch_group['原视频回流uv'].sum() if '原视频回流uv' in ch_group.columns else 0
-                    return_rate = return_uv / click if click > 0 else 0
-                        # 记录该渠道涉及的日期
+                    rates = calc_return_rates(ch_group, click)
                     ch_dates = ch_group['dt'].unique().tolist() if 'dt' in ch_group.columns else []
                     channels_data.append({
                         'channel': str(channel) if pd.notna(channel) else '未知',
                         'click': click,
-                        'return_uv': int(return_uv),
-                        'return_rate': round(float(return_rate), 4),
-                        'dates': ch_dates
+                        'dates': ch_dates,
+                        **rates
                     })
 
                 vid_click = int(vid_group['点击uv'].sum())
-                vid_return = vid_group['原视频回流uv'].sum() if '原视频回流uv' in vid_group.columns else 0
+                vid_rates = calc_return_rates(vid_group, vid_click)
                 # 获取二级品类(取第一条)
                 category = ''
                 if 'merge二级品类' in vid_group.columns:
@@ -125,15 +137,14 @@ def build_share_title_view(input_df, min_video_titles=2, top_n=1000):
                     'videoid': str(int(videoid)) if pd.notna(videoid) else '',
                     'category': category,
                     'total_click': vid_click,
-                    'return_uv': int(vid_return),
-                    'return_rate': round(float(vid_return / vid_click) if vid_click > 0 else 0, 4),
+                    **vid_rates,
                     'channels': sorted(channels_data, key=lambda x: x['click'], reverse=True)
                 })
 
             # 计算该视频标题的相似度(取平均)和回流率
             sim = vt_group[sim_col].mean()
             vt_click = int(vt_group['点击uv'].sum())
-            vt_return = vt_group['原视频回流uv'].sum() if '原视频回流uv' in vt_group.columns else 0
+            vt_rates = calc_return_rates(vt_group, vt_click)
 
             # 统计品类分布(每个品类有多少个不同视频ID)
             categories_dist = []
@@ -149,20 +160,18 @@ def build_share_title_view(input_df, min_video_titles=2, top_n=1000):
                 'sim': round(float(sim), 2),
                 'categories': categories_dist,
                 'total_click': vt_click,
-                'return_uv': int(vt_return),
-                'return_rate': round(float(vt_return / vt_click) if vt_click > 0 else 0, 4),
+                **vt_rates,
                 'videos': sorted(videos_data, key=lambda x: x['total_click'], reverse=True)
             })
 
         st_click = int(title_data['点击uv'].sum())
-        st_return = title_data['原视频回流uv'].sum() if '原视频回流uv' in title_data.columns else 0
+        st_rates = calc_return_rates(title_data, st_click)
 
         result.append({
             'share_title': str(share_title)[:80] if pd.notna(share_title) else '',
             'cover': cover,
             'total_click': st_click,
-            'return_uv': int(st_return),
-            'return_rate': round(float(st_return / st_click) if st_click > 0 else 0, 4),
+            **st_rates,
             'video_titles': sorted(video_titles_data, key=lambda x: x['sim'], reverse=True)
         })
 
@@ -209,29 +218,26 @@ def build_video_view(input_df, min_share_titles=2, top_n=1000):
             channels_data = []
             for channel, ch_group in st_group.groupby('channel'):
                 click = int(ch_group['点击uv'].sum())
-                return_uv = ch_group['原视频回流uv'].sum() if '原视频回流uv' in ch_group.columns else 0
-                return_rate = return_uv / click if click > 0 else 0
+                ch_rates = calc_return_rates(ch_group, click)
                 channels_data.append({
                     'channel': str(channel) if pd.notna(channel) else '未知',
                     'click': click,
-                    'return_uv': int(return_uv),
-                    'return_rate': round(float(return_rate), 4)
+                    **ch_rates
                 })
 
             st_click = int(st_group['点击uv'].sum())
-            st_return = st_group['原视频回流uv'].sum() if '原视频回流uv' in st_group.columns else 0
+            st_rates = calc_return_rates(st_group, st_click)
             share_titles_data.append({
                 'share_title': str(share_title)[:80] if pd.notna(share_title) else '',
                 'cover': cover,
                 'sim': round(float(sim), 2),
                 'total_click': st_click,
-                'return_uv': int(st_return),
-                'return_rate': round(float(st_return / st_click) if st_click > 0 else 0, 4),
+                **st_rates,
                 'channels': sorted(channels_data, key=lambda x: x['click'], reverse=True)
             })
 
         v_click = int(video_data['点击uv'].sum())
-        v_return = video_data['原视频回流uv'].sum() if '原视频回流uv' in video_data.columns else 0
+        v_rates = calc_return_rates(video_data, v_click)
         # 获取二级品类(取第一条)
         category = ''
         if 'merge二级品类' in video_data.columns:
@@ -242,8 +248,7 @@ def build_video_view(input_df, min_share_titles=2, top_n=1000):
             'video_title': str(video_title)[:60] if pd.notna(video_title) else '',
             'category': category,
             'total_click': v_click,
-            'return_uv': int(v_return),
-            'return_rate': round(float(v_return / v_click) if v_click > 0 else 0, 4),
+            **v_rates,
             'share_titles': sorted(share_titles_data, key=lambda x: x['sim'], reverse=True)
         })
 
@@ -472,6 +477,17 @@ html_content = f"""<!DOCTYPE html>
         }};
     }}
 
+    // 重算三种回流率
+    function recalcRates(item, children, clickKey = 'click') {{
+        item.overall_uv = children.reduce((s, c) => s + (c.overall_uv || 0), 0);
+        item.head_uv = children.reduce((s, c) => s + (c.head_uv || 0), 0);
+        item.rec_uv = children.reduce((s, c) => s + (c.rec_uv || 0), 0);
+        const click = item.total_click || 0;
+        item.overall_rate = click > 0 ? item.overall_uv / click : 0;
+        item.head_rate = click > 0 ? item.head_uv / click : 0;
+        item.rec_rate = click > 0 ? item.rec_uv / click : 0;
+    }}
+
     // 按渠道过滤数据并重新计算统计(递归)
     function filterByChannel(data, channel) {{
         if (channel === 'all') return JSON.parse(JSON.stringify(data));
@@ -488,20 +504,17 @@ html_content = f"""<!DOCTYPE html>
                         v.channels = v.channels.filter(ch => ch.channel === channel);
                         // 重算视频层统计
                         v.total_click = v.channels.reduce((s, c) => s + c.click, 0);
-                        v.return_uv = v.channels.reduce((s, c) => s + c.return_uv, 0);
-                        v.return_rate = v.total_click > 0 ? v.return_uv / v.total_click : 0;
+                        recalcRates(v, v.channels);
                         return v;
                     }}).filter(v => v.channels.length > 0);
                     // 重算视频标题层统计
                     vt.total_click = vt.videos.reduce((s, v) => s + v.total_click, 0);
-                    vt.return_uv = vt.videos.reduce((s, v) => s + v.return_uv, 0);
-                    vt.return_rate = vt.total_click > 0 ? vt.return_uv / vt.total_click : 0;
+                    recalcRates(vt, vt.videos);
                     return vt;
                 }}).filter(vt => vt.videos.length > 0);
                 // 重算分享标题层统计
                 st.total_click = st.video_titles.reduce((s, vt) => s + vt.total_click, 0);
-                st.return_uv = st.video_titles.reduce((s, vt) => s + vt.return_uv, 0);
-                st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
+                recalcRates(st, st.video_titles);
                 return st;
             }}).filter(st => st.video_titles.length > 0);
             // 按点击重新排序
@@ -517,14 +530,12 @@ html_content = f"""<!DOCTYPE html>
                     st.channels = st.channels.filter(ch => ch.channel === channel);
                     // 重算分享标题层统计
                     st.total_click = st.channels.reduce((s, c) => s + c.click, 0);
-                    st.return_uv = st.channels.reduce((s, c) => s + c.return_uv, 0);
-                    st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
+                    recalcRates(st, st.channels);
                     return st;
                 }}).filter(st => st.channels.length > 0);
                 // 重算视频层统计
                 v.total_click = v.share_titles.reduce((s, st) => s + st.total_click, 0);
-                v.return_uv = v.share_titles.reduce((s, st) => s + st.return_uv, 0);
-                v.return_rate = v.total_click > 0 ? v.return_uv / v.total_click : 0;
+                recalcRates(v, v.share_titles);
                 return v;
             }}).filter(v => v.share_titles.length > 0);
             // 按点击重新排序
@@ -545,24 +556,14 @@ html_content = f"""<!DOCTYPE html>
         let filteredShare = JSON.parse(JSON.stringify(shareData));
         let filteredVideo = JSON.parse(JSON.stringify(videoData));
 
-        // 视角1:筛选包含该品类视频的分享标题
+        // 视角1:筛选包含该品类视频的分享标题(但保留所有内容不过滤)
         if (filteredShare.share_titles) {{
-            filteredShare.share_titles = filteredShare.share_titles.map(st => {{
-                st.video_titles = st.video_titles.map(vt => {{
-                    // 筛选该品类的视频
-                    vt.videos = vt.videos.filter(v => v.category === category);
-                    // 重算视频标题层统计
-                    vt.total_click = vt.videos.reduce((s, v) => s + v.total_click, 0);
-                    vt.return_uv = vt.videos.reduce((s, v) => s + v.return_uv, 0);
-                    vt.return_rate = vt.total_click > 0 ? vt.return_uv / vt.total_click : 0;
-                    return vt;
-                }}).filter(vt => vt.videos.length > 0);
-                // 重算分享标题层统计
-                st.total_click = st.video_titles.reduce((s, vt) => s + vt.total_click, 0);
-                st.return_uv = st.video_titles.reduce((s, vt) => s + vt.return_uv, 0);
-                st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
-                return st;
-            }}).filter(st => st.video_titles.length > 0);
+            filteredShare.share_titles = filteredShare.share_titles.filter(st => {{
+                // 检查是否包含该品类的视频
+                return st.video_titles.some(vt =>
+                    vt.videos.some(v => v.category === category)
+                );
+            }});
             filteredShare.share_titles.sort((a, b) => b.total_click - a.total_click);
             filteredShare.count = filteredShare.share_titles.length;
         }}
@@ -727,6 +728,15 @@ html_content = f"""<!DOCTYPE html>
         return rate >= q66 ? 'good' : (rate <= q33 ? 'bad' : '');
     }}
 
+    // 渲染三种回流率
+    function renderRates(item) {{
+        return `
+            <span class="rate" style="${{rateGradient(item.overall_rate)}}">整体 ${{(item.overall_rate * 100).toFixed(1)}}%</span>
+            <span class="rate" style="${{rateGradient(item.head_rate)}}">头部 ${{(item.head_rate * 100).toFixed(1)}}%</span>
+            <span class="rate" style="${{rateGradient(item.rec_rate)}}">推荐 ${{(item.rec_rate * 100).toFixed(1)}}%</span>
+        `;
+    }}
+
     // 渲染渠道明细
     function renderChannels(channels) {{
         return channels.map(ch => `
@@ -734,8 +744,7 @@ html_content = f"""<!DOCTYPE html>
                 <span class="channel-name">${{ch.channel}}</span>
                 <div class="channel-metrics">
                     <span>点击 ${{ch.click.toLocaleString()}}</span>
-                    <span>回流 ${{ch.return_uv.toLocaleString()}}</span>
-                    <span class="rate" style="${{rateGradient(ch.return_rate)}}">回流率 ${{(ch.return_rate * 100).toFixed(1)}}%</span>
+                    ${{renderRates(ch)}}
                 </div>
             </div>
         `).join('');
@@ -761,7 +770,7 @@ html_content = f"""<!DOCTYPE html>
                         <div class="item-meta">
                             <span class="count">·${{st.video_titles.length}}</span>
                             <span>点击 ${{st.total_click.toLocaleString()}}</span>
-                            <span class="rate" style="${{rateGradient(st.return_rate)}}">回流率 ${{(st.return_rate * 100).toFixed(1)}}%</span>
+                            ${{renderRates(st)}}
                         </div>
                     </div>
                     <div class="item-children">
@@ -778,7 +787,7 @@ html_content = f"""<!DOCTYPE html>
                                             <span class="count">·${{vt.videos.length}}</span>
                                             <span class="sim ${{simClass(vt.sim)}}">相似度 ${{vt.sim.toFixed(2)}}</span>
                                             <span>点击 ${{vt.total_click.toLocaleString()}}</span>
-                                            <span class="rate" style="${{rateGradient(vt.return_rate)}}">回流率 ${{(vt.return_rate * 100).toFixed(1)}}%</span>
+                                            ${{renderRates(vt)}}
                                         </div>
                                     </div>
                                     <div class="item-children">
@@ -794,7 +803,7 @@ html_content = f"""<!DOCTYPE html>
                                                         <div class="item-meta">
                                                             <span class="count">·${{v.channels.length}}</span>
                                                             <span>点击 ${{v.total_click.toLocaleString()}}</span>
-                                                            <span class="rate" style="${{rateGradient(v.return_rate)}}">回流率 ${{(v.return_rate * 100).toFixed(1)}}%</span>
+                                                            ${{renderRates(v)}}
                                                         </div>
                                                     </div>
                                                     <div class="item-children">
@@ -837,7 +846,7 @@ html_content = f"""<!DOCTYPE html>
                         <div class="item-meta">
                             <span class="count">·${{v.share_titles.length}}</span>
                             <span>点击 ${{v.total_click.toLocaleString()}}</span>
-                            <span class="rate" style="${{rateGradient(v.return_rate)}}">回流率 ${{(v.return_rate * 100).toFixed(1)}}%</span>
+                            ${{renderRates(v)}}
                         </div>
                     </div>
                     <div class="item-children">
@@ -852,7 +861,7 @@ html_content = f"""<!DOCTYPE html>
                                             <span class="count">·${{st.channels.length}}</span>
                                             <span class="sim ${{simClass(st.sim)}}">相似度 ${{st.sim.toFixed(2)}}</span>
                                             <span>点击 ${{st.total_click.toLocaleString()}}</span>
-                                            <span class="rate" style="${{rateGradient(st.return_rate)}}">回流率 ${{(st.return_rate * 100).toFixed(1)}}%</span>
+                                            ${{renderRates(st)}}
                                         </div>
                                     </div>
                                     <div class="item-children">

+ 16 - 3
tasks/头部/进入前的I与头部I的相关性分析_v2/标题相关性分析.sql

@@ -17,13 +17,26 @@ SELECT  dt
         ,title
         ,merge一级品类
         ,merge二级品类
-        -- 核心指标(只关注原视频回流)
+        -- 核心指标
         ,COUNT(DISTINCT mid) AS 点击uv
+        -- 整体回流(原视频+推荐)
+        ,(SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 整体回流率
+        ,SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END) AS 整体回流uv
+        -- 头部回流(原视频)
         ,(SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv ELSE 0 END)
           + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END)
-         ) / (COUNT(DISTINCT mid) + 10) AS 原视频回流率
+         ) / (COUNT(DISTINCT mid) + 10) AS 头部回流率
         ,SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv ELSE 0 END)
-         + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END) AS 原视频回流uv
+         + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END) AS 头部回流uv
+        -- 推荐回流(非原视频)
+        ,(SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 推荐回流率
+        ,SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv ELSE 0 END) AS 推荐回流uv
 FROM    loghubods.opengid_base_data
 WHERE   dt = '${dt}'
 AND     usersharedepth = 0