فهرست منبع

feat(标题相关性分析): 新增 v2 分层下钻可视化

- 两个视角:分享标题→视频、视频→分享标题
- 四层下钻结构:标题→视频标题→视频ID→渠道
- 支持多维筛选:日期、渠道、品类、最小点击、Top数量
- 动态计算统计数据,渐变显示回流率
- 封面懒加载,序号标识

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 ماه پیش
والد
کامیت
b07036597d

+ 96 - 0
tasks/头部/进入前的I与头部I的相关性分析_v2/analyze.py

@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+标题相关性分析(v2 简化版)
+批量处理多天数据,计算相似度
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+from lib.text_embedding_api import compare_phrases_batch
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+data_dir = output_dir / "标题相关性分析"
+
+# 找到所有需要处理的数据文件
+if data_dir.exists():
+    csv_files = sorted(data_dir.glob("*.csv"))
+else:
+    csv_files = sorted(output_dir.glob("*.csv"))
+    csv_files = [f for f in csv_files if '_含相似度' not in f.name]
+
+if not csv_files:
+    print("没有找到数据文件")
+    exit(1)
+
+print(f"找到 {len(csv_files)} 个数据文件")
+
+BATCH_SIZE = 500
+
+def calc_similarity(df):
+    """计算标题相似度"""
+    similarity_configs = [
+        ('文章标题', 'title', '文章标题_视频标题_相似度'),
+        ('分享标题', 'title', '分享标题_视频标题_相似度'),
+    ]
+
+    for col1, col2, result_col in similarity_configs:
+        if result_col in df.columns and df[result_col].notna().any():
+            # 已有相似度数据,跳过
+            continue
+
+        df[result_col] = np.nan
+
+        pairs = []
+        valid_indices = []
+
+        for idx, row in df.iterrows():
+            text1 = str(row[col1]) if pd.notna(row[col1]) and row[col1] != '' else ''
+            text2 = str(row[col2]) if pd.notna(row[col2]) and row[col2] != '' else ''
+
+            if text1 and text2:
+                pairs.append((text1, text2))
+                valid_indices.append(idx)
+
+        if not pairs:
+            continue
+
+        print(f"  计算 {result_col}: {len(pairs)} 对")
+
+        scores = []
+        for i in range(0, len(pairs), BATCH_SIZE):
+            batch = pairs[i:i+BATCH_SIZE]
+            results = compare_phrases_batch(batch)
+            scores.extend([r['相似度'] for r in results])
+            if (i + BATCH_SIZE) % 10000 == 0:
+                print(f"    已处理 {min(i+BATCH_SIZE, len(pairs))}/{len(pairs)}")
+
+        for idx, score in zip(valid_indices, scores):
+            df.at[idx, result_col] = score
+
+    return df
+
+
+# 处理每个文件
+for csv_file in csv_files:
+    dt = csv_file.stem  # 日期如 20260107
+    output_file = output_dir / f"{dt}_含相似度.csv"
+
+    # 检查是否已处理
+    if output_file.exists():
+        print(f"[{dt}] 已存在,跳过")
+        continue
+
+    print(f"[{dt}] 处理中...")
+    df = pd.read_csv(csv_file)
+    print(f"  记录数: {len(df):,}")
+
+    df = calc_similarity(df)
+
+    df.to_csv(output_file, index=False)
+    print(f"  已保存: {output_file.name}")
+
+print("\n全部完成!")

+ 885 - 0
tasks/头部/进入前的I与头部I的相关性分析_v2/visualize.py

@@ -0,0 +1,885 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+标题相关性分析可视化(v2 分层下钻版)
+两个视角:
+1. 分享标题 → 视频标题 → 视频ID → 渠道
+2. 视频 → 分享标题 → 视频ID → 渠道
+支持多天数据和日期筛选
+"""
+import pandas as pd
+import numpy as np
+import json
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 读取所有含相似度的数据文件
+sim_files = sorted(output_dir.glob("*_含相似度.csv"))
+if not sim_files:
+    print("没有找到数据文件,请先运行 analyze.py")
+    exit(1)
+
+print(f"找到 {len(sim_files)} 个数据文件")
+
+# 合并所有数据(跳过空文件)
+dfs = []
+dates = []
+for f in sim_files:
+    dt = f.stem.replace('_含相似度', '')
+    df_tmp = pd.read_csv(f)
+    if len(df_tmp) == 0:
+        continue
+    dates.append(dt)
+    df_tmp['dt'] = dt
+    dfs.append(df_tmp)
+    print(f"  {dt}: {len(df_tmp):,} 条")
+
+df = pd.concat(dfs, ignore_index=True)
+print(f"合并后总记录数: {len(df):,}")
+
+# 日期列表(用于前端筛选)
+dates_json = json.dumps(sorted(dates), ensure_ascii=False)
+
+# 渠道列表(按数量排序)
+channel_counts = df['channel'].value_counts()
+channels = channel_counts.index.tolist()
+channels_json = json.dumps(channels, ensure_ascii=False)
+print(f"渠道数: {len(channels)}")
+
+# 品类列表(按数量排序)
+if 'merge二级品类' in df.columns:
+    category_counts = df['merge二级品类'].value_counts()
+    categories = [c for c in category_counts.index.tolist() if pd.notna(c) and c]
+    categories_json = json.dumps(categories, ensure_ascii=False)
+    print(f"品类数: {len(categories)}")
+else:
+    categories = []
+    categories_json = '[]'
+
+# 检查必要列
+sim_col = '分享标题_视频标题_相似度'
+if sim_col not in df.columns:
+    print(f"缺少相似度列: {sim_col}")
+    exit(1)
+
+# ========== 视角1:分享标题 → 视频 ==========
+def build_share_title_view(input_df, min_video_titles=2, top_n=1000):
+    """
+    构建正向视角数据:分享标题 → 视频标题 → 视频ID → 渠道
+    筛选:同一分享标题链接到至少 min_video_titles 个不同视频标题
+    """
+    # 筛选有相似度数据的记录
+    data = input_df[input_df[sim_col].notna()].copy()
+
+    # 按分享标题统计视频标题数(而非视频ID)
+    title_video_count = data.groupby('分享标题')['title'].nunique()
+    valid_titles = title_video_count[title_video_count >= min_video_titles].index
+    data = data[data['分享标题'].isin(valid_titles)]
+
+    # 按总点击排序,取 top_n
+    title_clicks = data.groupby('分享标题')['点击uv'].sum().nlargest(top_n)
+    valid_titles = title_clicks.index.tolist()
+
+    result = []
+    for share_title in valid_titles:
+        title_data = data[data['分享标题'] == share_title]
+
+        # 获取封面(取第一条)
+        cover = ''
+        if '分享封面' in title_data.columns:
+            first_cover = title_data['分享封面'].dropna().iloc[0] if not title_data['分享封面'].dropna().empty else ''
+            cover = str(first_cover) if first_cover else ''
+
+        # 按视频标题分组
+        video_titles_data = []
+        for video_title, vt_group in title_data.groupby('title'):
+            # 按视频ID分组
+            videos_data = []
+            for videoid, vid_group in vt_group.groupby('videoid'):
+                # 按渠道分组
+                channels_data = []
+                for channel, ch_group in vid_group.groupby('channel'):
+                    click = int(ch_group['点击uv'].sum())
+                    return_uv = ch_group['原视频回流uv'].sum() if '原视频回流uv' in ch_group.columns else 0
+                    return_rate = return_uv / click if click > 0 else 0
+                        # 记录该渠道涉及的日期
+                    ch_dates = ch_group['dt'].unique().tolist() if 'dt' in ch_group.columns else []
+                    channels_data.append({
+                        'channel': str(channel) if pd.notna(channel) else '未知',
+                        'click': click,
+                        'return_uv': int(return_uv),
+                        'return_rate': round(float(return_rate), 4),
+                        'dates': ch_dates
+                    })
+
+                vid_click = int(vid_group['点击uv'].sum())
+                vid_return = vid_group['原视频回流uv'].sum() if '原视频回流uv' in vid_group.columns else 0
+                # 获取二级品类(取第一条)
+                category = ''
+                if 'merge二级品类' in vid_group.columns:
+                    cat_val = vid_group['merge二级品类'].dropna().iloc[0] if not vid_group['merge二级品类'].dropna().empty else ''
+                    category = str(cat_val) if cat_val else ''
+                videos_data.append({
+                    'videoid': str(int(videoid)) if pd.notna(videoid) else '',
+                    'category': category,
+                    'total_click': vid_click,
+                    'return_uv': int(vid_return),
+                    'return_rate': round(float(vid_return / vid_click) if vid_click > 0 else 0, 4),
+                    'channels': sorted(channels_data, key=lambda x: x['click'], reverse=True)
+                })
+
+            # 计算该视频标题的相似度(取平均)和回流率
+            sim = vt_group[sim_col].mean()
+            vt_click = int(vt_group['点击uv'].sum())
+            vt_return = vt_group['原视频回流uv'].sum() if '原视频回流uv' in vt_group.columns else 0
+
+            # 统计品类分布(每个品类有多少个不同视频ID)
+            categories_dist = []
+            if 'merge二级品类' in vt_group.columns:
+                cat_video_counts = vt_group.groupby('merge二级品类')['videoid'].nunique()
+                for cat, cnt in cat_video_counts.items():
+                    if pd.notna(cat) and cat:
+                        categories_dist.append({'name': str(cat), 'count': int(cnt)})
+                categories_dist.sort(key=lambda x: x['count'], reverse=True)
+
+            video_titles_data.append({
+                'video_title': str(video_title)[:60] if pd.notna(video_title) else '',
+                'sim': round(float(sim), 2),
+                'categories': categories_dist,
+                'total_click': vt_click,
+                'return_uv': int(vt_return),
+                'return_rate': round(float(vt_return / vt_click) if vt_click > 0 else 0, 4),
+                'videos': sorted(videos_data, key=lambda x: x['total_click'], reverse=True)
+            })
+
+        st_click = int(title_data['点击uv'].sum())
+        st_return = title_data['原视频回流uv'].sum() if '原视频回流uv' in title_data.columns else 0
+
+        result.append({
+            'share_title': str(share_title)[:80] if pd.notna(share_title) else '',
+            'cover': cover,
+            'total_click': st_click,
+            'return_uv': int(st_return),
+            'return_rate': round(float(st_return / st_click) if st_click > 0 else 0, 4),
+            'video_titles': sorted(video_titles_data, key=lambda x: x['sim'], reverse=True)
+        })
+
+    return {'share_titles': result, 'count': len(result)}
+
+
+# ========== 视角2:视频 → 分享标题 ==========
+def build_video_view(input_df, min_share_titles=2, top_n=1000):
+    """
+    构建反向视角数据:视频 → 分享标题 → 视频ID → 渠道
+    筛选:同一视频被至少 min_share_titles 个不同分享标题引用
+    """
+    data = input_df[input_df[sim_col].notna()].copy()
+
+    # 按视频统计分享标题数
+    video_title_count = data.groupby('videoid')['分享标题'].nunique()
+    valid_videos = video_title_count[video_title_count >= min_share_titles].index
+    data = data[data['videoid'].isin(valid_videos)]
+
+    # 按总点击排序,取 top_n
+    video_clicks = data.groupby('videoid')['点击uv'].sum().nlargest(top_n)
+    valid_videos = video_clicks.index.tolist()
+
+    result = []
+    for videoid in valid_videos:
+        video_data = data[data['videoid'] == videoid]
+
+        # 获取视频标题(取第一条)
+        video_title = video_data['title'].iloc[0] if not video_data.empty else ''
+
+        # 按分享标题分组
+        share_titles_data = []
+        for share_title, st_group in video_data.groupby('分享标题'):
+            # 获取封面
+            cover = ''
+            if '分享封面' in st_group.columns:
+                first_cover = st_group['分享封面'].dropna().iloc[0] if not st_group['分享封面'].dropna().empty else ''
+                cover = str(first_cover) if first_cover else ''
+
+            # 计算相似度
+            sim = st_group[sim_col].mean()
+
+            # 按渠道分组(这里简化,不再按videoid分,因为videoid已固定)
+            channels_data = []
+            for channel, ch_group in st_group.groupby('channel'):
+                click = int(ch_group['点击uv'].sum())
+                return_uv = ch_group['原视频回流uv'].sum() if '原视频回流uv' in ch_group.columns else 0
+                return_rate = return_uv / click if click > 0 else 0
+                channels_data.append({
+                    'channel': str(channel) if pd.notna(channel) else '未知',
+                    'click': click,
+                    'return_uv': int(return_uv),
+                    'return_rate': round(float(return_rate), 4)
+                })
+
+            st_click = int(st_group['点击uv'].sum())
+            st_return = st_group['原视频回流uv'].sum() if '原视频回流uv' in st_group.columns else 0
+            share_titles_data.append({
+                'share_title': str(share_title)[:80] if pd.notna(share_title) else '',
+                'cover': cover,
+                'sim': round(float(sim), 2),
+                'total_click': st_click,
+                'return_uv': int(st_return),
+                'return_rate': round(float(st_return / st_click) if st_click > 0 else 0, 4),
+                'channels': sorted(channels_data, key=lambda x: x['click'], reverse=True)
+            })
+
+        v_click = int(video_data['点击uv'].sum())
+        v_return = video_data['原视频回流uv'].sum() if '原视频回流uv' in video_data.columns else 0
+        # 获取二级品类(取第一条)
+        category = ''
+        if 'merge二级品类' in video_data.columns:
+            cat_val = video_data['merge二级品类'].dropna().iloc[0] if not video_data['merge二级品类'].dropna().empty else ''
+            category = str(cat_val) if cat_val else ''
+        result.append({
+            'videoid': str(int(videoid)) if pd.notna(videoid) else '',
+            'video_title': str(video_title)[:60] if pd.notna(video_title) else '',
+            'category': category,
+            'total_click': v_click,
+            'return_uv': int(v_return),
+            'return_rate': round(float(v_return / v_click) if v_click > 0 else 0, 4),
+            'share_titles': sorted(share_titles_data, key=lambda x: x['sim'], reverse=True)
+        })
+
+    return {'videos': result, 'count': len(result)}
+
+
+# 构建数据(按日期 + 全部)
+print("构建数据...")
+share_data_by_date = {}
+video_data_by_date = {}
+
+# 全部数据
+print("  [全部] 构建中...")
+share_data_by_date['all'] = build_share_title_view(df)
+video_data_by_date['all'] = build_video_view(df)
+print(f"    分享标题: {share_data_by_date['all']['count']}, 视频: {video_data_by_date['all']['count']}")
+
+# 按日期构建
+for dt in sorted(dates):
+    df_dt = df[df['dt'] == dt]
+    share_view = build_share_title_view(df_dt)
+    video_view = build_video_view(df_dt)
+    if share_view['count'] > 0 or video_view['count'] > 0:
+        share_data_by_date[dt] = share_view
+        video_data_by_date[dt] = video_view
+        print(f"  [{dt}] 分享标题: {share_view['count']}, 视频: {video_view['count']}")
+
+# 转 JSON
+share_data_json = json.dumps(share_data_by_date, ensure_ascii=False)
+video_data_json = json.dumps(video_data_by_date, ensure_ascii=False)
+
+# 生成日期选项 HTML
+date_options = ['<option value="all" selected>全部</option>']
+for dt in sorted(dates):
+    label = f"{dt[4:6]}-{dt[6:8]}"
+    date_options.append(f'<option value="{dt}">{label}</option>')
+date_options_html = '\n'.join(date_options)
+
+html_content = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>标题相关性分析 - 分层下钻</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+               background: #f5f5f5; padding: 20px; }}
+        .container {{ max-width: 1200px; margin: 0 auto; }}
+        h1 {{ font-size: 24px; margin-bottom: 10px; color: #333; }}
+        .subtitle {{ color: #666; margin-bottom: 20px; font-size: 14px; }}
+
+        /* Tab 切换 */
+        .tabs {{ display: flex; gap: 10px; margin-bottom: 20px; }}
+        .tab {{ padding: 10px 20px; background: #e0e0e0; border: none; border-radius: 8px 8px 0 0;
+               cursor: pointer; font-size: 14px; color: #666; }}
+        .tab.active {{ background: white; color: #333; font-weight: bold; }}
+
+        .tab-content {{ display: none; background: white; border-radius: 0 8px 8px 8px;
+                       padding: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        .tab-content.active {{ display: block; }}
+
+        /* 层级样式 */
+        .level {{ margin-left: 20px; border-left: 2px solid #e0e0e0; padding-left: 15px; }}
+        .level-0 {{ margin-left: 0; border-left: none; padding-left: 0; }}
+
+        .item {{ margin: 8px 0; }}
+        .item-header {{ display: flex; align-items: center; gap: 10px; padding: 10px;
+                       background: #fafafa; border-radius: 6px; cursor: pointer;
+                       border: 1px solid #e8e8e8; transition: background 0.2s; }}
+        .item-header:hover {{ background: #f0f0f0; }}
+        .item-header.expanded {{ background: #e8f4fc; border-color: #b3d9f2; }}
+
+        .toggle {{ width: 20px; color: #999; font-size: 12px; flex-shrink: 0; }}
+        .cover-thumb {{ width: 50px; height: 38px; object-fit: cover; border-radius: 4px;
+                       flex-shrink: 0; background: #eee; cursor: pointer; }}
+        .item-title {{ flex: 1; font-size: 13px; color: #333;
+                      overflow: hidden; text-overflow: ellipsis; white-space: nowrap; }}
+        .item-meta {{ display: flex; gap: 12px; font-size: 12px; color: #666; flex-shrink: 0; }}
+        .count {{ color: #999; font-weight: 500; }}
+        .sim {{ font-weight: bold; }}
+        .sim.high {{ color: #2e7d32; }}
+        .sim.mid {{ color: #f57c00; }}
+        .sim.low {{ color: #c62828; }}
+        .rate {{ font-weight: bold; padding: 2px 6px; border-radius: 3px; }}
+
+        .item-children {{ display: none; margin-top: 5px; }}
+        .item-children.show {{ display: block; }}
+
+        /* 渠道明细(最内层)*/
+        .channel-row {{ display: flex; gap: 15px; padding: 6px 10px; font-size: 12px;
+                       background: #fafafa; margin: 4px 0; border-radius: 4px; }}
+        .channel-name {{ width: 100px; color: #666; }}
+        .channel-metrics {{ display: flex; gap: 15px; }}
+
+        /* 视频链接 */
+        .video-link {{ color: #667eea; text-decoration: none; font-size: 12px; }}
+        .video-link:hover {{ text-decoration: underline; }}
+        .category {{ font-size: 11px; color: #666; background: #f0f0f0; padding: 2px 6px;
+                    border-radius: 3px; margin-left: 6px; }}
+        .categories-dist {{ margin-left: 8px; }}
+        .cat-tag {{ font-size: 10px; color: #fff; background: #78909c; padding: 1px 5px;
+                   border-radius: 2px; margin-right: 4px; white-space: nowrap; }}
+        .rank {{ font-size: 11px; color: #999; font-weight: bold; min-width: 28px; flex-shrink: 0; }}
+        .controls input[type="number"] {{ padding: 6px 8px; border: 1px solid #ddd; border-radius: 4px; font-size: 14px; }}
+
+        /* 图片模态框 */
+        .img-modal {{ display: none; position: fixed; top: 0; left: 0; width: 100%; height: 100%;
+                     background: rgba(0,0,0,0.8); z-index: 1000; justify-content: center; align-items: center; }}
+        .img-modal img {{ max-width: 90%; max-height: 90%; border-radius: 8px; }}
+        .img-modal.show {{ display: flex; }}
+
+        /* 统计信息 */
+        .stats {{ font-size: 13px; color: #666; margin-bottom: 15px; padding: 10px;
+                 background: #fff3e0; border-radius: 6px; }}
+
+        /* 日期筛选 */
+        .controls {{ display: flex; gap: 20px; margin-bottom: 20px; padding: 15px; background: white;
+                    border-radius: 8px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); align-items: center; }}
+        .control-group {{ display: flex; align-items: center; gap: 8px; }}
+        .control-group label {{ font-weight: 500; color: #666; font-size: 13px; }}
+        .controls select {{ padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 14px; min-width: 100px; }}
+        .date-switcher {{ display: flex; align-items: center; gap: 5px; }}
+        .date-switcher button {{ padding: 5px 10px; border: 1px solid #ddd; background: white;
+                                cursor: pointer; border-radius: 3px; }}
+        .date-switcher button:hover {{ background: #f0f0f0; }}
+        .play-btn {{ background: #4CAF50; color: white; border: none; border-radius: 4px;
+                    padding: 6px 12px; font-size: 13px; cursor: pointer; }}
+        .play-btn:hover {{ background: #45a049; }}
+        .play-btn.playing {{ background: #f44336; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>标题相关性分析 - 分层下钻</h1>
+        <p class="subtitle">点击展开/收起各层级,分析分享标题与视频标题的相似度对回流率的影响</p>
+
+        <div class="controls">
+            <div class="control-group date-switcher">
+                <label>日期:</label>
+                <button onclick="switchDate(-1)">◀</button>
+                <select id="date-select" onchange="onFilterChange()">{date_options_html}</select>
+                <button onclick="switchDate(1)">▶</button>
+                <button id="play-btn" class="play-btn" onclick="togglePlay()">▶ 播放</button>
+            </div>
+            <div class="control-group">
+                <label>渠道:</label>
+                <select id="channel-select" onchange="onFilterChange()">
+                    <option value="all" selected>全部</option>
+                </select>
+            </div>
+            <div class="control-group">
+                <label>品类:</label>
+                <select id="category-select" onchange="onFilterChange()">
+                    <option value="all" selected>全部</option>
+                </select>
+            </div>
+            <div class="control-group">
+                <label>最小点击:</label>
+                <input type="number" id="min-click-input" value="100" min="0" step="100" style="width:70px" onchange="onFilterChange()">
+            </div>
+            <div class="control-group">
+                <label>Top:</label>
+                <input type="number" id="top-input" value="50" min="1" max="500" style="width:60px" onchange="onFilterChange()">
+            </div>
+        </div>
+
+        <div class="tabs">
+            <button class="tab active" onclick="switchTab('view1')">分享标题 → 视频</button>
+            <button class="tab" onclick="switchTab('view2')">视频 → 分享标题</button>
+        </div>
+
+        <div id="view1" class="tab-content active">
+            <div class="stats">视角1:同一分享标题配不同视频的效果对比</div>
+            <div id="share-title-list" class="level level-0"></div>
+        </div>
+
+        <div id="view2" class="tab-content">
+            <div class="stats">视角2:同一视频用不同分享标题推广的效果对比</div>
+            <div id="video-list" class="level level-0"></div>
+        </div>
+
+        <div id="imgModal" class="img-modal" onclick="this.classList.remove('show')">
+            <img id="modalImg" src="" alt="封面预览">
+        </div>
+    </div>
+
+    <script>
+    const allDates = {dates_json};
+    const allChannels = {channels_json};
+    const allCategories = {categories_json};
+    const shareDataByDate = {share_data_json};
+    const videoDataByDate = {video_data_json};
+
+    let playInterval = null;
+
+    // 初始化渠道下拉框
+    function initChannelSelect() {{
+        const select = document.getElementById('channel-select');
+        allChannels.forEach(ch => {{
+            const opt = document.createElement('option');
+            opt.value = ch;
+            opt.textContent = ch;
+            select.appendChild(opt);
+        }});
+    }}
+
+    // 初始化品类下拉框
+    function initCategorySelect() {{
+        const select = document.getElementById('category-select');
+        allCategories.forEach(cat => {{
+            const opt = document.createElement('option');
+            opt.value = cat;
+            opt.textContent = cat;
+            select.appendChild(opt);
+        }});
+    }}
+
+    // 获取当前筛选条件
+    function getFilters() {{
+        return {{
+            date: document.getElementById('date-select').value,
+            channel: document.getElementById('channel-select').value,
+            category: document.getElementById('category-select').value,
+            minClick: parseInt(document.getElementById('min-click-input').value) || 0,
+            top: parseInt(document.getElementById('top-input').value) || 50
+        }};
+    }}
+
+    // 按渠道过滤数据并重新计算统计(递归)
+    function filterByChannel(data, channel) {{
+        if (channel === 'all') return JSON.parse(JSON.stringify(data));
+
+        // 深拷贝
+        const filtered = JSON.parse(JSON.stringify(data));
+
+        // 视角1: share_titles -> video_titles -> videos -> channels
+        if (filtered.share_titles) {{
+            filtered.share_titles = filtered.share_titles.map(st => {{
+                st.video_titles = st.video_titles.map(vt => {{
+                    vt.videos = vt.videos.map(v => {{
+                        // 过滤渠道
+                        v.channels = v.channels.filter(ch => ch.channel === channel);
+                        // 重算视频层统计
+                        v.total_click = v.channels.reduce((s, c) => s + c.click, 0);
+                        v.return_uv = v.channels.reduce((s, c) => s + c.return_uv, 0);
+                        v.return_rate = v.total_click > 0 ? v.return_uv / v.total_click : 0;
+                        return v;
+                    }}).filter(v => v.channels.length > 0);
+                    // 重算视频标题层统计
+                    vt.total_click = vt.videos.reduce((s, v) => s + v.total_click, 0);
+                    vt.return_uv = vt.videos.reduce((s, v) => s + v.return_uv, 0);
+                    vt.return_rate = vt.total_click > 0 ? vt.return_uv / vt.total_click : 0;
+                    return vt;
+                }}).filter(vt => vt.videos.length > 0);
+                // 重算分享标题层统计
+                st.total_click = st.video_titles.reduce((s, vt) => s + vt.total_click, 0);
+                st.return_uv = st.video_titles.reduce((s, vt) => s + vt.return_uv, 0);
+                st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
+                return st;
+            }}).filter(st => st.video_titles.length > 0);
+            // 按点击重新排序
+            filtered.share_titles.sort((a, b) => b.total_click - a.total_click);
+            filtered.count = filtered.share_titles.length;
+        }}
+
+        // 视角2: videos -> share_titles -> channels
+        if (filtered.videos) {{
+            filtered.videos = filtered.videos.map(v => {{
+                v.share_titles = v.share_titles.map(st => {{
+                    // 过滤渠道
+                    st.channels = st.channels.filter(ch => ch.channel === channel);
+                    // 重算分享标题层统计
+                    st.total_click = st.channels.reduce((s, c) => s + c.click, 0);
+                    st.return_uv = st.channels.reduce((s, c) => s + c.return_uv, 0);
+                    st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
+                    return st;
+                }}).filter(st => st.channels.length > 0);
+                // 重算视频层统计
+                v.total_click = v.share_titles.reduce((s, st) => s + st.total_click, 0);
+                v.return_uv = v.share_titles.reduce((s, st) => s + st.return_uv, 0);
+                v.return_rate = v.total_click > 0 ? v.return_uv / v.total_click : 0;
+                return v;
+            }}).filter(v => v.share_titles.length > 0);
+            // 按点击重新排序
+            filtered.videos.sort((a, b) => b.total_click - a.total_click);
+            filtered.count = filtered.videos.length;
+        }}
+
+        return filtered;
+    }}
+
+    // 按品类过滤数据(两个视角逻辑不同)
+    function filterByCategory(shareData, videoData, category) {{
+        if (category === 'all') {{
+            return {{ share: shareData, video: videoData }};
+        }}
+
+        // 深拷贝
+        let filteredShare = JSON.parse(JSON.stringify(shareData));
+        let filteredVideo = JSON.parse(JSON.stringify(videoData));
+
+        // 视角1:筛选包含该品类视频的分享标题
+        if (filteredShare.share_titles) {{
+            filteredShare.share_titles = filteredShare.share_titles.map(st => {{
+                st.video_titles = st.video_titles.map(vt => {{
+                    // 筛选该品类的视频
+                    vt.videos = vt.videos.filter(v => v.category === category);
+                    // 重算视频标题层统计
+                    vt.total_click = vt.videos.reduce((s, v) => s + v.total_click, 0);
+                    vt.return_uv = vt.videos.reduce((s, v) => s + v.return_uv, 0);
+                    vt.return_rate = vt.total_click > 0 ? vt.return_uv / vt.total_click : 0;
+                    return vt;
+                }}).filter(vt => vt.videos.length > 0);
+                // 重算分享标题层统计
+                st.total_click = st.video_titles.reduce((s, vt) => s + vt.total_click, 0);
+                st.return_uv = st.video_titles.reduce((s, vt) => s + vt.return_uv, 0);
+                st.return_rate = st.total_click > 0 ? st.return_uv / st.total_click : 0;
+                return st;
+            }}).filter(st => st.video_titles.length > 0);
+            filteredShare.share_titles.sort((a, b) => b.total_click - a.total_click);
+            filteredShare.count = filteredShare.share_titles.length;
+        }}
+
+        // 视角2:直接筛选该品类的视频
+        if (filteredVideo.videos) {{
+            filteredVideo.videos = filteredVideo.videos.filter(v => v.category === category);
+            filteredVideo.videos.sort((a, b) => b.total_click - a.total_click);
+            filteredVideo.count = filteredVideo.videos.length;
+        }}
+
+        return {{ share: filteredShare, video: filteredVideo }};
+    }}
+
+    // 获取当前选中日期的数据(带筛选)
+    function getCurrentData() {{
+        const filters = getFilters();
+        let shareData = shareDataByDate[filters.date] || {{ share_titles: [], count: 0 }};
+        let videoData = videoDataByDate[filters.date] || {{ videos: [], count: 0 }};
+
+        // 按渠道过滤(会重新计算统计)
+        shareData = filterByChannel(shareData, filters.channel);
+        videoData = filterByChannel(videoData, filters.channel);
+
+        // 按品类过滤(两个视角逻辑不同)
+        const categoryFiltered = filterByCategory(shareData, videoData, filters.category);
+        shareData = categoryFiltered.share;
+        videoData = categoryFiltered.video;
+
+        // 按最小点击UV过滤(分享标题层 + 视频标题层)
+        if (shareData.share_titles) {{
+            shareData.share_titles = shareData.share_titles
+                .filter(st => st.total_click >= filters.minClick)
+                .map(st => {{
+                    st.video_titles = st.video_titles.filter(vt => vt.total_click >= filters.minClick);
+                    return st;
+                }})
+                .filter(st => st.video_titles.length > 0);  // 移除没有视频标题的
+        }}
+        if (videoData.videos) {{
+            videoData.videos = videoData.videos.filter(v => v.total_click >= filters.minClick);
+        }}
+
+        // 记录真实数量(过滤后、截取前)
+        const realShareCount = shareData.share_titles ? shareData.share_titles.length : 0;
+        const realVideoCount = videoData.videos ? videoData.videos.length : 0;
+
+        // 按 top 截取
+        if (shareData.share_titles) {{
+            shareData.share_titles = shareData.share_titles.slice(0, filters.top);
+        }}
+        if (videoData.videos) {{
+            videoData.videos = videoData.videos.slice(0, filters.top);
+        }}
+
+        return {{
+            share: {{ ...shareData, realCount: realShareCount, showCount: shareData.share_titles ? shareData.share_titles.length : 0 }},
+            video: {{ ...videoData, realCount: realVideoCount, showCount: videoData.videos ? videoData.videos.length : 0 }}
+        }};
+    }}
+
+    // 日期切换
+    function switchDate(delta) {{
+        const select = document.getElementById('date-select');
+        const newIndex = select.selectedIndex + delta;
+        if (newIndex >= 0 && newIndex < select.options.length) {{
+            select.selectedIndex = newIndex;
+            onDateChange();
+        }}
+    }}
+
+    function onDateChange() {{
+        renderShareTitleView();
+        renderVideoView();
+    }}
+
+    // 筛选条件变更(日期/渠道/数量)
+    function onFilterChange() {{
+        renderShareTitleView();
+        renderVideoView();
+    }}
+
+    // 播放功能
+    function togglePlay() {{
+        const btn = document.getElementById('play-btn');
+        if (playInterval) {{
+            clearInterval(playInterval);
+            playInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶ 播放';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '■ 停止';
+            playInterval = setInterval(() => {{
+                const select = document.getElementById('date-select');
+                if (select.selectedIndex < select.options.length - 1) {{
+                    select.selectedIndex++;
+                    onDateChange();
+                }} else {{
+                    togglePlay();  // 停止
+                }}
+            }}, 2000);
+        }}
+    }}
+
+    function switchTab(tabId) {{
+        document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+        document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+        document.querySelector(`[onclick="switchTab('${{tabId}}')"]`).classList.add('active');
+        document.getElementById(tabId).classList.add('active');
+    }}
+
+    function showImgModal(url) {{
+        if (!url) return;
+        document.getElementById('modalImg').src = url;
+        document.getElementById('imgModal').classList.add('show');
+    }}
+
+    function toggleItem(el) {{
+        const header = el;
+        const children = header.nextElementSibling;
+        if (children && children.classList.contains('item-children')) {{
+            const isExpanded = children.classList.contains('show');
+            children.classList.toggle('show');
+            header.classList.toggle('expanded');
+            header.querySelector('.toggle').textContent = isExpanded ? '▶' : '▼';
+        }}
+    }}
+
+    function simClass(sim) {{
+        return sim >= 0.8 ? 'high' : (sim >= 0.5 ? 'mid' : 'low');
+    }}
+
+    // 回流率渐变色(0% -> 白色, 30%+ -> 深绿)
+    function rateGradient(rate) {{
+        const maxRate = 0.30;  // 30% 为最大值
+        const ratio = Math.min(rate / maxRate, 1);
+        // 从白色 (255,255,255) 渐变到绿色 (200,230,201) 再到深绿 (46,125,50)
+        let r, g, b;
+        if (ratio < 0.5) {{
+            // 白色到浅绿
+            const t = ratio * 2;
+            r = Math.round(255 - t * 55);
+            g = Math.round(255 - t * 25);
+            b = Math.round(255 - t * 54);
+        }} else {{
+            // 浅绿到深绿
+            const t = (ratio - 0.5) * 2;
+            r = Math.round(200 - t * 154);
+            g = Math.round(230 - t * 105);
+            b = Math.round(201 - t * 151);
+        }}
+        const textColor = ratio > 0.6 ? 'white' : '#333';
+        return `background:rgb(${{r}},${{g}},${{b}});color:${{textColor}}`;
+    }}
+
+    function rateClass(rate, allRates) {{
+        if (!allRates || allRates.length === 0) return '';
+        const sorted = [...allRates].sort((a, b) => a - b);
+        const q33 = sorted[Math.floor(sorted.length * 0.33)];
+        const q66 = sorted[Math.floor(sorted.length * 0.66)];
+        return rate >= q66 ? 'good' : (rate <= q33 ? 'bad' : '');
+    }}
+
+    // 渲染渠道明细
+    function renderChannels(channels) {{
+        return channels.map(ch => `
+            <div class="channel-row">
+                <span class="channel-name">${{ch.channel}}</span>
+                <div class="channel-metrics">
+                    <span>点击 ${{ch.click.toLocaleString()}}</span>
+                    <span>回流 ${{ch.return_uv.toLocaleString()}}</span>
+                    <span class="rate" style="${{rateGradient(ch.return_rate)}}">回流率 ${{(ch.return_rate * 100).toFixed(1)}}%</span>
+                </div>
+            </div>
+        `).join('');
+    }}
+
+    // ========== 视角1:分享标题 → 视频 ==========
+    function renderShareTitleView() {{
+        const container = document.getElementById('share-title-list');
+        const data = getCurrentData().share;
+        if (!data.share_titles || data.share_titles.length === 0) {{
+            container.innerHTML = '<p style="color:#999;">该日期数据不足</p>';
+            return;
+        }}
+
+        container.innerHTML = `<p style="margin-bottom:10px;">共 ${{data.realCount}} 个可对比分享标题,当前显示 ${{data.showCount}} 个</p>` +
+            data.share_titles.map((st, idx) => `
+                <div class="item">
+                    <div class="item-header" onclick="toggleItem(this)">
+                        <span class="toggle">▶</span>
+                        <span class="rank">#${{idx + 1}}</span>
+                        ${{st.cover ? `<img class="cover-thumb" src="${{st.cover}}" loading="lazy" onclick="event.stopPropagation();showImgModal('${{st.cover}}')" onerror="this.style.display='none'">` : ''}}
+                        <span class="item-title" title="${{st.share_title}}">${{st.share_title}}</span>
+                        <div class="item-meta">
+                            <span class="count">·${{st.video_titles.length}}</span>
+                            <span>点击 ${{st.total_click.toLocaleString()}}</span>
+                            <span class="rate" style="${{rateGradient(st.return_rate)}}">回流率 ${{(st.return_rate * 100).toFixed(1)}}%</span>
+                        </div>
+                    </div>
+                    <div class="item-children">
+                        <div class="level">
+                            ${{st.video_titles.map(vt => `
+                                <div class="item">
+                                    <div class="item-header" onclick="toggleItem(this)">
+                                        <span class="toggle">▶</span>
+                                        <span class="item-title">
+                                            ${{vt.video_title}}
+                                            ${{vt.categories && vt.categories.length > 0 ? `<span class="categories-dist">${{vt.categories.map(c => `<span class="cat-tag">${{c.name}}:${{c.count}}</span>`).join('')}}</span>` : ''}}
+                                        </span>
+                                        <div class="item-meta">
+                                            <span class="count">·${{vt.videos.length}}</span>
+                                            <span class="sim ${{simClass(vt.sim)}}">相似度 ${{vt.sim.toFixed(2)}}</span>
+                                            <span>点击 ${{vt.total_click.toLocaleString()}}</span>
+                                            <span class="rate" style="${{rateGradient(vt.return_rate)}}">回流率 ${{(vt.return_rate * 100).toFixed(1)}}%</span>
+                                        </div>
+                                    </div>
+                                    <div class="item-children">
+                                        <div class="level">
+                                            ${{vt.videos.map(v => `
+                                                <div class="item">
+                                                    <div class="item-header" onclick="toggleItem(this)">
+                                                        <span class="toggle">▶</span>
+                                                        <span class="item-title">
+                                                            <a class="video-link" href="https://admin.piaoquantv.com/cms/post-detail/${{v.videoid}}/detail" target="_blank" onclick="event.stopPropagation()">[${{v.videoid}}]</a>
+                                                            ${{v.category ? `<span class="category">${{v.category}}</span>` : ''}}
+                                                        </span>
+                                                        <div class="item-meta">
+                                                            <span class="count">·${{v.channels.length}}</span>
+                                                            <span>点击 ${{v.total_click.toLocaleString()}}</span>
+                                                            <span class="rate" style="${{rateGradient(v.return_rate)}}">回流率 ${{(v.return_rate * 100).toFixed(1)}}%</span>
+                                                        </div>
+                                                    </div>
+                                                    <div class="item-children">
+                                                        <div class="level">
+                                                            ${{renderChannels(v.channels)}}
+                                                        </div>
+                                                    </div>
+                                                </div>
+                                            `).join('')}}
+                                        </div>
+                                    </div>
+                                </div>
+                            `).join('')}}
+                        </div>
+                    </div>
+                </div>
+            `).join('');
+    }}
+
+    // ========== 视角2:视频 → 分享标题 ==========
+    function renderVideoView() {{
+        const container = document.getElementById('video-list');
+        const data = getCurrentData().video;
+        if (!data.videos || data.videos.length === 0) {{
+            container.innerHTML = '<p style="color:#999;">该日期数据不足</p>';
+            return;
+        }}
+
+        container.innerHTML = `<p style="margin-bottom:10px;">共 ${{data.realCount}} 个可对比视频,当前显示 ${{data.showCount}} 个</p>` +
+            data.videos.map((v, idx) => `
+                <div class="item">
+                    <div class="item-header" onclick="toggleItem(this)">
+                        <span class="toggle">▶</span>
+                        <span class="rank">#${{idx + 1}}</span>
+                        <span class="item-title">
+                            <a class="video-link" href="https://admin.piaoquantv.com/cms/post-detail/${{v.videoid}}/detail" target="_blank" onclick="event.stopPropagation()">[${{v.videoid}}]</a>
+                            ${{v.category ? `<span class="category">${{v.category}}</span>` : ''}}
+                            ${{v.video_title}}
+                        </span>
+                        <div class="item-meta">
+                            <span class="count">·${{v.share_titles.length}}</span>
+                            <span>点击 ${{v.total_click.toLocaleString()}}</span>
+                            <span class="rate" style="${{rateGradient(v.return_rate)}}">回流率 ${{(v.return_rate * 100).toFixed(1)}}%</span>
+                        </div>
+                    </div>
+                    <div class="item-children">
+                        <div class="level">
+                            ${{v.share_titles.map(st => `
+                                <div class="item">
+                                    <div class="item-header" onclick="toggleItem(this)">
+                                        <span class="toggle">▶</span>
+                                        ${{st.cover ? `<img class="cover-thumb" src="${{st.cover}}" loading="lazy" onclick="event.stopPropagation();showImgModal('${{st.cover}}')" onerror="this.style.display='none'">` : ''}}
+                                        <span class="item-title" title="${{st.share_title}}">${{st.share_title}}</span>
+                                        <div class="item-meta">
+                                            <span class="count">·${{st.channels.length}}</span>
+                                            <span class="sim ${{simClass(st.sim)}}">相似度 ${{st.sim.toFixed(2)}}</span>
+                                            <span>点击 ${{st.total_click.toLocaleString()}}</span>
+                                            <span class="rate" style="${{rateGradient(st.return_rate)}}">回流率 ${{(st.return_rate * 100).toFixed(1)}}%</span>
+                                        </div>
+                                    </div>
+                                    <div class="item-children">
+                                        <div class="level">
+                                            ${{renderChannels(st.channels)}}
+                                        </div>
+                                    </div>
+                                </div>
+                            `).join('')}}
+                        </div>
+                    </div>
+                </div>
+            `).join('');
+    }}
+
+    // 初始化
+    initChannelSelect();
+    initCategorySelect();
+    renderShareTitleView();
+    renderVideoView();
+    </script>
+</body>
+</html>
+"""
+
+html_file = output_dir / "标题相关性分析.html"
+with open(html_file, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\nHTML 报告已生成: {html_file}")

+ 47 - 0
tasks/头部/进入前的I与头部I的相关性分析_v2/标题相关性分析.sql

@@ -0,0 +1,47 @@
+-- 标题相关性分析(简化版)
+-- 参数: ${dt} - 日期,格式 YYYYMMDD
+-- 只关注:分享标题/文章标题 与 视频标题 的相关性
+
+SELECT  dt
+        ,channel
+        ,hotsencetype
+        ,合作方名
+        ,公众号名
+        -- 素材维度
+        ,rootsourceid
+        ,文章标题
+        ,分享标题
+        ,分享封面
+        -- 视频维度
+        ,videoid
+        ,title
+        ,merge一级品类
+        ,merge二级品类
+        -- 核心指标(只关注原视频回流)
+        ,COUNT(DISTINCT mid) AS 点击uv
+        ,(SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 原视频回流率
+        ,SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv ELSE 0 END) AS 原视频回流uv
+FROM    loghubods.opengid_base_data
+WHERE   dt = '${dt}'
+AND     usersharedepth = 0
+AND     videoid IS NOT NULL
+AND     (文章标题 IS NOT NULL AND 文章标题 != '' OR 分享标题 IS NOT NULL AND 分享标题 != '')
+GROUP BY dt
+         ,channel
+         ,hotsencetype
+         ,合作方名
+         ,公众号名
+         ,rootsourceid
+         ,文章标题
+         ,分享标题
+         ,分享封面
+         ,videoid
+         ,title
+         ,merge一级品类
+         ,merge二级品类
+ORDER BY 点击uv DESC
+LIMIT   50000
+;