Browse Source

refactor: 重组 tasks 目录结构

- 移除旧的分析任务文件(人群品类曝光分析、渠道效果分析、素材分析等)
- 新增 archive 归档目录
- 新增 00_表的洞察、头部、承接 等新分类目录
- 新增 fetch_daily.py 脚本

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 month ago
parent
commit
8a9f206654
100 changed files with 3517 additions and 0 deletions
  1. 182 0
      fetch_daily.py
  2. 7 0
      tasks/00_表的洞察/loghubods.alg_vid_feature_basic_info/01_基本数据.sql
  3. 7 0
      tasks/00_表的洞察/loghubods.dwd_recsys_alg_exposure_base/01_基本数据.sql
  4. 7 0
      tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all/01_基本数据.sql
  5. 0 0
      tasks/archive/opengid原始分享数据探索/analyze.py
  6. 0 0
      tasks/archive/opengid原始分享数据探索/query.sql
  7. 0 0
      tasks/archive/opengid数据探索/analyze.py
  8. 0 0
      tasks/archive/opengid数据探索/query.sql
  9. 0 0
      tasks/archive/人群品类曝光分析/.DS_Store
  10. 0 0
      tasks/archive/人群品类曝光分析/query.sql
  11. 0 0
      tasks/archive/人群品类曝光分析/头部关联分析/.DS_Store
  12. 0 0
      tasks/archive/人群品类曝光分析/头部关联分析/query.sql
  13. 0 0
      tasks/archive/人群品类曝光分析/头部关联分析/query_v2_放宽条件.sql
  14. 0 0
      tasks/archive/人群品类曝光分析/头部关联分析/query_关联率对比.sql
  15. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析/.DS_Store
  16. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析/headvideoid分布/query.sql
  17. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析/query.sql
  18. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析/visualize.py
  19. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_简化版/analyze_category_correlation.py
  20. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_简化版/query.sql
  21. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize.py
  22. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize_combined.py
  23. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize_correlation.py
  24. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/.DS_Store
  25. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/query.sql
  26. 0 0
      tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/visualize.py
  27. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query.sql
  28. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v10_关联率排查.sql
  29. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v11_放宽条件.sql
  30. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v2.sql
  31. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v3.sql
  32. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v4.sql
  33. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v5.sql
  34. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v6.sql
  35. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v7.sql
  36. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v8.sql
  37. 0 0
      tasks/archive/人群品类曝光分析/数据膨胀排查/query_v9.sql
  38. 0 0
      tasks/archive/公众号投流素材缺失排查/analyze.py
  39. 0 0
      tasks/archive/公众号投流素材缺失排查/query.sql
  40. 0 0
      tasks/archive/品类再分享分析/.DS_Store
  41. 0 0
      tasks/archive/品类再分享分析/README.md
  42. 0 0
      tasks/archive/品类再分享分析/query.sql
  43. 0 0
      tasks/archive/品类再分享分析/visualize.py
  44. 0 0
      tasks/archive/品类命中分析/.DS_Store
  45. 0 0
      tasks/archive/品类命中分析/query.sql
  46. 0 0
      tasks/archive/品类命中分析/query_debug.sql
  47. 0 0
      tasks/archive/品类命中分析/query_detail.sql
  48. 0 0
      tasks/archive/品类命中分析/visualize.py
  49. 0 0
      tasks/archive/推荐样本表探索/query.sql
  50. 0 0
      tasks/archive/曝光样本表探索/daily_stats.sql
  51. 0 0
      tasks/archive/曝光样本表探索/query.sql
  52. 0 0
      tasks/archive/渠道再分享回流/query.sql
  53. 0 0
      tasks/archive/渠道场景分布/analyze.py
  54. 0 0
      tasks/archive/渠道场景分布/query.sql
  55. 0 0
      tasks/archive/渠道场景效果分析/README.md
  56. 0 0
      tasks/archive/渠道场景效果分析/query.sql
  57. 0 0
      tasks/archive/渠道场景效果分析/visualize.py
  58. 0 0
      tasks/archive/渠道效果分析/.DS_Store
  59. 0 0
      tasks/archive/渠道效果分析/README.md
  60. 0 0
      tasks/archive/渠道效果分析/analyze.py
  61. 0 0
      tasks/archive/渠道效果分析/query.sql
  62. 0 0
      tasks/archive/渠道效果分析/visualize.py
  63. 0 0
      tasks/archive/渠道用户量统计/analyze.py
  64. 0 0
      tasks/archive/渠道用户量统计/query.sql
  65. 0 0
      tasks/archive/素材字段分析/analyze.py
  66. 0 0
      tasks/archive/素材字段分析/query.sql
  67. 0 0
      tasks/archive/素材视频内容分析/.DS_Store
  68. 0 0
      tasks/archive/素材视频内容分析/README.md
  69. 0 0
      tasks/archive/素材视频内容分析/analyze.py
  70. 0 0
      tasks/archive/素材视频内容分析/query.sql
  71. 0 0
      tasks/archive/素材视频内容分析/visualize.py
  72. 0 0
      tasks/archive/素材视频内容分析/visualize_html.py
  73. 0 0
      tasks/archive/素材视频匹配分析/analyze.py
  74. 0 0
      tasks/archive/素材视频匹配分析/query.sql
  75. 0 0
      tasks/archive/素材视频维度分析/analyze.py
  76. 0 0
      tasks/archive/素材视频维度分析/analyze_match.py
  77. 0 0
      tasks/archive/素材视频维度分析/analyze_material_fields.py
  78. 0 0
      tasks/archive/素材视频维度分析/query.sql
  79. 0 0
      tasks/archive/表关联验证/query.sql
  80. 0 0
      tasks/archive/表关联验证/query_overall.sql
  81. 0 0
      tasks/archive/表关联验证/内外部UV_subsession/query.sql
  82. 0 0
      tasks/archive/表关联验证/内外部验证_subsession/query.sql
  83. 0 0
      tasks/archive/表关联验证/冲突排查/query.sql
  84. 0 0
      tasks/archive/表结构查询_video_dimension_detail_add_column.csv
  85. 0 0
      tasks/archive/视频二级品类分析/README.md
  86. 0 0
      tasks/archive/视频二级品类分析/analyze.py
  87. 0 0
      tasks/archive/视频二级品类分析/query.sql
  88. 0 0
      tasks/archive/视频维度分析/query.sql
  89. 0 0
      tasks/archive/视频维度详情分析/README.md
  90. 0 0
      tasks/archive/视频维度详情分析/analyze.py
  91. 0 0
      tasks/archive/视频维度详情分析/query.sql
  92. 364 0
      tasks/头部/进入前的I与头部I的相关性分析/analyze.py
  93. 184 0
      tasks/头部/进入前的I与头部I的相关性分析/fetch_data.py
  94. 649 0
      tasks/头部/进入前的I与头部I的相关性分析/visualize.py
  95. 101 0
      tasks/头部/进入前的I与头部I的相关性分析/进入前的I与头部I的相关性分析.sql
  96. BIN
      tasks/承接/头部品类与承接品类分析/.DS_Store
  97. 288 0
      tasks/承接/头部品类与承接品类分析/_archive/analyze_category_correlation.py
  98. 86 0
      tasks/承接/头部品类与承接品类分析/_archive/query_range.sql
  99. 874 0
      tasks/承接/头部品类与承接品类分析/_archive/visualize.py
  100. 768 0
      tasks/承接/头部品类与承接品类分析/_archive/visualize_correlation.py

+ 182 - 0
fetch_daily.py

@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+按天增量获取数据 - 通用版本
+支持并发获取,自动跳过已有数据
+
+用法:
+    python fetch_daily.py tasks/xxx/query.sql                    # 获取最近7天
+    python fetch_daily.py tasks/xxx/query.sql --days 30          # 获取最近30天
+    python fetch_daily.py tasks/xxx/query.sql --start 20260101 --end 20260107
+    python fetch_daily.py tasks/xxx/query.sql --date 20260105    # 单天
+    python fetch_daily.py tasks/xxx/query.sql --force            # 强制重新获取
+    python fetch_daily.py tasks/xxx/query.sql --workers 10       # 设置并发数
+"""
+import argparse
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+sys.path.insert(0, str(Path(__file__).parent / "lib"))
+from odps_module import ODPSClient
+
+# 线程安全的计数器
+counter_lock = threading.Lock()
+success_count = 0
+fail_count = 0
+
+
+def get_existing_dates(daily_dir):
+    """获取已下载的日期列表"""
+    existing = set()
+    if not daily_dir.exists():
+        return existing
+    for f in daily_dir.glob("*.csv"):
+        try:
+            dt = f.stem
+            if len(dt) == 8 and dt.isdigit():
+                existing.add(dt)
+        except:
+            pass
+    return existing
+
+
+def get_date_range(start_str, end_str):
+    """生成日期范围列表"""
+    start = datetime.strptime(start_str, "%Y%m%d")
+    end = datetime.strptime(end_str, "%Y%m%d")
+    dates = []
+    current = start
+    while current <= end:
+        dates.append(current.strftime("%Y%m%d"))
+        current += timedelta(days=1)
+    return dates
+
+
+def fetch_single_day(dt, sql_template, daily_dir):
+    """获取单天数据"""
+    global success_count, fail_count
+
+    try:
+        client = ODPSClient()
+        sql = sql_template.replace("${dt}", dt)
+        df = client.execute_sql(sql)
+
+        output_file = daily_dir / f"{dt}.csv"
+
+        if df is not None and len(df) > 0:
+            df.to_csv(output_file, index=False)
+            with counter_lock:
+                success_count += 1
+            return (dt, "success", len(df))
+        elif df is not None:
+            df.to_csv(output_file, index=False)
+            with counter_lock:
+                success_count += 1
+            return (dt, "empty", 0)
+        else:
+            with counter_lock:
+                fail_count += 1
+            return (dt, "fail", 0)
+
+    except Exception as e:
+        with counter_lock:
+            fail_count += 1
+        return (dt, "error", str(e))
+
+
+def main():
+    global success_count, fail_count
+
+    parser = argparse.ArgumentParser(description="按天增量获取数据")
+    parser.add_argument("sql_file", type=str, help="SQL文件路径")
+    parser.add_argument("--days", type=int, default=7, help="获取最近N天 (默认7)")
+    parser.add_argument("--start", type=str, help="开始日期 YYYYMMDD")
+    parser.add_argument("--end", type=str, help="结束日期 YYYYMMDD")
+    parser.add_argument("--date", type=str, help="单天日期 YYYYMMDD")
+    parser.add_argument("--force", action="store_true", help="强制重新获取")
+    parser.add_argument("--workers", type=int, default=5, help="并发数 (默认5)")
+    args = parser.parse_args()
+
+    # 解析 SQL 文件路径
+    sql_file = Path(args.sql_file).resolve()
+    if not sql_file.exists():
+        print(f"错误: 找不到 {sql_file}")
+        return
+
+    # 输出目录:SQL 同目录下的 output/SQL文件名/
+    output_dir = sql_file.parent / "output"
+    daily_dir = output_dir / sql_file.stem
+    daily_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"SQL文件: {sql_file}")
+    print(f"数据目录: {daily_dir}")
+
+    # 确定日期范围
+    if args.date:
+        target_dates = [args.date]
+    elif args.start and args.end:
+        target_dates = get_date_range(args.start, args.end)
+    else:
+        today = datetime.now()
+        end_date = (today - timedelta(days=1)).strftime("%Y%m%d")
+        start_date = (today - timedelta(days=args.days)).strftime("%Y%m%d")
+        target_dates = get_date_range(start_date, end_date)
+
+    print(f"目标日期: {target_dates[0]} ~ {target_dates[-1]} ({len(target_dates)}天)")
+
+    # 检查已有数据
+    existing_dates = get_existing_dates(daily_dir)
+    print(f"已有数据: {len(existing_dates)}天")
+
+    # 确定需要获取的日期
+    if args.force:
+        missing_dates = target_dates
+        print(f"强制模式: 重新获取所有 {len(missing_dates)} 天")
+    else:
+        missing_dates = [d for d in target_dates if d not in existing_dates]
+        print(f"需要获取: {len(missing_dates)}天")
+
+    if not missing_dates:
+        print("没有需要获取的数据,退出")
+        return
+
+    # 读取 SQL 模板
+    sql_template = sql_file.read_text(encoding="utf-8")
+
+    # 重置计数器
+    success_count = 0
+    fail_count = 0
+
+    # 并发获取
+    workers = min(args.workers, len(missing_dates))
+    print(f"\n开始获取 (并发数: {workers})...")
+
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(fetch_single_day, dt, sql_template, daily_dir): dt
+            for dt in missing_dates
+        }
+
+        completed = 0
+        for future in as_completed(futures):
+            completed += 1
+            dt, status, info = future.result()
+
+            if status == "success":
+                print(f"  [{completed}/{len(missing_dates)}] ✓ {dt}: {info} 行")
+            elif status == "empty":
+                print(f"  [{completed}/{len(missing_dates)}] ⚠ {dt}: 无数据")
+            elif status == "error":
+                print(f"  [{completed}/{len(missing_dates)}] ✗ {dt}: {info}")
+            else:
+                print(f"  [{completed}/{len(missing_dates)}] ✗ {dt}: 失败")
+
+    print(f"\n完成! 成功: {success_count}, 失败: {fail_count}")
+    print(f"数据目录: {daily_dir}")
+
+
+if __name__ == "__main__":
+    main()

+ 7 - 0
tasks/00_表的洞察/loghubods.alg_vid_feature_basic_info/01_基本数据.sql

@@ -0,0 +1,7 @@
+-- 视频特征表样本数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.alg_vid_feature_basic_info/01_基本数据.sql --date 20260107
+
+SELECT *
+FROM loghubods.alg_vid_feature_basic_info
+WHERE dt = '${dt}'
+LIMIT 100

+ 7 - 0
tasks/00_表的洞察/loghubods.dwd_recsys_alg_exposure_base/01_基本数据.sql

@@ -0,0 +1,7 @@
+-- 曝光表样本数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.dwd_recsys_alg_exposure_base/01_基本数据.sql --date 20260107
+
+SELECT *
+FROM loghubods.dwd_recsys_alg_exposure_base_20250108
+WHERE dt = '${dt}'
+LIMIT 100

+ 7 - 0
tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all/01_基本数据.sql

@@ -0,0 +1,7 @@
+-- 推荐全量样本表数据查看
+-- 使用: python fetch_daily.py tasks/00_表的洞察/loghubods.dwd_recsys_alg_sample_all/01_基本数据.sql --date 20260106
+
+SELECT *
+FROM loghubods.dwd_recsys_alg_sample_all_20250212
+WHERE dt = '${dt}'
+LIMIT 100

+ 0 - 0
tasks/opengid原始分享数据探索/analyze.py → tasks/archive/opengid原始分享数据探索/analyze.py


+ 0 - 0
tasks/opengid原始分享数据探索/query.sql → tasks/archive/opengid原始分享数据探索/query.sql


+ 0 - 0
tasks/opengid数据探索/analyze.py → tasks/archive/opengid数据探索/analyze.py


+ 0 - 0
tasks/opengid数据探索/query.sql → tasks/archive/opengid数据探索/query.sql


+ 0 - 0
tasks/人群品类曝光分析/.DS_Store → tasks/archive/人群品类曝光分析/.DS_Store


+ 0 - 0
tasks/人群品类曝光分析/query.sql → tasks/archive/人群品类曝光分析/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部关联分析/.DS_Store → tasks/archive/人群品类曝光分析/头部关联分析/.DS_Store


+ 0 - 0
tasks/人群品类曝光分析/头部关联分析/query.sql → tasks/archive/人群品类曝光分析/头部关联分析/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部关联分析/query_v2_放宽条件.sql → tasks/archive/人群品类曝光分析/头部关联分析/query_v2_放宽条件.sql


+ 0 - 0
tasks/人群品类曝光分析/头部关联分析/query_关联率对比.sql → tasks/archive/人群品类曝光分析/头部关联分析/query_关联率对比.sql


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析/.DS_Store → tasks/archive/人群品类曝光分析/头部品类分析/.DS_Store


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析/headvideoid分布/query.sql → tasks/archive/人群品类曝光分析/头部品类分析/headvideoid分布/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析/query.sql → tasks/archive/人群品类曝光分析/头部品类分析/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析/visualize.py → tasks/archive/人群品类曝光分析/头部品类分析/visualize.py


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_简化版/analyze_category_correlation.py → tasks/archive/人群品类曝光分析/头部品类分析_简化版/analyze_category_correlation.py


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_简化版/query.sql → tasks/archive/人群品类曝光分析/头部品类分析_简化版/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_简化版/visualize.py → tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize.py


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_简化版/visualize_combined.py → tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize_combined.py


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_简化版/visualize_correlation.py → tasks/archive/人群品类曝光分析/头部品类分析_简化版/visualize_correlation.py


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_过滤小量/.DS_Store → tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/.DS_Store


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_过滤小量/query.sql → tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/query.sql


+ 0 - 0
tasks/人群品类曝光分析/头部品类分析_过滤小量/visualize.py → tasks/archive/人群品类曝光分析/头部品类分析_过滤小量/visualize.py


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v10_关联率排查.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v10_关联率排查.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v11_放宽条件.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v11_放宽条件.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v2.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v2.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v3.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v3.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v4.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v4.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v5.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v5.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v6.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v6.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v7.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v7.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v8.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v8.sql


+ 0 - 0
tasks/人群品类曝光分析/数据膨胀排查/query_v9.sql → tasks/archive/人群品类曝光分析/数据膨胀排查/query_v9.sql


+ 0 - 0
tasks/公众号投流素材缺失排查/analyze.py → tasks/archive/公众号投流素材缺失排查/analyze.py


+ 0 - 0
tasks/公众号投流素材缺失排查/query.sql → tasks/archive/公众号投流素材缺失排查/query.sql


+ 0 - 0
tasks/品类再分享分析/.DS_Store → tasks/archive/品类再分享分析/.DS_Store


+ 0 - 0
tasks/品类再分享分析/README.md → tasks/archive/品类再分享分析/README.md


+ 0 - 0
tasks/品类再分享分析/query.sql → tasks/archive/品类再分享分析/query.sql


+ 0 - 0
tasks/品类再分享分析/visualize.py → tasks/archive/品类再分享分析/visualize.py


+ 0 - 0
tasks/品类命中分析/.DS_Store → tasks/archive/品类命中分析/.DS_Store


+ 0 - 0
tasks/品类命中分析/query.sql → tasks/archive/品类命中分析/query.sql


+ 0 - 0
tasks/品类命中分析/query_debug.sql → tasks/archive/品类命中分析/query_debug.sql


+ 0 - 0
tasks/品类命中分析/query_detail.sql → tasks/archive/品类命中分析/query_detail.sql


+ 0 - 0
tasks/品类命中分析/visualize.py → tasks/archive/品类命中分析/visualize.py


+ 0 - 0
tasks/推荐样本表探索/query.sql → tasks/archive/推荐样本表探索/query.sql


+ 0 - 0
tasks/曝光样本表探索/daily_stats.sql → tasks/archive/曝光样本表探索/daily_stats.sql


+ 0 - 0
tasks/曝光样本表探索/query.sql → tasks/archive/曝光样本表探索/query.sql


+ 0 - 0
tasks/渠道再分享回流/query.sql → tasks/archive/渠道再分享回流/query.sql


+ 0 - 0
tasks/渠道场景分布/analyze.py → tasks/archive/渠道场景分布/analyze.py


+ 0 - 0
tasks/渠道场景分布/query.sql → tasks/archive/渠道场景分布/query.sql


+ 0 - 0
tasks/渠道场景效果分析/README.md → tasks/archive/渠道场景效果分析/README.md


+ 0 - 0
tasks/渠道场景效果分析/query.sql → tasks/archive/渠道场景效果分析/query.sql


+ 0 - 0
tasks/渠道场景效果分析/visualize.py → tasks/archive/渠道场景效果分析/visualize.py


+ 0 - 0
tasks/渠道效果分析/.DS_Store → tasks/archive/渠道效果分析/.DS_Store


+ 0 - 0
tasks/渠道效果分析/README.md → tasks/archive/渠道效果分析/README.md


+ 0 - 0
tasks/渠道效果分析/analyze.py → tasks/archive/渠道效果分析/analyze.py


+ 0 - 0
tasks/渠道效果分析/query.sql → tasks/archive/渠道效果分析/query.sql


+ 0 - 0
tasks/渠道效果分析/visualize.py → tasks/archive/渠道效果分析/visualize.py


+ 0 - 0
tasks/渠道用户量统计/analyze.py → tasks/archive/渠道用户量统计/analyze.py


+ 0 - 0
tasks/渠道用户量统计/query.sql → tasks/archive/渠道用户量统计/query.sql


+ 0 - 0
tasks/素材字段分析/analyze.py → tasks/archive/素材字段分析/analyze.py


+ 0 - 0
tasks/素材字段分析/query.sql → tasks/archive/素材字段分析/query.sql


+ 0 - 0
tasks/素材视频内容分析/.DS_Store → tasks/archive/素材视频内容分析/.DS_Store


+ 0 - 0
tasks/素材视频内容分析/README.md → tasks/archive/素材视频内容分析/README.md


+ 0 - 0
tasks/素材视频内容分析/analyze.py → tasks/archive/素材视频内容分析/analyze.py


+ 0 - 0
tasks/素材视频内容分析/query.sql → tasks/archive/素材视频内容分析/query.sql


+ 0 - 0
tasks/素材视频内容分析/visualize.py → tasks/archive/素材视频内容分析/visualize.py


+ 0 - 0
tasks/素材视频内容分析/visualize_html.py → tasks/archive/素材视频内容分析/visualize_html.py


+ 0 - 0
tasks/素材视频匹配分析/analyze.py → tasks/archive/素材视频匹配分析/analyze.py


+ 0 - 0
tasks/素材视频匹配分析/query.sql → tasks/archive/素材视频匹配分析/query.sql


+ 0 - 0
tasks/素材视频维度分析/analyze.py → tasks/archive/素材视频维度分析/analyze.py


+ 0 - 0
tasks/素材视频维度分析/analyze_match.py → tasks/archive/素材视频维度分析/analyze_match.py


+ 0 - 0
tasks/素材视频维度分析/analyze_material_fields.py → tasks/archive/素材视频维度分析/analyze_material_fields.py


+ 0 - 0
tasks/素材视频维度分析/query.sql → tasks/archive/素材视频维度分析/query.sql


+ 0 - 0
tasks/表关联验证/query.sql → tasks/archive/表关联验证/query.sql


+ 0 - 0
tasks/表关联验证/query_overall.sql → tasks/archive/表关联验证/query_overall.sql


+ 0 - 0
tasks/表关联验证/内外部UV_subsession/query.sql → tasks/archive/表关联验证/内外部UV_subsession/query.sql


+ 0 - 0
tasks/表关联验证/内外部验证_subsession/query.sql → tasks/archive/表关联验证/内外部验证_subsession/query.sql


+ 0 - 0
tasks/表关联验证/冲突排查/query.sql → tasks/archive/表关联验证/冲突排查/query.sql


+ 0 - 0
tasks/表结构查询_video_dimension_detail_add_column.csv → tasks/archive/表结构查询_video_dimension_detail_add_column.csv


+ 0 - 0
tasks/视频二级品类分析/README.md → tasks/archive/视频二级品类分析/README.md


+ 0 - 0
tasks/视频二级品类分析/analyze.py → tasks/archive/视频二级品类分析/analyze.py


+ 0 - 0
tasks/视频二级品类分析/query.sql → tasks/archive/视频二级品类分析/query.sql


+ 0 - 0
tasks/视频维度分析/query.sql → tasks/archive/视频维度分析/query.sql


+ 0 - 0
tasks/视频维度详情分析/README.md → tasks/archive/视频维度详情分析/README.md


+ 0 - 0
tasks/视频维度详情分析/analyze.py → tasks/archive/视频维度详情分析/analyze.py


+ 0 - 0
tasks/视频维度详情分析/query.sql → tasks/archive/视频维度详情分析/query.sql


+ 364 - 0
tasks/头部/进入前的I与头部I的相关性分析/analyze.py

@@ -0,0 +1,364 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材视频内容分析
+分析视频内容特征(关键词、口播、引导)对传播效果的影响
+包含:文章标题/分享标题与视频标题的相似度计算
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+from lib.text_embedding_api import compare_phrases_batch
+
+# 找到最新的输出文件(支持子目录)
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+csv_files = list(output_dir.glob("**/*.csv"))
+if not csv_files:
+    print("没有找到数据文件,请先运行 fetch_data.py")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+print(f"读取文件: {latest_file}")
+df = pd.read_csv(latest_file)
+
+# 输出结果收集
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"分析文件: {latest_file.name}")
+log()
+
+# ============================================================
+# 计算标题相似度
+# ============================================================
+log("=" * 70)
+log("计算标题相似度...")
+log("=" * 70)
+
+# 准备所有相似度计算对
+# 1. 文章标题 vs 视频标题
+# 2. 分享标题 vs 视频标题
+# 3. 文章标题 vs 视频口播
+# 4. 分享标题 vs 视频口播
+# 5. 文章标题 vs 一级品类
+# 6. 文章标题 vs 二级品类
+# 7. 分享标题 vs 一级品类
+# 8. 分享标题 vs 二级品类
+
+similarity_configs = [
+    ('文章标题', 'title', '文章标题_视频标题_相似度'),
+    ('分享标题', 'title', '分享标题_视频标题_相似度'),
+    ('文章标题', '视频口播', '文章标题_口播_相似度'),
+    ('分享标题', '视频口播', '分享标题_口播_相似度'),
+    ('文章标题', 'merge一级品类', '文章标题_一级品类_相似度'),
+    ('文章标题', 'merge二级品类', '文章标题_二级品类_相似度'),
+    ('分享标题', 'merge一级品类', '分享标题_一级品类_相似度'),
+    ('分享标题', 'merge二级品类', '分享标题_二级品类_相似度'),
+]
+
+BATCH_SIZE = 500
+
+for col1, col2, result_col in similarity_configs:
+    # 初始化结果列
+    df[result_col] = np.nan
+
+    # 准备配对数据
+    pairs = []
+    valid_indices = []
+
+    for idx, row in df.iterrows():
+        text1 = str(row[col1]) if pd.notna(row[col1]) and row[col1] != '' else ''
+        text2 = str(row[col2]) if pd.notna(row[col2]) and row[col2] != '' else ''
+
+        if text1 and text2:
+            pairs.append((text1, text2))
+            valid_indices.append(idx)
+
+    if not pairs:
+        log(f"{result_col}: 无有效数据")
+        continue
+
+    log(f"计算 {result_col}: {len(pairs)} 对")
+
+    # 批量计算
+    scores = []
+    for i in range(0, len(pairs), BATCH_SIZE):
+        batch = pairs[i:i+BATCH_SIZE]
+        results = compare_phrases_batch(batch)
+        scores.extend([r['相似度'] for r in results])
+        if (i + BATCH_SIZE) % 5000 == 0:
+            log(f"  已处理 {min(i+BATCH_SIZE, len(pairs))}/{len(pairs)}")
+
+    # 写入结果
+    for idx, score in zip(valid_indices, scores):
+        df.at[idx, result_col] = score
+
+    log(f"  覆盖率: {df[result_col].notna().mean():.1%}")
+log()
+
+# ============================================================
+# 基本信息
+# ============================================================
+log("=" * 70)
+log("基本信息")
+log("=" * 70)
+log(f"记录数: {len(df):,}")
+log(f"视频数: {df['videoid'].nunique():,}")
+log(f"总点击uv: {df['点击uv'].sum():,}")
+log(f"总回流uv: {df['再分享回流uv'].sum():,}")
+log()
+
+# 字段覆盖率
+log("新增字段覆盖率:")
+for col in ['视频关键词', '视频口播', '视频主题', '传播性判断', '是否有片尾引导']:
+    if col in df.columns:
+        coverage = df[col].notna().sum() / len(df)
+        log(f"  {col}: {coverage:.1%}")
+log()
+
+# ============================================================
+# 传播性判断 vs 实际效果
+# ============================================================
+log("=" * 70)
+log("AI 传播性判断 vs 实际效果")
+log("=" * 70)
+if '传播性判断' in df.columns:
+    spread_stats = df.groupby('传播性判断').agg({
+        'videoid': 'nunique',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '视频数'})
+    spread_stats['回流率'] = spread_stats['再分享回流uv'] / (spread_stats['点击uv'] + 10)
+    spread_stats = spread_stats.sort_values('点击uv', ascending=False)
+
+    log(f"{'传播性判断':<15} {'视频数':>8} {'点击uv':>12} {'回流uv':>12} {'回流率':>10}")
+    log("-" * 65)
+    for spread, row in spread_stats.iterrows():
+        spread_name = str(spread)[:13] if pd.notna(spread) else '(空)'
+        log(f"{spread_name:<15} {int(row['视频数']):>8,} {int(row['点击uv']):>12,} {int(row['再分享回流uv']):>12,} {row['回流率']:>10.2%}")
+log()
+
+# ============================================================
+# 片尾引导效果分析
+# ============================================================
+log("=" * 70)
+log("片尾引导效果分析")
+log("=" * 70)
+if '是否有片尾引导' in df.columns:
+    guide_stats = df.groupby('是否有片尾引导').agg({
+        'videoid': 'nunique',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '视频数'})
+    guide_stats['回流率'] = guide_stats['再分享回流uv'] / (guide_stats['点击uv'] + 10)
+
+    log(f"{'是否有引导':<15} {'视频数':>8} {'点击uv':>12} {'回流率':>10}")
+    log("-" * 50)
+    for guide, row in guide_stats.iterrows():
+        guide_name = str(guide)[:13] if pd.notna(guide) else '(空)'
+        log(f"{guide_name:<15} {int(row['视频数']):>8,} {int(row['点击uv']):>12,} {row['回流率']:>10.2%}")
+log()
+
+# 引导强度分析
+if '引导强度' in df.columns:
+    log("引导强度细分:")
+    strength_stats = df.groupby('引导强度').agg({
+        'videoid': 'nunique',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '视频数'})
+    strength_stats['回流率'] = strength_stats['再分享回流uv'] / (strength_stats['点击uv'] + 10)
+    strength_stats = strength_stats.sort_values('点击uv', ascending=False)
+
+    for strength, row in strength_stats.iterrows():
+        strength_name = str(strength)[:20] if pd.notna(strength) else '(空)'
+        log(f"  {strength_name:<22} 视频数={int(row['视频数']):>5}, 回流率={row['回流率']:.2%}")
+log()
+
+# ============================================================
+# 情感倾向分析
+# ============================================================
+log("=" * 70)
+log("情感倾向效果分析")
+log("=" * 70)
+if '情感倾向' in df.columns:
+    emotion_stats = df.groupby('情感倾向').agg({
+        'videoid': 'nunique',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '视频数'})
+    emotion_stats['回流率'] = emotion_stats['再分享回流uv'] / (emotion_stats['点击uv'] + 10)
+    emotion_stats = emotion_stats.sort_values('点击uv', ascending=False).head(10)
+
+    log(f"{'情感倾向':<20} {'视频数':>8} {'点击uv':>12} {'回流率':>10}")
+    log("-" * 55)
+    for emotion, row in emotion_stats.iterrows():
+        emotion_name = str(emotion)[:18] if pd.notna(emotion) else '(空)'
+        log(f"{emotion_name:<20} {int(row['视频数']):>8,} {int(row['点击uv']):>12,} {row['回流率']:>10.2%}")
+log()
+
+# ============================================================
+# 视频风格分析
+# ============================================================
+log("=" * 70)
+log("视频风格效果分析(Top 15)")
+log("=" * 70)
+if '视频风格' in df.columns:
+    style_stats = df.groupby('视频风格').agg({
+        'videoid': 'nunique',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '视频数'})
+    style_stats['回流率'] = style_stats['再分享回流uv'] / (style_stats['点击uv'] + 10)
+    style_stats = style_stats.sort_values('点击uv', ascending=False).head(15)
+
+    log(f"{'视频风格':<25} {'视频数':>8} {'点击uv':>12} {'回流率':>10}")
+    log("-" * 60)
+    for style, row in style_stats.iterrows():
+        style_name = str(style)[:23] if pd.notna(style) else '(空)'
+        log(f"{style_name:<25} {int(row['视频数']):>8,} {int(row['点击uv']):>12,} {row['回流率']:>10.2%}")
+log()
+
+# ============================================================
+# 高回流视频内容特征
+# ============================================================
+log("=" * 70)
+log("高回流视频内容特征(回流率≥30%,点击≥1000)")
+log("=" * 70)
+high_return = df[(df['再分享回流率'] >= 0.3) & (df['点击uv'] >= 1000)]
+log(f"符合条件视频数: {len(high_return)}")
+log()
+
+if len(high_return) > 0:
+    # 传播性分布
+    if '传播性判断' in high_return.columns:
+        spread_dist = high_return['传播性判断'].value_counts(normalize=True)
+        log("传播性判断分布:")
+        for spread, pct in spread_dist.items():
+            log(f"  {spread}: {pct:.1%}")
+        log()
+
+    # 引导分布
+    if '是否有片尾引导' in high_return.columns:
+        guide_dist = high_return['是否有片尾引导'].value_counts(normalize=True)
+        log("片尾引导分布:")
+        for guide, pct in guide_dist.items():
+            log(f"  {guide}: {pct:.1%}")
+        log()
+
+    # Top 视频样例
+    log("Top 10 高回流视频:")
+    log("-" * 70)
+    top_return = high_return.nlargest(10, '再分享回流uv')
+    for _, row in top_return.iterrows():
+        title = str(row['title'])[:40] if pd.notna(row['title']) else '(无标题)'
+        keywords = str(row['视频关键词'])[:50] if pd.notna(row['视频关键词']) else ''
+        log(f"  {title}")
+        log(f"    关键词: {keywords}")
+        log(f"    点击uv={int(row['点击uv'])}, 回流率={row['再分享回流率']:.1%}, 传播性={row['传播性判断']}")
+log()
+
+# ============================================================
+# 关键词词频分析
+# ============================================================
+log("=" * 70)
+log("视频关键词词频(Top 30)")
+log("=" * 70)
+if '视频关键词' in df.columns:
+    # 提取所有关键词
+    all_keywords = []
+    for kw in df['视频关键词'].dropna():
+        if isinstance(kw, str):
+            # 按常见分隔符拆分
+            for sep in [',', ',', '、', ';', ';']:
+                kw = kw.replace(sep, ',')
+            all_keywords.extend([k.strip() for k in kw.split(',') if k.strip()])
+
+    from collections import Counter
+    kw_counts = Counter(all_keywords).most_common(30)
+    for kw, cnt in kw_counts:
+        log(f"  {kw}: {cnt}")
+log()
+
+# ============================================================
+# 标题相似度效果分析
+# ============================================================
+log("=" * 70)
+log("标题相似度效果分析")
+log("=" * 70)
+
+# 所有相似度指标
+similarity_cols = [
+    ('文章标题_视频标题_相似度', '文章标题 vs 视频标题'),
+    ('分享标题_视频标题_相似度', '分享标题 vs 视频标题'),
+    ('文章标题_口播_相似度', '文章标题 vs 视频口播'),
+    ('分享标题_口播_相似度', '分享标题 vs 视频口播'),
+    ('文章标题_一级品类_相似度', '文章标题 vs 一级品类'),
+    ('文章标题_二级品类_相似度', '文章标题 vs 二级品类'),
+    ('分享标题_一级品类_相似度', '分享标题 vs 一级品类'),
+    ('分享标题_二级品类_相似度', '分享标题 vs 二级品类'),
+]
+
+# 相似度汇总统计
+log("\n相似度汇总统计:")
+log(f"{'指标':<30} {'均值':>8} {'中位数':>8} {'标准差':>8} {'覆盖率':>8}")
+log("-" * 70)
+for col, label in similarity_cols:
+    if col in df.columns and df[col].notna().any():
+        mean_val = df[col].mean()
+        median_val = df[col].median()
+        std_val = df[col].std()
+        coverage = df[col].notna().mean()
+        log(f"{label:<30} {mean_val:>8.3f} {median_val:>8.3f} {std_val:>8.3f} {coverage:>8.1%}")
+
+# 逐个分析相似度与回流率的关系
+for col, label in similarity_cols:
+    if col not in df.columns or not df[col].notna().any():
+        continue
+
+    log(f"\n{label} vs 回流率:")
+
+    # 按相似度分组
+    group_col = f'{col}_分组'
+    df[group_col] = pd.cut(
+        df[col],
+        bins=[0, 0.3, 0.5, 0.7, 0.9, 1.0],
+        labels=['低(0-0.3)', '较低(0.3-0.5)', '中等(0.5-0.7)', '较高(0.7-0.9)', '高(0.9-1)']
+    )
+
+    sim_effect = df.groupby(group_col, observed=True).agg({
+        'videoid': 'count',
+        '点击uv': 'sum',
+        '再分享回流uv': 'sum'
+    }).rename(columns={'videoid': '记录数'})
+    sim_effect['回流率'] = sim_effect['再分享回流uv'] / (sim_effect['点击uv'] + 10)
+
+    log(f"{'相似度分组':<20} {'记录数':>8} {'点击uv':>12} {'回流率':>10}")
+    log("-" * 55)
+    for group, row in sim_effect.iterrows():
+        log(f"{str(group):<20} {int(row['记录数']):>8,} {int(row['点击uv']):>12,} {row['回流率']:>10.2%}")
+
+    # 清理临时列
+    df.drop(columns=[group_col], inplace=True)
+
+log()
+
+# 保存带相似度的数据
+output_with_sim = output_dir / f"{latest_file.stem}_含相似度.csv"
+df.to_csv(output_with_sim, index=False)
+log(f"含相似度数据已保存到: {output_with_sim}")
+
+# 保存分析结果
+result_file = output_dir / f"{latest_file.stem}_分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+
+log(f"分析结果已保存到: {result_file}")

+ 184 - 0
tasks/头部/进入前的I与头部I的相关性分析/fetch_data.py

@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+增量数据获取脚本
+按天获取数据,支持增量更新和并发获取
+
+用法:
+    python fetch_data.py 素材与头部视频相关性.sql              # 获取最近7天
+    python fetch_data.py 素材与头部视频相关性.sql --days 30    # 获取最近30天
+    python fetch_data.py 素材与头部视频相关性.sql --date 20260105
+    python fetch_data.py 素材与头部视频相关性.sql --force      # 强制重新获取
+"""
+import argparse
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+
+# 添加 lib 目录到路径 (tasks/承接/头部品类与承接品类分析 -> data_analysis/lib)
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "lib"))
+from odps_module import ODPSClient
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 线程安全的计数器
+counter_lock = threading.Lock()
+success_count = 0
+fail_count = 0
+
+
+def get_existing_dates(daily_dir):
+    """获取已下载的日期列表"""
+    existing = set()
+    if not daily_dir.exists():
+        return existing
+    for f in daily_dir.glob("*.csv"):
+        try:
+            dt = f.stem
+            if len(dt) == 8 and dt.isdigit():
+                existing.add(dt)
+        except:
+            pass
+    return existing
+
+
+def get_date_range(start_str, end_str):
+    """生成日期范围列表"""
+    start = datetime.strptime(start_str, "%Y%m%d")
+    end = datetime.strptime(end_str, "%Y%m%d")
+    dates = []
+    current = start
+    while current <= end:
+        dates.append(current.strftime("%Y%m%d"))
+        current += timedelta(days=1)
+    return dates
+
+
+def fetch_single_day(dt, sql_template, daily_dir):
+    """获取单天数据(每个线程创建自己的客户端)"""
+    global success_count, fail_count
+
+    try:
+        client = ODPSClient()
+        sql = sql_template.replace("${dt}", dt)
+        df = client.execute_sql(sql)
+
+        output_file = daily_dir / f"{dt}.csv"
+
+        if df is not None and len(df) > 0:
+            df.to_csv(output_file, index=False)
+            with counter_lock:
+                success_count += 1
+            return (dt, "success", len(df))
+        elif df is not None:
+            df.to_csv(output_file, index=False)
+            with counter_lock:
+                success_count += 1
+            return (dt, "empty", 0)
+        else:
+            with counter_lock:
+                fail_count += 1
+            return (dt, "fail", 0)
+
+    except Exception as e:
+        with counter_lock:
+            fail_count += 1
+        return (dt, "error", str(e))
+
+
+def main():
+    global success_count, fail_count
+
+    parser = argparse.ArgumentParser(description="增量获取品类数据")
+    parser.add_argument("sql_file", type=str, help="SQL文件名 (如: 品类组合_按天.sql)")
+    parser.add_argument("--days", type=int, default=7, help="获取最近N天 (默认7)")
+    parser.add_argument("--start", type=str, help="开始日期 YYYYMMDD")
+    parser.add_argument("--end", type=str, help="结束日期 YYYYMMDD")
+    parser.add_argument("--date", type=str, help="单天日期 YYYYMMDD")
+    parser.add_argument("--force", action="store_true", help="强制重新获取")
+    parser.add_argument("--workers", type=int, default=5, help="并发数 (默认5)")
+    args = parser.parse_args()
+
+    # 解析SQL文件路径和数据目录
+    sql_file = task_dir / args.sql_file
+    if not sql_file.exists():
+        print(f"错误: 找不到 {sql_file}")
+        return
+
+    # 数据目录 = output / SQL文件名(去掉.sql后缀)
+    data_dir_name = sql_file.stem  # 如 "品类组合_按天"
+    daily_dir = output_dir / data_dir_name
+    daily_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"SQL文件: {sql_file.name}")
+    print(f"数据目录: {daily_dir}")
+
+    # 确定日期范围
+    if args.date:
+        target_dates = [args.date]
+    elif args.start and args.end:
+        target_dates = get_date_range(args.start, args.end)
+    else:
+        today = datetime.now()
+        end_date = (today - timedelta(days=1)).strftime("%Y%m%d")
+        start_date = (today - timedelta(days=args.days)).strftime("%Y%m%d")
+        target_dates = get_date_range(start_date, end_date)
+
+    print(f"目标日期: {target_dates[0]} ~ {target_dates[-1]} ({len(target_dates)}天)")
+
+    # 检查已有数据
+    existing_dates = get_existing_dates(daily_dir)
+    print(f"已有数据: {len(existing_dates)}天")
+
+    # 确定需要获取的日期
+    if args.force:
+        missing_dates = target_dates
+        print(f"强制模式: 重新获取所有 {len(missing_dates)} 天")
+    else:
+        missing_dates = [d for d in target_dates if d not in existing_dates]
+        print(f"需要获取: {len(missing_dates)}天")
+
+    if not missing_dates:
+        print("没有需要获取的数据,退出")
+        return
+
+    # 读取SQL模板
+    sql_template = sql_file.read_text(encoding="utf-8")
+
+    # 重置计数器
+    success_count = 0
+    fail_count = 0
+
+    # 并发获取
+    workers = min(args.workers, len(missing_dates))
+    print(f"\n开始获取 (并发数: {workers})...")
+
+    with ThreadPoolExecutor(max_workers=workers) as executor:
+        futures = {
+            executor.submit(fetch_single_day, dt, sql_template, daily_dir): dt
+            for dt in missing_dates
+        }
+
+        completed = 0
+        for future in as_completed(futures):
+            completed += 1
+            dt, status, info = future.result()
+
+            if status == "success":
+                print(f"  [{completed}/{len(missing_dates)}] ✓ {dt}: {info} 行")
+            elif status == "empty":
+                print(f"  [{completed}/{len(missing_dates)}] ⚠ {dt}: 无数据")
+            elif status == "error":
+                print(f"  [{completed}/{len(missing_dates)}] ✗ {dt}: {info}")
+            else:
+                print(f"  [{completed}/{len(missing_dates)}] ✗ {dt}: 失败")
+
+    print(f"\n完成! 成功: {success_count}, 失败: {fail_count}")
+    print(f"数据目录: {daily_dir}")
+
+
+if __name__ == "__main__":
+    main()

+ 649 - 0
tasks/头部/进入前的I与头部I的相关性分析/visualize.py

@@ -0,0 +1,649 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材视频内容分析 - HTML 可视化(简化版:相似度 vs 回流率)
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import json
+
+# 找到最新的含相似度文件
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+csv_files = list(output_dir.glob("*含相似度*.csv"))
+if not csv_files:
+    csv_files = list(output_dir.glob("*.csv"))
+if not csv_files:
+    print("没有找到数据文件")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+print(f"读取文件: {latest_file.name}")
+df = pd.read_csv(latest_file)
+
+# 相似度列配置
+similarity_cols = [
+    ('分享标题_视频标题_相似度', '分享标题与视频标题'),
+    ('分享标题_口播_相似度', '分享标题与口播内容'),
+    ('分享标题_一级品类_相似度', '分享标题与一级品类'),
+    ('分享标题_二级品类_相似度', '分享标题与二级品类'),
+    ('文章标题_视频标题_相似度', '文章标题与视频标题'),
+    ('文章标题_口播_相似度', '文章标题与口播内容'),
+]
+
+# 过滤有效的相似度列
+valid_cols = [(col, label) for col, label in similarity_cols
+              if col in df.columns and df[col].notna().sum() > 100]
+
+print(f"有效相似度指标: {len(valid_cols)} 个")
+
+# 回流率字段
+rate_cols = ['再分享回流率', '原视频再分享回流率', '推荐再分享回流率']
+rate_cols = [c for c in rate_cols if c in df.columns]
+
+# 准备原始数据表(取关键字段)- 调整列顺序
+table_cols = ['dt', 'channel', 'hotsencetype', '合作方名', '公众号名']  # 日期、渠道、场景、合作方、公众号在最前
+table_cols += ['文章标题', '分享封面', '分享标题', 'title', 'videoid']  # 标题、封面和视频ID
+table_cols += rate_cols  # 三个回流率
+table_cols += ['点击uv']  # 点击量
+table_cols += [col for col, _ in valid_cols]  # 相似度
+table_cols += ['merge一级品类', 'merge二级品类']  # 品类在后
+table_cols = [c for c in table_cols if c in df.columns]
+
+# 过滤有相似度数据的记录
+raw_df = df[df[[col for col, _ in valid_cols[:2]]].notna().any(axis=1)].copy()
+
+# 计算分享标题聚合UV
+share_title_uv = raw_df.groupby('分享标题')['点击uv'].transform('sum')
+raw_df['分享标题聚合UV'] = share_title_uv
+
+# 按分享标题聚合UV排序,再按点击UV排序,取前2000条
+raw_df = raw_df.sort_values(['分享标题聚合UV', '点击uv'], ascending=[False, False]).head(2000)
+
+# 更新table_cols,加入聚合UV
+table_cols_with_agg = table_cols.copy()
+# 在点击uv后面插入分享标题聚合UV
+if '点击uv' in table_cols_with_agg:
+    idx = table_cols_with_agg.index('点击uv')
+    table_cols_with_agg.insert(idx, '分享标题聚合UV')
+
+raw_data = raw_df[table_cols_with_agg].fillna('').to_dict('records')
+table_cols = table_cols_with_agg
+
+# 相似度分组统计
+bins = [0, 0.3, 0.5, 0.7, 0.9, 1.0]
+labels_bin = ['0-0.3', '0.3-0.5', '0.5-0.7', '0.7-0.9', '0.9-1.0']
+
+group_stats = []
+for col, label in valid_cols:
+    df['_group'] = pd.cut(df[col], bins=bins, labels=labels_bin)
+
+    stats = []
+    for grp in labels_bin:
+        grp_df = df[df['_group'] == grp]
+        if len(grp_df) == 0:
+            continue
+
+        row = {
+            'group': grp,
+            'count': len(grp_df),
+            'click_uv': int(grp_df['点击uv'].sum()),
+        }
+
+        # 计算加权平均回流率(保持原始小数)
+        for rate_col in rate_cols:
+            weighted = (grp_df[rate_col] * grp_df['点击uv']).sum()
+            total_click = grp_df['点击uv'].sum()
+            row[rate_col] = round(weighted / (total_click + 1), 4) if total_click > 0 else 0
+
+        stats.append(row)
+
+    group_stats.append({
+        'label': label,
+        'col': col,
+        'stats': stats
+    })
+
+if '_group' in df.columns:
+    df.drop(columns=['_group'], inplace=True)
+
+# 列名映射(用于表头显示)
+col_labels = {col: label for col, label in valid_cols}
+col_labels.update({
+    'dt': '日期',
+    'channel': '渠道',
+    'hotsencetype': '场景类型',
+    '合作方名': '合作方',
+    '公众号名': '公众号',
+    '文章标题': '文章标题',
+    '分享标题': '分享标题',
+    '分享封面': '分享封面',
+    'title': '视频标题',
+    'videoid': '视频ID',
+    'merge一级品类': '一级品类',
+    'merge二级品类': '二级品类',
+    '分享标题聚合UV': '分享标题聚合UV',
+    '点击uv': '点击UV',
+    '再分享回流率': '再分享回流率',
+    '原视频再分享回流率': '原视频回流率',
+    '推荐再分享回流率': '推荐回流率',
+})
+
+# 获取筛选项的唯一值
+date_list = sorted(df['dt'].dropna().unique().tolist()) if 'dt' in df.columns else []
+channel_list = sorted(df['channel'].dropna().unique().tolist()) if 'channel' in df.columns else []
+hotsencetype_list = sorted(df['hotsencetype'].dropna().unique().tolist()) if 'hotsencetype' in df.columns else []
+partner_list = sorted(df['合作方名'].dropna().unique().tolist()) if '合作方名' in df.columns else []
+account_list = sorted(df['公众号名'].dropna().unique().tolist()) if '公众号名' in df.columns else []
+
+# 生成 HTML
+html_content = f'''<!DOCTYPE html>
+<html lang="zh-CN">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>相似度 vs 回流率分析</title>
+    <script src="https://cdn.jsdelivr.net/npm/echarts@5.4.3/dist/echarts.min.js"></script>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+            background: #f5f7fa;
+            padding: 20px;
+        }}
+        .container {{ max-width: 1600px; margin: 0 auto; }}
+        h1 {{ text-align: center; color: #333; margin-bottom: 20px; }}
+        .section {{
+            background: white;
+            border-radius: 12px;
+            padding: 20px;
+            margin-bottom: 20px;
+            box-shadow: 0 2px 8px rgba(0,0,0,0.08);
+        }}
+        .section h2 {{
+            color: #333;
+            margin-bottom: 15px;
+            padding-bottom: 10px;
+            border-bottom: 2px solid #667eea;
+            display: inline-block;
+        }}
+
+        /* 图表网格 */
+        .chart-grid {{
+            display: grid;
+            grid-template-columns: repeat(3, 1fr);
+            gap: 15px;
+        }}
+        .chart-item {{ height: 300px; }}
+
+        /* 可排序表格 */
+        .table-controls {{
+            display: flex;
+            gap: 15px;
+            margin-bottom: 15px;
+            flex-wrap: wrap;
+            align-items: center;
+        }}
+        .table-controls input {{
+            padding: 8px 12px;
+            border: 1px solid #ddd;
+            border-radius: 6px;
+            width: 300px;
+        }}
+        .table-controls select {{
+            padding: 8px 12px;
+            border: 1px solid #ddd;
+            border-radius: 6px;
+        }}
+        .table-wrapper {{
+            overflow-x: auto;
+            max-height: 600px;
+            overflow-y: auto;
+        }}
+        table {{
+            width: 100%;
+            border-collapse: collapse;
+            font-size: 13px;
+        }}
+        th {{
+            background: #667eea;
+            color: white;
+            padding: 10px 8px;
+            text-align: left;
+            cursor: pointer;
+            user-select: none;
+            white-space: nowrap;
+            position: sticky;
+            top: 0;
+            z-index: 10;
+        }}
+        th:hover {{ background: #5a6fd6; }}
+        th .sort-icon {{ margin-left: 5px; opacity: 0.5; }}
+        th.sorted .sort-icon {{ opacity: 1; }}
+        td {{
+            padding: 8px;
+            border-bottom: 1px solid #eee;
+            max-width: 250px;
+            overflow: hidden;
+            text-overflow: ellipsis;
+            white-space: nowrap;
+        }}
+        td.wrap {{
+            white-space: normal;
+            word-break: break-word;
+            min-width: 180px;
+            max-width: 220px;
+        }}
+        tr:hover {{ background: #f8f9fa; }}
+        tr:nth-child(even) {{ background: #fafbfc; }}
+        tr:nth-child(even):hover {{ background: #f0f1f2; }}
+        .num {{ text-align: right; font-family: monospace; }}
+        .highlight {{ background: #fff3cd !important; }}
+
+        /* 图片预览模态框 */
+        .img-modal {{
+            display: none;
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            background: rgba(0,0,0,0.8);
+            z-index: 1000;
+            cursor: pointer;
+            justify-content: center;
+            align-items: center;
+        }}
+        .img-modal img {{
+            max-width: 90%;
+            max-height: 90%;
+            border-radius: 8px;
+            box-shadow: 0 4px 20px rgba(0,0,0,0.3);
+        }}
+        .img-modal.show {{ display: flex; }}
+
+        /* 统计表格 */
+        .stats-table {{ margin-top: 10px; }}
+        .stats-table th {{ background: #5a6fd6; font-size: 12px; }}
+        .stats-table td {{ font-size: 12px; padding: 6px 8px; }}
+
+        @media (max-width: 1200px) {{
+            .chart-grid {{ grid-template-columns: repeat(2, 1fr); }}
+        }}
+        @media (max-width: 768px) {{
+            .chart-grid {{ grid-template-columns: 1fr; }}
+        }}
+    </style>
+</head>
+<body>
+    <!-- 图片预览模态框 -->
+    <div id="imgModal" class="img-modal" onclick="closeImgModal()">
+        <img id="modalImg" src="" alt="预览图片">
+    </div>
+
+    <div class="container">
+        <h1>相似度 vs 回流率分析</h1>
+
+        <!-- 分组统计图表 -->
+        <div class="section">
+            <h2>相似度分组 vs 回流率</h2>
+            <div class="chart-grid">
+                {' '.join(f'<div id="chart_{i}" class="chart-item"></div>' for i in range(len(group_stats)))}
+            </div>
+        </div>
+
+        <!-- 分组统计表格 -->
+        <div class="section">
+            <h2>分组详细数据</h2>
+            <div id="statsTablesContainer"></div>
+        </div>
+
+        <!-- 原始数据表 -->
+        <div class="section">
+            <h2>原始数据(Top 2000 by 点击UV)</h2>
+            <div class="table-controls">
+                <select id="dateFilter">
+                    <option value="">全部日期</option>
+                    {' '.join(f'<option value="{d}">{d}</option>' for d in date_list)}
+                </select>
+                <select id="channelFilter">
+                    <option value="">全部渠道</option>
+                    {' '.join(f'<option value="{c}">{c}</option>' for c in channel_list)}
+                </select>
+                <select id="hotsencetypeFilter">
+                    <option value="">全部场景</option>
+                    {' '.join(f'<option value="{h}">{h}</option>' for h in hotsencetype_list)}
+                </select>
+                <select id="partnerFilter">
+                    <option value="">全部合作方</option>
+                    {' '.join(f'<option value="{p}">{p}</option>' for p in partner_list)}
+                </select>
+                <select id="accountFilter">
+                    <option value="">全部公众号</option>
+                    {' '.join(f'<option value="{a}">{a}</option>' for a in account_list)}
+                </select>
+                <input type="text" id="searchInput" placeholder="搜索标题...">
+                <select id="simFilter">
+                    <option value="">全部相似度</option>
+                    <option value="high">高相似度 (≥0.7)</option>
+                    <option value="mid">中相似度 (0.3-0.7)</option>
+                    <option value="low">低相似度 (<0.3)</option>
+                </select>
+                <label>点击UV ≥ <input type="number" id="minUvInput" value="100" min="0" style="width:80px;padding:8px;border:1px solid #ddd;border-radius:6px;"></label>
+                <span id="rowCount" style="color:#666;"></span>
+            </div>
+            <div class="table-wrapper">
+                <table id="dataTable">
+                    <thead>
+                        <tr id="headerRow"></tr>
+                    </thead>
+                    <tbody id="tableBody"></tbody>
+                </table>
+            </div>
+        </div>
+    </div>
+
+    <script>
+        // 数据
+        const rawData = {json.dumps(raw_data, ensure_ascii=False, default=str)};
+        const groupStats = {json.dumps(group_stats, ensure_ascii=False)};
+        const tableCols = {json.dumps(table_cols, ensure_ascii=False)};
+        const colLabels = {json.dumps(col_labels, ensure_ascii=False)};
+        const rateCols = {json.dumps(rate_cols, ensure_ascii=False)};
+        const validCols = {json.dumps([col for col, _ in valid_cols], ensure_ascii=False)};
+
+        // 当前排序状态
+        let currentSort = {{ col: '分享标题聚合UV', dir: 'desc' }};
+        let filteredData = [...rawData];
+
+        // 渲染图表
+        function renderCharts() {{
+            groupStats.forEach((gs, idx) => {{
+                const chart = echarts.init(document.getElementById('chart_' + idx));
+                const groups = gs.stats.map(s => s.group);
+
+                const series = rateCols.map((rc, i) => ({{
+                    name: colLabels[rc] || rc,
+                    type: 'bar',
+                    data: gs.stats.map(s => s[rc] || 0),
+                    itemStyle: {{ color: ['#667eea', '#f093fb', '#43e97b'][i] }}
+                }}));
+
+                chart.setOption({{
+                    title: {{ text: gs.label, left: 'center', textStyle: {{ fontSize: 14 }} }},
+                    tooltip: {{ trigger: 'axis' }},
+                    legend: {{ bottom: 0, textStyle: {{ fontSize: 10 }} }},
+                    xAxis: {{ type: 'category', data: groups, axisLabel: {{ fontSize: 11 }} }},
+                    yAxis: {{ type: 'value', name: '回流率(%)', axisLabel: {{ fontSize: 10 }} }},
+                    series: series,
+                    grid: {{ top: 50, bottom: 60, left: 50, right: 20 }}
+                }});
+            }});
+        }}
+
+        // 渲染统计表格
+        function renderStatsTables() {{
+            const container = document.getElementById('statsTablesContainer');
+            let html = '<div style="display:grid;grid-template-columns:repeat(2,1fr);gap:20px;">';
+
+            groupStats.forEach(gs => {{
+                html += `<div>
+                    <h4 style="margin-bottom:10px;color:#333;">${{gs.label}}</h4>
+                    <table class="stats-table">
+                        <tr>
+                            <th>相似度区间</th>
+                            <th>记录数</th>
+                            <th>点击UV</th>
+                            ${{rateCols.map(rc => `<th>${{colLabels[rc] || rc}}</th>`).join('')}}
+                        </tr>`;
+
+                gs.stats.forEach(row => {{
+                    html += `<tr>
+                        <td>${{row.group}}</td>
+                        <td class="num">${{row.count.toLocaleString()}}</td>
+                        <td class="num">${{row.click_uv.toLocaleString()}}</td>
+                        ${{rateCols.map(rc => `<td class="num">${{(row[rc] || 0).toFixed(4)}}</td>`).join('')}}
+                    </tr>`;
+                }});
+
+                html += '</table></div>';
+            }});
+
+            html += '</div>';
+            container.innerHTML = html;
+        }}
+
+        // 渲染表头
+        function renderHeader() {{
+            const headerRow = document.getElementById('headerRow');
+            headerRow.innerHTML = tableCols.map(col => {{
+                const label = colLabels[col] || col;
+                const isSorted = currentSort.col === col;
+                const icon = isSorted ? (currentSort.dir === 'asc' ? '▲' : '▼') : '▼';
+                return `<th class="${{isSorted ? 'sorted' : ''}}" onclick="sortBy('${{col}}')">
+                    ${{label}}<span class="sort-icon">${{icon}}</span>
+                </th>`;
+            }}).join('');
+        }}
+
+        // 计算每列的最大最小值(用于渐变)
+        function getColumnRange(data, col) {{
+            const vals = data.map(r => r[col]).filter(v => typeof v === 'number' && !isNaN(v));
+            if (vals.length === 0) return {{ min: 0, max: 1 }};
+            return {{ min: Math.min(...vals), max: Math.max(...vals) }};
+        }}
+
+        // 根据值获取渐变背景色(绿色系)
+        function getGradientColor(val, min, max) {{
+            if (typeof val !== 'number' || isNaN(val)) return '';
+            const ratio = max > min ? (val - min) / (max - min) : 0;
+            // 绿色系,alpha 从 0.05 到 0.6
+            const alpha = 0.05 + ratio * 0.55;
+            return `rgba(34, 197, 94, ${{alpha}})`;
+        }}
+
+        // 渲染表格数据
+        function renderTable() {{
+            const tbody = document.getElementById('tableBody');
+            const search = document.getElementById('searchInput').value.toLowerCase();
+            const simFilter = document.getElementById('simFilter').value;
+
+            // 筛选条件
+            const minUv = parseInt(document.getElementById('minUvInput').value) || 0;
+            const dateFilter = document.getElementById('dateFilter').value;
+            const channelFilter = document.getElementById('channelFilter').value;
+            const hotsencetypeFilter = document.getElementById('hotsencetypeFilter').value;
+            const partnerFilter = document.getElementById('partnerFilter').value;
+            const accountFilter = document.getElementById('accountFilter').value;
+
+            // 过滤
+            filteredData = rawData.filter(row => {{
+                // 日期过滤(转字符串比较)
+                if (dateFilter && String(row['dt']) !== dateFilter) return false;
+
+                // 渠道过滤
+                if (channelFilter && row['channel'] !== channelFilter) return false;
+
+                // 场景类型过滤(转字符串比较)
+                if (hotsencetypeFilter && String(row['hotsencetype']) !== hotsencetypeFilter) return false;
+
+                // 合作方过滤
+                if (partnerFilter && row['合作方名'] !== partnerFilter) return false;
+
+                // 公众号过滤
+                if (accountFilter && row['公众号名'] !== accountFilter) return false;
+
+                // 点击UV过滤
+                if (row['点击uv'] < minUv) return false;
+
+                // 搜索过滤
+                if (search) {{
+                    const title1 = (row['分享标题'] || '').toLowerCase();
+                    const title2 = (row['title'] || '').toLowerCase();
+                    if (!title1.includes(search) && !title2.includes(search)) return false;
+                }}
+
+                // 相似度过滤
+                if (simFilter) {{
+                    const simVal = validCols.map(c => row[c]).find(v => v !== '' && v !== null);
+                    if (simVal === undefined) return false;
+                    if (simFilter === 'high' && simVal < 0.7) return false;
+                    if (simFilter === 'mid' && (simVal < 0.3 || simVal >= 0.7)) return false;
+                    if (simFilter === 'low' && simVal >= 0.3) return false;
+                }}
+
+                return true;
+            }});
+
+            // 排序
+            filteredData.sort((a, b) => {{
+                let va = a[currentSort.col];
+                let vb = b[currentSort.col];
+
+                // 处理空值
+                if (va === '' || va === null) va = currentSort.dir === 'asc' ? Infinity : -Infinity;
+                if (vb === '' || vb === null) vb = currentSort.dir === 'asc' ? Infinity : -Infinity;
+
+                // 数值比较
+                if (typeof va === 'number' && typeof vb === 'number') {{
+                    return currentSort.dir === 'asc' ? va - vb : vb - va;
+                }}
+
+                // 字符串比较
+                va = String(va);
+                vb = String(vb);
+                return currentSort.dir === 'asc' ? va.localeCompare(vb) : vb.localeCompare(va);
+            }});
+
+            // 计算全局列范围(用于渐变)
+            const globalRanges = {{}};
+            tableCols.forEach(col => {{
+                globalRanges[col] = getColumnRange(filteredData, col);
+            }});
+
+            // 渲染行
+            tbody.innerHTML = filteredData.map(row => {{
+                return '<tr>' + tableCols.map(col => {{
+                    let val = row[col];
+                    const isNum = typeof val === 'number';
+
+                    if (val === '' || val === null || val === undefined) {{
+                        return '<td>-</td>';
+                    }}
+
+                    // 分享封面 - 显示为图片缩略图,点击放大预览
+                    if (col === '分享封面') {{
+                        return `<td><img src="${{val}}" style="max-width:80px;max-height:60px;cursor:pointer;border-radius:4px;" onclick="showImgModal('${{val}}')" onerror="this.style.display='none'"></td>`;
+                    }}
+
+                    // videoid - 显示为超链接(优先处理,避免被数字判断拦截)
+                    if (col === 'videoid') {{
+                        return `<td><a href="https://admin.piaoquantv.com/cms/post-detail/${{val}}/detail" target="_blank" style="color:#667eea;text-decoration:none;">${{val}}</a></td>`;
+                    }}
+
+                    // 日期、场景类型 - 强制显示为字符串
+                    if (col === 'dt' || col === 'hotsencetype') {{
+                        return `<td>${{val}}</td>`;
+                    }}
+
+                    if (isNum) {{
+                        const range = globalRanges[col] || {{ min: 0, max: 1 }};
+                        let displayVal = '';
+                        let needGradient = false;
+
+                        // 相似度列 - 需要渐变
+                        if (col.includes('相似度')) {{
+                            displayVal = val.toFixed(3);
+                            needGradient = true;
+                        }}
+                        // 回流率列 - 需要渐变
+                        else if (col.includes('回流率')) {{
+                            displayVal = val.toFixed(4);
+                            needGradient = true;
+                        }}
+                        // UV列 - 不需要渐变
+                        else if (col.includes('UV') || col.includes('uv')) {{
+                            displayVal = Math.round(val).toLocaleString();
+                        }}
+                        else {{
+                            displayVal = val.toFixed(2);
+                        }}
+
+                        const bgColor = needGradient ? getGradientColor(val, range.min, range.max) : '';
+                        const style = bgColor ? `style="background:${{bgColor}}"` : '';
+                        return `<td class="num" ${{style}}>${{displayVal}}</td>`;
+                    }}
+
+                    // 标题列 - 允许换行不截断
+                    if (col === 'title' || col === '分享标题' || col === '文章标题') {{
+                        return `<td class="wrap">${{val}}</td>`;
+                    }}
+
+                    // 其他文本列,截断显示
+                    const displayVal = String(val).substring(0, 40) + (String(val).length > 40 ? '...' : '');
+                    return `<td title="${{val}}">${{displayVal}}</td>`;
+                }}).join('') + '</tr>';
+            }}).join('');
+
+            document.getElementById('rowCount').textContent = `显示 ${{filteredData.length}} 条`;
+        }}
+
+        // 排序
+        function sortBy(col) {{
+            if (currentSort.col === col) {{
+                currentSort.dir = currentSort.dir === 'asc' ? 'desc' : 'asc';
+            }} else {{
+                currentSort.col = col;
+                currentSort.dir = 'desc';
+            }}
+            renderHeader();
+            renderTable();
+        }}
+
+        // 图片预览功能
+        function showImgModal(src) {{
+            document.getElementById('modalImg').src = src;
+            document.getElementById('imgModal').classList.add('show');
+        }}
+        function closeImgModal() {{
+            document.getElementById('imgModal').classList.remove('show');
+        }}
+        // ESC 关闭预览
+        document.addEventListener('keydown', (e) => {{
+            if (e.key === 'Escape') closeImgModal();
+        }});
+
+        // 事件绑定
+        document.getElementById('searchInput').addEventListener('input', renderTable);
+        document.getElementById('simFilter').addEventListener('change', renderTable);
+        document.getElementById('minUvInput').addEventListener('input', renderTable);
+        document.getElementById('dateFilter').addEventListener('change', renderTable);
+        document.getElementById('channelFilter').addEventListener('change', renderTable);
+        document.getElementById('hotsencetypeFilter').addEventListener('change', renderTable);
+        document.getElementById('partnerFilter').addEventListener('change', renderTable);
+        document.getElementById('accountFilter').addEventListener('change', renderTable);
+
+        // 初始化
+        renderCharts();
+        renderStatsTables();
+        renderHeader();
+        renderTable();
+
+        // 响应式
+        window.addEventListener('resize', () => {{
+            groupStats.forEach((_, idx) => {{
+                echarts.getInstanceByDom(document.getElementById('chart_' + idx))?.resize();
+            }});
+        }});
+    </script>
+</body>
+</html>
+'''
+
+# 保存 HTML
+output_path = output_dir / '素材视频内容分析.html'
+with open(output_path, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\n已保存: {output_path}")

+ 101 - 0
tasks/头部/进入前的I与头部I的相关性分析/进入前的I与头部I的相关性分析.sql

@@ -0,0 +1,101 @@
+-- 素材与头部视频相关性分析(单日查询)
+-- 参数: ${dt} - 日期,格式 YYYYMMDD
+-- JOIN 视频基础信息(关键词、口播等),分析素材与视频内容的匹配关系
+
+SELECT  a.dt
+        ,a.channel
+        ,a.hotsencetype
+        ,a.合作方名
+        ,a.公众号名
+        -- 素材维度
+        ,a.rootsourceid
+        ,a.文章标题
+        ,a.分享标题
+        ,a.分享封面
+        -- 视频基础信息
+        ,a.videoid
+        ,a.title
+        ,a.merge一级品类
+        ,a.merge二级品类
+        -- 视频内容信息(来自 video_dimension 表)
+        ,b.视频关键词
+        ,b.视频口播
+        ,b.视频主题
+        ,b.视频场景
+        ,b.情感倾向
+        ,b.视频风格
+        ,b.传播性判断
+        ,b.推测观众年龄段
+        ,b.是否有片尾引导
+        ,b.引导强度
+        -- 核心指标
+        ,COUNT(DISTINCT a.mid) AS 点击uv
+        ,COUNT(DISTINCT CASE WHEN a.是否进入推荐 = '1' THEN a.mid END) / COUNT(DISTINCT a.mid) AS 进入推荐率
+        ,(SUM(CASE WHEN a.再分享群聊回流uv > 0 THEN a.再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN a.再分享单聊回流uv > 0 THEN a.再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT a.mid) + 10) AS 再分享回流率
+        ,(SUM(CASE WHEN a.是否原视频 = '是' THEN a.再分享群聊回流uv END)
+          + SUM(CASE WHEN a.是否原视频 = '是' THEN a.再分享单聊回流uv END)
+         ) / (COUNT(DISTINCT a.mid) + 10) AS 原视频再分享回流率
+        ,(SUM(CASE WHEN a.是否原视频 = '否' THEN a.再分享群聊回流uv END)
+          + SUM(CASE WHEN a.是否原视频 = '否' THEN a.再分享单聊回流uv END)
+         ) / (COUNT(DISTINCT a.mid) + 10) AS 推荐再分享回流率
+        ,SUM(CASE WHEN a.再分享群聊回流uv > 0 THEN a.再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN a.再分享单聊回流uv > 0 THEN a.再分享单聊回流uv ELSE 0 END) AS 再分享回流uv
+FROM    loghubods.opengid_base_data a
+LEFT JOIN (
+    SELECT  视频id
+            ,视频关键词
+            ,视频口播
+            ,视频主题
+            ,视频场景
+            ,情感倾向
+            ,视频风格
+            ,传播性判断
+            ,推测观众年龄段
+            ,是否有片尾引导
+            ,引导强度
+    FROM    loghubods.video_dimension_detail_add_column
+    WHERE   dt = '${dt}'
+    GROUP BY 视频id
+             ,视频关键词
+             ,视频口播
+             ,视频主题
+             ,视频场景
+             ,情感倾向
+             ,视频风格
+             ,传播性判断
+             ,推测观众年龄段
+             ,是否有片尾引导
+             ,引导强度
+) b ON a.videoid = b.视频id
+WHERE   a.dt = '${dt}'
+AND     a.usersharedepth = 0
+AND     a.videoid IS NOT NULL
+AND     (a.文章标题 IS NOT NULL AND a.文章标题 != '' OR a.分享标题 IS NOT NULL AND a.分享标题 != '')
+GROUP BY a.dt
+         ,a.channel
+         ,a.hotsencetype
+         ,a.合作方名
+         ,a.公众号名
+         ,a.rootsourceid
+         ,a.文章标题
+         ,a.分享标题
+         ,a.分享封面
+         ,a.videoid
+         ,a.title
+         ,a.merge一级品类
+         ,a.merge二级品类
+         ,b.视频关键词
+         ,b.视频口播
+         ,b.视频主题
+         ,b.视频场景
+         ,b.情感倾向
+         ,b.视频风格
+         ,b.传播性判断
+         ,b.推测观众年龄段
+         ,b.是否有片尾引导
+         ,b.引导强度
+ORDER BY 点击uv DESC
+LIMIT   50000
+;

BIN
tasks/承接/头部品类与承接品类分析/.DS_Store


+ 288 - 0
tasks/承接/头部品类与承接品类分析/_archive/analyze_category_correlation.py

@@ -0,0 +1,288 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+品类承接裂变率分析
+分析目标:
+1. 进入/承接品类一致时,承接裂变率(vov)是否更高
+2. 不同品类组合间的承接裂变率是否存在稳定的相关性
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from scipy import stats
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei', 'PingFang SC']
+matplotlib.rcParams['axes.unicode_minus'] = False
+
+# ========== 数据加载 ==========
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+csv_files = [f for f in output_dir.glob("query_*.csv") if not f.name.endswith('.html')]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+print("=" * 60)
+print("品类承接裂变率分析")
+print("=" * 60)
+print(f"数据文件: {latest_file.name}")
+print(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+print(f"记录数: {len(df)}")
+print()
+
+# 过滤掉 headvideoid为空 的记录(无法判断进入品类)
+df_valid = df[~df['head_cate2'].isin(['headvideoid为空', '未匹配品类'])].copy()
+print(f"有效记录数(排除headvideoid为空/未匹配): {len(df_valid)}")
+
+# ========== 分析1: 品类一致性分析 ==========
+print("\n" + "=" * 60)
+print("分析1: 进入/承接品类一致性 vs 承接裂变率(vov)")
+print("=" * 60)
+
+# 标记是否为同品类
+df_valid['is_same_cate'] = df_valid['head_cate2'] == df_valid['rec_cate2']
+
+# 按人群分组分析
+for crowd in ['内部', '外部0层', '外部裂变']:
+    crowd_df = df_valid[df_valid['crowd'] == crowd]
+    if len(crowd_df) == 0:
+        continue
+
+    # 同品类 vs 跨品类
+    same_cate = crowd_df[crowd_df['is_same_cate']]
+    diff_cate = crowd_df[~crowd_df['is_same_cate']]
+
+    # 加权平均 vov (按曝光量加权)
+    same_vov = (same_cate['new_exposure_cnt'].sum() / same_cate['exp'].sum()) if same_cate['exp'].sum() > 0 else 0
+    diff_vov = (diff_cate['new_exposure_cnt'].sum() / diff_cate['exp'].sum()) if diff_cate['exp'].sum() > 0 else 0
+
+    print(f"\n【{crowd}】")
+    print(f"  同品类承接: 曝光 {same_cate['exp'].sum():,.0f}, vov = {same_vov:.4f}")
+    print(f"  跨品类承接: 曝光 {diff_cate['exp'].sum():,.0f}, vov = {diff_vov:.4f}")
+    print(f"  同品类/跨品类 vov比值: {same_vov/diff_vov:.2f}x" if diff_vov > 0 else "  跨品类无数据")
+
+    # 统计检验: Mann-Whitney U检验 (非参数检验)
+    if len(same_cate) >= 5 and len(diff_cate) >= 5:
+        stat, pvalue = stats.mannwhitneyu(same_cate['vov'], diff_cate['vov'], alternative='greater')
+        print(f"  Mann-Whitney U检验 (同品类vov > 跨品类vov): p-value = {pvalue:.4f}")
+        print(f"  结论: {'显著' if pvalue < 0.05 else '不显著'} (α=0.05)")
+
+# 整体汇总
+print("\n【整体汇总】")
+same_all = df_valid[df_valid['is_same_cate']]
+diff_all = df_valid[~df_valid['is_same_cate']]
+same_vov_all = same_all['new_exposure_cnt'].sum() / same_all['exp'].sum()
+diff_vov_all = diff_all['new_exposure_cnt'].sum() / diff_all['exp'].sum()
+print(f"  同品类承接: 曝光 {same_all['exp'].sum():,.0f}, vov = {same_vov_all:.4f}")
+print(f"  跨品类承接: 曝光 {diff_all['exp'].sum():,.0f}, vov = {diff_vov_all:.4f}")
+print(f"  同品类/跨品类 vov比值: {same_vov_all/diff_vov_all:.2f}x")
+
+# ========== 分析2: 品类组合稳定性分析 ==========
+print("\n" + "=" * 60)
+print("分析2: 品类组合间的承接裂变率稳定性相关性")
+print("=" * 60)
+
+# 2.1 跨日期稳定性: 同一品类组合在不同日期的vov相关性
+print("\n【2.1 跨日期稳定性】")
+print("分析同一品类组合在不同日期的vov是否稳定")
+
+dates = sorted(df_valid['dt'].unique())
+if len(dates) >= 2:
+    # 创建品类组合 pivot table
+    df_valid['cate_pair'] = df_valid['head_cate2'] + ' → ' + df_valid['rec_cate2']
+
+    # 按日期和品类组合汇总
+    daily_vov = df_valid.groupby(['dt', 'cate_pair']).apply(
+        lambda x: x['new_exposure_cnt'].sum() / x['exp'].sum()
+    ).unstack(level=0)
+
+    # 计算相邻日期间的相关性
+    date_correlations = []
+    for i in range(len(dates) - 1):
+        d1, d2 = dates[i], dates[i+1]
+        if d1 in daily_vov.columns and d2 in daily_vov.columns:
+            valid_pairs = daily_vov[[d1, d2]].dropna()
+            if len(valid_pairs) >= 10:
+                corr, pval = stats.pearsonr(valid_pairs[d1], valid_pairs[d2])
+                date_correlations.append({'date1': d1, 'date2': d2, 'corr': corr, 'pval': pval, 'n': len(valid_pairs)})
+
+    if date_correlations:
+        corr_df = pd.DataFrame(date_correlations)
+        print(f"  相邻日期vov相关性:")
+        for _, row in corr_df.iterrows():
+            print(f"    {row['date1']} vs {row['date2']}: r={row['corr']:.3f}, p={row['pval']:.4f}, n={row['n']}")
+        print(f"  平均相关系数: {corr_df['corr'].mean():.3f}")
+        print(f"  结论: 品类组合的vov在跨日期间{'高度稳定' if corr_df['corr'].mean() > 0.7 else '较为稳定' if corr_df['corr'].mean() > 0.5 else '不太稳定'}")
+
+# 2.2 跨人群稳定性: 同一品类组合在不同人群的vov相关性
+print("\n【2.2 跨人群稳定性】")
+print("分析同一品类组合在不同人群的vov排序是否一致")
+
+crowd_vov = df_valid.groupby(['crowd', 'cate_pair']).apply(
+    lambda x: x['new_exposure_cnt'].sum() / x['exp'].sum()
+).unstack(level=0)
+
+crowds = ['内部', '外部0层', '外部裂变']
+crowd_pairs = [(crowds[i], crowds[j]) for i in range(len(crowds)) for j in range(i+1, len(crowds))]
+
+for c1, c2 in crowd_pairs:
+    if c1 in crowd_vov.columns and c2 in crowd_vov.columns:
+        valid = crowd_vov[[c1, c2]].dropna()
+        if len(valid) >= 10:
+            corr, pval = stats.pearsonr(valid[c1], valid[c2])
+            spearman_corr, spearman_pval = stats.spearmanr(valid[c1], valid[c2])
+            print(f"  {c1} vs {c2}:")
+            print(f"    Pearson r = {corr:.3f} (p={pval:.4f})")
+            print(f"    Spearman ρ = {spearman_corr:.3f} (p={spearman_pval:.4f})")
+            print(f"    样本数: {len(valid)} 品类组合")
+
+# 2.3 高/低裂变品类组合识别
+print("\n【2.3 稳定的高/低裂变品类组合】")
+print("识别在所有人群中都表现稳定的品类组合")
+
+# 计算每个品类组合在所有人群的平均vov
+overall_vov = df_valid.groupby('cate_pair').apply(
+    lambda x: pd.Series({
+        'vov': x['new_exposure_cnt'].sum() / x['exp'].sum(),
+        'exp': x['exp'].sum(),
+        'crowd_count': x['crowd'].nunique()
+    })
+)
+
+# 只看在多个人群都有数据的组合
+stable_pairs = overall_vov[overall_vov['crowd_count'] >= 2].copy()
+stable_pairs = stable_pairs.sort_values('vov', ascending=False)
+
+print(f"\n  Top 10 高裂变品类组合 (vov最高):")
+for i, (pair, row) in enumerate(stable_pairs.head(10).iterrows(), 1):
+    print(f"    {i}. {pair}: vov={row['vov']:.4f}, 曝光={row['exp']:,.0f}")
+
+print(f"\n  Top 10 低裂变品类组合 (vov最低):")
+for i, (pair, row) in enumerate(stable_pairs.tail(10).iloc[::-1].iterrows(), 1):
+    print(f"    {i}. {pair}: vov={row['vov']:.4f}, 曝光={row['exp']:,.0f}")
+
+# ========== 分析3: 品类亲和性矩阵 ==========
+print("\n" + "=" * 60)
+print("分析3: 品类亲和性矩阵 (进入品类 → 承接品类)")
+print("=" * 60)
+
+# 计算每个head_cate2的基准vov
+head_baseline = df_valid.groupby('head_cate2').apply(
+    lambda x: x['new_exposure_cnt'].sum() / x['exp'].sum()
+).to_dict()
+
+# 计算亲和性: 特定组合vov / 进入品类基准vov
+affinity_data = []
+for (head, rec), grp in df_valid.groupby(['head_cate2', 'rec_cate2']):
+    if grp['exp'].sum() >= 10000:  # 只看曝光量足够的组合
+        pair_vov = grp['new_exposure_cnt'].sum() / grp['exp'].sum()
+        baseline = head_baseline.get(head, 1)
+        affinity = pair_vov / baseline if baseline > 0 else 0
+        affinity_data.append({
+            'head_cate2': head,
+            'rec_cate2': rec,
+            'vov': pair_vov,
+            'baseline_vov': baseline,
+            'affinity': affinity,
+            'exp': grp['exp'].sum()
+        })
+
+affinity_df = pd.DataFrame(affinity_data)
+
+print("\n  高亲和组合 (affinity > 1.2, 即vov比基准高20%):")
+high_affinity = affinity_df[affinity_df['affinity'] > 1.2].sort_values('affinity', ascending=False).head(15)
+for _, row in high_affinity.iterrows():
+    print(f"    {row['head_cate2']} → {row['rec_cate2']}: affinity={row['affinity']:.2f}, vov={row['vov']:.4f}")
+
+print("\n  低亲和组合 (affinity < 0.8, 即vov比基准低20%):")
+low_affinity = affinity_df[affinity_df['affinity'] < 0.8].sort_values('affinity').head(15)
+for _, row in low_affinity.iterrows():
+    print(f"    {row['head_cate2']} → {row['rec_cate2']}: affinity={row['affinity']:.2f}, vov={row['vov']:.4f}")
+
+# ========== 可视化 ==========
+print("\n" + "=" * 60)
+print("生成可视化图表...")
+print("=" * 60)
+
+fig, axes = plt.subplots(2, 2, figsize=(14, 12))
+
+# 图1: 同品类 vs 跨品类 vov对比
+ax1 = axes[0, 0]
+crowds = ['内部', '外部0层', '外部裂变']
+same_vovs = []
+diff_vovs = []
+for crowd in crowds:
+    crowd_df = df_valid[df_valid['crowd'] == crowd]
+    same = crowd_df[crowd_df['is_same_cate']]
+    diff = crowd_df[~crowd_df['is_same_cate']]
+    same_vovs.append(same['new_exposure_cnt'].sum() / same['exp'].sum() if same['exp'].sum() > 0 else 0)
+    diff_vovs.append(diff['new_exposure_cnt'].sum() / diff['exp'].sum() if diff['exp'].sum() > 0 else 0)
+
+x = np.arange(len(crowds))
+width = 0.35
+ax1.bar(x - width/2, same_vovs, width, label='同品类承接', color='#4CAF50')
+ax1.bar(x + width/2, diff_vovs, width, label='跨品类承接', color='#2196F3')
+ax1.set_ylabel('承接裂变率 (vov)')
+ax1.set_title('同品类 vs 跨品类 承接裂变率对比')
+ax1.set_xticks(x)
+ax1.set_xticklabels(crowds)
+ax1.legend()
+ax1.grid(axis='y', alpha=0.3)
+
+# 图2: 品类组合vov分布
+ax2 = axes[0, 1]
+ax2.hist(stable_pairs['vov'], bins=30, edgecolor='black', alpha=0.7, color='#FF9800')
+ax2.axvline(stable_pairs['vov'].median(), color='red', linestyle='--', label=f'中位数: {stable_pairs["vov"].median():.4f}')
+ax2.axvline(stable_pairs['vov'].mean(), color='blue', linestyle='--', label=f'均值: {stable_pairs["vov"].mean():.4f}')
+ax2.set_xlabel('承接裂变率 (vov)')
+ax2.set_ylabel('品类组合数')
+ax2.set_title('品类组合vov分布')
+ax2.legend()
+
+# 图3: 跨人群vov相关性散点图 (内部 vs 外部0层)
+ax3 = axes[1, 0]
+if '内部' in crowd_vov.columns and '外部0层' in crowd_vov.columns:
+    valid = crowd_vov[['内部', '外部0层']].dropna()
+    ax3.scatter(valid['内部'], valid['外部0层'], alpha=0.5, s=30)
+    # 添加对角线
+    max_val = max(valid['内部'].max(), valid['外部0层'].max())
+    ax3.plot([0, max_val], [0, max_val], 'r--', alpha=0.5, label='y=x')
+    ax3.set_xlabel('内部 vov')
+    ax3.set_ylabel('外部0层 vov')
+    ax3.set_title('跨人群vov相关性 (内部 vs 外部0层)')
+    corr, _ = stats.pearsonr(valid['内部'], valid['外部0层'])
+    ax3.text(0.05, 0.95, f'r = {corr:.3f}', transform=ax3.transAxes, fontsize=12, verticalalignment='top')
+    ax3.legend()
+
+# 图4: 亲和性分布
+ax4 = axes[1, 1]
+ax4.hist(affinity_df['affinity'], bins=30, edgecolor='black', alpha=0.7, color='#9C27B0')
+ax4.axvline(1.0, color='red', linestyle='--', label='基准线 (affinity=1)')
+ax4.set_xlabel('亲和性 (vov / 基准vov)')
+ax4.set_ylabel('品类组合数')
+ax4.set_title('品类亲和性分布')
+ax4.legend()
+
+plt.tight_layout()
+plt.savefig(output_dir / 'category_correlation_analysis.png', dpi=150, bbox_inches='tight')
+print(f"图表已保存: {output_dir / 'category_correlation_analysis.png'}")
+
+# ========== 导出分析结果 ==========
+print("\n导出分析结果...")
+
+# 导出品类组合vov排名
+stable_pairs.to_csv(output_dir / 'category_pair_vov_ranking.csv')
+print(f"品类组合vov排名: {output_dir / 'category_pair_vov_ranking.csv'}")
+
+# 导出亲和性矩阵
+affinity_df.to_csv(output_dir / 'category_affinity_matrix.csv', index=False)
+print(f"品类亲和性矩阵: {output_dir / 'category_affinity_matrix.csv'}")
+
+print("\n" + "=" * 60)
+print("分析完成!")
+print("=" * 60)

+ 86 - 0
tasks/承接/头部品类与承接品类分析/_archive/query_range.sql

@@ -0,0 +1,86 @@
+-- 简化版:直接用 headvideoid 和 vid 关联品类表获取品类
+-- 不需要 join 头部视频表
+WITH t_rec AS (
+    SELECT  dt
+            ,mid
+            ,subsessionid
+            ,headvideoid
+            ,vid AS rec_vid
+            ,ts
+            ,CASE WHEN rootsourceid = '' OR rootsourceid IS NULL THEN '内部' ELSE '外部' END AS in_out
+            ,page
+            ,CASE   WHEN page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页") THEN "推荐"
+                    WHEN page IN ("回流页","其他") THEN "非推荐"
+                    ELSE "其他"
+            END AS page_rec
+            ,share_cnt
+            ,return_n_uv
+            ,new_exposure_cnt
+            ,GET_JSON_OBJECT(extend,"$.extParams.userShareDepth") AS layer
+    FROM    loghubods.dwd_recsys_alg_exposure_base_20250108
+    WHERE   dt BETWEEN "${start}" AND "${end}"
+    AND     apptype IN ('4','0')
+    AND     page IN ("回流后沉浸页&内页feed","详情后沉浸页","首页feed","详情页","回流页","其他")
+)
+,t_vid_info AS (
+    -- 视频品类信息表
+    SELECT  vid
+            ,COALESCE(GET_JSON_OBJECT(feature,"$.merge_second_level_cate"),"unknown") AS cate2
+    FROM    (
+                SELECT  vid
+                        ,feature
+                        ,ROW_NUMBER() OVER (PARTITION BY vid ORDER BY dt DESC, hh DESC) AS rn
+                FROM    loghubods.alg_vid_feature_basic_info
+                WHERE   CONCAT(dt,hh) BETWEEN CONCAT("${start}","00") AND CONCAT("${end}","23")
+            )
+    WHERE   rn = 1
+)
+,t_joined AS (
+    SELECT  r.dt
+            ,CASE   WHEN r.in_out = '内部' THEN '内部'
+                    WHEN r.layer = '0' THEN '外部0层'
+                    WHEN CAST(r.layer AS INT) > 0 THEN '外部裂变'
+                    ELSE '其他'
+            END AS crowd
+            ,CASE   WHEN r.headvideoid IS NULL OR r.headvideoid = '' THEN 'headvideoid为空'
+                    WHEN h.cate2 IS NULL THEN '未匹配品类'
+                    ELSE h.cate2
+            END AS head_cate2
+            ,COALESCE(v.cate2, 'unknown') AS rec_cate2
+            ,r.share_cnt
+            ,r.return_n_uv
+            ,r.new_exposure_cnt
+    FROM    t_rec r
+    LEFT JOIN t_vid_info h ON r.headvideoid = h.vid
+    LEFT JOIN t_vid_info v ON r.rec_vid = v.vid
+    WHERE   r.page_rec = '推荐'
+)
+,t_final AS (
+    SELECT  dt
+            ,crowd
+            ,head_cate2
+            ,rec_cate2
+            ,SUM(1) AS exp
+            ,SUM(share_cnt) AS share_cnt
+            ,SUM(return_n_uv) AS return_n_uv
+            ,SUM(new_exposure_cnt) AS new_exposure_cnt
+    FROM    t_joined
+    GROUP BY dt, crowd, head_cate2, rec_cate2
+)
+SELECT  dt
+        ,crowd
+        ,head_cate2
+        ,rec_cate2
+        ,exp
+        ,share_cnt
+        ,return_n_uv
+        ,new_exposure_cnt
+        ,round(COALESCE(share_cnt / exp, 0), 4) AS str
+        ,round(COALESCE(return_n_uv / share_cnt, 0), 4) AS ros
+        ,round(COALESCE(return_n_uv / exp, 0), 4) AS rovn
+        ,round(COALESCE(new_exposure_cnt / exp, 0), 4) AS vov
+FROM    t_final
+WHERE   crowd <> '其他'
+AND     exp >= 1000
+ORDER BY dt DESC, crowd, exp DESC
+;

+ 874 - 0
tasks/承接/头部品类与承接品类分析/_archive/visualize.py

@@ -0,0 +1,874 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+头部品类分析可视化
+Tab 1: Matrix - 头部品类 × 推荐品类矩阵
+Tab 2: Compare - Top 10 品类人群对比
+"""
+import pandas as pd
+import json
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 找到最新的原始数据文件
+csv_files = [f for f in output_dir.glob("query_*.csv")]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+print(f"分析文件: {latest_file.name}")
+print(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 日期列表
+all_dates = sorted([str(d) for d in df['dt'].unique()])
+date_options = ['all'] + all_dates
+latest_date = all_dates[-1] if all_dates else 'all'
+print(f"日期数: {len(all_dates)}")
+
+# 人群列表
+crowd_list = ['内部', '外部0层', '外部裂变']
+print(f"人群: {crowd_list}")
+
+# 曝光阈值
+EXP_THRESHOLD = 1000
+
+# 计算人群×日期的矩阵数据
+def calc_matrix_data(crowd, date=None):
+    ch_df = df[df['crowd'] == crowd].copy()
+    if date and date != 'all':
+        ch_df = ch_df[ch_df['dt'].astype(str) == str(date)]
+    if len(ch_df) == 0:
+        return None
+
+    row_col = 'head_cate2'
+    col_col = 'rec_cate2'
+
+    matrix = ch_df.groupby([row_col, col_col]).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+
+    matrix = matrix[matrix['exp'] >= EXP_THRESHOLD]
+    if len(matrix) == 0:
+        return None
+
+    matrix['str'] = matrix['share_cnt'] / (matrix['exp'] + 1)
+    matrix['ros'] = matrix['return_n_uv'] / (matrix['share_cnt'] + 1)
+    matrix['rovn'] = matrix['return_n_uv'] / (matrix['exp'] + 1)
+    matrix['vov'] = matrix['new_exposure_cnt'] / (matrix['exp'] + 1)
+
+    exp_pivot = matrix.pivot(index=row_col, columns=col_col, values='exp').fillna(0)
+    str_pivot = matrix.pivot(index=row_col, columns=col_col, values='str').fillna(0)
+    ros_pivot = matrix.pivot(index=row_col, columns=col_col, values='ros').fillna(0)
+    rovn_pivot = matrix.pivot(index=row_col, columns=col_col, values='rovn').fillna(0)
+    vov_pivot = matrix.pivot(index=row_col, columns=col_col, values='vov').fillna(0)
+
+    row_order = exp_pivot.sum(axis=1).sort_values(ascending=False).index.tolist()
+    col_order = exp_pivot.sum(axis=0).sort_values(ascending=False).index.tolist()
+
+    def to_dict(pivot, is_int=False):
+        return {str(r): {str(c): int(pivot.loc[r, c]) if is_int else round(float(pivot.loc[r, c]), 4) if c in pivot.columns else 0 for c in col_order} for r in row_order}
+
+    total_exp = int(ch_df['exp'].sum())
+    total_share = int(ch_df['share_cnt'].sum())
+    total_return = int(ch_df['return_n_uv'].sum())
+
+    return {
+        'rows': row_order,
+        'cols': col_order,
+        'exp': to_dict(exp_pivot, is_int=True),
+        'str': to_dict(str_pivot),
+        'ros': to_dict(ros_pivot),
+        'rovn': to_dict(rovn_pivot),
+        'vov': to_dict(vov_pivot),
+        'total_exp': total_exp,
+        'total_str': round(total_share / (total_exp + 1), 4),
+        'total_rovn': round(total_return / (total_exp + 1), 4),
+    }
+
+# 计算头部品类下钻数据:head_cate2 -> crowd -> rec_cate2
+def calc_head_drill_data(date=None):
+    ch_df = df.copy()
+    if date and date != 'all':
+        ch_df = ch_df[ch_df['dt'].astype(str) == str(date)]
+    if len(ch_df) == 0:
+        return None
+
+    # 按 head_cate2 + crowd + rec_cate2 聚合
+    agg = ch_df.groupby(['head_cate2', 'crowd', 'rec_cate2']).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+
+    agg['str'] = agg['share_cnt'] / (agg['exp'] + 1)
+    agg['ros'] = agg['return_n_uv'] / (agg['share_cnt'] + 1)
+    agg['rovn'] = agg['return_n_uv'] / (agg['exp'] + 1)
+    agg['vov'] = agg['new_exposure_cnt'] / (agg['exp'] + 1)
+
+    # 构建嵌套字典: head_cate2 -> crowd -> {rec_cate2: metrics}
+    result = {}
+
+    # 添加 "all" 选项:不区分头部品类,按 crowd + rec_cate2 聚合
+    agg_all = ch_df.groupby(['crowd', 'rec_cate2']).agg({
+        'exp': 'sum',
+        'share_cnt': 'sum',
+        'return_n_uv': 'sum',
+        'new_exposure_cnt': 'sum',
+    }).reset_index()
+    agg_all['str'] = agg_all['share_cnt'] / (agg_all['exp'] + 1)
+    agg_all['ros'] = agg_all['return_n_uv'] / (agg_all['share_cnt'] + 1)
+    agg_all['rovn'] = agg_all['return_n_uv'] / (agg_all['exp'] + 1)
+    agg_all['vov'] = agg_all['new_exposure_cnt'] / (agg_all['exp'] + 1)
+
+    result['all'] = {}
+    for crowd in crowd_list:
+        crowd_df = agg_all[agg_all['crowd'] == crowd]
+        result['all'][crowd] = {}
+        # 计算整体汇总
+        total_exp = int(crowd_df['exp'].sum())
+        total_share = crowd_df['share_cnt'].sum()
+        total_return = crowd_df['return_n_uv'].sum()
+        total_new_exp = crowd_df['new_exposure_cnt'].sum()
+        result['all'][crowd]['_total'] = {
+            'exp': total_exp,
+            'str': round(total_share / (total_exp + 1), 4),
+            'ros': round(total_return / (total_share + 1), 4),
+            'rovn': round(total_return / (total_exp + 1), 4),
+            'vov': round(total_new_exp / (total_exp + 1), 4),
+        }
+        for _, row in crowd_df.iterrows():
+            result['all'][crowd][row['rec_cate2']] = {
+                'exp': int(row['exp']),
+                'str': round(row['str'], 4),
+                'ros': round(row['ros'], 4),
+                'rovn': round(row['rovn'], 4),
+                'vov': round(row['vov'], 4),
+            }
+
+    # 按头部品类聚合
+    for head_cate in agg['head_cate2'].unique():
+        result[head_cate] = {}
+        for crowd in crowd_list:
+            crowd_df = agg[(agg['head_cate2'] == head_cate) & (agg['crowd'] == crowd)]
+            result[head_cate][crowd] = {}
+            # 计算该头部品类下的整体汇总
+            total_exp = int(crowd_df['exp'].sum())
+            total_share = crowd_df['share_cnt'].sum()
+            total_return = crowd_df['return_n_uv'].sum()
+            total_new_exp = crowd_df['new_exposure_cnt'].sum()
+            result[head_cate][crowd]['_total'] = {
+                'exp': total_exp,
+                'str': round(total_share / (total_exp + 1), 4),
+                'ros': round(total_return / (total_share + 1), 4),
+                'rovn': round(total_return / (total_exp + 1), 4),
+                'vov': round(total_new_exp / (total_exp + 1), 4),
+            }
+            for _, row in crowd_df.iterrows():
+                result[head_cate][crowd][row['rec_cate2']] = {
+                    'exp': int(row['exp']),
+                    'str': round(row['str'], 4),
+                    'ros': round(row['ros'], 4),
+                    'rovn': round(row['rovn'], 4),
+                    'vov': round(row['vov'], 4),
+                }
+
+    # 获取所有头部品类列表(按总曝光排序)
+    head_exp = ch_df.groupby('head_cate2')['exp'].sum().sort_values(ascending=False)
+    head_list = head_exp.index.tolist()
+
+    return {
+        'heads': ['all'] + head_list,  # all 放在最前面
+        'data': result
+    }
+
+
+# 预计算所有数据
+all_data = {}
+for crowd in crowd_list:
+    all_data[crowd] = {}
+    for dt in date_options:
+        matrix = calc_matrix_data(crowd, dt)
+        if matrix:
+            all_data[crowd][dt] = matrix
+
+# 预计算头部品类下钻数据
+head_drill_data = {}
+for dt in date_options:
+    drill = calc_head_drill_data(dt)
+    if drill:
+        head_drill_data[dt] = drill
+
+# 转为JSON
+data_json = json.dumps(all_data, ensure_ascii=False)
+head_drill_json = json.dumps(head_drill_data, ensure_ascii=False)
+crowd_list_json = json.dumps(crowd_list, ensure_ascii=False)
+dates_json = json.dumps(date_options)
+
+# 日期选项HTML
+date_options_html = "".join([
+    f'<option value="{dt}" {"selected" if dt == latest_date else ""}>'
+    f'{"all" if dt == "all" else dt}</option>'
+    for dt in date_options
+])
+
+# 人群选项HTML
+crowd_options_html = "".join([
+    f'<option value="{c}">{c}</option>'
+    for c in crowd_list
+])
+
+html_content = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>头部品类分析</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+               background: #f5f5f5; padding: 20px; }}
+        .container {{ max-width: 1600px; margin: 0 auto; background: white;
+                     border-radius: 8px; padding: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        h1 {{ font-size: 24px; margin-bottom: 20px; color: #333; }}
+        .controls {{ display: flex; gap: 20px; margin-bottom: 20px; align-items: center; flex-wrap: wrap; }}
+        .controls .date-switcher {{ margin-left: auto; }}
+        .play-btn {{ background: #4CAF50; color: white; border: none; border-radius: 4px; padding: 6px 12px; font-size: 14px; }}
+        .play-btn:hover {{ background: #45a049; }}
+        .play-btn.playing {{ background: #f44336; }}
+        .control-group {{ display: flex; align-items: center; gap: 8px; }}
+        .control-group label {{ font-weight: 500; color: #666; }}
+        select {{ padding: 8px 12px; border: 1px solid #ddd; border-radius: 4px; font-size: 14px; min-width: 120px; }}
+        .summary {{ display: flex; gap: 20px; margin-bottom: 20px; }}
+        .stat-card {{ background: #f8f9fa; padding: 15px 20px; border-radius: 6px; text-align: center; }}
+        .stat-card h4 {{ font-size: 24px; color: #28a745; margin-bottom: 5px; }}
+        .stat-card p {{ font-size: 12px; color: #666; }}
+        .matrix-container {{ overflow-x: auto; max-height: 600px; overflow-y: auto; }}
+        table {{ border-collapse: collapse; font-size: 11px; }}
+        th, td {{ border: 1px solid #e0e0e0; padding: 4px 6px; text-align: center; white-space: nowrap; }}
+        th {{ background: #f5f5f5; font-weight: 600; position: sticky; top: 0; z-index: 1; }}
+        th:first-child {{ position: sticky; left: 0; z-index: 3; }}
+        td:first-child {{ background: #f5f5f5; font-weight: 500; position: sticky; left: 0; z-index: 1; text-align: left; }}
+        .corner-cell {{
+            position: relative;
+            width: 100px;
+            height: 50px;
+            background: linear-gradient(to top right, #f5f5f5 49.5%, #ccc 49.5%, #ccc 50.5%, #f5f5f5 50.5%);
+        }}
+        .corner-cell .row-label {{
+            position: absolute;
+            bottom: 4px;
+            left: 4px;
+            font-size: 10px;
+            color: #666;
+        }}
+        .corner-cell .col-label {{
+            position: absolute;
+            top: 4px;
+            right: 4px;
+            font-size: 10px;
+            color: #666;
+        }}
+        .legend {{ font-size: 12px; color: #666; margin-bottom: 10px; }}
+        .date-switcher {{ display: flex; align-items: center; gap: 5px; }}
+        .date-switcher button {{ padding: 5px 10px; border: 1px solid #ddd; background: white;
+                                cursor: pointer; border-radius: 3px; }}
+        .date-switcher button:hover {{ background: #f0f0f0; }}
+        .play-btn.playing {{ background: #28a745; color: white; }}
+        /* Compare tab styles */
+        .chart-container {{ width: 100%; overflow-x: auto; }}
+        .bar-chart {{ min-width: 800px; }}
+        .bar-group {{ display: flex; align-items: flex-end; gap: 4px; margin-bottom: 8px; }}
+        .bar {{ min-width: 60px; text-align: center; font-size: 10px; color: white;
+               border-radius: 3px 3px 0 0; transition: all 0.3s; cursor: pointer; }}
+        .bar:hover {{ opacity: 0.8; }}
+        .bar-label {{ font-size: 11px; color: #333; margin-bottom: 5px; font-weight: 500; }}
+        .chart-legend {{ display: flex; gap: 20px; margin-bottom: 15px; }}
+        .legend-item {{ display: flex; align-items: center; gap: 5px; font-size: 12px; }}
+        .legend-color {{ width: 16px; height: 16px; border-radius: 3px; }}
+        .compare-table {{ width: 100%; border-collapse: collapse; }}
+        .compare-table th {{ background: #f5f5f5; padding: 8px 10px; text-align: center; font-weight: 600; border: 1px solid #ddd; }}
+        .compare-table td {{ padding: 6px 8px; border: 1px solid #eee; text-align: center; }}
+        .compare-table .crowd-header {{ background: #e8e8e8; font-size: 14px; }}
+        .compare-table .cat-cell {{ text-align: left; padding-left: 10px; }}
+        .compare-section {{ display: flex; gap: 20px; }}
+        .crowd-block {{ flex: 1; min-width: 250px; }}
+        .crowd-block table {{ width: 100%; border-collapse: collapse; }}
+        .crowd-block th {{ background: #f0f0f0; padding: 8px; border: 1px solid #ddd; }}
+        .crowd-block td {{ padding: 6px 8px; border: 1px solid #eee; }}
+        .crowd-block .rn {{ width: 40px; text-align: center; color: #666; }}
+        .crowd-block .cat {{ text-align: left; cursor: pointer; transition: all 0.2s; }}
+        .crowd-block .val {{ text-align: right; font-family: monospace; }}
+        .crowd-block .cat.highlight {{
+            font-weight: bold;
+        }}
+        .crowd-block tr.row-highlight {{
+            outline: 2px solid #1565C0;
+            outline-offset: -1px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>头部品类 → 推荐品类</h1>
+
+        <!-- Matrix Tab -->
+        <div id="tab-matrix">
+            <div class="controls">
+                <div class="control-group">
+                    <label>人群:</label>
+                    <select id="crowd-select" onchange="updateMatrix()">
+                        {crowd_options_html}
+                    </select>
+                </div>
+                <div class="control-group">
+                    <label>指标:</label>
+                    <select id="metric-select" onchange="updateMatrix()">
+                        <option value="exp">exp</option>
+                        <option value="str">str</option>
+                        <option value="ros">ros</option>
+                        <option value="rovn">rovn</option>
+                        <option value="vov" selected>vov</option>
+                    </select>
+                </div>
+                <div class="control-group date-switcher">
+                    <label>日期:</label>
+                    <button onclick="switchDate(-1)">◀</button>
+                    <select id="date-select" onchange="updateMatrix()">
+                        {date_options_html}
+                    </select>
+                    <button onclick="switchDate(1)">▶</button>
+                    <button id="play-btn" class="play-btn" onclick="togglePlay()">▶</button>
+                </div>
+            </div>
+
+            <div class="summary" id="summary"></div>
+
+            <div class="legend">
+                行=头部品类,列=推荐品类 | 颜色越深=数值越高 | 点击表头排序
+                <button onclick="resetSort()" style="margin-left:15px;padding:3px 10px;cursor:pointer;">重置</button>
+            </div>
+
+            <div class="matrix-container">
+                <table id="matrix-table">
+                    <thead id="matrix-header"></thead>
+                    <tbody id="matrix-body"></tbody>
+                </table>
+            </div>
+
+            <!-- 头部品类下钻表格 -->
+            <div style="margin-top: 30px; border-top: 2px solid #e0e0e0; padding-top: 20px;">
+                <h3 style="margin-bottom: 15px; font-size: 16px; color: #333;">头部品类下钻:各人群推荐品类 Top N</h3>
+                <div class="controls">
+                    <div class="control-group">
+                        <label>头部品类:</label>
+                        <select id="drill-head" onchange="updateHeadDrill()">
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>排序:</label>
+                        <select id="drill-sort" onchange="updateHeadDrill()">
+                            <option value="exp" selected>exp</option>
+                            <option value="str">str</option>
+                            <option value="ros">ros</option>
+                            <option value="rovn">rovn</option>
+                            <option value="vov">vov</option>
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>展示:</label>
+                        <select id="drill-metric" onchange="updateHeadDrill()">
+                            <option value="exp">exp</option>
+                            <option value="str">str</option>
+                            <option value="ros">ros</option>
+                            <option value="rovn">rovn</option>
+                            <option value="vov" selected>vov</option>
+                        </select>
+                    </div>
+                    <div class="control-group">
+                        <label>Top:</label>
+                        <select id="drill-topn" onchange="updateHeadDrill()">
+                            <option value="5">5</option>
+                            <option value="10" selected>10</option>
+                            <option value="15">15</option>
+                            <option value="20">20</option>
+                        </select>
+                    </div>
+                    <div class="control-group date-switcher">
+                        <label>日期:</label>
+                        <button onclick="switchDrillDate(-1)">◀</button>
+                        <select id="drill-date" onchange="updateHeadDrill()">
+                            {date_options_html}
+                        </select>
+                        <button onclick="switchDrillDate(1)">▶</button>
+                        <button id="drill-play-btn" class="play-btn" onclick="toggleDrillPlay()">▶</button>
+                    </div>
+                </div>
+                <div class="compare-section" id="drill-section"></div>
+            </div>
+        </div>
+
+    </div>
+
+    <script>
+    const allData = {data_json};
+    const headDrillData = {head_drill_json};
+    const crowdList = {crowd_list_json};
+    const dates = {dates_json};
+    const crowdColors = {{ '内部': '#4CAF50', '外部0层': '#2196F3', '外部裂变': '#FF9800' }};
+    let playInterval = null;
+    let drillPlayInterval = null;
+    let currentRowOrder = null;
+    let currentColOrder = null;
+    let sortState = {{ row: null, col: null, asc: true }};
+    let lastCrowd = null;
+    let lastDate = null;
+
+    function getGradient(val, maxVal, minVal = 0) {{
+        if (val <= minVal || maxVal <= minVal) return '#f8f9fa';
+        const ratio = Math.min((val - minVal) / (maxVal - minVal), 1);
+        const r = Math.round(255 - ratio * 215);
+        const g = Math.round(255 - ratio * 88);
+        const b = Math.round(255 - ratio * 186);
+        return `rgb(${{r}},${{g}},${{b}})`;
+    }}
+
+    function updateMatrix() {{
+        const crowd = document.getElementById('crowd-select').value;
+        const metric = document.getElementById('metric-select').value;
+        const date = document.getElementById('date-select').value;
+
+        if (!allData[crowd] || !allData[crowd][date]) {{
+            document.getElementById('summary').innerHTML = '<div class="stat-card"><h4>-</h4><p>no data</p></div>';
+            document.getElementById('matrix-header').innerHTML = '';
+            document.getElementById('matrix-body').innerHTML = '';
+            return;
+        }}
+
+        const data = allData[crowd][date];
+
+        document.getElementById('summary').innerHTML = `
+            <div class="stat-card"><h4>${{data.total_exp.toLocaleString()}}</h4><p>总 exp</p></div>
+            <div class="stat-card"><h4>${{data.total_str.toFixed(4)}}</h4><p>总 str</p></div>
+            <div class="stat-card"><h4>${{data.total_rovn.toFixed(4)}}</h4><p>总 rovn</p></div>
+            <div class="stat-card"><h4>${{data.rows.length}}</h4><p>头部品类数</p></div>
+            <div class="stat-card"><h4>${{data.cols.length}}</h4><p>推荐品类数</p></div>
+        `;
+
+        const metricData = data[metric];
+        const allVals = [];
+        data.rows.forEach(r => data.cols.forEach(c => {{
+            const val = metricData[r]?.[c] || 0;
+            if (val > 0) allVals.push(val);
+        }}));
+        allVals.sort((a, b) => a - b);
+
+        const p95Idx = Math.floor(allVals.length * 0.95);
+        let maxVal = allVals.length > 0 ? allVals[Math.min(p95Idx, allVals.length - 1)] : 0;
+        const thresholds = {{ exp: 10000, str: 0.1, ros: 0.5, rovn: 0.05, vov: 0.3 }};
+        maxVal = Math.max(maxVal, thresholds[metric] || 0.1);
+
+        // 切换人群或日期时,重置排序,使用新数据的 exp 排序
+        if (crowd !== lastCrowd || date !== lastDate) {{
+            currentRowOrder = null;
+            currentColOrder = null;
+            sortState = {{ row: null, col: null, asc: true }};
+            lastCrowd = crowd;
+            lastDate = date;
+        }}
+
+        if (!currentRowOrder) currentRowOrder = [...data.rows];
+        if (!currentColOrder) currentColOrder = [...data.cols];
+
+        const rows = currentRowOrder.filter(r => data.rows.includes(r));
+        const cols = currentColOrder.filter(c => data.cols.includes(c));
+
+        const expData = data.exp;
+        const rowExpTotals = {{}};
+        const colExpTotals = {{}};
+        rows.forEach(r => {{ rowExpTotals[r] = cols.reduce((sum, c) => sum + (expData[r]?.[c] || 0), 0); }});
+        cols.forEach(c => {{ colExpTotals[c] = rows.reduce((sum, r) => sum + (expData[r]?.[c] || 0), 0); }});
+
+        // 计算原始排名(按exp排序)
+        const origRowOrder = [...data.rows];
+        const origColOrder = [...data.cols];
+
+        document.getElementById('matrix-header').innerHTML = `
+            <tr>
+                <th class="corner-cell" style="cursor:pointer" onclick="sortByRowSum()">
+                    <span class="row-label">头部品类 ↓</span>
+                    <span class="col-label">推荐品类 →</span>
+                </th>
+                ${{cols.map((c, i) => {{
+                    const origRank = origColOrder.indexOf(c) + 1;
+                    return `<th style="cursor:pointer" onclick="sortByCol('${{c}}')" title="推荐品类: ${{c}}&#10;exp排名: #${{origRank}}&#10;exp: ${{colExpTotals[c].toLocaleString()}}">#${{origRank}} ${{c}}</th>`;
+                }}).join('')}}
+            </tr>
+        `;
+
+        document.getElementById('matrix-body').innerHTML = rows.map((r, ri) => {{
+            const origRowRank = origRowOrder.indexOf(r) + 1;
+            const cells = cols.map(c => {{
+                const val = metricData[r]?.[c] || 0;
+                const cellExp = expData[r]?.[c] || 0;
+                const bg = getGradient(val, maxVal);
+                const display = metric === 'exp' ? parseInt(val).toLocaleString() : val.toFixed(4);
+                const rowPct = rowExpTotals[r] > 0 ? (cellExp / rowExpTotals[r] * 100).toFixed(1) : '0.0';
+                const colPct = colExpTotals[c] > 0 ? (cellExp / colExpTotals[c] * 100).toFixed(1) : '0.0';
+                return `<td style="background:${{bg}}" title="头部: ${{r}}&#10;推荐: ${{c}}&#10;${{metric}}: ${{display}}&#10;exp: ${{cellExp.toLocaleString()}}&#10;横向占比: ${{rowPct}}%&#10;纵向占比: ${{colPct}}%">${{display}}</td>`;
+            }}).join('');
+            return `<tr><td style="cursor:pointer;background:#f5f5f5" onclick="sortByRow('${{r}}')" title="头部品类: ${{r}}&#10;exp排名: #${{origRowRank}}&#10;exp: ${{rowExpTotals[r].toLocaleString()}}">#${{origRowRank}} ${{r}}</td>${{cells}}</tr>`;
+        }}).join('');
+    }}
+
+    function switchDate(delta) {{
+        const select = document.getElementById('date-select');
+        const idx = dates.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dates.length) {{
+            select.value = dates[newIdx];
+            updateMatrix();
+        }}
+    }}
+
+    function switchDrillDate(delta) {{
+        const select = document.getElementById('drill-date');
+        const idx = dates.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dates.length) {{
+            select.value = dates[newIdx];
+            // 触发 change 事件以更新头部品类列表
+            select.dispatchEvent(new Event('change'));
+        }}
+    }}
+
+    function toggleDrillPlay() {{
+        const btn = document.getElementById('drill-play-btn');
+        if (drillPlayInterval) {{
+            clearInterval(drillPlayInterval);
+            drillPlayInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸';
+            let idx = 0;
+            const play = () => {{
+                if (idx >= dates.length) {{
+                    clearInterval(drillPlayInterval);
+                    drillPlayInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶';
+                    return;
+                }}
+                document.getElementById('drill-date').value = dates[idx];
+                document.getElementById('drill-date').dispatchEvent(new Event('change'));
+                idx++;
+            }};
+            play();
+            drillPlayInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    function togglePlay() {{
+        const btn = document.getElementById('play-btn');
+        if (playInterval) {{
+            clearInterval(playInterval);
+            playInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸';
+            let idx = 0;
+            const play = () => {{
+                if (idx >= dates.length) {{
+                    clearInterval(playInterval);
+                    playInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶';
+                    return;
+                }}
+                document.getElementById('date-select').value = dates[idx];
+                updateMatrix();
+                idx++;
+            }};
+            play();
+            playInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    function getCurrentData() {{
+        const crowd = document.getElementById('crowd-select').value;
+        const date = document.getElementById('date-select').value;
+        const metric = document.getElementById('metric-select').value;
+        if (!allData[crowd] || !allData[crowd][date]) return null;
+        return {{ data: allData[crowd][date], metric }};
+    }}
+
+    function sortByRowSum() {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        const rowSums = {{}};
+        data.rows.forEach(r => {{ rowSums[r] = data.cols.reduce((sum, c) => sum + (metricData[r]?.[c] || 0), 0); }});
+        sortState.asc = sortState.row === 'sum' ? !sortState.asc : false;
+        sortState.row = 'sum';
+        currentRowOrder = [...data.rows].sort((a, b) => sortState.asc ? rowSums[a] - rowSums[b] : rowSums[b] - rowSums[a]);
+        updateMatrix();
+    }}
+
+    function sortByCol(colName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        sortState.asc = sortState.col === colName ? !sortState.asc : false;
+        sortState.col = colName;
+        currentRowOrder = [...data.rows].sort((a, b) => {{
+            const va = metricData[a]?.[colName] || 0;
+            const vb = metricData[b]?.[colName] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+        updateMatrix();
+    }}
+
+    function sortByRow(rowName) {{
+        const result = getCurrentData();
+        if (!result) return;
+        const {{ data, metric }} = result;
+        const metricData = data[metric];
+        sortState.asc = sortState.row === rowName ? !sortState.asc : false;
+        sortState.row = rowName;
+        currentColOrder = [...data.cols].sort((a, b) => {{
+            const va = metricData[rowName]?.[a] || 0;
+            const vb = metricData[rowName]?.[b] || 0;
+            return sortState.asc ? va - vb : vb - va;
+        }});
+        updateMatrix();
+    }}
+
+    function resetSort() {{
+        currentRowOrder = null;
+        currentColOrder = null;
+        sortState = {{ row: null, col: null, asc: true }};
+        updateMatrix();
+    }}
+
+    function highlightCat(el) {{
+        const cat = el.getAttribute('data-cat');
+        document.querySelectorAll('.cat[data-cat]').forEach(cell => {{
+            if (cell.getAttribute('data-cat') === cat) {{
+                cell.classList.add('highlight');
+                cell.closest('tr').classList.add('row-highlight');
+            }}
+        }});
+    }}
+
+    function unhighlightCat() {{
+        document.querySelectorAll('.cat.highlight').forEach(cell => {{
+            cell.classList.remove('highlight');
+            cell.closest('tr').classList.remove('row-highlight');
+        }});
+    }}
+
+    // 初始化头部品类下钻
+    function initHeadDrill() {{
+        const date = document.getElementById('drill-date').value;
+        const headSelect = document.getElementById('drill-head');
+
+        if (!headDrillData[date]) {{
+            headSelect.innerHTML = '<option value="">无数据</option>';
+            return;
+        }}
+
+        const heads = headDrillData[date].heads;
+        headSelect.innerHTML = heads.map((h, i) => {{
+            const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+            return `<option value="${{h}}">${{label}}</option>`;
+        }}).join('');
+
+        updateHeadDrill();
+    }}
+
+    function updateHeadDrill() {{
+        const date = document.getElementById('drill-date').value;
+        const headCate = document.getElementById('drill-head').value;
+        const sortBy = document.getElementById('drill-sort').value;
+        const showMetric = document.getElementById('drill-metric').value;
+        const topN = parseInt(document.getElementById('drill-topn').value);
+
+        // 检查日期变化,更新头部品类列表
+        const headSelect = document.getElementById('drill-head');
+        if (headDrillData[date] && headSelect.options.length > 0) {{
+            const currentHeads = headDrillData[date].heads;
+            const firstOption = headSelect.options[0]?.value;
+            if (currentHeads[0] !== firstOption) {{
+                headSelect.innerHTML = currentHeads.map((h, i) => {{
+                    const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+                    return `<option value="${{h}}" ${{h === headCate ? 'selected' : ''}}>${{label}}</option>`;
+                }}).join('');
+            }}
+        }}
+
+        if (!headDrillData[date] || !headCate) {{
+            document.getElementById('drill-section').innerHTML = '<p>无数据</p>';
+            return;
+        }}
+
+        const data = headDrillData[date].data[headCate];
+        if (!data) {{
+            document.getElementById('drill-section').innerHTML = '<p>该头部品类无数据</p>';
+            return;
+        }}
+
+        // 为每个人群计算 Top N 和整体汇总
+        const crowdTopN = {{}};
+        const crowdTotal = {{}};
+        crowdList.forEach(crowd => {{
+            const items = [];
+            if (data[crowd]) {{
+                for (const cat in data[crowd]) {{
+                    if (cat === '_total') {{
+                        // 保存整体汇总
+                        crowdTotal[crowd] = {{
+                            exp: data[crowd][cat].exp || 0,
+                            showVal: data[crowd][cat][showMetric] || 0
+                        }};
+                    }} else {{
+                        items.push({{
+                            cat: cat,
+                            sortVal: data[crowd][cat][sortBy] || 0,
+                            showVal: data[crowd][cat][showMetric] || 0,
+                            exp: data[crowd][cat].exp || 0
+                        }});
+                    }}
+                }}
+            }}
+            items.sort((a, b) => b.sortVal - a.sortVal);
+            crowdTopN[crowd] = items.slice(0, topN);
+        }});
+
+        // 收集所有品类用于颜色映射
+        const allCats = new Set();
+        crowdList.forEach(crowd => {{
+            crowdTopN[crowd].forEach(item => allCats.add(item.cat));
+        }});
+        const catList = Array.from(allCats);
+
+        const catColors = {{}};
+        const colorPalette = [
+            '#FFCDD2', '#F8BBD0', '#E1BEE7', '#D1C4E9', '#C5CAE9',
+            '#BBDEFB', '#B3E5FC', '#B2EBF2', '#B2DFDB', '#C8E6C9',
+            '#DCEDC8', '#F0F4C3', '#FFF9C4', '#FFECB3', '#FFE0B2',
+            '#FFCCBC', '#D7CCC8', '#CFD8DC', '#BCAAA4', '#B0BEC5'
+        ];
+        catList.forEach((cat, i) => {{
+            catColors[cat] = colorPalette[i % colorPalette.length];
+        }});
+
+        // 计算指标渐变范围
+        let maxVal = 0, minVal = Infinity;
+        crowdList.forEach(crowd => {{
+            crowdTopN[crowd].forEach(item => {{
+                if (item.showVal > maxVal) maxVal = item.showVal;
+                if (item.showVal < minVal) minVal = item.showVal;
+            }});
+        }});
+        if (minVal === Infinity) minVal = 0;
+
+        function getValueColor(val) {{
+            if (maxVal === minVal) return '#C8E6C9';
+            const ratio = (val - minVal) / (maxVal - minVal);
+            const r = Math.round(200 - ratio * 120);
+            const g = Math.round(230 - ratio * 80);
+            const b = Math.round(201 - ratio * 120);
+            return `rgb(${{r}},${{g}},${{b}})`;
+        }}
+
+        // 生成表格
+        let html = '';
+        crowdList.forEach(crowd => {{
+            const colSpan = showMetric === 'exp' ? 3 : 4;
+            html += `<div class="crowd-block">
+                <table>
+                    <thead>
+                        <tr><th colspan="${{colSpan}}" style="background:${{crowdColors[crowd]}};color:white">${{crowd}}</th></tr>
+                        <tr><th class="rn">rn</th><th>推荐品类</th><th>exp</th>${{showMetric !== 'exp' ? `<th>${{showMetric}}</th>` : ''}}</tr>
+                    </thead>
+                    <tbody>`;
+
+            if (crowdTopN[crowd].length === 0) {{
+                html += `<tr><td colspan="${{colSpan}}" style="color:#999">无数据</td></tr>`;
+            }} else {{
+                // 先添加整体汇总行 (rn=0)
+                if (crowdTotal[crowd]) {{
+                    const totalExp = parseInt(crowdTotal[crowd].exp).toLocaleString();
+                    const totalMetric = (crowdTotal[crowd].showVal * 100).toFixed(1) + '%';
+                    html += `<tr style="background:#f5f5f5;font-weight:bold">
+                        <td class="rn">0</td>
+                        <td class="cat" style="background:#e0e0e0">整体</td>
+                        <td class="val">${{totalExp}}</td>
+                        ${{showMetric !== 'exp' ? `<td class="val">${{totalMetric}}</td>` : ''}}
+                    </tr>`;
+                }}
+                // 添加 Top N 品类
+                crowdTopN[crowd].forEach((item, i) => {{
+                    const expDisplay = parseInt(item.exp).toLocaleString();
+                    const metricDisplay = (item.showVal * 100).toFixed(1) + '%';
+                    const valColor = getValueColor(item.showVal);
+                    const catColor = catColors[item.cat];
+                    const catAttr = item.cat.replace(/"/g, '&quot;');
+                    html += `<tr>
+                        <td class="rn">${{i + 1}}</td>
+                        <td class="cat" style="background:${{catColor}}" data-cat="${{catAttr}}" onmouseenter="highlightCat(this)" onmouseleave="unhighlightCat()">${{item.cat}}</td>
+                        <td class="val">${{expDisplay}}</td>
+                        ${{showMetric !== 'exp' ? `<td class="val" style="background:${{valColor}}">${{metricDisplay}}</td>` : ''}}
+                    </tr>`;
+                }});
+            }}
+
+            html += `</tbody></table></div>`;
+        }});
+
+        document.getElementById('drill-section').innerHTML = html;
+    }}
+
+    // 监听日期变化,更新头部品类列表
+    document.getElementById('drill-date').addEventListener('change', function() {{
+        const date = this.value;
+        const headSelect = document.getElementById('drill-head');
+        const currentHead = headSelect.value;
+
+        if (headDrillData[date]) {{
+            const heads = headDrillData[date].heads;
+            headSelect.innerHTML = heads.map((h, i) => {{
+                const label = h === 'all' ? '全部(不区分头部品类)' : `#${{i}} ${{h}}`;
+                return `<option value="${{h}}" ${{h === currentHead ? 'selected' : ''}}>${{label}}</option>`;
+            }}).join('');
+        }} else {{
+            headSelect.innerHTML = '<option value="">无数据</option>';
+        }}
+        updateHeadDrill();
+    }});
+
+    updateMatrix();
+    initHeadDrill();
+    </script>
+</body>
+</html>
+"""
+
+html_file = output_dir / f"{latest_file.stem}_头部品类分析.html"
+with open(html_file, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\nHTML 报告已生成: {html_file}")

+ 768 - 0
tasks/承接/头部品类与承接品类分析/_archive/visualize_correlation.py

@@ -0,0 +1,768 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+品类承接裂变率相关性分析 - HTML可视化
+Tab 1: 品类一致性分析 - 同品类vs跨品类vov对比
+Tab 2: 品类组合稳定性 - 跨人群相关性散点图
+Tab 3: 品类亲和性矩阵 - 热力图
+"""
+import pandas as pd
+import numpy as np
+import json
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+# 找到最新的原始数据文件
+csv_files = [f for f in output_dir.glob("query_*.csv") if not f.name.endswith('.html')]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+print(f"分析文件: {latest_file.name}")
+print(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+
+# 过滤掉 headvideoid为空 的记录
+df_valid = df[~df['head_cate2'].isin(['headvideoid为空', '未匹配品类'])].copy()
+df_valid['is_same_cate'] = df_valid['head_cate2'] == df_valid['rec_cate2']
+df_valid['cate_pair'] = df_valid['head_cate2'] + ' → ' + df_valid['rec_cate2']
+
+crowd_list = ['内部', '外部0层', '外部裂变']
+date_list = ['全部'] + sorted([str(d) for d in df_valid['dt'].unique()])
+EXP_THRESHOLD = 10000  # 亲和性矩阵的曝光阈值(全部天数)
+EXP_THRESHOLD_DAILY = 1000  # 单日曝光阈值
+
+# ========== 1. 品类一致性数据 ==========
+consistency_data = {'crowds': crowd_list, 'same': [], 'diff': [], 'ratio': []}
+for crowd in crowd_list:
+    crowd_df = df_valid[df_valid['crowd'] == crowd]
+    same = crowd_df[crowd_df['is_same_cate']]
+    diff = crowd_df[~crowd_df['is_same_cate']]
+    same_vov = same['new_exposure_cnt'].sum() / same['exp'].sum() if same['exp'].sum() > 0 else 0
+    diff_vov = diff['new_exposure_cnt'].sum() / diff['exp'].sum() if diff['exp'].sum() > 0 else 0
+    consistency_data['same'].append(round(same_vov, 4))
+    consistency_data['diff'].append(round(diff_vov, 4))
+    consistency_data['ratio'].append(round(same_vov / diff_vov, 2) if diff_vov > 0 else 0)
+
+# 整体
+same_all = df_valid[df_valid['is_same_cate']]
+diff_all = df_valid[~df_valid['is_same_cate']]
+consistency_data['total_same'] = round(same_all['new_exposure_cnt'].sum() / same_all['exp'].sum(), 4)
+consistency_data['total_diff'] = round(diff_all['new_exposure_cnt'].sum() / diff_all['exp'].sum(), 4)
+consistency_data['total_ratio'] = round(consistency_data['total_same'] / consistency_data['total_diff'], 2)
+
+# 同品类曝光占比
+consistency_data['same_exp'] = [int(df_valid[(df_valid['crowd'] == c) & df_valid['is_same_cate']]['exp'].sum()) for c in crowd_list]
+consistency_data['diff_exp'] = [int(df_valid[(df_valid['crowd'] == c) & ~df_valid['is_same_cate']]['exp'].sum()) for c in crowd_list]
+
+# ========== 2. 品类亲和性矩阵(按人群分开 + 整体) ==========
+def calc_affinity_matrix(data_df, exp_threshold=EXP_THRESHOLD):
+    """计算亲和性矩阵数据"""
+    # 计算每个head_cate2的基准vov
+    head_baseline = data_df.groupby('head_cate2').apply(
+        lambda x: x['new_exposure_cnt'].sum() / x['exp'].sum(), include_groups=False
+    ).to_dict()
+
+    affinity_list = []
+    for (head, rec), grp in data_df.groupby(['head_cate2', 'rec_cate2']):
+        if grp['exp'].sum() >= exp_threshold:
+            pair_vov = grp['new_exposure_cnt'].sum() / grp['exp'].sum()
+            baseline = head_baseline.get(head, 1)
+            affinity = pair_vov / baseline if baseline > 0 else 0
+            affinity_list.append({
+                'head': head, 'rec': rec,
+                'vov': round(pair_vov, 4),
+                'baseline': round(baseline, 4),
+                'affinity': round(affinity, 2),
+                'exp': int(grp['exp'].sum())
+            })
+
+    if not affinity_list:
+        return None
+
+    aff_df = pd.DataFrame(affinity_list)
+
+    # 构建矩阵数据 - 行列使用相同品类列表,方便看对角线(同品类承接)
+    # 合并 head 和 rec 的曝光量,按总曝光排序
+    head_exp = aff_df.groupby('head')['exp'].sum()
+    rec_exp = aff_df.groupby('rec')['exp'].sum()
+    all_cates = set(head_exp.index) | set(rec_exp.index)
+    cate_total_exp = {c: head_exp.get(c, 0) + rec_exp.get(c, 0) for c in all_cates}
+    cate_list = sorted(cate_total_exp.keys(), key=lambda x: cate_total_exp[x], reverse=True)[:30]
+
+    # 行列使用相同顺序
+    head_list = cate_list
+    rec_list = cate_list
+
+    result = {'rows': head_list, 'cols': rec_list, 'affinity': {}, 'vov': {}, 'exp': {}}
+    for head in head_list:
+        result['affinity'][head] = {}
+        result['vov'][head] = {}
+        result['exp'][head] = {}
+        for rec in rec_list:
+            row = aff_df[(aff_df['head'] == head) & (aff_df['rec'] == rec)]
+            if len(row) > 0:
+                result['affinity'][head][rec] = float(row.iloc[0]['affinity'])
+                result['vov'][head][rec] = float(row.iloc[0]['vov'])
+                result['exp'][head][rec] = int(row.iloc[0]['exp'])
+            else:
+                result['affinity'][head][rec] = 0
+                result['vov'][head][rec] = 0
+                result['exp'][head][rec] = 0
+    return result
+
+# 先计算全部+整体的矩阵,获取固定的行列顺序
+base_matrix = calc_affinity_matrix(df_valid, EXP_THRESHOLD)
+fixed_cate_list = base_matrix['rows'] if base_matrix else []
+
+def calc_affinity_matrix_fixed(data_df, exp_threshold, fixed_list):
+    """计算亲和性矩阵数据,使用固定的行列顺序"""
+    head_baseline = data_df.groupby('head_cate2').apply(
+        lambda x: x['new_exposure_cnt'].sum() / x['exp'].sum(), include_groups=False
+    ).to_dict()
+
+    affinity_dict = {}
+    for (head, rec), grp in data_df.groupby(['head_cate2', 'rec_cate2']):
+        if grp['exp'].sum() >= exp_threshold:
+            pair_vov = grp['new_exposure_cnt'].sum() / grp['exp'].sum()
+            baseline = head_baseline.get(head, 1)
+            affinity = pair_vov / baseline if baseline > 0 else 0
+            affinity_dict[(head, rec)] = {
+                'vov': round(pair_vov, 4),
+                'affinity': round(affinity, 2),
+                'exp': int(grp['exp'].sum())
+            }
+
+    # 使用固定的行列顺序
+    result = {'rows': fixed_list, 'cols': fixed_list, 'affinity': {}, 'vov': {}, 'exp': {}}
+    for head in fixed_list:
+        result['affinity'][head] = {}
+        result['vov'][head] = {}
+        result['exp'][head] = {}
+        for rec in fixed_list:
+            if (head, rec) in affinity_dict:
+                result['affinity'][head][rec] = float(affinity_dict[(head, rec)]['affinity'])
+                result['vov'][head][rec] = float(affinity_dict[(head, rec)]['vov'])
+                result['exp'][head][rec] = int(affinity_dict[(head, rec)]['exp'])
+            else:
+                result['affinity'][head][rec] = 0
+                result['vov'][head][rec] = 0
+                result['exp'][head][rec] = 0
+    return result
+
+# 计算各日期×人群的矩阵(使用固定行列顺序)
+matrix_data = {}
+for date in date_list:
+    matrix_data[date] = {}
+    if date == '全部':
+        date_df = df_valid
+        threshold = EXP_THRESHOLD
+    else:
+        date_df = df_valid[df_valid['dt'].astype(str) == date]
+        threshold = EXP_THRESHOLD_DAILY
+
+    # 整体
+    matrix_data[date]['整体'] = calc_affinity_matrix_fixed(date_df, threshold, fixed_cate_list)
+    # 各人群
+    for crowd in crowd_list:
+        matrix_data[date][crowd] = calc_affinity_matrix_fixed(
+            date_df[date_df['crowd'] == crowd], threshold, fixed_cate_list
+        )
+
+# ========== 4. Top品类组合排名(按人群分开 + 整体) ==========
+def calc_ranking(data_df, min_exp=1000):
+    """计算品类组合排名"""
+    pair_vov = data_df.groupby('cate_pair').apply(
+        lambda x: pd.Series({
+            'vov': x['new_exposure_cnt'].sum() / x['exp'].sum(),
+            'exp': int(x['exp'].sum()),
+        }), include_groups=False
+    )
+    pair_vov = pair_vov[pair_vov['exp'] >= min_exp]
+    if len(pair_vov) == 0:
+        return {'high': [], 'low': []}
+
+    all_high = pair_vov.sort_values('vov', ascending=False).head(100)
+    all_low = pair_vov.sort_values('vov', ascending=True).head(100)
+
+    return {
+        'high': [{'pair': idx, 'vov': float(round(row['vov'], 4)), 'exp': int(row['exp'])} for idx, row in all_high.iterrows()],
+        'low': [{'pair': idx, 'vov': float(round(row['vov'], 4)), 'exp': int(row['exp'])} for idx, row in all_low.iterrows()]
+    }
+
+# 计算各日期×人群的排名
+ranking_data = {}
+for date in date_list:
+    ranking_data[date] = {}
+    if date == '全部':
+        date_df = df_valid
+        min_exp = 1000
+    else:
+        date_df = df_valid[df_valid['dt'].astype(str) == date]
+        min_exp = 100  # 单日阈值更低
+
+    ranking_data[date]['整体'] = calc_ranking(date_df, min_exp)
+    for crowd in crowd_list:
+        ranking_data[date][crowd] = calc_ranking(date_df[date_df['crowd'] == crowd], min_exp)
+
+# 转为JSON
+consistency_json = json.dumps(consistency_data, ensure_ascii=False)
+matrix_json = json.dumps(matrix_data, ensure_ascii=False)
+ranking_json = json.dumps(ranking_data, ensure_ascii=False)
+dates_json = json.dumps(date_list, ensure_ascii=False)
+
+# 日期选项HTML
+date_options_html = "".join([f'<option value="{d}" {"selected" if d == "全部" else ""}>{d}</option>' for d in date_list])
+
+html_content = f"""<!DOCTYPE html>
+<html>
+<head>
+    <meta charset="utf-8">
+    <title>品类承接裂变率相关性分析</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+               background: #f5f5f5; padding: 20px; }}
+        .container {{ max-width: 1600px; margin: 0 auto; background: white;
+                     border-radius: 8px; padding: 20px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }}
+        h1 {{ font-size: 24px; margin-bottom: 10px; color: #333; }}
+        .subtitle {{ color: #666; margin-bottom: 20px; font-size: 14px; }}
+
+        /* Tabs */
+        .tabs {{ display: flex; gap: 5px; margin-bottom: 20px; border-bottom: 2px solid #e0e0e0; }}
+        .tab {{ padding: 10px 20px; cursor: pointer; border: none; background: none;
+               font-size: 14px; color: #666; border-bottom: 2px solid transparent; margin-bottom: -2px; }}
+        .tab:hover {{ color: #333; }}
+        .tab.active {{ color: #1976D2; border-bottom-color: #1976D2; font-weight: 500; }}
+        .tab-content {{ display: none; }}
+        .tab-content.active {{ display: block; }}
+
+        /* Summary cards */
+        .summary {{ display: flex; gap: 15px; margin-bottom: 25px; flex-wrap: wrap; }}
+        .stat-card {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+                     padding: 15px 20px; border-radius: 8px; text-align: center; color: white; min-width: 140px; }}
+        .stat-card.green {{ background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%); }}
+        .stat-card.orange {{ background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); }}
+        .stat-card.blue {{ background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); }}
+        .stat-card h4 {{ font-size: 28px; margin-bottom: 5px; }}
+        .stat-card p {{ font-size: 12px; opacity: 0.9; }}
+
+        /* Bar chart */
+        .chart-section {{ margin-bottom: 30px; }}
+        .chart-title {{ font-size: 16px; font-weight: 500; margin-bottom: 15px; color: #333; }}
+        .bar-chart {{ display: flex; gap: 30px; align-items: flex-end; justify-content: center; padding: 20px; }}
+        .bar-group {{ text-align: center; }}
+        .bar-pair {{ display: flex; gap: 8px; align-items: flex-end; height: 200px; }}
+        .bar {{ width: 50px; border-radius: 4px 4px 0 0; transition: all 0.3s; cursor: pointer; position: relative; }}
+        .bar:hover {{ opacity: 0.8; }}
+        .bar-value {{ position: absolute; top: -25px; left: 50%; transform: translateX(-50%); font-size: 12px; font-weight: 500; white-space: nowrap; }}
+        .bar-label {{ margin-top: 10px; font-size: 13px; color: #333; }}
+        .bar-ratio {{ font-size: 11px; color: #666; margin-top: 3px; }}
+        .legend {{ display: flex; gap: 20px; justify-content: center; margin-bottom: 15px; }}
+        .legend-item {{ display: flex; align-items: center; gap: 6px; font-size: 13px; }}
+        .legend-color {{ width: 16px; height: 16px; border-radius: 3px; }}
+
+        /* Scatter plot */
+        .scatter-container {{ display: flex; gap: 20px; flex-wrap: wrap; }}
+        .scatter-box {{ flex: 1; min-width: 350px; background: #f8f9fa; border-radius: 8px; padding: 15px; }}
+        .scatter-title {{ font-size: 14px; font-weight: 500; margin-bottom: 10px; }}
+        .scatter-stats {{ font-size: 12px; color: #666; margin-bottom: 10px; }}
+        .scatter-canvas {{ width: 100%; height: 300px; position: relative; background: white; border: 1px solid #e0e0e0; border-radius: 4px; }}
+
+        /* Matrix */
+        .matrix-container {{ overflow-x: auto; max-height: 600px; overflow-y: auto; }}
+        table {{ border-collapse: collapse; font-size: 11px; }}
+        th, td {{ border: 1px solid #e0e0e0; padding: 4px 6px; text-align: center; white-space: nowrap; }}
+        th {{ background: #f5f5f5; font-weight: 600; position: sticky; top: 0; z-index: 1; }}
+        th:first-child {{ position: sticky; left: 0; z-index: 3; }}
+        td:first-child {{ background: #f5f5f5; font-weight: 500; position: sticky; left: 0; z-index: 1; text-align: left; }}
+        .corner-cell {{ background: linear-gradient(to top right, #f5f5f5 49.5%, #ccc 49.5%, #ccc 50.5%, #f5f5f5 50.5%); }}
+
+        /* Controls */
+        .controls {{ display: flex; gap: 15px; margin-bottom: 15px; align-items: center; flex-wrap: wrap; }}
+        .control-group {{ display: flex; align-items: center; gap: 6px; }}
+        .control-group label {{ font-size: 13px; color: #666; }}
+        select {{ padding: 6px 10px; border: 1px solid #ddd; border-radius: 4px; font-size: 13px; }}
+        .date-switcher button {{ padding: 5px 10px; border: 1px solid #ddd; background: white; cursor: pointer; border-radius: 3px; }}
+        .date-switcher button:hover {{ background: #f0f0f0; }}
+        .play-btn {{ background: #4CAF50; color: white; border: none; border-radius: 4px; padding: 5px 12px; font-size: 12px; cursor: pointer; }}
+        .play-btn:hover {{ background: #45a049; }}
+        .play-btn.playing {{ background: #f44336; }}
+        /* Matrix highlight */
+        th.highlight, td.row-header.highlight {{ background: #bbdefb !important; }}
+        td.cell-highlight {{ outline: 2px solid #1565C0; outline-offset: -1px; }}
+
+        /* Ranking table */
+        .ranking-section {{ display: flex; gap: 30px; }}
+        .ranking-box {{ flex: 1; }}
+        .ranking-box h4 {{ font-size: 14px; margin-bottom: 10px; padding: 8px; border-radius: 4px; }}
+        .ranking-box.high h4 {{ background: #e8f5e9; color: #2e7d32; }}
+        .ranking-box.low h4 {{ background: #ffebee; color: #c62828; }}
+        .ranking-table {{ width: 100%; border-collapse: collapse; }}
+        .ranking-table th {{ background: #f5f5f5; padding: 8px; text-align: left; font-size: 12px; }}
+        .ranking-table td {{ padding: 6px 8px; border-bottom: 1px solid #eee; font-size: 12px; }}
+        .ranking-table .rn {{ width: 30px; color: #999; }}
+        .ranking-table .vov {{ font-family: monospace; text-align: right; }}
+        .ranking-table .exp {{ color: #666; text-align: right; }}
+
+        /* Insight box */
+        .insight-box {{ background: #e3f2fd; border-left: 4px solid #1976D2; padding: 15px; margin: 20px 0; border-radius: 0 8px 8px 0; }}
+        .insight-box h5 {{ color: #1565C0; margin-bottom: 8px; font-size: 14px; }}
+        .insight-box p {{ color: #333; font-size: 13px; line-height: 1.6; }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>品类承接裂变率相关性分析</h1>
+        <p class="subtitle">分析进入品类与承接品类的关系对裂变效果的影响</p>
+
+        <div class="tabs">
+            <button class="tab active" onclick="switchTab('consistency')">品类一致性</button>
+            <button class="tab" onclick="switchTab('affinity')">品类亲和性矩阵</button>
+            <button class="tab" onclick="switchTab('ranking')">品类组合排名</button>
+        </div>
+
+        <!-- Tab 1: 品类一致性 -->
+        <div id="tab-consistency" class="tab-content active">
+            <div class="summary">
+                <div class="stat-card green">
+                    <h4 id="same-vov">-</h4>
+                    <p>同品类承接 vov</p>
+                </div>
+                <div class="stat-card orange">
+                    <h4 id="diff-vov">-</h4>
+                    <p>跨品类承接 vov</p>
+                </div>
+                <div class="stat-card blue">
+                    <h4 id="vov-ratio">-</h4>
+                    <p>同/跨品类比值</p>
+                </div>
+            </div>
+
+            <div class="insight-box">
+                <h5>核心发现</h5>
+                <p>同品类承接(进入品类=承接品类)的裂变率显著高于跨品类承接,约为 <strong id="insight-ratio">-</strong> 倍。
+                这说明用户对同类内容有更强的分享意愿,推荐系统在品类匹配上有优化空间。</p>
+            </div>
+
+            <div class="chart-section">
+                <div class="chart-title">各人群同品类 vs 跨品类 vov 对比</div>
+                <div class="legend">
+                    <div class="legend-item"><div class="legend-color" style="background:#4CAF50"></div>同品类承接</div>
+                    <div class="legend-item"><div class="legend-color" style="background:#2196F3"></div>跨品类承接</div>
+                </div>
+                <div class="bar-chart" id="consistency-chart"></div>
+            </div>
+
+            <div class="chart-section">
+                <div class="chart-title">同品类曝光占比</div>
+                <div id="exp-ratio-chart" style="display:flex;gap:20px;justify-content:center;"></div>
+            </div>
+        </div>
+
+        <!-- Tab 2: 品类亲和性矩阵 -->
+        <div id="tab-affinity" class="tab-content">
+            <div class="insight-box">
+                <h5>亲和性 = 这个组合的表现 / 进入品类的平均表现</h5>
+                <p>
+                <strong>举例</strong>:用户从「搞笑段子」进入,平均裂变率 0.4<br>
+                • 推荐「搞笑段子→搞笑段子」裂变率 0.8,亲和性 = 0.8/0.4 = <span style="color:#2e7d32;font-weight:bold">2.0 ✓ 更对味</span><br>
+                • 推荐「搞笑段子→历史名人」裂变率 0.2,亲和性 = 0.2/0.4 = <span style="color:#c62828;font-weight:bold">0.5 ✗ 不对味</span><br><br>
+                <strong>颜色</strong>:<span style="background:#c8e6c9;padding:2px 6px;border-radius:3px">绿色=高亲和</span>
+                <span style="background:#ffcdd2;padding:2px 6px;border-radius:3px;margin-left:10px">红色=低亲和</span>
+                </p>
+            </div>
+
+            <div class="controls">
+                <div class="control-group date-switcher">
+                    <label>日期:</label>
+                    <button onclick="switchMatrixDate(-1)">◀</button>
+                    <select id="matrix-date" onchange="updateMatrix()">
+                        {date_options_html}
+                    </select>
+                    <button onclick="switchMatrixDate(1)">▶</button>
+                    <button id="matrix-play-btn" class="play-btn" onclick="toggleMatrixPlay()">▶ 播放</button>
+                </div>
+                <div class="control-group">
+                    <label>人群:</label>
+                    <select id="matrix-crowd" onchange="updateMatrix()">
+                        <option value="整体" selected>整体</option>
+                        <option value="内部">内部</option>
+                        <option value="外部0层">外部0层</option>
+                        <option value="外部裂变">外部裂变</option>
+                    </select>
+                </div>
+                <div class="control-group">
+                    <label>显示指标:</label>
+                    <select id="matrix-metric" onchange="updateMatrix()">
+                        <option value="affinity" selected>亲和性 (affinity)</option>
+                        <option value="vov">裂变率 (vov)</option>
+                        <option value="exp">曝光量 (exp)</option>
+                    </select>
+                </div>
+            </div>
+
+            <div class="matrix-container">
+                <table id="affinity-table">
+                    <thead id="affinity-header"></thead>
+                    <tbody id="affinity-body"></tbody>
+                </table>
+            </div>
+        </div>
+
+        <!-- Tab 4: 品类组合排名 -->
+        <div id="tab-ranking" class="tab-content">
+            <div class="insight-box">
+                <h5>筛选条件</h5>
+                <p>仅展示在 ≥2 个人群中都有数据且曝光量 ≥1000 的品类组合,确保结果稳定可靠。</p>
+            </div>
+
+            <div class="controls">
+                <div class="control-group date-switcher">
+                    <label>日期:</label>
+                    <button onclick="switchRankingDate(-1)">◀</button>
+                    <select id="ranking-date" onchange="initRanking()">
+                        {date_options_html}
+                    </select>
+                    <button onclick="switchRankingDate(1)">▶</button>
+                    <button id="ranking-play-btn" class="play-btn" onclick="toggleRankingPlay()">▶ 播放</button>
+                </div>
+                <div class="control-group">
+                    <label>人群:</label>
+                    <select id="ranking-crowd" onchange="initRanking()">
+                        <option value="整体" selected>整体</option>
+                        <option value="内部">内部</option>
+                        <option value="外部0层">外部0层</option>
+                        <option value="外部裂变">外部裂变</option>
+                    </select>
+                </div>
+                <div class="control-group">
+                    <label>展示数量:</label>
+                    <select id="ranking-topn" onchange="initRanking()">
+                        <option value="20">Top 20</option>
+                        <option value="50">Top 50</option>
+                        <option value="100">Top 100</option>
+                    </select>
+                </div>
+            </div>
+
+            <div class="ranking-section">
+                <div class="ranking-box high">
+                    <h4>Top 20 高裂变品类组合</h4>
+                    <table class="ranking-table" id="high-ranking"></table>
+                </div>
+                <div class="ranking-box low">
+                    <h4>Top 20 低裂变品类组合</h4>
+                    <table class="ranking-table" id="low-ranking"></table>
+                </div>
+            </div>
+        </div>
+    </div>
+
+    <script>
+    const consistencyData = {consistency_json};
+    const matrixData = {matrix_json};
+    const rankingData = {ranking_json};
+    const dateList = {dates_json};
+
+    let matrixPlayInterval = null;
+    let rankingPlayInterval = null;
+
+    // Tab switching
+    function switchTab(tabId) {{
+        document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+        document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+        document.querySelector(`[onclick="switchTab('${{tabId}}')"]`).classList.add('active');
+        document.getElementById('tab-' + tabId).classList.add('active');
+    }}
+
+    // Initialize consistency chart
+    function initConsistency() {{
+        const data = consistencyData;
+        document.getElementById('same-vov').textContent = data.total_same.toFixed(4);
+        document.getElementById('diff-vov').textContent = data.total_diff.toFixed(4);
+        document.getElementById('vov-ratio').textContent = data.total_ratio.toFixed(2) + 'x';
+        document.getElementById('insight-ratio').textContent = data.total_ratio.toFixed(2);
+
+        const maxVov = Math.max(...data.same, ...data.diff);
+        const chartHtml = data.crowds.map((crowd, i) => {{
+            const sameH = Math.round(data.same[i] / maxVov * 180);
+            const diffH = Math.round(data.diff[i] / maxVov * 180);
+            return `
+                <div class="bar-group">
+                    <div class="bar-pair">
+                        <div class="bar" style="height:${{sameH}}px;background:#4CAF50">
+                            <span class="bar-value">${{data.same[i].toFixed(4)}}</span>
+                        </div>
+                        <div class="bar" style="height:${{diffH}}px;background:#2196F3">
+                            <span class="bar-value">${{data.diff[i].toFixed(4)}}</span>
+                        </div>
+                    </div>
+                    <div class="bar-label">${{crowd}}</div>
+                    <div class="bar-ratio">${{data.ratio[i]}}x</div>
+                </div>
+            `;
+        }}).join('');
+        document.getElementById('consistency-chart').innerHTML = chartHtml;
+
+        // Exp ratio
+        const expHtml = data.crowds.map((crowd, i) => {{
+            const total = data.same_exp[i] + data.diff_exp[i];
+            const sameRatio = total > 0 ? (data.same_exp[i] / total * 100).toFixed(1) : 0;
+            return `
+                <div style="text-align:center">
+                    <div style="font-size:13px;margin-bottom:5px">${{crowd}}</div>
+                    <div style="width:150px;height:20px;background:#e0e0e0;border-radius:10px;overflow:hidden">
+                        <div style="width:${{sameRatio}}%;height:100%;background:#4CAF50"></div>
+                    </div>
+                    <div style="font-size:11px;color:#666;margin-top:3px">同品类占比: ${{sameRatio}}%</div>
+                </div>
+            `;
+        }}).join('');
+        document.getElementById('exp-ratio-chart').innerHTML = expHtml;
+    }}
+
+    // Matrix
+    function updateMatrix() {{
+        const date = document.getElementById('matrix-date').value;
+        const crowd = document.getElementById('matrix-crowd').value;
+        const metric = document.getElementById('matrix-metric').value;
+
+        if (!matrixData[date] || !matrixData[date][crowd]) {{
+            document.getElementById('affinity-header').innerHTML = '<tr><th>无数据</th></tr>';
+            document.getElementById('affinity-body').innerHTML = '';
+            return;
+        }}
+
+        const data = matrixData[date][crowd];
+        const metricData = data[metric];
+
+        // Calculate color range
+        const allVals = [];
+        data.rows.forEach(r => data.cols.forEach(c => {{
+            const val = metricData[r]?.[c] || 0;
+            if (val > 0) allVals.push(val);
+        }}));
+
+        let maxVal, minVal = 0;
+        if (metric === 'affinity') {{
+            maxVal = 2; minVal = 0.5;
+        }} else if (metric === 'vov') {{
+            allVals.sort((a, b) => a - b);
+            maxVal = allVals[Math.floor(allVals.length * 0.95)] || 1;
+        }} else {{
+            allVals.sort((a, b) => a - b);
+            maxVal = allVals[Math.floor(allVals.length * 0.9)] || 100000;
+        }}
+
+        function getColor(val) {{
+            if (metric === 'affinity') {{
+                if (val >= 1) {{
+                    const ratio = Math.min((val - 1) / (maxVal - 1), 1);
+                    return `rgb(${{Math.round(200 - ratio * 200)}}, ${{Math.round(230 - ratio * 30)}}, ${{Math.round(200 - ratio * 200)}})`;
+                }} else {{
+                    const ratio = Math.min((1 - val) / (1 - minVal), 1);
+                    return `rgb(${{Math.round(230 - ratio * 30)}}, ${{Math.round(200 - ratio * 200)}}, ${{Math.round(200 - ratio * 200)}})`;
+                }}
+            }} else {{
+                const ratio = Math.min(val / maxVal, 1);
+                return `rgb(${{Math.round(255 - ratio * 215)}}, ${{Math.round(255 - ratio * 88)}}, ${{Math.round(255 - ratio * 186)}})`;
+            }}
+        }}
+
+        const expData = data.exp;
+
+        // 计算每行和每列的总曝光量
+        const rowTotals = {{}};
+        const colTotals = {{}};
+        data.rows.forEach(r => {{
+            rowTotals[r] = data.cols.reduce((sum, c) => sum + (expData[r]?.[c] || 0), 0);
+        }});
+        data.cols.forEach(c => {{
+            colTotals[c] = data.rows.reduce((sum, r) => sum + (expData[r]?.[c] || 0), 0);
+        }});
+
+        document.getElementById('affinity-header').innerHTML = `
+            <tr>
+                <th class="corner-cell" style="width:120px">进入↓ 承接→</th>
+                ${{data.cols.map((c, ci) => `<th data-col="${{ci}}" title="${{c}}\\nexp: ${{colTotals[c].toLocaleString()}}">${{c.length > 6 ? c.substring(0,6) + '..' : c}}</th>`).join('')}}
+            </tr>
+        `;
+
+        document.getElementById('affinity-body').innerHTML = data.rows.map((r, ri) => {{
+            const cells = data.cols.map((c, ci) => {{
+                const val = metricData[r]?.[c] || 0;
+                const exp = expData[r]?.[c] || 0;
+                const bg = val > 0 ? getColor(val) : '#f8f9fa';
+                const isDiagonal = (r === c);  // 对角线:同品类承接
+                let display;
+                if (metric === 'exp') {{
+                    display = val > 0 ? (val >= 10000 ? Math.round(val/1000) + 'k' : val) : '-';
+                }} else {{
+                    display = val > 0 ? val.toFixed(2) : '-';
+                }}
+                // 计算横向和纵向占比
+                const rowPct = rowTotals[r] > 0 ? (exp / rowTotals[r] * 100).toFixed(1) : '0.0';
+                const colPct = colTotals[c] > 0 ? (exp / colTotals[c] * 100).toFixed(1) : '0.0';
+                const tooltip = `进入: ${{r}}\\n承接: ${{c}}\\n${{metric}}: ${{val}}\\nexp: ${{exp.toLocaleString()}}\\n横向占比: ${{rowPct}}%\\n纵向占比: ${{colPct}}%${{isDiagonal ? '\\n★ 同品类承接' : ''}}`;
+                const border = isDiagonal ? 'border:2px solid #1565C0;' : '';
+                return `<td data-row="${{ri}}" data-col="${{ci}}" style="background:${{bg}};${{border}}" title="${{tooltip}}" onmouseenter="highlightCell(${{ri}},${{ci}})" onmouseleave="unhighlightCell()">${{display}}</td>`;
+            }}).join('');
+            return `<tr><td class="row-header" data-row="${{ri}}" title="${{r}}\\nexp: ${{rowTotals[r].toLocaleString()}}">${{r.length > 10 ? r.substring(0,10) + '..' : r}}</td>${{cells}}</tr>`;
+        }}).join('');
+    }}
+
+    // Highlight row/col headers on cell hover
+    function highlightCell(row, col) {{
+        // Highlight column header
+        document.querySelectorAll('#affinity-header th[data-col]').forEach(th => {{
+            if (parseInt(th.dataset.col) === col) th.classList.add('highlight');
+        }});
+        // Highlight row header
+        document.querySelectorAll('#affinity-body .row-header').forEach(td => {{
+            if (parseInt(td.dataset.row) === row) td.classList.add('highlight');
+        }});
+    }}
+
+    function unhighlightCell() {{
+        document.querySelectorAll('.highlight').forEach(el => el.classList.remove('highlight'));
+    }}
+
+    // Ranking
+    function initRanking() {{
+        const date = document.getElementById('ranking-date').value;
+        const crowd = document.getElementById('ranking-crowd').value;
+        const topN = parseInt(document.getElementById('ranking-topn').value);
+
+        if (!rankingData[date] || !rankingData[date][crowd]) {{
+            document.getElementById('high-ranking').innerHTML = '<tbody><tr><td>无数据</td></tr></tbody>';
+            document.getElementById('low-ranking').innerHTML = '<tbody><tr><td>无数据</td></tr></tbody>';
+            return;
+        }}
+
+        const data = rankingData[date][crowd];
+
+        function renderTable(items, tableId) {{
+            const sliced = items.slice(0, topN);
+            const html = `
+                <thead><tr><th class="rn">#</th><th>品类组合</th><th class="vov">vov</th><th class="exp">曝光</th></tr></thead>
+                <tbody>
+                    ${{sliced.map((item, i) => `
+                        <tr>
+                            <td class="rn">${{i + 1}}</td>
+                            <td>${{item.pair}}</td>
+                            <td class="vov">${{item.vov.toFixed(4)}}</td>
+                            <td class="exp">${{item.exp.toLocaleString()}}</td>
+                        </tr>
+                    `).join('')}}
+                </tbody>
+            `;
+            document.getElementById(tableId).innerHTML = html;
+        }}
+
+        // 更新标题
+        const dateLabel = date === '全部' ? '' : ` [${{date}}]`;
+        const crowdLabel = crowd === '整体' ? '' : ` (${{crowd}})`;
+        document.querySelector('.ranking-box.high h4').textContent = `Top ${{topN}} 高裂变品类组合${{crowdLabel}}${{dateLabel}}`;
+        document.querySelector('.ranking-box.low h4').textContent = `Top ${{topN}} 低裂变品类组合${{crowdLabel}}${{dateLabel}}`;
+
+        renderTable(data.high, 'high-ranking');
+        renderTable(data.low, 'low-ranking');
+    }}
+
+    // Matrix date switching
+    function switchMatrixDate(delta) {{
+        const select = document.getElementById('matrix-date');
+        const idx = dateList.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dateList.length) {{
+            select.value = dateList[newIdx];
+            updateMatrix();
+        }}
+    }}
+
+    function toggleMatrixPlay() {{
+        const btn = document.getElementById('matrix-play-btn');
+        if (matrixPlayInterval) {{
+            clearInterval(matrixPlayInterval);
+            matrixPlayInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶ 播放';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸ 停止';
+            let idx = 1;  // 从第一个日期开始(跳过"全部")
+            const play = () => {{
+                if (idx >= dateList.length) {{
+                    clearInterval(matrixPlayInterval);
+                    matrixPlayInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶ 播放';
+                    return;
+                }}
+                document.getElementById('matrix-date').value = dateList[idx];
+                updateMatrix();
+                idx++;
+            }};
+            play();
+            matrixPlayInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    // Ranking date switching
+    function switchRankingDate(delta) {{
+        const select = document.getElementById('ranking-date');
+        const idx = dateList.indexOf(select.value);
+        const newIdx = idx + delta;
+        if (newIdx >= 0 && newIdx < dateList.length) {{
+            select.value = dateList[newIdx];
+            initRanking();
+        }}
+    }}
+
+    function toggleRankingPlay() {{
+        const btn = document.getElementById('ranking-play-btn');
+        if (rankingPlayInterval) {{
+            clearInterval(rankingPlayInterval);
+            rankingPlayInterval = null;
+            btn.classList.remove('playing');
+            btn.textContent = '▶ 播放';
+        }} else {{
+            btn.classList.add('playing');
+            btn.textContent = '⏸ 停止';
+            let idx = 1;
+            const play = () => {{
+                if (idx >= dateList.length) {{
+                    clearInterval(rankingPlayInterval);
+                    rankingPlayInterval = null;
+                    btn.classList.remove('playing');
+                    btn.textContent = '▶ 播放';
+                    return;
+                }}
+                document.getElementById('ranking-date').value = dateList[idx];
+                initRanking();
+                idx++;
+            }};
+            play();
+            rankingPlayInterval = setInterval(play, 1500);
+        }}
+    }}
+
+    // Initialize
+    initConsistency();
+    updateMatrix();
+    initRanking();
+    </script>
+</body>
+</html>
+"""
+
+html_file = output_dir / f"{latest_file.stem}_品类相关性分析.html"
+with open(html_file, 'w', encoding='utf-8') as f:
+    f.write(html_content)
+
+print(f"\nHTML 报告已生成: {html_file}")

Some files were not shown because too many files changed in this diff