Sfoglia il codice sorgente

feat: 新增素材视频匹配分析任务

- 分析素材与视频的搭配效果
- 只取有素材信息的记录(文章标题或卡片标题非空)
- 输出 Top 组合、素材搭配对比、品类×渠道矩阵
- 导出 CSV 供 Excel 筛选

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 mesi fa
parent
commit
2e4fccce5e

+ 204 - 0
tasks/素材视频匹配分析/analyze.py

@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材视频匹配分析
+分析素材与视频的搭配效果(仅有素材的记录)
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+
+csv_files = [f for f in output_dir.glob("*.csv")
+             if '_分析' not in f.name and '_匹配' not in f.name]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"分析文件: {latest_file.name}")
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+log()
+
+log("=" * 70)
+log("数据概览")
+log("=" * 70)
+log(f"总记录数: {len(df):,}")
+log(f"渠道数: {df['channel'].nunique()}")
+log(f"素材数: {df['rootsourceid'].nunique()}")
+log(f"视频数: {df['videoid'].nunique()}")
+log()
+
+# 过滤小样本
+df_filtered = df[df['点击uv'] >= 100].copy()
+log(f"UV >= 100 的记录数: {len(df_filtered):,}")
+log()
+
+# ============================================================
+# 一、高效素材-视频组合 Top 30
+# ============================================================
+log("=" * 70)
+log("一、高效素材-视频组合 Top 30(UV >= 100,按回流率排序)")
+log("=" * 70)
+log()
+
+top_by_ror = df_filtered.nlargest(30, '再分享回流率')
+for i, (_, row) in enumerate(top_by_ror.iterrows(), 1):
+    channel = row['channel']
+    gzh = f"({row['公众号名']})" if pd.notna(row['公众号名']) else ''
+    article = str(row['文章标题'])[:35] if pd.notna(row['文章标题']) else ''
+    card = str(row['分享标题'])[:35] if pd.notna(row['分享标题']) else ''
+    video = str(row['title'])[:35] if pd.notna(row['title']) else '(无)'
+    category = row['merge一级品类'] if pd.notna(row['merge一级品类']) else ''
+    ror = row['再分享回流率']
+    ror_orig = row['原视频再分享回流率'] if pd.notna(row['原视频再分享回流率']) else 0
+    ror_rec = row['推荐再分享回流率'] if pd.notna(row['推荐再分享回流率']) else 0
+
+    log(f"{i:2}. UV={int(row['点击uv']):>5}")
+    log(f"    渠道: {channel} {gzh}")
+    if article:
+        log(f"    素材_文章: {article}")
+    if card:
+        log(f"    素材_卡片: {card}")
+    log(f"    视频: {video} [{category}]")
+    log(f"    回流率: 总={ror:.1%}, 原视频={ror_orig:.1%}, 推荐={ror_rec:.1%}")
+log()
+
+# ============================================================
+# 二、素材搭配分析
+# ============================================================
+log("=" * 70)
+log("二、同一素材的最佳 vs 最差视频搭配")
+log("=" * 70)
+log()
+
+source_video_count = df.groupby('rootsourceid').agg({
+    'videoid': 'nunique',
+    '点击uv': 'sum'
+}).rename(columns={'videoid': '视频数'})
+multi_video_sources = source_video_count[
+    (source_video_count['视频数'] >= 3) & (source_video_count['点击uv'] >= 500)
+].index.tolist()
+
+log(f"有3+视频且UV>=500的素材数: {len(multi_video_sources)}")
+log()
+
+source_analysis = []
+for source in multi_video_sources[:30]:
+    source_df = df[df['rootsourceid'] == source]
+    if len(source_df) < 3:
+        continue
+
+    total_uv = source_df['点击uv'].sum()
+    best = source_df.nlargest(1, '再分享回流率').iloc[0]
+    worst = source_df.nsmallest(1, '再分享回流率').iloc[0]
+    diff = best['再分享回流率'] - worst['再分享回流率']
+
+    source_analysis.append({
+        '文章标题': source_df['文章标题'].iloc[0],
+        '分享标题': source_df['分享标题'].iloc[0],
+        'channel': source_df['channel'].iloc[0],
+        '视频数': len(source_df),
+        '总UV': total_uv,
+        '最佳视频': best['title'],
+        '最佳回流率': best['再分享回流率'],
+        '最差视频': worst['title'],
+        '最差回流率': worst['再分享回流率'],
+        '效果差异': diff
+    })
+
+if source_analysis:
+    source_df = pd.DataFrame(source_analysis).sort_values('效果差异', ascending=False)
+
+    for i, (_, row) in enumerate(source_df.head(15).iterrows(), 1):
+        article = str(row['文章标题'])[:35] if pd.notna(row['文章标题']) else ''
+        card = str(row['分享标题'])[:35] if pd.notna(row['分享标题']) else ''
+        best_v = str(row['最佳视频'])[:28] if pd.notna(row['最佳视频']) else '(无)'
+        worst_v = str(row['最差视频'])[:28] if pd.notna(row['最差视频']) else '(无)'
+
+        log(f"{i:2}. 渠道={row['channel']}, 视频数={row['视频数']}, UV={int(row['总UV']):,}")
+        if article:
+            log(f"    素材_文章: {article}")
+        if card:
+            log(f"    素材_卡片: {card}")
+        log(f"    最佳: {best_v} → {row['最佳回流率']:.1%}")
+        log(f"    最差: {worst_v} → {row['最差回流率']:.1%}")
+        log(f"    差异: {row['效果差异']:.1%}")
+        log()
+
+# ============================================================
+# 三、品类×渠道矩阵
+# ============================================================
+log("=" * 70)
+log("三、品类×渠道效果矩阵")
+log("=" * 70)
+log()
+
+pivot_data = df.groupby(['merge一级品类', 'channel']).agg({
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum'
+}).reset_index()
+pivot_data['回流率'] = pivot_data['再分享回流uv'] / (pivot_data['点击uv'] + 10)
+
+pivot_uv = pivot_data.pivot(index='merge一级品类', columns='channel', values='点击uv')
+pivot_ror = pivot_data.pivot(index='merge一级品类', columns='channel', values='回流率')
+
+min_uv = 500
+valid_categories = pivot_uv[pivot_uv.sum(axis=1) >= min_uv].index
+valid_channels = pivot_uv.columns[pivot_uv.sum(axis=0) >= min_uv]
+
+if len(valid_categories) > 0 and len(valid_channels) > 0:
+    log(f"品类数: {len(valid_categories)}, 渠道数: {len(valid_channels)}")
+    log()
+    log("【各品类最佳渠道】")
+    log("-" * 70)
+    for category in valid_categories:
+        row = pivot_ror.loc[category, valid_channels].dropna()
+        if len(row) == 0:
+            continue
+        best_ch = row.idxmax()
+        best_ror = row.max()
+        cat_name = str(category)[:15] if pd.notna(category) else '(空)'
+        log(f"  {cat_name:<17} → {best_ch:<25} 回流率={best_ror:.1%}")
+log()
+
+# ============================================================
+# 保存结果
+# ============================================================
+result_file = output_dir / f"{latest_file.stem}_分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+log(f"分析结果已保存: {result_file}")
+
+# 导出 CSV
+export_df = df_filtered.copy()
+export_cols = ['channel', '公众号名', '文章标题', '分享标题', 'title',
+               'merge一级品类', 'merge二级品类', '点击uv', '进入推荐率',
+               '再分享回流率', '原视频再分享回流率', '推荐再分享回流率',
+               '再分享回流uv', 'rootsourceid', 'videoid']
+export_cols = [c for c in export_cols if c in export_df.columns]
+export_df = export_df[export_cols].rename(columns={
+    'channel': '渠道',
+    '公众号名': '渠道_公众号名',
+    '文章标题': '素材_文章标题',
+    '分享标题': '素材_卡片标题',
+    'title': '视频_标题',
+    'merge一级品类': '视频_一级品类',
+    'merge二级品类': '视频_二级品类'
+}).sort_values('点击uv', ascending=False)
+
+csv_file = output_dir / f"{latest_file.stem}_匹配数据.csv"
+export_df.to_csv(csv_file, index=False, encoding='utf-8-sig')
+log(f"CSV 已保存: {csv_file} ({len(export_df):,} 条)")

+ 48 - 0
tasks/素材视频匹配分析/query.sql

@@ -0,0 +1,48 @@
+-- 素材视频匹配分析
+-- 分析素材与视频的搭配效果
+-- 只取有素材信息的记录(文章标题或卡片标题非空)
+
+SELECT  dt
+        ,channel
+        ,公众号名
+        -- 素材维度
+        ,rootsourceid
+        ,文章标题
+        ,分享标题
+        -- 视频维度
+        ,videoid
+        ,title
+        ,merge一级品类
+        ,merge二级品类
+        -- 核心指标
+        ,COUNT(DISTINCT mid) AS 点击uv
+        ,COUNT(DISTINCT CASE WHEN 是否进入推荐 = '1' THEN mid END) / COUNT(DISTINCT mid) AS 进入推荐率
+        ,(SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+          + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 再分享回流率
+        ,(SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv END)
+          + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 原视频再分享回流率
+        ,(SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv END)
+          + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv END)
+         ) / (COUNT(DISTINCT mid) + 10) AS 推荐再分享回流率
+        ,SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+         + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END) AS 再分享回流uv
+FROM    loghubods.opengid_base_data
+WHERE   dt >= ${start}
+AND     dt <= ${end}
+AND     usersharedepth = 0
+AND     videoid IS NOT NULL
+AND     (文章标题 IS NOT NULL AND 文章标题 != '' OR 分享标题 IS NOT NULL AND 分享标题 != '')
+GROUP BY dt
+         ,channel
+         ,公众号名
+         ,rootsourceid
+         ,文章标题
+         ,分享标题
+         ,videoid
+         ,title
+         ,merge一级品类
+         ,merge二级品类
+ORDER BY 点击uv DESC
+;