Просмотр исходного кода

feat(素材视频分析): 添加素材-视频匹配分析脚本

- analyze_match.py: 分析素材与视频的搭配效果
  - 仅分析有素材信息的记录(文章标题或卡片标题非空)
  - 输出 Top 30 高效组合、素材搭配分析、品类×渠道矩阵
  - 导出 CSV 供 Excel 筛选

- analyze_material_fields.py: 分析各渠道素材字段填充率
  - 展示各渠道素材类型分类
  - 输出字段填充率汇总表

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 месяцев назад
Родитель
Сommit
6bfb304135

+ 233 - 0
tasks/素材视频维度分析/analyze_match.py

@@ -0,0 +1,233 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材-视频匹配效果分析
+只分析有素材信息的记录(文章标题或卡片标题非空)
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# 找到最新的原始数据文件(排除分析结果文件)
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+csv_files = [f for f in output_dir.glob("*.csv")
+             if '_匹配' not in f.name and '_分析' not in f.name and '_素材' not in f.name]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"分析文件: {latest_file.name}")
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+log()
+
+# 只分析有素材信息的记录(文章标题或卡片标题非空)
+df_with_material = df[
+    (df['文章标题'].notna()) | (df['分享标题'].notna())
+].copy()
+
+log("=" * 70)
+log("数据概览(仅有素材的记录)")
+log("=" * 70)
+log(f"总记录数: {len(df)}")
+log(f"有素材的记录: {len(df_with_material)} ({len(df_with_material)/len(df):.1%})")
+log(f"渠道数: {df_with_material['channel'].nunique()}")
+log(f"视频数: {df_with_material['videoid'].nunique()}")
+log()
+
+# 过滤小样本(基于有素材的数据)
+df_filtered = df_with_material[df_with_material['点击uv'] >= 100].copy()
+log(f"UV >= 100 的记录数: {len(df_filtered)}")
+log()
+
+# ============================================================
+# 1. 高效素材-视频组合 Top 30
+# ============================================================
+log("=" * 70)
+log("一、高效素材-视频组合 Top 30(UV >= 100)")
+log("=" * 70)
+log()
+
+log("【按再分享回流率排序】")
+log("-" * 70)
+top_by_ror = df_filtered.nlargest(30, '再分享回流率')
+for i, (_, row) in enumerate(top_by_ror.iterrows(), 1):
+    channel = row['channel']
+    gzh_name = f"({row['公众号名']})" if pd.notna(row['公众号名']) else ''
+    article_title = str(row['文章标题'])[:30] if pd.notna(row['文章标题']) else ''
+    card_title = str(row['分享标题'])[:30] if pd.notna(row['分享标题']) else ''
+    video_title = str(row['title'])[:30] if pd.notna(row['title']) else '(无)'
+    category = row['merge一级品类'] if pd.notna(row['merge一级品类']) else ''
+    ror = row['再分享回流率']
+    ror_orig = row['原视频再分享回流率'] if pd.notna(row['原视频再分享回流率']) else 0
+    ror_rec = row['推荐再分享回流率'] if pd.notna(row['推荐再分享回流率']) else 0
+
+    log(f"{i:2}. UV={int(row['点击uv']):>5}")
+    log(f"    渠道: {channel} {gzh_name}")
+    if article_title:
+        log(f"    素材_文章: {article_title}")
+    if card_title:
+        log(f"    素材_卡片: {card_title}")
+    log(f"    视频: {video_title} [{category}]")
+    log(f"    回流率: 总={ror:.1%}, 原视频={ror_orig:.1%}, 推荐={ror_rec:.1%}")
+log()
+
+# ============================================================
+# 2. 素材搭配分析
+# ============================================================
+log("=" * 70)
+log("二、素材搭配分析(素材内最佳 vs 最差视频)")
+log("=" * 70)
+log()
+
+source_video_count = df_with_material.groupby('rootsourceid').agg({
+    'videoid': 'nunique',
+    '点击uv': 'sum'
+}).rename(columns={'videoid': '视频数'})
+multi_video_sources = source_video_count[
+    (source_video_count['视频数'] >= 3) & (source_video_count['点击uv'] >= 500)
+].index.tolist()
+
+log(f"有3+视频且UV>=500的素材数: {len(multi_video_sources)}")
+log()
+
+source_analysis = []
+for source in multi_video_sources[:50]:
+    source_df = df_with_material[df_with_material['rootsourceid'] == source]
+    if len(source_df) < 3:
+        continue
+
+    total_uv = source_df['点击uv'].sum()
+    best = source_df.nlargest(1, '再分享回流率').iloc[0]
+    worst = source_df.nsmallest(1, '再分享回流率').iloc[0]
+    diff = best['再分享回流率'] - worst['再分享回流率']
+
+    source_analysis.append({
+        '文章标题': source_df['文章标题'].iloc[0],
+        '分享标题': source_df['分享标题'].iloc[0],
+        'channel': source_df['channel'].iloc[0],
+        '视频数': len(source_df),
+        '总UV': total_uv,
+        '最佳视频': best['title'],
+        '最佳回流率': best['再分享回流率'],
+        '最差视频': worst['title'],
+        '最差回流率': worst['再分享回流率'],
+        '效果差异': diff
+    })
+
+if source_analysis:
+    source_analysis_df = pd.DataFrame(source_analysis).sort_values('效果差异', ascending=False)
+
+    log("【视频效果差异最大的素材 Top 15】")
+    log("-" * 70)
+
+    for i, (_, row) in enumerate(source_analysis_df.head(15).iterrows(), 1):
+        article_title = str(row['文章标题'])[:35] if pd.notna(row['文章标题']) else ''
+        card_title = str(row['分享标题'])[:35] if pd.notna(row['分享标题']) else ''
+        best_title = str(row['最佳视频'])[:28] if pd.notna(row['最佳视频']) else '(无)'
+        worst_title = str(row['最差视频'])[:28] if pd.notna(row['最差视频']) else '(无)'
+
+        log(f"{i:2}. 渠道: {row['channel']}, 视频数={row['视频数']}, 总UV={int(row['总UV'])}")
+        if article_title:
+            log(f"    素材_文章: {article_title}")
+        if card_title:
+            log(f"    素材_卡片: {card_title}")
+        log(f"    最佳视频: {best_title} → 回流率={row['最佳回流率']:.1%}")
+        log(f"    最差视频: {worst_title} → 回流率={row['最差回流率']:.1%}")
+        log(f"    效果差异: {row['效果差异']:.1%}")
+        log()
+log()
+
+# ============================================================
+# 3. 品类×渠道效果矩阵
+# ============================================================
+log("=" * 70)
+log("三、品类×渠道效果矩阵(仅有素材的记录)")
+log("=" * 70)
+
+pivot_data = df_with_material.groupby(['merge一级品类', 'channel']).agg({
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum'
+}).reset_index()
+pivot_data['回流率'] = pivot_data['再分享回流uv'] / (pivot_data['点击uv'] + 10)
+
+pivot_ror = pivot_data.pivot(index='merge一级品类', columns='channel', values='回流率')
+pivot_uv = pivot_data.pivot(index='merge一级品类', columns='channel', values='点击uv')
+
+min_uv = 500
+valid_categories = pivot_uv[pivot_uv.sum(axis=1) >= min_uv].index
+valid_channels = pivot_uv.columns[pivot_uv.sum(axis=0) >= min_uv]
+
+if len(valid_categories) > 0 and len(valid_channels) > 0:
+    pivot_ror_filtered = pivot_ror.loc[valid_categories, valid_channels]
+    log(f"品类数: {len(valid_categories)}, 渠道数: {len(valid_channels)}")
+    log()
+
+    log("【各品类最佳渠道】")
+    log("-" * 70)
+    for category in valid_categories:
+        row = pivot_ror_filtered.loc[category].dropna()
+        if len(row) == 0:
+            continue
+        best_channel = row.idxmax()
+        best_ror = row.max()
+        cat_name = str(category)[:15] if pd.notna(category) else '(空)'
+        log(f"  {cat_name:<17} → {best_channel:<25} 回流率={best_ror:.1%}")
+log()
+
+# ============================================================
+# 保存结果
+# ============================================================
+result_file = output_dir / f"{latest_file.stem}_匹配分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+log(f"分析结果已保存到: {result_file}")
+
+# ============================================================
+# 导出 CSV(仅有素材的记录)
+# ============================================================
+log()
+log("=" * 70)
+log("导出 CSV 文件")
+log("=" * 70)
+
+export_df = df_filtered.copy()
+
+output_cols = [
+    'channel', '公众号名',
+    '文章标题', '分享标题',
+    'title', 'merge一级品类', 'merge二级品类',
+    '点击uv', '进入推荐率', '再分享回流率', '原视频再分享回流率', '推荐再分享回流率', '再分享回流uv',
+    'rootsourceid', 'videoid'
+]
+output_cols = [c for c in output_cols if c in export_df.columns]
+export_df = export_df[output_cols]
+
+export_df = export_df.rename(columns={
+    'channel': '渠道',
+    '公众号名': '渠道_公众号名',
+    '文章标题': '素材_文章标题',
+    '分享标题': '素材_卡片标题',
+    'title': '视频_标题',
+    'merge一级品类': '视频_一级品类',
+    'merge二级品类': '视频_二级品类'
+})
+
+export_df = export_df.sort_values('点击uv', ascending=False)
+
+csv_file = output_dir / f"{latest_file.stem}_素材视频匹配.csv"
+export_df.to_csv(csv_file, index=False, encoding='utf-8-sig')
+
+log(f"CSV 已保存: {csv_file}")
+log(f"共 {len(export_df)} 条记录(有素材且 UV >= 100)")

+ 218 - 0
tasks/素材视频维度分析/analyze_material_fields.py

@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+分析各渠道素材字段填充情况
+素材字段:文章标题、分享标题、分享封面、公众号名、wx_sn、contenturl
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# 找到最新的原始数据文件
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+csv_files = [f for f in output_dir.glob("*.csv")
+             if '_匹配' not in f.name and '_分析' not in f.name and '_素材' not in f.name]
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+# 输出结果收集
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"分析文件: {latest_file.name}")
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+log()
+
+# 素材相关字段
+material_fields = [
+    ('文章标题', '长文维度'),
+    ('分享标题', '卡片维度'),
+    ('分享封面', '卡片维度'),
+    ('公众号名', '渠道细分'),
+    ('wx_sn', '长文ID'),
+    ('contenturl', '长文链接'),
+    ('rootsourceid', '素材ID'),
+    ('shareid', '卡片ID'),
+]
+
+# 检查哪些字段存在
+existing_fields = [(f, desc) for f, desc in material_fields if f in df.columns]
+
+log("=" * 80)
+log("一、各渠道素材字段填充率总览")
+log("=" * 80)
+log()
+
+# 构建表头
+header = f"{'渠道':<28}"
+for field, desc in existing_fields:
+    header += f"{field[:6]:>8}"
+log(header)
+log("-" * 80)
+
+# 按渠道统计
+channels = df['channel'].unique()
+channel_stats = []
+
+for ch in channels:
+    ch_df = df[df['channel'] == ch]
+    n = len(ch_df)
+
+    row_data = {'渠道': ch, '记录数': n}
+    row_str = f"{ch:<28}"
+
+    for field, desc in existing_fields:
+        rate = ch_df[field].notna().mean()
+        row_data[field] = rate
+        row_str += f"{rate:>8.0%}"
+
+    log(row_str)
+    channel_stats.append(row_data)
+
+log()
+
+# ============================================================
+# 二、素材类型分类
+# ============================================================
+log("=" * 80)
+log("二、渠道素材类型分类")
+log("=" * 80)
+log()
+
+# 分类规则
+log("【有文章标题的渠道】(公众号系,可分析长文素材)")
+log("-" * 60)
+for stat in channel_stats:
+    if stat.get('文章标题', 0) >= 0.3:
+        log(f"  {stat['渠道']:<30} 文章标题填充率={stat['文章标题']:.0%}")
+log()
+
+log("【有卡片标题的渠道】(群/企微系,可分析卡片素材)")
+log("-" * 60)
+for stat in channel_stats:
+    if stat.get('分享标题', 0) >= 0.3:
+        log(f"  {stat['渠道']:<30} 卡片标题填充率={stat['分享标题']:.0%}")
+log()
+
+log("【素材信息缺失的渠道】(只能按视频维度分析)")
+log("-" * 60)
+for stat in channel_stats:
+    art = stat.get('文章标题', 0)
+    card = stat.get('分享标题', 0)
+    if art < 0.3 and card < 0.3:
+        log(f"  {stat['渠道']:<30} 文章={art:.0%}, 卡片={card:.0%}")
+log()
+
+# ============================================================
+# 三、详细字段分析
+# ============================================================
+log("=" * 80)
+log("三、各渠道素材字段详细分析")
+log("=" * 80)
+
+for ch in channels:
+    ch_df = df[df['channel'] == ch]
+    n = len(ch_df)
+
+    log()
+    log(f"【{ch}】({n} 条记录)")
+    log("-" * 60)
+
+    for field, desc in existing_fields:
+        rate = ch_df[field].notna().mean()
+        count = ch_df[field].notna().sum()
+
+        # 状态标记
+        if rate >= 0.8:
+            status = "✓ 完整"
+        elif rate >= 0.3:
+            status = "△ 部分"
+        elif rate > 0:
+            status = "○ 稀少"
+        else:
+            status = "✗ 无"
+
+        log(f"  {field:<12} ({desc:<6}): {rate:>6.1%} ({count}/{n}) {status}")
+
+    # 给出建议
+    art = ch_df['文章标题'].notna().mean() if '文章标题' in ch_df.columns else 0
+    card = ch_df['分享标题'].notna().mean() if '分享标题' in ch_df.columns else 0
+
+    log()
+    if art >= 0.5:
+        log(f"  → 建议:按「文章标题」分析素材效果")
+    elif card >= 0.5:
+        log(f"  → 建议:按「卡片标题」分析素材效果")
+    elif art >= 0.1 or card >= 0.1:
+        log(f"  → 建议:素材信息不完整,可结合 rootsourceid 分析")
+    else:
+        log(f"  → 建议:无素材信息,只能按视频维度分析")
+
+log()
+
+# ============================================================
+# 四、素材字段示例
+# ============================================================
+log("=" * 80)
+log("四、各渠道素材示例(有素材的渠道)")
+log("=" * 80)
+
+for ch in channels:
+    ch_df = df[df['channel'] == ch]
+
+    # 检查是否有素材
+    has_article = ch_df['文章标题'].notna().any() if '文章标题' in ch_df.columns else False
+    has_card = ch_df['分享标题'].notna().any() if '分享标题' in ch_df.columns else False
+
+    if not has_article and not has_card:
+        continue
+
+    log()
+    log(f"【{ch}】")
+    log("-" * 60)
+
+    # 取有素材的示例
+    if has_article:
+        sample = ch_df[ch_df['文章标题'].notna()].head(3)
+        log("  文章标题示例:")
+        for _, row in sample.iterrows():
+            art = str(row['文章标题'])[:50]
+            log(f"    - {art}")
+
+    if has_card:
+        sample = ch_df[ch_df['分享标题'].notna()].head(3)
+        log("  卡片标题示例:")
+        for _, row in sample.iterrows():
+            card = str(row['分享标题'])[:50]
+            log(f"    - {card}")
+
+log()
+
+# ============================================================
+# 保存结果
+# ============================================================
+result_file = output_dir / f"{latest_file.stem}_素材字段分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+
+log(f"分析结果已保存到: {result_file}")
+
+# 同时输出 CSV 汇总表
+csv_data = []
+for stat in channel_stats:
+    csv_data.append(stat)
+
+summary_df = pd.DataFrame(csv_data)
+summary_file = output_dir / f"{latest_file.stem}_素材字段汇总.csv"
+summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
+log(f"汇总表已保存到: {summary_file}")