4 месяцев назад · 6bfb304135
--- a/tasks/素材视频维度分析/analyze_match.py
+++ b/tasks/素材视频维度分析/analyze_match.py
@@ -0,0 +1,233 @@
 
				+#!/usr/bin/env python
			
 
				+# coding=utf-8
			
 
				+"""
			
 
				+素材-视频匹配效果分析
			
 
				+只分析有素材信息的记录（文章标题或卡片标题非空）
			
 
				+"""
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 找到最新的原始数据文件（排除分析结果文件）
			
 
				+task_dir = Path(__file__).parent
			
 
				+output_dir = task_dir / "output"
			
 
				+csv_files = [f for f in output_dir.glob("*.csv")
			
 
				+             if '_匹配' not in f.name and '_分析' not in f.name and '_素材' not in f.name]
			
 
				+if not csv_files:
			
 
				+    print("没有找到数据文件，请先运行 query.sql")
			
 
				+    exit(1)
			
 
				+
			
 
				+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
			
 
				+df = pd.read_csv(latest_file)
			
 
				+
			
 
				+lines = []
			
 
				+
			
 
				+
			
 
				+def log(text=""):
			
 
				+    print(text)
			
 
				+    lines.append(text)
			
 
				+
			
 
				+
			
 
				+log(f"分析文件: {latest_file.name}")
			
 
				+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
			
 
				+log()
			
 
				+
			
 
				+# 只分析有素材信息的记录（文章标题或卡片标题非空）
			
 
				+df_with_material = df[
			
 
				+    (df['文章标题'].notna()) | (df['分享标题'].notna())
			
 
				+].copy()
			
 
				+
			
 
				+log("=" * 70)
			
 
				+log("数据概览（仅有素材的记录）")
			
 
				+log("=" * 70)
			
 
				+log(f"总记录数: {len(df)}")
			
 
				+log(f"有素材的记录: {len(df_with_material)} ({len(df_with_material)/len(df):.1%})")
			
 
				+log(f"渠道数: {df_with_material['channel'].nunique()}")
			
 
				+log(f"视频数: {df_with_material['videoid'].nunique()}")
			
 
				+log()
			
 
				+
			
 
				+# 过滤小样本（基于有素材的数据）
			
 
				+df_filtered = df_with_material[df_with_material['点击uv'] >= 100].copy()
			
 
				+log(f"UV >= 100 的记录数: {len(df_filtered)}")
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 1. 高效素材-视频组合 Top 30
			
 
				+# ============================================================
			
 
				+log("=" * 70)
			
 
				+log("一、高效素材-视频组合 Top 30（UV >= 100）")
			
 
				+log("=" * 70)
			
 
				+log()
			
 
				+
			
 
				+log("【按再分享回流率排序】")
			
 
				+log("-" * 70)
			
 
				+top_by_ror = df_filtered.nlargest(30, '再分享回流率')
			
 
				+for i, (_, row) in enumerate(top_by_ror.iterrows(), 1):
			
 
				+    channel = row['channel']
			
 
				+    gzh_name = f"({row['公众号名']})" if pd.notna(row['公众号名']) else ''
			
 
				+    article_title = str(row['文章标题'])[:30] if pd.notna(row['文章标题']) else ''
			
 
				+    card_title = str(row['分享标题'])[:30] if pd.notna(row['分享标题']) else ''
			
 
				+    video_title = str(row['title'])[:30] if pd.notna(row['title']) else '(无)'
			
 
				+    category = row['merge一级品类'] if pd.notna(row['merge一级品类']) else ''
			
 
				+    ror = row['再分享回流率']
			
 
				+    ror_orig = row['原视频再分享回流率'] if pd.notna(row['原视频再分享回流率']) else 0
			
 
				+    ror_rec = row['推荐再分享回流率'] if pd.notna(row['推荐再分享回流率']) else 0
			
 
				+
			
 
				+    log(f"{i:2}. UV={int(row['点击uv']):>5}")
			
 
				+    log(f"    渠道: {channel} {gzh_name}")
			
 
				+    if article_title:
			
 
				+        log(f"    素材_文章: {article_title}")
			
 
				+    if card_title:
			
 
				+        log(f"    素材_卡片: {card_title}")
			
 
				+    log(f"    视频: {video_title} [{category}]")
			
 
				+    log(f"    回流率: 总={ror:.1%}, 原视频={ror_orig:.1%}, 推荐={ror_rec:.1%}")
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 2. 素材搭配分析
			
 
				+# ============================================================
			
 
				+log("=" * 70)
			
 
				+log("二、素材搭配分析（素材内最佳 vs 最差视频）")
			
 
				+log("=" * 70)
			
 
				+log()
			
 
				+
			
 
				+source_video_count = df_with_material.groupby('rootsourceid').agg({
			
 
				+    'videoid': 'nunique',
			
 
				+    '点击uv': 'sum'
			
 
				+}).rename(columns={'videoid': '视频数'})
			
 
				+multi_video_sources = source_video_count[
			
 
				+    (source_video_count['视频数'] >= 3) & (source_video_count['点击uv'] >= 500)
			
 
				+].index.tolist()
			
 
				+
			
 
				+log(f"有3+视频且UV>=500的素材数: {len(multi_video_sources)}")
			
 
				+log()
			
 
				+
			
 
				+source_analysis = []
			
 
				+for source in multi_video_sources[:50]:
			
 
				+    source_df = df_with_material[df_with_material['rootsourceid'] == source]
			
 
				+    if len(source_df) < 3:
			
 
				+        continue
			
 
				+
			
 
				+    total_uv = source_df['点击uv'].sum()
			
 
				+    best = source_df.nlargest(1, '再分享回流率').iloc[0]
			
 
				+    worst = source_df.nsmallest(1, '再分享回流率').iloc[0]
			
 
				+    diff = best['再分享回流率'] - worst['再分享回流率']
			
 
				+
			
 
				+    source_analysis.append({
			
 
				+        '文章标题': source_df['文章标题'].iloc[0],
			
 
				+        '分享标题': source_df['分享标题'].iloc[0],
			
 
				+        'channel': source_df['channel'].iloc[0],
			
 
				+        '视频数': len(source_df),
			
 
				+        '总UV': total_uv,
			
 
				+        '最佳视频': best['title'],
			
 
				+        '最佳回流率': best['再分享回流率'],
			
 
				+        '最差视频': worst['title'],
			
 
				+        '最差回流率': worst['再分享回流率'],
			
 
				+        '效果差异': diff
			
 
				+    })
			
 
				+
			
 
				+if source_analysis:
			
 
				+    source_analysis_df = pd.DataFrame(source_analysis).sort_values('效果差异', ascending=False)
			
 
				+
			
 
				+    log("【视频效果差异最大的素材 Top 15】")
			
 
				+    log("-" * 70)
			
 
				+
			
 
				+    for i, (_, row) in enumerate(source_analysis_df.head(15).iterrows(), 1):
			
 
				+        article_title = str(row['文章标题'])[:35] if pd.notna(row['文章标题']) else ''
			
 
				+        card_title = str(row['分享标题'])[:35] if pd.notna(row['分享标题']) else ''
			
 
				+        best_title = str(row['最佳视频'])[:28] if pd.notna(row['最佳视频']) else '(无)'
			
 
				+        worst_title = str(row['最差视频'])[:28] if pd.notna(row['最差视频']) else '(无)'
			
 
				+
			
 
				+        log(f"{i:2}. 渠道: {row['channel']}, 视频数={row['视频数']}, 总UV={int(row['总UV'])}")
			
 
				+        if article_title:
			
 
				+            log(f"    素材_文章: {article_title}")
			
 
				+        if card_title:
			
 
				+            log(f"    素材_卡片: {card_title}")
			
 
				+        log(f"    最佳视频: {best_title} → 回流率={row['最佳回流率']:.1%}")
			
 
				+        log(f"    最差视频: {worst_title} → 回流率={row['最差回流率']:.1%}")
			
 
				+        log(f"    效果差异: {row['效果差异']:.1%}")
			
 
				+        log()
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 3. 品类×渠道效果矩阵
			
 
				+# ============================================================
			
 
				+log("=" * 70)
			
 
				+log("三、品类×渠道效果矩阵（仅有素材的记录）")
			
 
				+log("=" * 70)
			
 
				+
			
 
				+pivot_data = df_with_material.groupby(['merge一级品类', 'channel']).agg({
			
 
				+    '点击uv': 'sum',
			
 
				+    '再分享回流uv': 'sum'
			
 
				+}).reset_index()
			
 
				+pivot_data['回流率'] = pivot_data['再分享回流uv'] / (pivot_data['点击uv'] + 10)
			
 
				+
			
 
				+pivot_ror = pivot_data.pivot(index='merge一级品类', columns='channel', values='回流率')
			
 
				+pivot_uv = pivot_data.pivot(index='merge一级品类', columns='channel', values='点击uv')
			
 
				+
			
 
				+min_uv = 500
			
 
				+valid_categories = pivot_uv[pivot_uv.sum(axis=1) >= min_uv].index
			
 
				+valid_channels = pivot_uv.columns[pivot_uv.sum(axis=0) >= min_uv]
			
 
				+
			
 
				+if len(valid_categories) > 0 and len(valid_channels) > 0:
			
 
				+    pivot_ror_filtered = pivot_ror.loc[valid_categories, valid_channels]
			
 
				+    log(f"品类数: {len(valid_categories)}, 渠道数: {len(valid_channels)}")
			
 
				+    log()
			
 
				+
			
 
				+    log("【各品类最佳渠道】")
			
 
				+    log("-" * 70)
			
 
				+    for category in valid_categories:
			
 
				+        row = pivot_ror_filtered.loc[category].dropna()
			
 
				+        if len(row) == 0:
			
 
				+            continue
			
 
				+        best_channel = row.idxmax()
			
 
				+        best_ror = row.max()
			
 
				+        cat_name = str(category)[:15] if pd.notna(category) else '(空)'
			
 
				+        log(f"  {cat_name:<17} → {best_channel:<25} 回流率={best_ror:.1%}")
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 保存结果
			
 
				+# ============================================================
			
 
				+result_file = output_dir / f"{latest_file.stem}_匹配分析.txt"
			
 
				+with open(result_file, 'w', encoding='utf-8') as f:
			
 
				+    f.write("\n".join(lines))
			
 
				+log(f"分析结果已保存到: {result_file}")
			
 
				+
			
 
				+# ============================================================
			
 
				+# 导出 CSV（仅有素材的记录）
			
 
				+# ============================================================
			
 
				+log()
			
 
				+log("=" * 70)
			
 
				+log("导出 CSV 文件")
			
 
				+log("=" * 70)
			
 
				+
			
 
				+export_df = df_filtered.copy()
			
 
				+
			
 
				+output_cols = [
			
 
				+    'channel', '公众号名',
			
 
				+    '文章标题', '分享标题',
			
 
				+    'title', 'merge一级品类', 'merge二级品类',
			
 
				+    '点击uv', '进入推荐率', '再分享回流率', '原视频再分享回流率', '推荐再分享回流率', '再分享回流uv',
			
 
				+    'rootsourceid', 'videoid'
			
 
				+]
			
 
				+output_cols = [c for c in output_cols if c in export_df.columns]
			
 
				+export_df = export_df[output_cols]
			
 
				+
			
 
				+export_df = export_df.rename(columns={
			
 
				+    'channel': '渠道',
			
 
				+    '公众号名': '渠道_公众号名',
			
 
				+    '文章标题': '素材_文章标题',
			
 
				+    '分享标题': '素材_卡片标题',
			
 
				+    'title': '视频_标题',
			
 
				+    'merge一级品类': '视频_一级品类',
			
 
				+    'merge二级品类': '视频_二级品类'
			
 
				+})
			
 
				+
			
 
				+export_df = export_df.sort_values('点击uv', ascending=False)
			
 
				+
			
 
				+csv_file = output_dir / f"{latest_file.stem}_素材视频匹配.csv"
			
 
				+export_df.to_csv(csv_file, index=False, encoding='utf-8-sig')
			
 
				+
			
 
				+log(f"CSV 已保存: {csv_file}")
			
 
				+log(f"共 {len(export_df)} 条记录（有素材且 UV >= 100）")
			
--- a/tasks/素材视频维度分析/analyze_material_fields.py
+++ b/tasks/素材视频维度分析/analyze_material_fields.py
@@ -0,0 +1,218 @@
 
				+#!/usr/bin/env python
			
 
				+# coding=utf-8
			
 
				+"""
			
 
				+分析各渠道素材字段填充情况
			
 
				+素材字段：文章标题、分享标题、分享封面、公众号名、wx_sn、contenturl
			
 
				+"""
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from pathlib import Path
			
 
				+
			
 
				+# 找到最新的原始数据文件
			
 
				+task_dir = Path(__file__).parent
			
 
				+output_dir = task_dir / "output"
			
 
				+csv_files = [f for f in output_dir.glob("*.csv")
			
 
				+             if '_匹配' not in f.name and '_分析' not in f.name and '_素材' not in f.name]
			
 
				+if not csv_files:
			
 
				+    print("没有找到数据文件，请先运行 query.sql")
			
 
				+    exit(1)
			
 
				+
			
 
				+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
			
 
				+df = pd.read_csv(latest_file)
			
 
				+
			
 
				+# 输出结果收集
			
 
				+lines = []
			
 
				+
			
 
				+
			
 
				+def log(text=""):
			
 
				+    print(text)
			
 
				+    lines.append(text)
			
 
				+
			
 
				+
			
 
				+log(f"分析文件: {latest_file.name}")
			
 
				+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
			
 
				+log()
			
 
				+
			
 
				+# 素材相关字段
			
 
				+material_fields = [
			
 
				+    ('文章标题', '长文维度'),
			
 
				+    ('分享标题', '卡片维度'),
			
 
				+    ('分享封面', '卡片维度'),
			
 
				+    ('公众号名', '渠道细分'),
			
 
				+    ('wx_sn', '长文ID'),
			
 
				+    ('contenturl', '长文链接'),
			
 
				+    ('rootsourceid', '素材ID'),
			
 
				+    ('shareid', '卡片ID'),
			
 
				+]
			
 
				+
			
 
				+# 检查哪些字段存在
			
 
				+existing_fields = [(f, desc) for f, desc in material_fields if f in df.columns]
			
 
				+
			
 
				+log("=" * 80)
			
 
				+log("一、各渠道素材字段填充率总览")
			
 
				+log("=" * 80)
			
 
				+log()
			
 
				+
			
 
				+# 构建表头
			
 
				+header = f"{'渠道':<28}"
			
 
				+for field, desc in existing_fields:
			
 
				+    header += f"{field[:6]:>8}"
			
 
				+log(header)
			
 
				+log("-" * 80)
			
 
				+
			
 
				+# 按渠道统计
			
 
				+channels = df['channel'].unique()
			
 
				+channel_stats = []
			
 
				+
			
 
				+for ch in channels:
			
 
				+    ch_df = df[df['channel'] == ch]
			
 
				+    n = len(ch_df)
			
 
				+
			
 
				+    row_data = {'渠道': ch, '记录数': n}
			
 
				+    row_str = f"{ch:<28}"
			
 
				+
			
 
				+    for field, desc in existing_fields:
			
 
				+        rate = ch_df[field].notna().mean()
			
 
				+        row_data[field] = rate
			
 
				+        row_str += f"{rate:>8.0%}"
			
 
				+
			
 
				+    log(row_str)
			
 
				+    channel_stats.append(row_data)
			
 
				+
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 二、素材类型分类
			
 
				+# ============================================================
			
 
				+log("=" * 80)
			
 
				+log("二、渠道素材类型分类")
			
 
				+log("=" * 80)
			
 
				+log()
			
 
				+
			
 
				+# 分类规则
			
 
				+log("【有文章标题的渠道】（公众号系，可分析长文素材）")
			
 
				+log("-" * 60)
			
 
				+for stat in channel_stats:
			
 
				+    if stat.get('文章标题', 0) >= 0.3:
			
 
				+        log(f"  {stat['渠道']:<30} 文章标题填充率={stat['文章标题']:.0%}")
			
 
				+log()
			
 
				+
			
 
				+log("【有卡片标题的渠道】（群/企微系，可分析卡片素材）")
			
 
				+log("-" * 60)
			
 
				+for stat in channel_stats:
			
 
				+    if stat.get('分享标题', 0) >= 0.3:
			
 
				+        log(f"  {stat['渠道']:<30} 卡片标题填充率={stat['分享标题']:.0%}")
			
 
				+log()
			
 
				+
			
 
				+log("【素材信息缺失的渠道】（只能按视频维度分析）")
			
 
				+log("-" * 60)
			
 
				+for stat in channel_stats:
			
 
				+    art = stat.get('文章标题', 0)
			
 
				+    card = stat.get('分享标题', 0)
			
 
				+    if art < 0.3 and card < 0.3:
			
 
				+        log(f"  {stat['渠道']:<30} 文章={art:.0%}, 卡片={card:.0%}")
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 三、详细字段分析
			
 
				+# ============================================================
			
 
				+log("=" * 80)
			
 
				+log("三、各渠道素材字段详细分析")
			
 
				+log("=" * 80)
			
 
				+
			
 
				+for ch in channels:
			
 
				+    ch_df = df[df['channel'] == ch]
			
 
				+    n = len(ch_df)
			
 
				+
			
 
				+    log()
			
 
				+    log(f"【{ch}】({n} 条记录)")
			
 
				+    log("-" * 60)
			
 
				+
			
 
				+    for field, desc in existing_fields:
			
 
				+        rate = ch_df[field].notna().mean()
			
 
				+        count = ch_df[field].notna().sum()
			
 
				+
			
 
				+        # 状态标记
			
 
				+        if rate >= 0.8:
			
 
				+            status = "✓ 完整"
			
 
				+        elif rate >= 0.3:
			
 
				+            status = "△ 部分"
			
 
				+        elif rate > 0:
			
 
				+            status = "○ 稀少"
			
 
				+        else:
			
 
				+            status = "✗ 无"
			
 
				+
			
 
				+        log(f"  {field:<12} ({desc:<6}): {rate:>6.1%} ({count}/{n}) {status}")
			
 
				+
			
 
				+    # 给出建议
			
 
				+    art = ch_df['文章标题'].notna().mean() if '文章标题' in ch_df.columns else 0
			
 
				+    card = ch_df['分享标题'].notna().mean() if '分享标题' in ch_df.columns else 0
			
 
				+
			
 
				+    log()
			
 
				+    if art >= 0.5:
			
 
				+        log(f"  → 建议：按「文章标题」分析素材效果")
			
 
				+    elif card >= 0.5:
			
 
				+        log(f"  → 建议：按「卡片标题」分析素材效果")
			
 
				+    elif art >= 0.1 or card >= 0.1:
			
 
				+        log(f"  → 建议：素材信息不完整，可结合 rootsourceid 分析")
			
 
				+    else:
			
 
				+        log(f"  → 建议：无素材信息，只能按视频维度分析")
			
 
				+
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 四、素材字段示例
			
 
				+# ============================================================
			
 
				+log("=" * 80)
			
 
				+log("四、各渠道素材示例（有素材的渠道）")
			
 
				+log("=" * 80)
			
 
				+
			
 
				+for ch in channels:
			
 
				+    ch_df = df[df['channel'] == ch]
			
 
				+
			
 
				+    # 检查是否有素材
			
 
				+    has_article = ch_df['文章标题'].notna().any() if '文章标题' in ch_df.columns else False
			
 
				+    has_card = ch_df['分享标题'].notna().any() if '分享标题' in ch_df.columns else False
			
 
				+
			
 
				+    if not has_article and not has_card:
			
 
				+        continue
			
 
				+
			
 
				+    log()
			
 
				+    log(f"【{ch}】")
			
 
				+    log("-" * 60)
			
 
				+
			
 
				+    # 取有素材的示例
			
 
				+    if has_article:
			
 
				+        sample = ch_df[ch_df['文章标题'].notna()].head(3)
			
 
				+        log("  文章标题示例:")
			
 
				+        for _, row in sample.iterrows():
			
 
				+            art = str(row['文章标题'])[:50]
			
 
				+            log(f"    - {art}")
			
 
				+
			
 
				+    if has_card:
			
 
				+        sample = ch_df[ch_df['分享标题'].notna()].head(3)
			
 
				+        log("  卡片标题示例:")
			
 
				+        for _, row in sample.iterrows():
			
 
				+            card = str(row['分享标题'])[:50]
			
 
				+            log(f"    - {card}")
			
 
				+
			
 
				+log()
			
 
				+
			
 
				+# ============================================================
			
 
				+# 保存结果
			
 
				+# ============================================================
			
 
				+result_file = output_dir / f"{latest_file.stem}_素材字段分析.txt"
			
 
				+with open(result_file, 'w', encoding='utf-8') as f:
			
 
				+    f.write("\n".join(lines))
			
 
				+
			
 
				+log(f"分析结果已保存到: {result_file}")
			
 
				+
			
 
				+# 同时输出 CSV 汇总表
			
 
				+csv_data = []
			
 
				+for stat in channel_stats:
			
 
				+    csv_data.append(stat)
			
 
				+
			
 
				+summary_df = pd.DataFrame(csv_data)
			
 
				+summary_file = output_dir / f"{latest_file.stem}_素材字段汇总.csv"
			
 
				+summary_df.to_csv(summary_file, index=False, encoding='utf-8-sig')
			
 
				+log(f"汇总表已保存到: {summary_file}")