|
|
@@ -0,0 +1,233 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# coding=utf-8
|
|
|
+"""
|
|
|
+素材-视频匹配效果分析
|
|
|
+只分析有素材信息的记录(文章标题或卡片标题非空)
|
|
|
+"""
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+# 找到最新的原始数据文件(排除分析结果文件)
|
|
|
+task_dir = Path(__file__).parent
|
|
|
+output_dir = task_dir / "output"
|
|
|
+csv_files = [f for f in output_dir.glob("*.csv")
|
|
|
+ if '_匹配' not in f.name and '_分析' not in f.name and '_素材' not in f.name]
|
|
|
+if not csv_files:
|
|
|
+ print("没有找到数据文件,请先运行 query.sql")
|
|
|
+ exit(1)
|
|
|
+
|
|
|
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
|
|
|
+df = pd.read_csv(latest_file)
|
|
|
+
|
|
|
+lines = []
|
|
|
+
|
|
|
+
|
|
|
+def log(text=""):
|
|
|
+ print(text)
|
|
|
+ lines.append(text)
|
|
|
+
|
|
|
+
|
|
|
+log(f"分析文件: {latest_file.name}")
|
|
|
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 只分析有素材信息的记录(文章标题或卡片标题非空)
|
|
|
+df_with_material = df[
|
|
|
+ (df['文章标题'].notna()) | (df['分享标题'].notna())
|
|
|
+].copy()
|
|
|
+
|
|
|
+log("=" * 70)
|
|
|
+log("数据概览(仅有素材的记录)")
|
|
|
+log("=" * 70)
|
|
|
+log(f"总记录数: {len(df)}")
|
|
|
+log(f"有素材的记录: {len(df_with_material)} ({len(df_with_material)/len(df):.1%})")
|
|
|
+log(f"渠道数: {df_with_material['channel'].nunique()}")
|
|
|
+log(f"视频数: {df_with_material['videoid'].nunique()}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 过滤小样本(基于有素材的数据)
|
|
|
+df_filtered = df_with_material[df_with_material['点击uv'] >= 100].copy()
|
|
|
+log(f"UV >= 100 的记录数: {len(df_filtered)}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 1. 高效素材-视频组合 Top 30
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("一、高效素材-视频组合 Top 30(UV >= 100)")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+log("【按再分享回流率排序】")
|
|
|
+log("-" * 70)
|
|
|
+top_by_ror = df_filtered.nlargest(30, '再分享回流率')
|
|
|
+for i, (_, row) in enumerate(top_by_ror.iterrows(), 1):
|
|
|
+ channel = row['channel']
|
|
|
+ gzh_name = f"({row['公众号名']})" if pd.notna(row['公众号名']) else ''
|
|
|
+ article_title = str(row['文章标题'])[:30] if pd.notna(row['文章标题']) else ''
|
|
|
+ card_title = str(row['分享标题'])[:30] if pd.notna(row['分享标题']) else ''
|
|
|
+ video_title = str(row['title'])[:30] if pd.notna(row['title']) else '(无)'
|
|
|
+ category = row['merge一级品类'] if pd.notna(row['merge一级品类']) else ''
|
|
|
+ ror = row['再分享回流率']
|
|
|
+ ror_orig = row['原视频再分享回流率'] if pd.notna(row['原视频再分享回流率']) else 0
|
|
|
+ ror_rec = row['推荐再分享回流率'] if pd.notna(row['推荐再分享回流率']) else 0
|
|
|
+
|
|
|
+ log(f"{i:2}. UV={int(row['点击uv']):>5}")
|
|
|
+ log(f" 渠道: {channel} {gzh_name}")
|
|
|
+ if article_title:
|
|
|
+ log(f" 素材_文章: {article_title}")
|
|
|
+ if card_title:
|
|
|
+ log(f" 素材_卡片: {card_title}")
|
|
|
+ log(f" 视频: {video_title} [{category}]")
|
|
|
+ log(f" 回流率: 总={ror:.1%}, 原视频={ror_orig:.1%}, 推荐={ror_rec:.1%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 2. 素材搭配分析
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("二、素材搭配分析(素材内最佳 vs 最差视频)")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+source_video_count = df_with_material.groupby('rootsourceid').agg({
|
|
|
+ 'videoid': 'nunique',
|
|
|
+ '点击uv': 'sum'
|
|
|
+}).rename(columns={'videoid': '视频数'})
|
|
|
+multi_video_sources = source_video_count[
|
|
|
+ (source_video_count['视频数'] >= 3) & (source_video_count['点击uv'] >= 500)
|
|
|
+].index.tolist()
|
|
|
+
|
|
|
+log(f"有3+视频且UV>=500的素材数: {len(multi_video_sources)}")
|
|
|
+log()
|
|
|
+
|
|
|
+source_analysis = []
|
|
|
+for source in multi_video_sources[:50]:
|
|
|
+ source_df = df_with_material[df_with_material['rootsourceid'] == source]
|
|
|
+ if len(source_df) < 3:
|
|
|
+ continue
|
|
|
+
|
|
|
+ total_uv = source_df['点击uv'].sum()
|
|
|
+ best = source_df.nlargest(1, '再分享回流率').iloc[0]
|
|
|
+ worst = source_df.nsmallest(1, '再分享回流率').iloc[0]
|
|
|
+ diff = best['再分享回流率'] - worst['再分享回流率']
|
|
|
+
|
|
|
+ source_analysis.append({
|
|
|
+ '文章标题': source_df['文章标题'].iloc[0],
|
|
|
+ '分享标题': source_df['分享标题'].iloc[0],
|
|
|
+ 'channel': source_df['channel'].iloc[0],
|
|
|
+ '视频数': len(source_df),
|
|
|
+ '总UV': total_uv,
|
|
|
+ '最佳视频': best['title'],
|
|
|
+ '最佳回流率': best['再分享回流率'],
|
|
|
+ '最差视频': worst['title'],
|
|
|
+ '最差回流率': worst['再分享回流率'],
|
|
|
+ '效果差异': diff
|
|
|
+ })
|
|
|
+
|
|
|
+if source_analysis:
|
|
|
+ source_analysis_df = pd.DataFrame(source_analysis).sort_values('效果差异', ascending=False)
|
|
|
+
|
|
|
+ log("【视频效果差异最大的素材 Top 15】")
|
|
|
+ log("-" * 70)
|
|
|
+
|
|
|
+ for i, (_, row) in enumerate(source_analysis_df.head(15).iterrows(), 1):
|
|
|
+ article_title = str(row['文章标题'])[:35] if pd.notna(row['文章标题']) else ''
|
|
|
+ card_title = str(row['分享标题'])[:35] if pd.notna(row['分享标题']) else ''
|
|
|
+ best_title = str(row['最佳视频'])[:28] if pd.notna(row['最佳视频']) else '(无)'
|
|
|
+ worst_title = str(row['最差视频'])[:28] if pd.notna(row['最差视频']) else '(无)'
|
|
|
+
|
|
|
+ log(f"{i:2}. 渠道: {row['channel']}, 视频数={row['视频数']}, 总UV={int(row['总UV'])}")
|
|
|
+ if article_title:
|
|
|
+ log(f" 素材_文章: {article_title}")
|
|
|
+ if card_title:
|
|
|
+ log(f" 素材_卡片: {card_title}")
|
|
|
+ log(f" 最佳视频: {best_title} → 回流率={row['最佳回流率']:.1%}")
|
|
|
+ log(f" 最差视频: {worst_title} → 回流率={row['最差回流率']:.1%}")
|
|
|
+ log(f" 效果差异: {row['效果差异']:.1%}")
|
|
|
+ log()
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 3. 品类×渠道效果矩阵
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("三、品类×渠道效果矩阵(仅有素材的记录)")
|
|
|
+log("=" * 70)
|
|
|
+
|
|
|
+pivot_data = df_with_material.groupby(['merge一级品类', 'channel']).agg({
|
|
|
+ '点击uv': 'sum',
|
|
|
+ '再分享回流uv': 'sum'
|
|
|
+}).reset_index()
|
|
|
+pivot_data['回流率'] = pivot_data['再分享回流uv'] / (pivot_data['点击uv'] + 10)
|
|
|
+
|
|
|
+pivot_ror = pivot_data.pivot(index='merge一级品类', columns='channel', values='回流率')
|
|
|
+pivot_uv = pivot_data.pivot(index='merge一级品类', columns='channel', values='点击uv')
|
|
|
+
|
|
|
+min_uv = 500
|
|
|
+valid_categories = pivot_uv[pivot_uv.sum(axis=1) >= min_uv].index
|
|
|
+valid_channels = pivot_uv.columns[pivot_uv.sum(axis=0) >= min_uv]
|
|
|
+
|
|
|
+if len(valid_categories) > 0 and len(valid_channels) > 0:
|
|
|
+ pivot_ror_filtered = pivot_ror.loc[valid_categories, valid_channels]
|
|
|
+ log(f"品类数: {len(valid_categories)}, 渠道数: {len(valid_channels)}")
|
|
|
+ log()
|
|
|
+
|
|
|
+ log("【各品类最佳渠道】")
|
|
|
+ log("-" * 70)
|
|
|
+ for category in valid_categories:
|
|
|
+ row = pivot_ror_filtered.loc[category].dropna()
|
|
|
+ if len(row) == 0:
|
|
|
+ continue
|
|
|
+ best_channel = row.idxmax()
|
|
|
+ best_ror = row.max()
|
|
|
+ cat_name = str(category)[:15] if pd.notna(category) else '(空)'
|
|
|
+ log(f" {cat_name:<17} → {best_channel:<25} 回流率={best_ror:.1%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 保存结果
|
|
|
+# ============================================================
|
|
|
+result_file = output_dir / f"{latest_file.stem}_匹配分析.txt"
|
|
|
+with open(result_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("\n".join(lines))
|
|
|
+log(f"分析结果已保存到: {result_file}")
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 导出 CSV(仅有素材的记录)
|
|
|
+# ============================================================
|
|
|
+log()
|
|
|
+log("=" * 70)
|
|
|
+log("导出 CSV 文件")
|
|
|
+log("=" * 70)
|
|
|
+
|
|
|
+export_df = df_filtered.copy()
|
|
|
+
|
|
|
+output_cols = [
|
|
|
+ 'channel', '公众号名',
|
|
|
+ '文章标题', '分享标题',
|
|
|
+ 'title', 'merge一级品类', 'merge二级品类',
|
|
|
+ '点击uv', '进入推荐率', '再分享回流率', '原视频再分享回流率', '推荐再分享回流率', '再分享回流uv',
|
|
|
+ 'rootsourceid', 'videoid'
|
|
|
+]
|
|
|
+output_cols = [c for c in output_cols if c in export_df.columns]
|
|
|
+export_df = export_df[output_cols]
|
|
|
+
|
|
|
+export_df = export_df.rename(columns={
|
|
|
+ 'channel': '渠道',
|
|
|
+ '公众号名': '渠道_公众号名',
|
|
|
+ '文章标题': '素材_文章标题',
|
|
|
+ '分享标题': '素材_卡片标题',
|
|
|
+ 'title': '视频_标题',
|
|
|
+ 'merge一级品类': '视频_一级品类',
|
|
|
+ 'merge二级品类': '视频_二级品类'
|
|
|
+})
|
|
|
+
|
|
|
+export_df = export_df.sort_values('点击uv', ascending=False)
|
|
|
+
|
|
|
+csv_file = output_dir / f"{latest_file.stem}_素材视频匹配.csv"
|
|
|
+export_df.to_csv(csv_file, index=False, encoding='utf-8-sig')
|
|
|
+
|
|
|
+log(f"CSV 已保存: {csv_file}")
|
|
|
+log(f"共 {len(export_df)} 条记录(有素材且 UV >= 100)")
|