|
|
@@ -0,0 +1,155 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# coding=utf-8
|
|
|
+"""
|
|
|
+素材+视频维度分析
|
|
|
+"""
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+# 找到最新的输出文件
|
|
|
+task_dir = Path(__file__).parent
|
|
|
+output_dir = task_dir / "output"
|
|
|
+csv_files = list(output_dir.glob("*.csv"))
|
|
|
+if not csv_files:
|
|
|
+ print("没有找到数据文件,请先运行 query.sql")
|
|
|
+ exit(1)
|
|
|
+
|
|
|
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
|
|
|
+df = pd.read_csv(latest_file)
|
|
|
+
|
|
|
+# 输出结果收集
|
|
|
+lines = []
|
|
|
+
|
|
|
+
|
|
|
+def log(text=""):
|
|
|
+ print(text)
|
|
|
+ lines.append(text)
|
|
|
+
|
|
|
+
|
|
|
+log(f"分析文件: {latest_file.name}")
|
|
|
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 基本信息
|
|
|
+log("=" * 70)
|
|
|
+log("基本信息")
|
|
|
+log("=" * 70)
|
|
|
+log(f"记录数: {len(df)}")
|
|
|
+log(f"渠道数: {df['channel'].nunique()}")
|
|
|
+log(f"视频数: {df['videoid'].nunique()}")
|
|
|
+log(f"素材数(rootsourceid): {df['rootsourceid'].nunique()}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 各渠道数据量
|
|
|
+log("=" * 70)
|
|
|
+log("各渠道数据量")
|
|
|
+log("=" * 70)
|
|
|
+channel_stats = df.groupby('channel').agg({
|
|
|
+ 'videoid': 'count',
|
|
|
+ '点击uv': 'sum',
|
|
|
+ '再分享回流uv': 'sum'
|
|
|
+}).rename(columns={'videoid': '记录数'})
|
|
|
+channel_stats = channel_stats.sort_values('点击uv', ascending=False)
|
|
|
+for ch, row in channel_stats.iterrows():
|
|
|
+ log(f" {ch}: {int(row['记录数'])}条, 点击uv={int(row['点击uv'])}, 回流uv={int(row['再分享回流uv'])}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 各渠道核心指标
|
|
|
+log("=" * 70)
|
|
|
+log("各渠道核心指标(加权平均)")
|
|
|
+log("=" * 70)
|
|
|
+log(f"{'渠道':<25} {'进入推荐率':>10} {'再分享回流率':>12} {'原视频质量':>10}")
|
|
|
+log("-" * 70)
|
|
|
+
|
|
|
+for channel in channel_stats.index:
|
|
|
+ ch_df = df[df['channel'] == channel]
|
|
|
+ total_uv = ch_df['点击uv'].sum()
|
|
|
+
|
|
|
+ # 加权平均(进入推荐率已在SQL中计算)
|
|
|
+ 进入推荐率 = (ch_df['进入推荐率'] * ch_df['点击uv']).sum() / total_uv if total_uv > 0 else 0
|
|
|
+ 再分享回流率 = ch_df['再分享回流uv'].sum() / (total_uv + 10)
|
|
|
+
|
|
|
+ # 原视频质量用中位数
|
|
|
+ 原视频质量 = ch_df['原视频质量'].replace([np.inf, -np.inf], np.nan).median()
|
|
|
+ 原视频质量_str = f"{原视频质量:.2f}" if pd.notna(原视频质量) else "N/A"
|
|
|
+
|
|
|
+ log(f" {channel:<23} {进入推荐率:>10.1%} {再分享回流率:>12.2%} {原视频质量_str:>10}")
|
|
|
+log()
|
|
|
+
|
|
|
+# Top 视频
|
|
|
+log("=" * 70)
|
|
|
+log("各渠道 Top5 视频(按点击uv)")
|
|
|
+log("=" * 70)
|
|
|
+
|
|
|
+for channel in channel_stats.index:
|
|
|
+ ch_df = df[df['channel'] == channel].nlargest(5, '点击uv')
|
|
|
+ log(f"\n【{channel}】")
|
|
|
+ log("-" * 60)
|
|
|
+ for _, row in ch_df.iterrows():
|
|
|
+ title = str(row['title'])[:30] if pd.notna(row['title']) else '(无标题)'
|
|
|
+ log(f" {title:<32} uv={int(row['点击uv']):>6}, 回流率={row['再分享回流率']:.2%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 品类分布
|
|
|
+log("=" * 70)
|
|
|
+log("一级品类分布(Top 10)")
|
|
|
+log("=" * 70)
|
|
|
+category_stats = df.groupby('merge一级品类').agg({
|
|
|
+ 'videoid': 'count',
|
|
|
+ '点击uv': 'sum',
|
|
|
+ '再分享回流uv': 'sum'
|
|
|
+}).rename(columns={'videoid': '记录数'})
|
|
|
+category_stats['回流率'] = category_stats['再分享回流uv'] / (category_stats['点击uv'] + 10)
|
|
|
+category_stats = category_stats.sort_values('点击uv', ascending=False).head(10)
|
|
|
+
|
|
|
+for cat, row in category_stats.iterrows():
|
|
|
+ cat_name = str(cat)[:20] if pd.notna(cat) else '(空)'
|
|
|
+ log(f" {cat_name:<22} 点击uv={int(row['点击uv']):>8}, 回流率={row['回流率']:.2%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 卡片效果分析
|
|
|
+log("=" * 70)
|
|
|
+log("卡片效果(shareid 维度,Top 20)")
|
|
|
+log("=" * 70)
|
|
|
+card_stats = df.groupby('shareid').agg({
|
|
|
+ 'videoid': 'nunique',
|
|
|
+ '点击uv': 'sum',
|
|
|
+ '再分享回流uv': 'sum',
|
|
|
+ '分享标题': 'first',
|
|
|
+ 'channel': 'first'
|
|
|
+}).rename(columns={'videoid': '视频数'})
|
|
|
+card_stats['回流率'] = card_stats['再分享回流uv'] / (card_stats['点击uv'] + 10)
|
|
|
+card_stats = card_stats.sort_values('点击uv', ascending=False).head(20)
|
|
|
+
|
|
|
+for card, row in card_stats.iterrows():
|
|
|
+ title = str(row['分享标题'])[:30] if pd.notna(row['分享标题']) else '(无标题)'
|
|
|
+ log(f" {title:<32}")
|
|
|
+ log(f" 渠道={row['channel']}, 视频数={int(row['视频数'])}, 点击uv={int(row['点击uv'])}, 回流率={row['回流率']:.2%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 素材效果分析
|
|
|
+log("=" * 70)
|
|
|
+log("素材效果(rootsourceid 维度,Top 20)")
|
|
|
+log("=" * 70)
|
|
|
+source_stats = df.groupby('rootsourceid').agg({
|
|
|
+ 'videoid': 'nunique',
|
|
|
+ '点击uv': 'sum',
|
|
|
+ '再分享回流uv': 'sum',
|
|
|
+ 'channel': 'first'
|
|
|
+}).rename(columns={'videoid': '视频数'})
|
|
|
+source_stats['回流率'] = source_stats['再分享回流uv'] / (source_stats['点击uv'] + 10)
|
|
|
+source_stats = source_stats.sort_values('点击uv', ascending=False).head(20)
|
|
|
+
|
|
|
+for src, row in source_stats.iterrows():
|
|
|
+ src_short = str(src)[:40] if pd.notna(src) else '(空)'
|
|
|
+ log(f" {src_short:<42}")
|
|
|
+ log(f" 渠道={row['channel']}, 视频数={int(row['视频数'])}, 点击uv={int(row['点击uv'])}, 回流率={row['回流率']:.2%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# 保存分析结果
|
|
|
+result_file = output_dir / f"{latest_file.stem}_分析.txt"
|
|
|
+with open(result_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("\n".join(lines))
|
|
|
+
|
|
|
+log(f"分析结果已保存到: {result_file}")
|