Pārlūkot izejas kodu

添加素材+视频维度分析任务

链路:长文 → 卡片 → 点击视频 → 进入推荐 → 再分享 → 回流
- query.sql: 按渠道取 Top 500,包含长文/卡片/视频三个维度
- analyze.py: 分析各渠道指标、品类分布、卡片效果、素材效果

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 2 mēneši atpakaļ
vecāks
revīzija
598ec4cffb

+ 155 - 0
tasks/素材视频维度分析/analyze.py

@@ -0,0 +1,155 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材+视频维度分析
+"""
+import pandas as pd
+import numpy as np
+from pathlib import Path
+
+# 找到最新的输出文件
+task_dir = Path(__file__).parent
+output_dir = task_dir / "output"
+csv_files = list(output_dir.glob("*.csv"))
+if not csv_files:
+    print("没有找到数据文件,请先运行 query.sql")
+    exit(1)
+
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
+df = pd.read_csv(latest_file)
+
+# 输出结果收集
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"分析文件: {latest_file.name}")
+log(f"时间范围: {df['dt'].min()} ~ {df['dt'].max()}")
+log()
+
+# 基本信息
+log("=" * 70)
+log("基本信息")
+log("=" * 70)
+log(f"记录数: {len(df)}")
+log(f"渠道数: {df['channel'].nunique()}")
+log(f"视频数: {df['videoid'].nunique()}")
+log(f"素材数(rootsourceid): {df['rootsourceid'].nunique()}")
+log()
+
+# 各渠道数据量
+log("=" * 70)
+log("各渠道数据量")
+log("=" * 70)
+channel_stats = df.groupby('channel').agg({
+    'videoid': 'count',
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum'
+}).rename(columns={'videoid': '记录数'})
+channel_stats = channel_stats.sort_values('点击uv', ascending=False)
+for ch, row in channel_stats.iterrows():
+    log(f"  {ch}: {int(row['记录数'])}条, 点击uv={int(row['点击uv'])}, 回流uv={int(row['再分享回流uv'])}")
+log()
+
+# 各渠道核心指标
+log("=" * 70)
+log("各渠道核心指标(加权平均)")
+log("=" * 70)
+log(f"{'渠道':<25} {'进入推荐率':>10} {'再分享回流率':>12} {'原视频质量':>10}")
+log("-" * 70)
+
+for channel in channel_stats.index:
+    ch_df = df[df['channel'] == channel]
+    total_uv = ch_df['点击uv'].sum()
+
+    # 加权平均(进入推荐率已在SQL中计算)
+    进入推荐率 = (ch_df['进入推荐率'] * ch_df['点击uv']).sum() / total_uv if total_uv > 0 else 0
+    再分享回流率 = ch_df['再分享回流uv'].sum() / (total_uv + 10)
+
+    # 原视频质量用中位数
+    原视频质量 = ch_df['原视频质量'].replace([np.inf, -np.inf], np.nan).median()
+    原视频质量_str = f"{原视频质量:.2f}" if pd.notna(原视频质量) else "N/A"
+
+    log(f"  {channel:<23} {进入推荐率:>10.1%} {再分享回流率:>12.2%} {原视频质量_str:>10}")
+log()
+
+# Top 视频
+log("=" * 70)
+log("各渠道 Top5 视频(按点击uv)")
+log("=" * 70)
+
+for channel in channel_stats.index:
+    ch_df = df[df['channel'] == channel].nlargest(5, '点击uv')
+    log(f"\n【{channel}】")
+    log("-" * 60)
+    for _, row in ch_df.iterrows():
+        title = str(row['title'])[:30] if pd.notna(row['title']) else '(无标题)'
+        log(f"  {title:<32} uv={int(row['点击uv']):>6}, 回流率={row['再分享回流率']:.2%}")
+log()
+
+# 品类分布
+log("=" * 70)
+log("一级品类分布(Top 10)")
+log("=" * 70)
+category_stats = df.groupby('merge一级品类').agg({
+    'videoid': 'count',
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum'
+}).rename(columns={'videoid': '记录数'})
+category_stats['回流率'] = category_stats['再分享回流uv'] / (category_stats['点击uv'] + 10)
+category_stats = category_stats.sort_values('点击uv', ascending=False).head(10)
+
+for cat, row in category_stats.iterrows():
+    cat_name = str(cat)[:20] if pd.notna(cat) else '(空)'
+    log(f"  {cat_name:<22} 点击uv={int(row['点击uv']):>8}, 回流率={row['回流率']:.2%}")
+log()
+
+# 卡片效果分析
+log("=" * 70)
+log("卡片效果(shareid 维度,Top 20)")
+log("=" * 70)
+card_stats = df.groupby('shareid').agg({
+    'videoid': 'nunique',
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum',
+    '分享标题': 'first',
+    'channel': 'first'
+}).rename(columns={'videoid': '视频数'})
+card_stats['回流率'] = card_stats['再分享回流uv'] / (card_stats['点击uv'] + 10)
+card_stats = card_stats.sort_values('点击uv', ascending=False).head(20)
+
+for card, row in card_stats.iterrows():
+    title = str(row['分享标题'])[:30] if pd.notna(row['分享标题']) else '(无标题)'
+    log(f"  {title:<32}")
+    log(f"    渠道={row['channel']}, 视频数={int(row['视频数'])}, 点击uv={int(row['点击uv'])}, 回流率={row['回流率']:.2%}")
+log()
+
+# 素材效果分析
+log("=" * 70)
+log("素材效果(rootsourceid 维度,Top 20)")
+log("=" * 70)
+source_stats = df.groupby('rootsourceid').agg({
+    'videoid': 'nunique',
+    '点击uv': 'sum',
+    '再分享回流uv': 'sum',
+    'channel': 'first'
+}).rename(columns={'videoid': '视频数'})
+source_stats['回流率'] = source_stats['再分享回流uv'] / (source_stats['点击uv'] + 10)
+source_stats = source_stats.sort_values('点击uv', ascending=False).head(20)
+
+for src, row in source_stats.iterrows():
+    src_short = str(src)[:40] if pd.notna(src) else '(空)'
+    log(f"  {src_short:<42}")
+    log(f"    渠道={row['channel']}, 视频数={int(row['视频数'])}, 点击uv={int(row['点击uv'])}, 回流率={row['回流率']:.2%}")
+log()
+
+# 保存分析结果
+result_file = output_dir / f"{latest_file.stem}_分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+
+log(f"分析结果已保存到: {result_file}")

+ 68 - 0
tasks/素材视频维度分析/query.sql

@@ -0,0 +1,68 @@
+-- 素材+视频维度分析
+-- 链路:长文 → 卡片 → 点击视频 → 进入推荐 → 再分享 → 回流
+-- 每个渠道取 Top 500
+
+SELECT  *
+FROM    (
+            SELECT  dt
+                    ,channel
+                    -- 长文维度
+                    ,rootsourceid
+                    ,wx_sn
+                    ,公众号名
+                    ,文章标题
+                    ,contenturl
+                    -- 卡片维度
+                    ,shareid
+                    ,分享标题
+                    ,分享封面
+                    -- 视频维度
+                    ,videoid
+                    ,title
+                    ,merge一级品类
+                    ,merge二级品类
+                    -- 核心指标
+                    ,COUNT(DISTINCT mid) AS 点击uv
+                    ,COUNT(DISTINCT CASE WHEN 是否进入推荐 = '1' THEN mid END) / COUNT(DISTINCT mid) AS 进入推荐率
+                    ,(SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+                      + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END)
+                     ) / (COUNT(DISTINCT mid) + 10) AS 再分享回流率
+                    ,(SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv END)
+                      + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv END)
+                     ) / (COUNT(DISTINCT mid) + 10) AS 原视频再分享回流率
+                    ,(SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv END)
+                      + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv END)
+                     ) / (COUNT(DISTINCT mid) + 10) AS 推荐再分享回流率
+                    ,((SUM(CASE WHEN 是否原视频 = '是' THEN 再分享群聊回流uv END)
+                       + SUM(CASE WHEN 是否原视频 = '是' THEN 再分享单聊回流uv END)
+                      ) / (COUNT(DISTINCT mid) + 10)
+                     ) / ((SUM(CASE WHEN 是否原视频 = '否' THEN 再分享群聊回流uv END)
+                           + SUM(CASE WHEN 是否原视频 = '否' THEN 再分享单聊回流uv END)
+                          ) / (COUNT(DISTINCT mid) + 10)
+                     ) AS 原视频质量
+                    ,SUM(CASE WHEN 再分享群聊回流uv > 0 THEN 再分享群聊回流uv ELSE 0 END)
+                     + SUM(CASE WHEN 再分享单聊回流uv > 0 THEN 再分享单聊回流uv ELSE 0 END) AS 再分享回流uv
+                    ,ROW_NUMBER() OVER (PARTITION BY channel ORDER BY COUNT(DISTINCT mid) DESC) AS rn
+            FROM    loghubods.opengid_base_data
+            WHERE   dt >= ${start}
+            AND     dt <= ${end}
+            AND     usersharedepth = 0
+            AND     videoid IS NOT NULL
+            GROUP BY dt
+                     ,channel
+                     ,rootsourceid
+                     ,wx_sn
+                     ,公众号名
+                     ,文章标题
+                     ,contenturl
+                     ,shareid
+                     ,分享标题
+                     ,分享封面
+                     ,videoid
+                     ,title
+                     ,merge一级品类
+                     ,merge二级品类
+        ) t
+WHERE   rn <= 500
+ORDER BY channel, 点击uv DESC
+;