Просмотр исходного кода

feat(表洞察): 新增 opengid_base_data 素材字段填充率分析

分析文章标题、卡片标题、卡片封面、长文封面四个字段的填充情况

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
yangxiaohui 1 месяц назад
Родитель
Сommit
deab9057c1

+ 24 - 0
tasks/00_表的洞察/loghubods.opengid_base_data/01_素材字段填充率.sql

@@ -0,0 +1,24 @@
+-- 素材字段填充率分析
+-- 分析各渠道的素材字段填充情况(文章标题、卡片标题、卡片封面、长文封面)
+-- 使用: python fetch_daily.py "tasks/00_表的洞察/loghubods.opengid_base_data/01_素材字段填充率.sql"
+
+SELECT  dt
+        ,channel
+        ,COUNT(*) AS 记录数
+        -- 覆盖率
+        ,ROUND(SUM(CASE WHEN 文章标题 IS NOT NULL AND 文章标题 != '' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 文章标题覆盖率
+        ,ROUND(SUM(CASE WHEN 分享标题 IS NOT NULL AND 分享标题 != '' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 卡片标题覆盖率
+        ,ROUND(SUM(CASE WHEN 分享封面 IS NOT NULL AND 分享封面 != '' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 卡片封面覆盖率
+        ,ROUND(SUM(CASE WHEN 长文封面 IS NOT NULL AND 长文封面 != '' THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 2) AS 长文封面覆盖率
+        -- 数量
+        ,SUM(CASE WHEN 文章标题 IS NOT NULL AND 文章标题 != '' THEN 1 ELSE 0 END) AS 有文章标题
+        ,SUM(CASE WHEN 分享标题 IS NOT NULL AND 分享标题 != '' THEN 1 ELSE 0 END) AS 有卡片标题
+        ,SUM(CASE WHEN 分享封面 IS NOT NULL AND 分享封面 != '' THEN 1 ELSE 0 END) AS 有卡片封面
+        ,SUM(CASE WHEN 长文封面 IS NOT NULL AND 长文封面 != '' THEN 1 ELSE 0 END) AS 有长文封面
+FROM    loghubods.opengid_base_data
+WHERE   dt = '${dt}'
+AND     usersharedepth = 0
+AND     videoid IS NOT NULL
+GROUP BY dt, channel
+ORDER BY 记录数 DESC
+;

+ 172 - 0
tasks/00_表的洞察/loghubods.opengid_base_data/analyze.py

@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+# coding=utf-8
+"""
+素材字段填充率分析
+分析各渠道的素材字段填充情况(文章标题、卡片标题、卡片封面、长文封面)
+"""
+import pandas as pd
+from pathlib import Path
+
+task_dir = Path(__file__).parent
+data_dir = task_dir / "output" / "01_素材字段填充率"
+
+csv_files = list(data_dir.glob("*.csv"))
+if not csv_files:
+    print("没有找到数据文件,请先运行 SQL 获取数据")
+    exit(1)
+
+# 读取所有数据
+dfs = [pd.read_csv(f) for f in sorted(csv_files)]
+df = pd.concat(dfs, ignore_index=True)
+
+lines = []
+
+
+def log(text=""):
+    print(text)
+    lines.append(text)
+
+
+log(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
+log(f"总记录数: {df['记录数'].sum():,}")
+log()
+
+# ============================================================
+# 一、整体填充率
+# ============================================================
+log("=" * 80)
+log("一、整体素材字段填充率")
+log("=" * 80)
+log()
+
+total = df['记录数'].sum()
+log(f"{'字段':<12} {'有值数量':>15} {'填充率':>10}")
+log("-" * 40)
+log(f"{'文章标题':<12} {int(df['有文章标题'].sum()):>15,} {df['有文章标题'].sum()/total:>10.2%}")
+log(f"{'卡片标题':<12} {int(df['有卡片标题'].sum()):>15,} {df['有卡片标题'].sum()/total:>10.2%}")
+log(f"{'卡片封面':<12} {int(df['有卡片封面'].sum()):>15,} {df['有卡片封面'].sum()/total:>10.2%}")
+log(f"{'长文封面':<12} {int(df['有长文封面'].sum()):>15,} {df['有长文封面'].sum()/total:>10.2%}")
+log()
+
+# ============================================================
+# 二、按渠道汇总
+# ============================================================
+log("=" * 80)
+log("二、各渠道素材字段填充率")
+log("=" * 80)
+log()
+
+by_ch = df.groupby('channel').agg({
+    '记录数': 'sum',
+    '有文章标题': 'sum',
+    '有卡片标题': 'sum',
+    '有卡片封面': 'sum',
+    '有长文封面': 'sum'
+}).sort_values('记录数', ascending=False)
+
+log(f"{'渠道':<26} {'记录数':>12} {'占比':>8} {'文章标题':>10} {'卡片标题':>10} {'卡片封面':>10} {'长文封面':>10}")
+log("-" * 98)
+
+for ch, row in by_ch.iterrows():
+    n = row['记录数']
+    pct = n / total
+    art = row['有文章标题'] / n if n > 0 else 0
+    card = row['有卡片标题'] / n if n > 0 else 0
+    cover = row['有卡片封面'] / n if n > 0 else 0
+    long_cover = row['有长文封面'] / n if n > 0 else 0
+    log(f"{ch:<26} {int(n):>12,} {pct:>8.1%} {art:>10.1%} {card:>10.1%} {cover:>10.1%} {long_cover:>10.1%}")
+
+log()
+
+# ============================================================
+# 三、每日趋势
+# ============================================================
+log("=" * 80)
+log("三、每日填充率趋势")
+log("=" * 80)
+log()
+
+daily = df.groupby('dt').agg({
+    '记录数': 'sum',
+    '有文章标题': 'sum',
+    '有卡片标题': 'sum',
+    '有卡片封面': 'sum',
+    '有长文封面': 'sum'
+}).reset_index()
+
+log(f"{'日期':<12} {'记录数':>12} {'文章标题':>10} {'卡片标题':>10} {'卡片封面':>10} {'长文封面':>10}")
+log("-" * 70)
+
+for _, row in daily.iterrows():
+    n = row['记录数']
+    art = row['有文章标题'] / n if n > 0 else 0
+    card = row['有卡片标题'] / n if n > 0 else 0
+    cover = row['有卡片封面'] / n if n > 0 else 0
+    long_cover = row['有长文封面'] / n if n > 0 else 0
+    log(f"{row['dt']:<12} {int(n):>12,} {art:>10.1%} {card:>10.1%} {cover:>10.1%} {long_cover:>10.1%}")
+
+log()
+
+# ============================================================
+# 四、各渠道每日文章标题填充率(填充率>=10%的渠道)
+# ============================================================
+log("=" * 80)
+log("四、各渠道每日文章标题填充率(填充率>=10%的渠道)")
+log("=" * 80)
+log()
+
+channels_with_article = by_ch[by_ch['有文章标题'] / by_ch['记录数'] >= 0.1].index.tolist()
+
+if channels_with_article:
+    dates = sorted(df['dt'].unique())
+    header = f"{'渠道':<26}" + "".join([f"{str(d)[-4:]:>10}" for d in dates])
+    log(header)
+    log("-" * (26 + len(dates) * 10))
+
+    for ch in channels_with_article:
+        ch_data = df[df['channel'] == ch].set_index('dt')
+        row_str = f"{ch:<26}"
+        for d in dates:
+            if d in ch_data.index:
+                r = ch_data.loc[d]
+                rate = r['有文章标题'] / r['记录数'] if r['记录数'] > 0 else 0
+                row_str += f"{rate:>10.1%}"
+            else:
+                row_str += f"{'--':>10}"
+        log(row_str)
+    log()
+
+# ============================================================
+# 五、各渠道每日卡片标题填充率(填充率>=10%的渠道)
+# ============================================================
+log("=" * 80)
+log("五、各渠道每日卡片标题填充率(填充率>=10%的渠道)")
+log("=" * 80)
+log()
+
+channels_with_card = by_ch[by_ch['有卡片标题'] / by_ch['记录数'] >= 0.1].index.tolist()
+
+if channels_with_card:
+    dates = sorted(df['dt'].unique())
+    header = f"{'渠道':<26}" + "".join([f"{str(d)[-4:]:>10}" for d in dates])
+    log(header)
+    log("-" * (26 + len(dates) * 10))
+
+    for ch in channels_with_card:
+        ch_data = df[df['channel'] == ch].set_index('dt')
+        row_str = f"{ch:<26}"
+        for d in dates:
+            if d in ch_data.index:
+                r = ch_data.loc[d]
+                rate = r['有卡片标题'] / r['记录数'] if r['记录数'] > 0 else 0
+                row_str += f"{rate:>10.1%}"
+            else:
+                row_str += f"{'--':>10}"
+        log(row_str)
+    log()
+
+# 保存结果
+result_file = task_dir / "output" / "素材字段填充率_分析.txt"
+with open(result_file, 'w', encoding='utf-8') as f:
+    f.write("\n".join(lines))
+log(f"结果已保存: {result_file}")