|
|
@@ -0,0 +1,172 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# coding=utf-8
|
|
|
+"""
|
|
|
+素材字段填充率分析
|
|
|
+分析各渠道的素材字段填充情况(文章标题、卡片标题、卡片封面、长文封面)
|
|
|
+"""
|
|
|
+import pandas as pd
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+task_dir = Path(__file__).parent
|
|
|
+data_dir = task_dir / "output" / "01_素材字段填充率"
|
|
|
+
|
|
|
+csv_files = list(data_dir.glob("*.csv"))
|
|
|
+if not csv_files:
|
|
|
+ print("没有找到数据文件,请先运行 SQL 获取数据")
|
|
|
+ exit(1)
|
|
|
+
|
|
|
+# 读取所有数据
|
|
|
+dfs = [pd.read_csv(f) for f in sorted(csv_files)]
|
|
|
+df = pd.concat(dfs, ignore_index=True)
|
|
|
+
|
|
|
+lines = []
|
|
|
+
|
|
|
+
|
|
|
+def log(text=""):
|
|
|
+ print(text)
|
|
|
+ lines.append(text)
|
|
|
+
|
|
|
+
|
|
|
+log(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
|
|
|
+log(f"总记录数: {df['记录数'].sum():,}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 一、整体填充率
|
|
|
+# ============================================================
|
|
|
+log("=" * 80)
|
|
|
+log("一、整体素材字段填充率")
|
|
|
+log("=" * 80)
|
|
|
+log()
|
|
|
+
|
|
|
+total = df['记录数'].sum()
|
|
|
+log(f"{'字段':<12} {'有值数量':>15} {'填充率':>10}")
|
|
|
+log("-" * 40)
|
|
|
+log(f"{'文章标题':<12} {int(df['有文章标题'].sum()):>15,} {df['有文章标题'].sum()/total:>10.2%}")
|
|
|
+log(f"{'卡片标题':<12} {int(df['有卡片标题'].sum()):>15,} {df['有卡片标题'].sum()/total:>10.2%}")
|
|
|
+log(f"{'卡片封面':<12} {int(df['有卡片封面'].sum()):>15,} {df['有卡片封面'].sum()/total:>10.2%}")
|
|
|
+log(f"{'长文封面':<12} {int(df['有长文封面'].sum()):>15,} {df['有长文封面'].sum()/total:>10.2%}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 二、按渠道汇总
|
|
|
+# ============================================================
|
|
|
+log("=" * 80)
|
|
|
+log("二、各渠道素材字段填充率")
|
|
|
+log("=" * 80)
|
|
|
+log()
|
|
|
+
|
|
|
+by_ch = df.groupby('channel').agg({
|
|
|
+ '记录数': 'sum',
|
|
|
+ '有文章标题': 'sum',
|
|
|
+ '有卡片标题': 'sum',
|
|
|
+ '有卡片封面': 'sum',
|
|
|
+ '有长文封面': 'sum'
|
|
|
+}).sort_values('记录数', ascending=False)
|
|
|
+
|
|
|
+log(f"{'渠道':<26} {'记录数':>12} {'占比':>8} {'文章标题':>10} {'卡片标题':>10} {'卡片封面':>10} {'长文封面':>10}")
|
|
|
+log("-" * 98)
|
|
|
+
|
|
|
+for ch, row in by_ch.iterrows():
|
|
|
+ n = row['记录数']
|
|
|
+ pct = n / total
|
|
|
+ art = row['有文章标题'] / n if n > 0 else 0
|
|
|
+ card = row['有卡片标题'] / n if n > 0 else 0
|
|
|
+ cover = row['有卡片封面'] / n if n > 0 else 0
|
|
|
+ long_cover = row['有长文封面'] / n if n > 0 else 0
|
|
|
+ log(f"{ch:<26} {int(n):>12,} {pct:>8.1%} {art:>10.1%} {card:>10.1%} {cover:>10.1%} {long_cover:>10.1%}")
|
|
|
+
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 三、每日趋势
|
|
|
+# ============================================================
|
|
|
+log("=" * 80)
|
|
|
+log("三、每日填充率趋势")
|
|
|
+log("=" * 80)
|
|
|
+log()
|
|
|
+
|
|
|
+daily = df.groupby('dt').agg({
|
|
|
+ '记录数': 'sum',
|
|
|
+ '有文章标题': 'sum',
|
|
|
+ '有卡片标题': 'sum',
|
|
|
+ '有卡片封面': 'sum',
|
|
|
+ '有长文封面': 'sum'
|
|
|
+}).reset_index()
|
|
|
+
|
|
|
+log(f"{'日期':<12} {'记录数':>12} {'文章标题':>10} {'卡片标题':>10} {'卡片封面':>10} {'长文封面':>10}")
|
|
|
+log("-" * 70)
|
|
|
+
|
|
|
+for _, row in daily.iterrows():
|
|
|
+ n = row['记录数']
|
|
|
+ art = row['有文章标题'] / n if n > 0 else 0
|
|
|
+ card = row['有卡片标题'] / n if n > 0 else 0
|
|
|
+ cover = row['有卡片封面'] / n if n > 0 else 0
|
|
|
+ long_cover = row['有长文封面'] / n if n > 0 else 0
|
|
|
+ log(f"{row['dt']:<12} {int(n):>12,} {art:>10.1%} {card:>10.1%} {cover:>10.1%} {long_cover:>10.1%}")
|
|
|
+
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 四、各渠道每日文章标题填充率(填充率>=10%的渠道)
|
|
|
+# ============================================================
|
|
|
+log("=" * 80)
|
|
|
+log("四、各渠道每日文章标题填充率(填充率>=10%的渠道)")
|
|
|
+log("=" * 80)
|
|
|
+log()
|
|
|
+
|
|
|
+channels_with_article = by_ch[by_ch['有文章标题'] / by_ch['记录数'] >= 0.1].index.tolist()
|
|
|
+
|
|
|
+if channels_with_article:
|
|
|
+ dates = sorted(df['dt'].unique())
|
|
|
+ header = f"{'渠道':<26}" + "".join([f"{str(d)[-4:]:>10}" for d in dates])
|
|
|
+ log(header)
|
|
|
+ log("-" * (26 + len(dates) * 10))
|
|
|
+
|
|
|
+ for ch in channels_with_article:
|
|
|
+ ch_data = df[df['channel'] == ch].set_index('dt')
|
|
|
+ row_str = f"{ch:<26}"
|
|
|
+ for d in dates:
|
|
|
+ if d in ch_data.index:
|
|
|
+ r = ch_data.loc[d]
|
|
|
+ rate = r['有文章标题'] / r['记录数'] if r['记录数'] > 0 else 0
|
|
|
+ row_str += f"{rate:>10.1%}"
|
|
|
+ else:
|
|
|
+ row_str += f"{'--':>10}"
|
|
|
+ log(row_str)
|
|
|
+ log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 五、各渠道每日卡片标题填充率(填充率>=10%的渠道)
|
|
|
+# ============================================================
|
|
|
+log("=" * 80)
|
|
|
+log("五、各渠道每日卡片标题填充率(填充率>=10%的渠道)")
|
|
|
+log("=" * 80)
|
|
|
+log()
|
|
|
+
|
|
|
+channels_with_card = by_ch[by_ch['有卡片标题'] / by_ch['记录数'] >= 0.1].index.tolist()
|
|
|
+
|
|
|
+if channels_with_card:
|
|
|
+ dates = sorted(df['dt'].unique())
|
|
|
+ header = f"{'渠道':<26}" + "".join([f"{str(d)[-4:]:>10}" for d in dates])
|
|
|
+ log(header)
|
|
|
+ log("-" * (26 + len(dates) * 10))
|
|
|
+
|
|
|
+ for ch in channels_with_card:
|
|
|
+ ch_data = df[df['channel'] == ch].set_index('dt')
|
|
|
+ row_str = f"{ch:<26}"
|
|
|
+ for d in dates:
|
|
|
+ if d in ch_data.index:
|
|
|
+ r = ch_data.loc[d]
|
|
|
+ rate = r['有卡片标题'] / r['记录数'] if r['记录数'] > 0 else 0
|
|
|
+ row_str += f"{rate:>10.1%}"
|
|
|
+ else:
|
|
|
+ row_str += f"{'--':>10}"
|
|
|
+ log(row_str)
|
|
|
+ log()
|
|
|
+
|
|
|
+# 保存结果
|
|
|
+result_file = task_dir / "output" / "素材字段填充率_分析.txt"
|
|
|
+with open(result_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("\n".join(lines))
|
|
|
+log(f"结果已保存: {result_file}")
|