|
|
@@ -0,0 +1,150 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# coding=utf-8
|
|
|
+"""
|
|
|
+素材字段分析
|
|
|
+分析各渠道的素材字段填充情况(按天)
|
|
|
+素材字段:文章标题、卡片标题、卡片封面
|
|
|
+"""
|
|
|
+import pandas as pd
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+task_dir = Path(__file__).parent
|
|
|
+output_dir = task_dir / "output"
|
|
|
+
|
|
|
+csv_files = [f for f in output_dir.glob("*.csv") if '_分析' not in f.name]
|
|
|
+if not csv_files:
|
|
|
+ print("没有找到数据文件,请先运行 query.sql")
|
|
|
+ exit(1)
|
|
|
+
|
|
|
+latest_file = max(csv_files, key=lambda x: x.stat().st_mtime)
|
|
|
+df = pd.read_csv(latest_file)
|
|
|
+
|
|
|
+lines = []
|
|
|
+
|
|
|
+
|
|
|
+def log(text=""):
|
|
|
+ print(text)
|
|
|
+ lines.append(text)
|
|
|
+
|
|
|
+
|
|
|
+log(f"分析文件: {latest_file.name}")
|
|
|
+log(f"日期范围: {df['dt'].min()} ~ {df['dt'].max()}")
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 一、按天汇总
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("一、每日素材字段填充率")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+daily = df.groupby('dt').agg({
|
|
|
+ '记录数': 'sum',
|
|
|
+ '有文章标题': 'sum',
|
|
|
+ '有卡片标题': 'sum',
|
|
|
+ '有卡片封面': 'sum'
|
|
|
+}).reset_index()
|
|
|
+
|
|
|
+header = f"{'日期':<12} {'记录数':>12} {'文章标题':>10} {'卡片标题':>10} {'卡片封面':>10}"
|
|
|
+log(header)
|
|
|
+log("-" * 70)
|
|
|
+
|
|
|
+for _, row in daily.iterrows():
|
|
|
+ n = row['记录数']
|
|
|
+ art = row['有文章标题'] / n if n > 0 else 0
|
|
|
+ card = row['有卡片标题'] / n if n > 0 else 0
|
|
|
+ cover = row['有卡片封面'] / n if n > 0 else 0
|
|
|
+ log(f"{row['dt']:<12} {int(n):>12,} {art:>10.1%} {card:>10.1%} {cover:>10.1%}")
|
|
|
+
|
|
|
+log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 二、各渠道每日填充率
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("二、各渠道每日文章标题填充率")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+# 只看有文章标题的渠道
|
|
|
+channels_with_article = df.groupby('channel').agg({
|
|
|
+ '记录数': 'sum',
|
|
|
+ '有文章标题': 'sum'
|
|
|
+})
|
|
|
+channels_with_article['填充率'] = channels_with_article['有文章标题'] / channels_with_article['记录数']
|
|
|
+channels_with_article = channels_with_article[channels_with_article['填充率'] >= 0.1].index.tolist()
|
|
|
+
|
|
|
+if channels_with_article:
|
|
|
+ dates = sorted(df['dt'].unique())
|
|
|
+ header = f"{'渠道':<25}" + "".join([f"{str(d)[-4:]:>8}" for d in dates])
|
|
|
+ log(header)
|
|
|
+ log("-" * 70)
|
|
|
+
|
|
|
+ for ch in channels_with_article:
|
|
|
+ ch_data = df[df['channel'] == ch].set_index('dt')
|
|
|
+ row_str = f"{ch:<25}"
|
|
|
+ for d in dates:
|
|
|
+ if d in ch_data.index:
|
|
|
+ r = ch_data.loc[d]
|
|
|
+ rate = r['有文章标题'] / r['记录数'] if r['记录数'] > 0 else 0
|
|
|
+ row_str += f"{rate:>8.0%}"
|
|
|
+ else:
|
|
|
+ row_str += f"{'--':>8}"
|
|
|
+ log(row_str)
|
|
|
+ log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 三、各渠道每日卡片标题填充率
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("三、各渠道每日卡片标题填充率")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+channels_with_card = df.groupby('channel').agg({
|
|
|
+ '记录数': 'sum',
|
|
|
+ '有卡片标题': 'sum'
|
|
|
+})
|
|
|
+channels_with_card['填充率'] = channels_with_card['有卡片标题'] / channels_with_card['记录数']
|
|
|
+channels_with_card = channels_with_card[channels_with_card['填充率'] >= 0.1].index.tolist()
|
|
|
+
|
|
|
+if channels_with_card:
|
|
|
+ dates = sorted(df['dt'].unique())
|
|
|
+ header = f"{'渠道':<25}" + "".join([f"{str(d)[-4:]:>8}" for d in dates])
|
|
|
+ log(header)
|
|
|
+ log("-" * 70)
|
|
|
+
|
|
|
+ for ch in channels_with_card:
|
|
|
+ ch_data = df[df['channel'] == ch].set_index('dt')
|
|
|
+ row_str = f"{ch:<25}"
|
|
|
+ for d in dates:
|
|
|
+ if d in ch_data.index:
|
|
|
+ r = ch_data.loc[d]
|
|
|
+ rate = r['有卡片标题'] / r['记录数'] if r['记录数'] > 0 else 0
|
|
|
+ row_str += f"{rate:>8.0%}"
|
|
|
+ else:
|
|
|
+ row_str += f"{'--':>8}"
|
|
|
+ log(row_str)
|
|
|
+ log()
|
|
|
+
|
|
|
+# ============================================================
|
|
|
+# 四、汇总
|
|
|
+# ============================================================
|
|
|
+log("=" * 70)
|
|
|
+log("四、整体汇总")
|
|
|
+log("=" * 70)
|
|
|
+log()
|
|
|
+
|
|
|
+total = df['记录数'].sum()
|
|
|
+log(f"总记录数: {int(total):,}")
|
|
|
+log(f"有文章标题: {int(df['有文章标题'].sum()):,} ({df['有文章标题'].sum()/total:.1%})")
|
|
|
+log(f"有卡片标题: {int(df['有卡片标题'].sum()):,} ({df['有卡片标题'].sum()/total:.1%})")
|
|
|
+log(f"有卡片封面: {int(df['有卡片封面'].sum()):,} ({df['有卡片封面'].sum()/total:.1%})")
|
|
|
+log()
|
|
|
+
|
|
|
+# 保存
|
|
|
+result_file = output_dir / f"{latest_file.stem}_分析.txt"
|
|
|
+with open(result_file, 'w', encoding='utf-8') as f:
|
|
|
+ f.write("\n".join(lines))
|
|
|
+log(f"结果已保存: {result_file}")
|