|
@@ -0,0 +1,214 @@
|
|
|
|
|
+"""逐月策略 ODPS 查询。"""
|
|
|
|
|
+
|
|
|
|
|
+import re
|
|
|
|
|
+from datetime import datetime
|
|
|
|
|
+from zoneinfo import ZoneInfo
|
|
|
|
|
+
|
|
|
|
|
+from app.odps.client import get_odps_client
|
|
|
|
|
+from app.strategies.odps._utils import normalize_scalar, parse_video_list
|
|
|
|
|
+
|
|
|
|
|
+SHANGHAI_TZ = ZoneInfo("Asia/Shanghai")
|
|
|
|
|
+
|
|
|
|
|
+_DATE_PARTITION_RE = re.compile(r"^\d{8}$")
|
|
|
|
|
+
|
|
|
|
|
+_EXCLUDED_ELEMENTS = (
|
|
|
|
|
+ "元旦", "腊八节", "小年", "除夕", "春节", "正月初一", "正月初二", "正月初三",
|
|
|
|
|
+ "正月初四", "正月初五", "情人节", "元宵节", "龙抬头", "妇女节", "植树节", "劳动节",
|
|
|
|
|
+ "母亲节", "儿童节", "端午节", "父亲节", "建党节", "建军节", "七夕节", "中元节",
|
|
|
|
|
+ "中秋节", "国庆节", "重阳节", "感恩节", "公祭日", "平安夜", "圣诞节", "小寒",
|
|
|
|
|
+ "大寒", "立春", "雨水", "惊蛰", "春分", "清明", "谷雨", "立夏", "小满", "芒种",
|
|
|
|
|
+ "夏至", "小暑", "大暑", "立秋", "处暑", "白露", "秋分", "寒露", "霜降", "立冬",
|
|
|
|
|
+ "小雪", "大雪", "冬至", "早上好", "中午好", "下午好", "晚上好", "晚安", "祝福",
|
|
|
|
|
+ "祝愿", "祝你", "祝贺", "祝大家", "祝您", "祝好运", "祝群主", "祝朋友",
|
|
|
|
|
+)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _validate_bizdate(bizdate: str) -> str:
|
|
|
|
|
+ value = bizdate.strip()
|
|
|
|
|
+ if not _DATE_PARTITION_RE.match(value):
|
|
|
|
|
+ raise ValueError(f"bizdate 须为 YYYYMMDD 格式,当前为 {value!r}")
|
|
|
|
|
+ return value
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _sql_string_list(values: tuple[str, ...]) -> str:
|
|
|
|
|
+ return ", ".join(f"'{item}'" for item in values)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def build_monthly_demands_sql(
|
|
|
|
|
+ *,
|
|
|
|
|
+ bizdate: str,
|
|
|
|
|
+ strategy_label: str,
|
|
|
|
|
+ view_pv_count: int,
|
|
|
|
|
+ month_total_pv_threshold: float,
|
|
|
|
|
+ min_contribution_score: float,
|
|
|
|
|
+ rov_avg: float,
|
|
|
|
|
+ min_frequency: int,
|
|
|
|
|
+) -> str:
|
|
|
|
|
+ bizdate_value = _validate_bizdate(bizdate)
|
|
|
|
|
+ label = strategy_label.strip()
|
|
|
|
|
+ if not label:
|
|
|
|
|
+ raise ValueError("strategy_label cannot be empty")
|
|
|
|
|
+ excluded_sql = _sql_string_list(_EXCLUDED_ELEMENTS)
|
|
|
|
|
+
|
|
|
|
|
+ return f"""
|
|
|
|
|
+WITH biz_day AS (
|
|
|
|
|
+ SELECT TO_DATE(
|
|
|
|
|
+ CONCAT(
|
|
|
|
|
+ SUBSTR('{bizdate_value}', 1, 4), '-',
|
|
|
|
|
+ SUBSTR('{bizdate_value}', 5, 2), '-',
|
|
|
|
|
+ SUBSTR('{bizdate_value}', 7, 2)
|
|
|
|
|
+ )
|
|
|
|
|
+ ) AS biz_dt
|
|
|
|
|
+),
|
|
|
|
|
+yesterday AS (
|
|
|
|
|
+ SELECT DATE_SUB((SELECT biz_dt FROM biz_day), 1) AS yest
|
|
|
|
|
+),
|
|
|
|
|
+window_bounds AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ CAST((SELECT yest FROM yesterday) AS DATETIME) AS end_dt,
|
|
|
|
|
+ CAST(DATE_SUB((SELECT yest FROM yesterday), 359) AS DATETIME) AS start_dt
|
|
|
|
|
+),
|
|
|
|
|
+cleaned_video_metrics AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ CAST(视频id AS STRING) AS vid,
|
|
|
|
|
+ CAST(FLOOR(DATEDIFF(
|
|
|
|
|
+ (SELECT yest FROM yesterday),
|
|
|
|
|
+ TO_DATE(REGEXP_REPLACE(CAST(dt AS STRING), '-', ''), 'yyyyMMdd')
|
|
|
|
|
+ ) / 30) AS STRING) AS ym,
|
|
|
|
|
+ rov_t0,
|
|
|
|
|
+ COALESCE(`当日分发曝光pv`, 0) AS day_dist_pv
|
|
|
|
|
+ FROM loghubods.video_dimension_detail_add_column
|
|
|
|
|
+ WHERE TO_DATE(REGEXP_REPLACE(CAST(dt AS STRING), '-', ''), 'yyyyMMdd')
|
|
|
|
|
+ BETWEEN (SELECT start_dt FROM window_bounds) AND (SELECT end_dt FROM window_bounds)
|
|
|
|
|
+ AND COALESCE(`当日分发曝光pv`, 0) >= {int(view_pv_count)}
|
|
|
|
|
+),
|
|
|
|
|
+video_monthly_avg_metrics AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ ym,
|
|
|
|
|
+ vid,
|
|
|
|
|
+ AVG(CASE WHEN rov_t0 = 0 THEN NULL ELSE rov_t0 END) AS vid_avg_rov,
|
|
|
|
|
+ SUM(day_dist_pv) AS month_total_pv
|
|
|
|
|
+ FROM cleaned_video_metrics
|
|
|
|
|
+ GROUP BY ym, vid
|
|
|
|
|
+ HAVING SUM(day_dist_pv) > {float(month_total_pv_threshold)}
|
|
|
|
|
+),
|
|
|
|
|
+tag_vid_dedup AS (
|
|
|
|
|
+ SELECT DISTINCT
|
|
|
|
|
+ CAST(vid AS STRING) AS vid,
|
|
|
|
|
+ 原始元素
|
|
|
|
|
+ FROM loghubods.dwd_topic_decode_result_detail_di
|
|
|
|
|
+ WHERE dt = MAX_PT('loghubods.dwd_topic_decode_result_detail_di')
|
|
|
|
|
+ AND 元素维度 = '实质'
|
|
|
|
|
+ AND 贡献分 >= {float(min_contribution_score)}
|
|
|
|
|
+),
|
|
|
|
|
+element_monthly_metrics AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ t1.原始元素,
|
|
|
|
|
+ t2.ym,
|
|
|
|
|
+ COALESCE(ROUND(AVG(t2.vid_avg_rov), 6), 0) AS month_avg_rov
|
|
|
|
|
+ FROM tag_vid_dedup t1
|
|
|
|
|
+ JOIN video_monthly_avg_metrics t2
|
|
|
|
|
+ ON t1.vid = t2.vid
|
|
|
|
|
+ GROUP BY t1.原始元素, t2.ym
|
|
|
|
|
+ HAVING COALESCE(ROUND(AVG(t2.vid_avg_rov), 6), 0) >= {float(rov_avg)}
|
|
|
|
|
+),
|
|
|
|
|
+element_total_rov AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ 原始元素,
|
|
|
|
|
+ ROUND(SUM(month_avg_rov), 6) AS avg_rov
|
|
|
|
|
+ FROM element_monthly_metrics
|
|
|
|
|
+ GROUP BY 原始元素
|
|
|
|
|
+),
|
|
|
|
|
+element_vid_dedup AS (
|
|
|
|
|
+ SELECT DISTINCT
|
|
|
|
|
+ em.原始元素,
|
|
|
|
|
+ vm.vid
|
|
|
|
|
+ FROM element_monthly_metrics em
|
|
|
|
|
+ JOIN tag_vid_dedup tv
|
|
|
|
|
+ ON em.原始元素 = tv.原始元素
|
|
|
|
|
+ JOIN video_monthly_avg_metrics vm
|
|
|
|
|
+ ON tv.vid = vm.vid
|
|
|
|
|
+ AND em.ym = vm.ym
|
|
|
|
|
+),
|
|
|
|
|
+element_vid_stats AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ 原始元素,
|
|
|
|
|
+ COUNT(DISTINCT vid) AS vid_count,
|
|
|
|
|
+ COLLECT_SET(vid) AS vid_list
|
|
|
|
|
+ FROM element_vid_dedup
|
|
|
|
|
+ GROUP BY 原始元素
|
|
|
|
|
+),
|
|
|
|
|
+element_freq AS (
|
|
|
|
|
+ SELECT
|
|
|
|
|
+ 原始元素,
|
|
|
|
|
+ COUNT(1) AS 频次
|
|
|
|
|
+ FROM element_monthly_metrics
|
|
|
|
|
+ GROUP BY 原始元素
|
|
|
|
|
+)
|
|
|
|
|
+SELECT
|
|
|
|
|
+ '{label}' AS strategy,
|
|
|
|
|
+ md5(CONCAT('{label}', r.原始元素, '{bizdate_value}')) AS demand_id,
|
|
|
|
|
+ r.原始元素 AS demand_name,
|
|
|
|
|
+ r.avg_rov AS weight,
|
|
|
|
|
+ '特征点' AS type,
|
|
|
|
|
+ COALESCE(v.vid_count, 0) AS video_count,
|
|
|
|
|
+ v.vid_list AS video_list,
|
|
|
|
|
+ '{{}}' AS extend
|
|
|
|
|
+FROM element_total_rov r
|
|
|
|
|
+LEFT JOIN element_vid_stats v
|
|
|
|
|
+ ON r.原始元素 = v.原始元素
|
|
|
|
|
+LEFT JOIN element_freq f
|
|
|
|
|
+ ON r.原始元素 = f.原始元素
|
|
|
|
|
+WHERE r.原始元素 NOT IN ({excluded_sql})
|
|
|
|
|
+ AND COALESCE(f.频次, 0) >= {int(min_frequency)}
|
|
|
|
|
+ORDER BY weight DESC
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def query_monthly_demands(
|
|
|
|
|
+ *,
|
|
|
|
|
+ bizdate: str | None = None,
|
|
|
|
|
+ strategy_label: str = "逐月",
|
|
|
|
|
+ view_pv_count: int,
|
|
|
|
|
+ month_total_pv_threshold: float,
|
|
|
|
|
+ min_contribution_score: float,
|
|
|
|
|
+ rov_avg: float,
|
|
|
|
|
+ min_frequency: int,
|
|
|
|
|
+) -> list[dict[str, object]]:
|
|
|
|
|
+ if bizdate is None:
|
|
|
|
|
+ bizdate = datetime.now(SHANGHAI_TZ).strftime("%Y%m%d")
|
|
|
|
|
+ sql = build_monthly_demands_sql(
|
|
|
|
|
+ bizdate=bizdate,
|
|
|
|
|
+ strategy_label=strategy_label,
|
|
|
|
|
+ view_pv_count=view_pv_count,
|
|
|
|
|
+ month_total_pv_threshold=month_total_pv_threshold,
|
|
|
|
|
+ min_contribution_score=min_contribution_score,
|
|
|
|
|
+ rov_avg=rov_avg,
|
|
|
|
|
+ min_frequency=min_frequency,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ odps_client = get_odps_client()
|
|
|
|
|
+ instance = odps_client.execute_sql(
|
|
|
|
|
+ sql,
|
|
|
|
|
+ hints={
|
|
|
|
|
+ "odps.sql.submit.mode": "script",
|
|
|
|
|
+ "odps.sql.decimal.odps2": "true",
|
|
|
|
|
+ },
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ rows: list[dict[str, object]] = []
|
|
|
|
|
+ with instance.open_reader(tunnel=True) as reader:
|
|
|
|
|
+ for record in reader:
|
|
|
|
|
+ rows.append(
|
|
|
|
|
+ {
|
|
|
|
|
+ "strategy": record["strategy"],
|
|
|
|
|
+ "demand_id": record["demand_id"],
|
|
|
|
|
+ "demand_name": record["demand_name"],
|
|
|
|
|
+ "weight": normalize_scalar(record["weight"]),
|
|
|
|
|
+ "type": record["type"],
|
|
|
|
|
+ "video_count": record["video_count"],
|
|
|
|
|
+ "video_list": parse_video_list(record["video_list"]),
|
|
|
|
|
+ "extend": record["extend"],
|
|
|
|
|
+ }
|
|
|
|
|
+ )
|
|
|
|
|
+ return rows
|