Jelajahi Sumber

增加每日上限

xueyiming 3 minggu lalu
induk
melakukan
627c7932cc

+ 5 - 0
app/core/config.py

@@ -146,6 +146,7 @@ class Settings:
     demand_pool_top_n: int = 200
     hot_demand_pool_strategy: str = "新热事件"
     wxindex_score_threshold: float = 1_000_000.0
+    odps_daily_write_limit: int = 100
 
     postprocess_batch_size: int = 20
     contribution_match_llm_model: str = ""
@@ -285,6 +286,10 @@ class Settings:
                     ),
                 ),
             ),
+            odps_daily_write_limit=_env_int(
+                "ODPS_DAILY_WRITE_LIMIT",
+                defaults.odps_daily_write_limit,
+            ),
             postprocess_batch_size=_env_int(
                 "POSTPROCESS_BATCH_SIZE",
                 defaults.postprocess_batch_size,

+ 4 - 0
app/hot_content/config.py

@@ -198,6 +198,10 @@ def load_flow_config(interval_override: int | None = None) -> FlowConfig:
                 ),
             ),
         ),
+        odps_daily_write_limit=_get_env_int(
+            "ODPS_DAILY_WRITE_LIMIT",
+            settings.odps_daily_write_limit,
+        ),
         postprocess_batch_size=_get_env_int(
             "POSTPROCESS_BATCH_SIZE",
             settings.postprocess_batch_size,

+ 68 - 15
app/hot_content/demand_pool_writer.py

@@ -27,12 +27,36 @@ def _escape_sql_string(value: str) -> str:
     return value.replace("'", "''")
 
 
+def apply_odps_daily_write_limit(
+    pending_rows: list[dict[str, Any]],
+    *,
+    existing_count: int,
+    daily_limit: int,
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
+    """按每日上限截断待写入行。daily_limit <= 0 表示不限制。"""
+    limit_meta: dict[str, Any] = {
+        "daily_write_limit": daily_limit if daily_limit > 0 else None,
+        "daily_written_count": existing_count,
+    }
+    if daily_limit <= 0:
+        limit_meta["daily_remaining_quota"] = None
+        return pending_rows, [], limit_meta
+
+    remaining_quota = daily_limit - existing_count
+    limit_meta["daily_remaining_quota"] = max(remaining_quota, 0)
+    if remaining_quota <= 0:
+        return [], list(pending_rows), limit_meta
+    if len(pending_rows) <= remaining_quota:
+        return pending_rows, [], limit_meta
+    return pending_rows[:remaining_quota], pending_rows[remaining_quota:], limit_meta
+
+
 class HotDemandPoolWriter:
     def __init__(self, config: FlowConfig, repository: HotContentRepository):
         self.config = config
         self.repository = repository
 
-    def sync_today(self) -> dict[str, Any]:
+    def plan_today(self) -> dict[str, Any]:
         partition_dt = datetime.now(SHANGHAI_TZ).date().strftime("%Y%m%d")
         strategy = self.config.hot_demand_pool_strategy
         # 仅同步主表 hot_content_records.created_at 为当天的 record,写入当天 ODPS 分区。
@@ -52,6 +76,7 @@ class HotDemandPoolWriter:
             strategy=strategy,
         )
         skip_demand_ids = synced_demand_ids | odps_existing_demand_ids
+        daily_written_count = len(skip_demand_ids)
 
         pending_rows: list[dict[str, Any]] = []
         skipped_rows: list[dict[str, Any]] = []
@@ -62,50 +87,78 @@ class HotDemandPoolWriter:
                 continue
             pending_rows.append(row)
 
+        rows_to_write, limit_skipped_rows, limit_meta = apply_odps_daily_write_limit(
+            pending_rows,
+            existing_count=daily_written_count,
+            daily_limit=self.config.odps_daily_write_limit,
+        )
+
+        return {
+            "partition_dt": partition_dt,
+            "strategy": strategy,
+            "source_record_count": len(export_groups),
+            "candidate_row_count": len(hive_rows),
+            "pending_row_count": len(rows_to_write),
+            "skipped_row_count": len(skipped_rows),
+            "limit_skipped_row_count": len(limit_skipped_rows),
+            "rows_to_write": rows_to_write,
+            "skipped_rows": skipped_rows,
+            "limit_skipped_rows": limit_skipped_rows,
+            "target_table": self.config.demand_pool_source_table,
+            **limit_meta,
+        }
+
+    def sync_today(self) -> dict[str, Any]:
+        plan = self.plan_today()
+        rows_to_write = plan["rows_to_write"]
         written_count = self._insert_partition_rows(
-            hive_rows=pending_rows,
-            partition_dt=partition_dt,
+            hive_rows=rows_to_write,
+            partition_dt=str(plan["partition_dt"]),
         )
         if written_count:
             self.repository.save_odps_sync_logs(
                 [
                     {
-                        "partition_dt": partition_dt,
-                        "strategy": strategy,
+                        "partition_dt": plan["partition_dt"],
+                        "strategy": plan["strategy"],
                         "demand_id": row["demand_id"],
                         "demand_name": row["demand_name"],
                         "demand_type": row["type"],
                         "record_id": row.get("record_id") or 0,
                     }
-                    for row in pending_rows
+                    for row in rows_to_write
                 ]
             )
 
         pending_record_ids = sorted(
             {
                 int(row.get("record_id") or 0)
-                for row in pending_rows
+                for row in rows_to_write
                 if int(row.get("record_id") or 0) > 0
             }
         )
         skipped_record_ids = sorted(
             {
                 int(row.get("record_id") or 0)
-                for row in skipped_rows
+                for row in plan["skipped_rows"] + plan["limit_skipped_rows"]
                 if int(row.get("record_id") or 0) > 0
             }
         )
         return {
-            "partition_dt": partition_dt,
-            "strategy": strategy,
-            "source_record_count": len(export_groups),
-            "candidate_row_count": len(hive_rows),
-            "pending_row_count": len(pending_rows),
-            "skipped_row_count": len(skipped_rows),
+            "partition_dt": plan["partition_dt"],
+            "strategy": plan["strategy"],
+            "source_record_count": plan["source_record_count"],
+            "candidate_row_count": plan["candidate_row_count"],
+            "pending_row_count": plan["pending_row_count"],
+            "skipped_row_count": plan["skipped_row_count"],
+            "limit_skipped_row_count": plan["limit_skipped_row_count"],
+            "daily_write_limit": plan["daily_write_limit"],
+            "daily_written_count": plan["daily_written_count"],
+            "daily_remaining_quota": plan["daily_remaining_quota"],
             "written_count": written_count,
             "pending_record_ids": pending_record_ids,
             "skipped_record_ids": skipped_record_ids,
-            "target_table": self.config.demand_pool_source_table,
+            "target_table": plan["target_table"],
         }
 
     def _list_odps_partition_demand_ids(

+ 1 - 1
app/hot_content/types.py

@@ -42,6 +42,7 @@ class FlowConfig:
     demand_pool_top_n: int
     hot_demand_pool_strategy: str
     wxindex_score_threshold: float
+    odps_daily_write_limit: int
     postprocess_batch_size: int
     contribution_match_llm_model: str
     contribution_match_llm_max_attempts: int
@@ -52,6 +53,5 @@ class FlowConfig:
     wxindex_llm_max_tokens: int
     wxindex_api_url: str
     wxindex_lookback_days: int
-    wxindex_score_threshold: float
     sources: list[HotSourceConfig]
     mysql: MysqlConfig

+ 1 - 0
docker-compose.yml

@@ -30,6 +30,7 @@ services:
       WXINDEX_SCORE_THRESHOLD: ${WXINDEX_SCORE_THRESHOLD:-1000000}
       DEMAND_POOL_SOURCE_TABLE: ${DEMAND_POOL_SOURCE_TABLE:-dwd_multi_demand_pool_di}
       HOT_DEMAND_POOL_STRATEGY: ${HOT_DEMAND_POOL_STRATEGY:-新热事件}
+      ODPS_DAILY_WRITE_LIMIT: ${ODPS_DAILY_WRITE_LIMIT:-100}
     # 如需挂载本地 .env,可取消注释:
     # env_file:
     #   - .env