|
|
@@ -0,0 +1,143 @@
|
|
|
+import datetime
|
|
|
+import hashlib
|
|
|
+import json
|
|
|
+from typing import Dict
|
|
|
+
|
|
|
+from app.core.config import GlobalConfigSettings
|
|
|
+from app.core.database import DatabaseManager
|
|
|
+from app.core.observability import LogService
|
|
|
+from app.infra.internal import delete_illegal_gzh_articles
|
|
|
+from app.infra.shared import run_tasks_with_asyncio_task_group
|
|
|
+
|
|
|
+from ._mapper import RateLimitedArticleMapper
|
|
|
+from ._utils import RateLimitedArticleUtils
|
|
|
+
|
|
|
+
|
|
|
+class RateLimitedArticleFilter(RateLimitedArticleMapper):
|
|
|
+ RATE_LIMITED = 2
|
|
|
+
|
|
|
+ def __init__(
|
|
|
+ self,
|
|
|
+ pool: DatabaseManager,
|
|
|
+ config: GlobalConfigSettings,
|
|
|
+ log_service: LogService,
|
|
|
+ ):
|
|
|
+ super().__init__(pool=pool)
|
|
|
+ self.config = config.read_rate_limit
|
|
|
+ self.tool = RateLimitedArticleUtils(log_service=log_service)
|
|
|
+
|
|
|
+ async def _process_single_article(self, data: Dict):
|
|
|
+ """处理单个文章的异步任务"""
|
|
|
+ title = data["title"]
|
|
|
+ title_md5 = hashlib.md5(title.encode("utf-8")).hexdigest()
|
|
|
+ remark = json.dumps(
|
|
|
+ {
|
|
|
+ "发文数量": data["publish_count"],
|
|
|
+ "限流数量": data["low_read_count"],
|
|
|
+ "限流比例": data["low_read_ratio"],
|
|
|
+ "周期": data["days"],
|
|
|
+ "触发规则": data["trigger_rules"],
|
|
|
+ "执行日期": datetime.datetime.today().strftime("%Y-%m-%d"),
|
|
|
+ },
|
|
|
+ ensure_ascii=False,
|
|
|
+ )
|
|
|
+ try:
|
|
|
+ insert_rows = await self.save_record(
|
|
|
+ article_tuple=(title_md5, title, remark)
|
|
|
+ )
|
|
|
+ if insert_rows:
|
|
|
+ gh_id = data["gh_ids"][0]
|
|
|
+ await delete_illegal_gzh_articles(
|
|
|
+ gh_id=gh_id, title=title, delete_flag=self.RATE_LIMITED
|
|
|
+ )
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "title_shielded",
|
|
|
+ "title": title,
|
|
|
+ "days": data["days"],
|
|
|
+ "trigger_rules": data["trigger_rules"],
|
|
|
+ "gh_id": gh_id,
|
|
|
+ "low_read_count": data["low_read_count"],
|
|
|
+ "publish_count": data["publish_count"],
|
|
|
+ }
|
|
|
+ )
|
|
|
+ except Exception as e:
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "process_single_article_failed",
|
|
|
+ "title": title,
|
|
|
+ "days": data.get("days"),
|
|
|
+ "status": "error",
|
|
|
+ "message": str(e),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ raise
|
|
|
+
|
|
|
+ async def process_single_task(self, days: int, max_concurrent: int = 5):
|
|
|
+ """并发处理所有文章任务"""
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "period_start",
|
|
|
+ "days": days,
|
|
|
+ }
|
|
|
+ )
|
|
|
+ raw_records = await self.find_rate_limited_articles(
|
|
|
+ days_duration=days,
|
|
|
+ )
|
|
|
+ effective_records = self.tool.filter_account_disabled_records(
|
|
|
+ records=raw_records,
|
|
|
+ read_on_avg_threshold=self.config.read_on_avg_threshold,
|
|
|
+ consist_days=self.config.CONSIST_DAYS,
|
|
|
+ )
|
|
|
+
|
|
|
+ aggregated = self.tool.aggregate_rate_limited_titles(
|
|
|
+ records=effective_records,
|
|
|
+ read_on_avg_threshold=self.config.read_on_avg_threshold,
|
|
|
+ base_discover_time=self.config.base_discover_time,
|
|
|
+ low_read_rate_threshold=self.config.low_read_rate_threshold,
|
|
|
+ max_rate_limited_articles=self.config.MAX_RATE_LIMITED_ARTICLES,
|
|
|
+ )
|
|
|
+ data_list = [{**item, "days": days} for item in aggregated]
|
|
|
+
|
|
|
+ result = await run_tasks_with_asyncio_task_group(
|
|
|
+ task_list=data_list,
|
|
|
+ handler=self._process_single_article,
|
|
|
+ description="执行限流删文处理",
|
|
|
+ max_concurrency=max_concurrent,
|
|
|
+ unit="per_title",
|
|
|
+ )
|
|
|
+
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "period_complete",
|
|
|
+ "days": days,
|
|
|
+ "raw_row_count": len(raw_records),
|
|
|
+ "effective_row_count": len(effective_records),
|
|
|
+ "hit_title_count": len(data_list),
|
|
|
+ "total_task": result["total_task"],
|
|
|
+ "processed_task": result["processed_task"],
|
|
|
+ "error_count": len(result["errors"]),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ for _idx, task_obj, err in result["errors"]:
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "period_item_error",
|
|
|
+ "days": days,
|
|
|
+ "title": task_obj.get("title"),
|
|
|
+ "status": "error",
|
|
|
+ "message": str(err),
|
|
|
+ }
|
|
|
+ )
|
|
|
+
|
|
|
+ async def deal(self):
|
|
|
+ await self.tool.trace_log(
|
|
|
+ {
|
|
|
+ "event": "deal_start",
|
|
|
+ "stat_durations": list(self.config.stat_durations),
|
|
|
+ }
|
|
|
+ )
|
|
|
+ for _day in self.config.stat_durations:
|
|
|
+ await self.process_single_task(_day)
|
|
|
+
|
|
|
+ await self.tool.trace_log({"event": "deal_complete"})
|