|
|
@@ -1,10 +1,10 @@
|
|
|
import asyncio
|
|
|
import json
|
|
|
+import logging
|
|
|
import time
|
|
|
from datetime import datetime, timedelta
|
|
|
from typing import Optional, Dict, Any, List
|
|
|
|
|
|
-from app.infra.external import feishu_robot
|
|
|
from app.infra.shared import task_schedule_response
|
|
|
from app.jobs.task_handler import TaskHandler
|
|
|
from app.jobs.task_config import (
|
|
|
@@ -18,9 +18,12 @@ from app.jobs.task_utils import (
|
|
|
TaskConcurrencyError,
|
|
|
TaskUtils,
|
|
|
)
|
|
|
+from app.jobs.task_lifecycle import TaskLifecycleManager
|
|
|
from app.core.config import GlobalConfigSettings
|
|
|
from app.core.database import DatabaseManager
|
|
|
-from app.core.observability import LogService
|
|
|
+from app.core.observability import LogService, AlertService
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
class TaskScheduler(TaskHandler):
|
|
|
@@ -43,6 +46,12 @@ class TaskScheduler(TaskHandler):
|
|
|
super().__init__(data, log_service, db_client, trace_id, config)
|
|
|
self.table = TaskUtils.validate_table_name(TaskConstants.TASK_TABLE)
|
|
|
|
|
|
+ async def _send_alert(self, title: str, detail: dict, dedup_key: str = None):
|
|
|
+ """发送告警(异步解耦,不阻塞主链路)"""
|
|
|
+ alert = AlertService.get_instance()
|
|
|
+ if alert:
|
|
|
+ await alert.send_alert(title=title, detail=detail, dedup_key=dedup_key)
|
|
|
+
|
|
|
# ==================== 数据库操作 ====================
|
|
|
|
|
|
async def _insert_or_ignore_task(self, task_name: str, date_str: str) -> None:
|
|
|
@@ -85,7 +94,7 @@ class TaskScheduler(TaskHandler):
|
|
|
query = f"""
|
|
|
UPDATE {self.table}
|
|
|
SET task_status = %s, finish_timestamp = %s
|
|
|
- WHERE trace_id = %s AND task_status = %s
|
|
|
+ WHERE trace_id = %s AND task_status IN (%s, %s)
|
|
|
"""
|
|
|
await self.db_client.async_save(
|
|
|
query=query,
|
|
|
@@ -94,6 +103,7 @@ class TaskScheduler(TaskHandler):
|
|
|
int(time.time()),
|
|
|
self.trace_id,
|
|
|
TaskStatus.PROCESSING,
|
|
|
+ TaskStatus.CANCEL_REQUESTED,
|
|
|
),
|
|
|
)
|
|
|
|
|
|
@@ -148,7 +158,7 @@ class TaskScheduler(TaskHandler):
|
|
|
timeout_tasks=[t["trace_id"] for t in timeout_tasks],
|
|
|
)
|
|
|
|
|
|
- await feishu_robot.bot(
|
|
|
+ await self._send_alert(
|
|
|
title=f"Task Timeout Alert: {task_name}",
|
|
|
detail={
|
|
|
"task_name": task_name,
|
|
|
@@ -162,6 +172,7 @@ class TaskScheduler(TaskHandler):
|
|
|
for t in timeout_tasks
|
|
|
],
|
|
|
},
|
|
|
+ dedup_key=f"timeout_{task_name}",
|
|
|
)
|
|
|
|
|
|
# 可选:自动释放超时任务(需要谨慎使用)
|
|
|
@@ -183,7 +194,7 @@ class TaskScheduler(TaskHandler):
|
|
|
max_concurrent=config.max_concurrent,
|
|
|
)
|
|
|
|
|
|
- await feishu_robot.bot(
|
|
|
+ await self._send_alert(
|
|
|
title=f"Task Concurrency Limit: {task_name}",
|
|
|
detail={
|
|
|
"task_name": task_name,
|
|
|
@@ -191,6 +202,7 @@ class TaskScheduler(TaskHandler):
|
|
|
"max_concurrent": config.max_concurrent,
|
|
|
"active_tasks": [t["trace_id"] for t in active_tasks],
|
|
|
},
|
|
|
+ dedup_key=f"concurrency_{task_name}",
|
|
|
)
|
|
|
|
|
|
raise TaskConcurrencyError(
|
|
|
@@ -266,7 +278,7 @@ class TaskScheduler(TaskHandler):
|
|
|
|
|
|
# 根据错误类型决定是否告警
|
|
|
if config.alert_on_failure:
|
|
|
- await feishu_robot.bot(
|
|
|
+ await self._send_alert(
|
|
|
title=f"Task Failed: {task_name}",
|
|
|
detail={
|
|
|
"task_name": task_name,
|
|
|
@@ -275,12 +287,24 @@ class TaskScheduler(TaskHandler):
|
|
|
"duration": duration,
|
|
|
"retryable": e.retryable,
|
|
|
},
|
|
|
+ dedup_key=f"task_failed_{task_name}_{self.trace_id}",
|
|
|
)
|
|
|
|
|
|
# TODO: 实现重试逻辑
|
|
|
# if e.retryable and retry_count < config.retry_times:
|
|
|
# await self._schedule_retry(task_name, retry_count + 1)
|
|
|
|
|
|
+ except asyncio.CancelledError:
|
|
|
+ # 任务被取消
|
|
|
+ status = TaskStatus.CANCELLED
|
|
|
+ duration = time.time() - start_time
|
|
|
+ await self._log_task_event(
|
|
|
+ "task_cancelled",
|
|
|
+ task_name=task_name,
|
|
|
+ duration=duration,
|
|
|
+ )
|
|
|
+ raise
|
|
|
+
|
|
|
except Exception as e:
|
|
|
# 未知错误
|
|
|
duration = time.time() - start_time
|
|
|
@@ -293,7 +317,7 @@ class TaskScheduler(TaskHandler):
|
|
|
duration=duration,
|
|
|
)
|
|
|
|
|
|
- await feishu_robot.bot(
|
|
|
+ await self._send_alert(
|
|
|
title=f"Task Error: {task_name}",
|
|
|
detail={
|
|
|
"task_name": task_name,
|
|
|
@@ -301,13 +325,22 @@ class TaskScheduler(TaskHandler):
|
|
|
"error": error_detail,
|
|
|
"duration": duration,
|
|
|
},
|
|
|
+ dedup_key=f"task_error_{task_name}_{self.trace_id}",
|
|
|
)
|
|
|
|
|
|
finally:
|
|
|
await self._release_task(status)
|
|
|
+ lifecycle = TaskLifecycleManager.get_instance()
|
|
|
+ if lifecycle:
|
|
|
+ await lifecycle.unregister(self.trace_id)
|
|
|
|
|
|
# 创建后台任务
|
|
|
- asyncio.create_task(_task_wrapper(), name=f"{task_name}_{self.trace_id}")
|
|
|
+ task = asyncio.create_task(
|
|
|
+ _task_wrapper(), name=f"{task_name}_{self.trace_id}"
|
|
|
+ )
|
|
|
+ lifecycle = TaskLifecycleManager.get_instance()
|
|
|
+ if lifecycle:
|
|
|
+ await lifecycle.register(self.trace_id, task)
|
|
|
|
|
|
return await task_schedule_response.success_response(
|
|
|
task_name=task_name,
|
|
|
@@ -334,12 +367,15 @@ class TaskScheduler(TaskHandler):
|
|
|
"""
|
|
|
trace_id = trace_id or self.trace_id
|
|
|
query = f"SELECT * FROM {self.table} WHERE trace_id = %s"
|
|
|
- result = await self.db_client.async_fetch_one(query, (trace_id,))
|
|
|
+ result = await self.db_client.async_fetch_one(query, params=(trace_id,))
|
|
|
return result
|
|
|
|
|
|
async def cancel_task(self, trace_id: Optional[str] = None) -> bool:
|
|
|
"""
|
|
|
- 取消任务(将状态设置为失败)
|
|
|
+ 取消任务
|
|
|
+
|
|
|
+ INIT 状态直接设为 CANCELLED,PROCESSING 状态设为 CANCEL_REQUESTED
|
|
|
+ 等待轮询器检测到信号后取消本地协程
|
|
|
|
|
|
Args:
|
|
|
trace_id: 任务追踪 ID,默认使用当前实例的 trace_id
|
|
|
@@ -350,13 +386,24 @@ class TaskScheduler(TaskHandler):
|
|
|
trace_id = trace_id or self.trace_id
|
|
|
query = f"""
|
|
|
UPDATE {self.table}
|
|
|
- SET task_status = %s, finish_timestamp = %s
|
|
|
+ SET task_status = CASE
|
|
|
+ WHEN task_status = %s THEN %s
|
|
|
+ WHEN task_status = %s THEN %s
|
|
|
+ END,
|
|
|
+ finish_timestamp = CASE
|
|
|
+ WHEN task_status = %s THEN %s
|
|
|
+ ELSE finish_timestamp
|
|
|
+ END
|
|
|
WHERE trace_id = %s AND task_status IN (%s, %s)
|
|
|
"""
|
|
|
result = await self.db_client.async_save(
|
|
|
query,
|
|
|
(
|
|
|
- TaskStatus.FAILED,
|
|
|
+ TaskStatus.INIT,
|
|
|
+ TaskStatus.CANCELLED,
|
|
|
+ TaskStatus.PROCESSING,
|
|
|
+ TaskStatus.CANCEL_REQUESTED,
|
|
|
+ TaskStatus.INIT,
|
|
|
int(time.time()),
|
|
|
trace_id,
|
|
|
TaskStatus.INIT,
|
|
|
@@ -365,7 +412,7 @@ class TaskScheduler(TaskHandler):
|
|
|
)
|
|
|
|
|
|
if result:
|
|
|
- await self._log_task_event("task_cancelled", trace_id=trace_id)
|
|
|
+ await self._log_task_event("task_cancel_requested", trace_id=trace_id)
|
|
|
|
|
|
return bool(result)
|
|
|
|