task_lifecycle.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. """
  2. 任务生命周期管理器
  3. 提供分布式环境下的协程生命周期管理,支持:
  4. - 进程内任务注册表
  5. - 基于 MySQL 的跨进程取消信号
  6. - 轮询机制检测取消请求
  7. - 优雅关闭时取消所有任务
  8. """
  9. import asyncio
  10. import logging
  11. from typing import Dict, Optional
  12. from app.core.database import DatabaseManager
  13. from app.jobs.task_config import TaskStatus
  14. logger = logging.getLogger(__name__)
  15. class TaskLifecycleManager:
  16. """任务生命周期管理器(单例)"""
  17. _instance: Optional["TaskLifecycleManager"] = None
  18. def __init__(
  19. self,
  20. db_client: DatabaseManager,
  21. poll_interval: float = 5.0,
  22. force_kill_timeout: float = 10.0,
  23. ):
  24. """
  25. 初始化生命周期管理器
  26. Args:
  27. db_client: 数据库客户端
  28. poll_interval: 轮询间隔(秒)
  29. force_kill_timeout: 强制终止超时(秒)
  30. """
  31. self._registry: Dict[str, asyncio.Task] = {}
  32. self._lock = asyncio.Lock()
  33. self._db = db_client
  34. self._poll_interval = poll_interval
  35. self._force_kill_timeout = force_kill_timeout
  36. self._poll_task: Optional[asyncio.Task] = None
  37. self._shutting_down = False
  38. @classmethod
  39. def initialize(
  40. cls,
  41. db_client: DatabaseManager,
  42. poll_interval: float = 5.0,
  43. force_kill_timeout: float = 10.0,
  44. ) -> "TaskLifecycleManager":
  45. """
  46. 初始化单例实例
  47. Args:
  48. db_client: 数据库客户端
  49. poll_interval: 轮询间隔(秒)
  50. force_kill_timeout: 强制终止超时(秒)
  51. Returns:
  52. TaskLifecycleManager 实例
  53. """
  54. if cls._instance is None:
  55. cls._instance = cls(db_client, poll_interval, force_kill_timeout)
  56. logger.info(
  57. f"TaskLifecycleManager initialized with poll_interval={poll_interval}s"
  58. )
  59. return cls._instance
  60. @classmethod
  61. def get_instance(cls) -> Optional["TaskLifecycleManager"]:
  62. """获取单例实例"""
  63. return cls._instance
  64. async def register(self, trace_id: str, task: asyncio.Task) -> None:
  65. """
  66. 注册任务到生命周期管理器
  67. Args:
  68. trace_id: 任务追踪 ID
  69. task: asyncio.Task 对象
  70. """
  71. async with self._lock:
  72. self._registry[trace_id] = task
  73. logger.debug(f"Task registered: {trace_id}, total={len(self._registry)}")
  74. async def unregister(self, trace_id: str) -> None:
  75. """
  76. 从生命周期管理器注销任务
  77. Args:
  78. trace_id: 任务追踪 ID
  79. """
  80. async with self._lock:
  81. if trace_id in self._registry:
  82. del self._registry[trace_id]
  83. logger.debug(
  84. f"Task unregistered: {trace_id}, total={len(self._registry)}"
  85. )
  86. async def cancel_local(self, trace_id: str) -> bool:
  87. """
  88. 取消本地协程
  89. Args:
  90. trace_id: 任务追踪 ID
  91. Returns:
  92. 是否成功取消
  93. """
  94. async with self._lock:
  95. task = self._registry.get(trace_id)
  96. if not task:
  97. logger.debug(f"Task not found in local registry: {trace_id}")
  98. return False
  99. if task.done():
  100. logger.debug(f"Task already done: {trace_id}")
  101. return False
  102. logger.info(f"Cancelling task: {trace_id}")
  103. task.cancel()
  104. # 等待任务响应取消(带超时)
  105. try:
  106. await asyncio.wait_for(task, timeout=self._force_kill_timeout)
  107. except asyncio.CancelledError:
  108. logger.info(f"Task cancelled successfully: {trace_id}")
  109. except asyncio.TimeoutError:
  110. logger.warning(
  111. f"Task did not respond to cancellation within {self._force_kill_timeout}s: {trace_id}"
  112. )
  113. except Exception as e:
  114. logger.error(f"Error while waiting for task cancellation: {trace_id}, {e}")
  115. return True
  116. async def _poll_loop(self) -> None:
  117. """轮询循环:检查数据库中的取消请求"""
  118. logger.info("Task lifecycle polling loop started")
  119. while not self._shutting_down:
  120. try:
  121. # 查询 CANCEL_REQUESTED 状态的任务
  122. rows = await self._db.async_fetch(
  123. "SELECT trace_id FROM long_articles_task_manager "
  124. "WHERE task_status = %s",
  125. params=(TaskStatus.CANCEL_REQUESTED,),
  126. )
  127. if rows:
  128. # 获取本地注册表的快照
  129. async with self._lock:
  130. local_trace_ids = set(self._registry.keys())
  131. # 取消本地存在的任务
  132. for row in rows:
  133. trace_id = row["trace_id"]
  134. if trace_id in local_trace_ids:
  135. logger.info(
  136. f"Cancel signal detected for task: {trace_id}"
  137. )
  138. await self.cancel_local(trace_id)
  139. except Exception as e:
  140. logger.exception(f"Error in poll loop: {e}")
  141. # 等待下一次轮询
  142. await asyncio.sleep(self._poll_interval)
  143. logger.info("Task lifecycle polling loop stopped")
  144. async def start_polling(self) -> None:
  145. """启动轮询协程"""
  146. if self._poll_task is not None:
  147. logger.warning("Polling already started")
  148. return
  149. self._poll_task = asyncio.create_task(
  150. self._poll_loop(), name="task_lifecycle_poll"
  151. )
  152. logger.info("Task lifecycle polling started")
  153. async def stop_polling(self) -> None:
  154. """停止轮询协程"""
  155. if self._poll_task is None:
  156. return
  157. self._shutting_down = True
  158. self._poll_task.cancel()
  159. try:
  160. await self._poll_task
  161. except asyncio.CancelledError:
  162. pass
  163. self._poll_task = None
  164. logger.info("Task lifecycle polling stopped")
  165. async def shutdown(self, timeout: float = 30.0) -> None:
  166. """
  167. 优雅关闭:取消所有任务并等待完成
  168. Args:
  169. timeout: 等待任务完成的超时时间(秒)
  170. """
  171. logger.info("TaskLifecycleManager shutting down...")
  172. # 获取所有任务的快照
  173. async with self._lock:
  174. tasks = list(self._registry.values())
  175. trace_ids = list(self._registry.keys())
  176. if tasks:
  177. logger.info(f"Cancelling {len(tasks)} running tasks: {trace_ids}")
  178. # 取消所有任务
  179. for task in tasks:
  180. if not task.done():
  181. task.cancel()
  182. # 等待所有任务完成(带超时)
  183. try:
  184. await asyncio.wait_for(
  185. asyncio.gather(*tasks, return_exceptions=True),
  186. timeout=timeout,
  187. )
  188. logger.info("All tasks cancelled successfully")
  189. except asyncio.TimeoutError:
  190. logger.warning(
  191. f"Some tasks did not finish within {timeout}s timeout"
  192. )
  193. else:
  194. logger.info("No running tasks to cancel")
  195. # 停止轮询
  196. await self.stop_polling()
  197. logger.info("TaskLifecycleManager shutdown complete")
  198. __all__ = ["TaskLifecycleManager"]