99
1010import asyncio
1111from collections .abc import Callable
12- from contextlib import asynccontextmanager
12+ from contextlib import asynccontextmanager , suppress
1313from enum import Enum
1414from http import HTTPStatus
1515from queue import Queue
@@ -71,19 +71,18 @@ def __init__(
7171 ):
7272 @asynccontextmanager
7373 async def lifespan (_ : FastAPI ):
74- self ._monitor_task = asyncio .create_task (
75- LoopMonitor (
76- state_manager = self .state_manager ,
77- loop_manager = self .loop_manager ,
78- restart_callback = self .restart_loop ,
79- wake_queue = self .wake_queue ,
80- fastloop_instance = self ,
81- ).run ()
82- )
74+ self ._stopping = False
75+ self ._start_monitor (reason = "lifespan" )
8376
8477 yield
8578
86- self ._monitor_task .cancel ()
79+ self ._stopping = True
80+ if self ._monitor_restart_task :
81+ self ._monitor_restart_task .cancel ()
82+ if self ._monitor_task :
83+ self ._monitor_task .cancel ()
84+ with suppress (asyncio .CancelledError ):
85+ await self ._monitor_task
8786 await self .loop_manager .stop_all ()
8887 await self .workflow_manager .stop_all ()
8988 await self .task_manager .stop_all ()
@@ -108,6 +107,9 @@ async def lifespan(_: FastAPI):
108107 self .workflow_manager : WorkflowManager = WorkflowManager (self .state_manager )
109108 self .task_manager : TaskManager = TaskManager (self .state_manager )
110109 self ._monitor_task : asyncio .Task [None ] | None = None
110+ self ._monitor_restart_task : asyncio .Task [None ] | None = None
111+ self ._monitor_restart_delay_s : float = 0.5
112+ self ._stopping : bool = False
111113 self ._loop_start_func : Callable [[LoopContext ], None ] | None = None
112114 self ._loop_metadata : dict [str , dict [str , Any ]] = {}
113115 self ._workflow_metadata : dict [str , dict [str , Any ]] = {}
@@ -137,6 +139,58 @@ async def events_history_endpoint(entity_id: str): # type: ignore
137139 async def events_sse_endpoint (entity_id : str ): # type: ignore
138140 return await self .loop_manager .events_sse (entity_id )
139141
142+ @self .middleware ("http" )
143+ async def _ensure_monitor_running (request , call_next ): # type: ignore
144+ if self ._monitor_task is None or self ._monitor_task .done ():
145+ self ._start_monitor (reason = "middleware_safety_net" )
146+ return await call_next (request )
147+
148+ def _start_monitor (self , * , reason : str ) -> None :
149+ if self ._stopping :
150+ return
151+ if self ._monitor_task is not None and not self ._monitor_task .done ():
152+ return
153+ logger .info ("Starting LoopMonitor" , extra = {"reason" : reason })
154+ self ._monitor_task = asyncio .create_task (
155+ LoopMonitor (
156+ state_manager = self .state_manager ,
157+ loop_manager = self .loop_manager ,
158+ restart_callback = self .restart_loop ,
159+ wake_queue = self .wake_queue ,
160+ fastloop_instance = self ,
161+ ).run ()
162+ )
163+ self ._monitor_task .add_done_callback (self ._on_monitor_done )
164+
165+ def _on_monitor_done (self , task : asyncio .Task [Any ]) -> None :
166+ if self ._stopping :
167+ return
168+ with suppress (asyncio .CancelledError ):
169+ exc = task .exception ()
170+ if exc is None :
171+ logger .warning ("LoopMonitor stopped unexpectedly; restarting" )
172+ else :
173+ logger .error ("LoopMonitor crashed; restarting" , extra = {"error" : str (exc )})
174+ self ._schedule_monitor_restart ()
175+
176+ def _schedule_monitor_restart (self ) -> None :
177+ if self ._stopping :
178+ return
179+ if (
180+ self ._monitor_restart_task is not None
181+ and not self ._monitor_restart_task .done ()
182+ ):
183+ return
184+
185+ delay = self ._monitor_restart_delay_s
186+ self ._monitor_restart_delay_s = min (self ._monitor_restart_delay_s * 2 , 10.0 )
187+
188+ async def _restart () -> None :
189+ await asyncio .sleep (delay )
190+ self ._start_monitor (reason = "restart_after_crash" )
191+
192+ self ._monitor_restart_task = asyncio .create_task (_restart ())
193+
140194 @property
141195 def config (self ) -> BaseConfig :
142196 return self .config_manager .get_config ()
0 commit comments