@@ -100,9 +100,10 @@ async def _check_orphaned_loops(self) -> None:
100100 for loop in running_loops :
101101 if await self .state_manager .has_claim (loop .loop_id ):
102102 continue
103- logger .info (
104- "Loop has no claim, restarting" , extra = {"loop_id" : loop .loop_id }
105- )
103+ if not await self .state_manager .try_claim_loop_recovery (loop .loop_id ):
104+ continue
105+
106+ logger .info ("Orphaned loop recovered" , extra = {"loop_id" : loop .loop_id })
106107 if not await self .restart_callback (loop .loop_id ):
107108 await self .state_manager .update_loop_status (
108109 loop .loop_id , LoopStatus .STOPPED
@@ -115,12 +116,14 @@ async def _check_orphaned_workflows(self) -> None:
115116 for workflow in running_workflows :
116117 if await self .state_manager .workflow_has_claim (workflow .workflow_run_id ):
117118 continue
119+ if not await self .state_manager .try_claim_workflow_recovery (
120+ workflow .workflow_run_id
121+ ):
122+ continue
123+
118124 logger .info (
119- "Workflow has no claim, restarting" ,
120- extra = {
121- "workflow_run_id" : workflow .workflow_run_id ,
122- "block_index" : workflow .current_block_index ,
123- },
125+ "Orphaned workflow recovered" ,
126+ extra = {"workflow_run_id" : workflow .workflow_run_id },
124127 )
125128 if not await self .fastloop_instance .restart_workflow (
126129 workflow .workflow_run_id
@@ -136,16 +139,17 @@ async def _check_orphaned_tasks(self) -> None:
136139 for task in running_tasks :
137140 if await self .state_manager .task_has_claim (task .task_id ):
138141 continue
142+ if not await self .state_manager .try_claim_task_recovery (task .task_id ):
143+ continue
144+
145+ await self .state_manager .update_task_status (task .task_id , TaskStatus .FAILED )
139146
140147 metadata = self .fastloop_instance ._task_metadata .get (task .task_name )
141148 if not metadata :
142- await self .state_manager .update_task_status (
143- task .task_id , TaskStatus .FAILED
144- )
145149 continue
146150
147151 logger .info (
148- "Task has no claim, restarting " ,
152+ "Orphaned task recovered " ,
149153 extra = {"task_id" : task .task_id , "task_name" : task .task_name },
150154 )
151155 await self .fastloop_instance .task_manager .submit (
@@ -157,12 +161,6 @@ async def _check_orphaned_tasks(self) -> None:
157161 )
158162
159163 async def _check_scheduled_workflows (self ) -> None :
160- """Check for IDLE workflows with past-due scheduled wake times.
161-
162- This is a backup mechanism that catches workflows that may have been
163- removed from the ZSET but not yet processed (e.g., if the wake queue
164- consumer failed or the wake monitoring thread died).
165- """
166164 now = time .time ()
167165 idle_workflows = await self .state_manager .get_all_workflows (
168166 status = LoopStatus .IDLE
@@ -174,17 +172,14 @@ async def _check_scheduled_workflows(self) -> None:
174172 continue
175173 if await self .state_manager .workflow_has_claim (workflow .workflow_run_id ):
176174 continue
177- claimed_from_zset = await self .state_manager .try_claim_workflow_wake (
175+ if not await self .state_manager .try_claim_workflow_wake (
178176 workflow .workflow_run_id
179- )
177+ ):
178+ continue
179+
180180 logger .info (
181- "IDLE workflow has past-due wake time, restarting" ,
182- extra = {
183- "workflow_run_id" : workflow .workflow_run_id ,
184- "scheduled_wake_time" : workflow .scheduled_wake_time ,
185- "block_index" : workflow .current_block_index ,
186- "claimed_from_zset" : claimed_from_zset ,
187- },
181+ "IDLE workflow past-due, restarting" ,
182+ extra = {"workflow_run_id" : workflow .workflow_run_id },
188183 )
189184 if await self .fastloop_instance .restart_workflow (workflow .workflow_run_id ):
190185 await self .state_manager .clear_workflow_wake_time (
@@ -200,8 +195,12 @@ async def _check_scheduled_workflows(self) -> None:
200195
201196 async def _check_scheduled_tasks (self ) -> None :
202197 for schedule_id , schedule in await self .state_manager .get_due_schedules ():
198+ if not await self .state_manager .try_claim_schedule (schedule_id ):
199+ continue
200+
203201 metadata = self .fastloop_instance ._task_metadata .get (schedule .task_name )
204202 if not metadata :
203+ await self .state_manager .advance_schedule (schedule_id , schedule )
205204 continue
206205
207206 try :
@@ -212,12 +211,13 @@ async def _check_scheduled_tasks(self) -> None:
212211 retry_policy = metadata .get ("retry" ),
213212 executor_type = metadata .get ("executor" ),
214213 )
215- await self .state_manager .advance_schedule (schedule_id , schedule )
216214 except Exception as e :
217215 logger .error (
218216 "Scheduled task failed" ,
219217 extra = {"schedule_id" : schedule_id , "error" : str (e )},
220218 )
219+ finally :
220+ await self .state_manager .advance_schedule (schedule_id , schedule )
221221
222222 async def _check_disconnect_stops (self ) -> None :
223223 active_ids = await self .loop_manager .active_loop_ids ()
0 commit comments