@@ -232,16 +232,21 @@ def _drain(pool: ProcessPoolExecutor) -> None:
232232 futures : dict = {}
233233 submit_times : dict [str , float ] = {}
234234 submit_failed : list [tuple [BenchmarkInstance , list [RunParams ]]] = []
235- pool_broken = False
236235 for inst , params_list in pending :
237- try :
238- submit_times [inst .instance_id ] = _time .monotonic ()
239- futures [pool .submit (eval_all_cells_fn , inst , params_list )] = (inst , params_list )
240- except BrokenProcessPool :
241- idx = pending .index ((inst , params_list ))
242- submit_failed .extend (pending [idx :])
243- pool_broken = True
244- break
236+ for attempt in range (3 ):
237+ try :
238+ submit_times [inst .instance_id ] = _time .monotonic ()
239+ futures [pool .submit (eval_all_cells_fn , inst , params_list )] = (inst , params_list )
240+ break
241+ except BrokenProcessPool :
242+ # The kill switch from a previous task left a worker
243+ # mid-respawn; ProcessPoolExecutor itself spawns a
244+ # replacement on the next submit. Brief sleep + retry
245+ # avoids forcing a full pool rebuild for one dead worker.
246+ if attempt == 2 :
247+ submit_failed .append ((inst , params_list ))
248+ break
249+ _time .sleep (0.1 )
245250 outer_deadline = _time .monotonic () + timeout_per_instance * len (points ) * max (1 , (len (pending ) + workers - 1 ) // workers )
246251 completed : set [str ] = set ()
247252 try :
@@ -253,6 +258,11 @@ def _drain(pool: ProcessPoolExecutor) -> None:
253258 try :
254259 per_cell = future .result (timeout = 0 )
255260 except BrokenProcessPool :
261+ # Worker death (typically from our os._exit kill switch).
262+ # Treat elapsed-near-deadline as timeout so a pathological
263+ # instance is checkpointed and not retried forever.
264+ # Shorter elapsed => transient (submitted just before
265+ # an earlier task killed the worker); record as error.
256266 elapsed = _time .monotonic () - submit_times .get (inst .instance_id , 0.0 )
257267 if elapsed >= timeout_per_instance * 0.9 :
258268 per_cell = [
@@ -268,41 +278,28 @@ def _drain(pool: ProcessPoolExecutor) -> None:
268278 for p in params_list
269279 ]
270280 else :
271- pool_broken = True
272281 per_cell = [(p , _failure_eval (inst , p , "error" , "BrokenProcessPool: worker died" )) for p in params_list ]
273282 except Exception as e :
274283 per_cell = [(p , _failure_eval (inst , p , "error" , f"{ type (e ).__name__ } : { e } " )) for p in params_list ]
275284 completed .add (inst .instance_id )
276285 _record_per_cell (per_cell )
277286 except BrokenProcessPool :
278- pool_broken = True
287+ # Self-healing pool — log nothing here; per-future BPP handler
288+ # above already recorded the dead instance, and ProcessPoolExecutor
289+ # spawns replacement workers on the next submission cycle.
290+ pass
279291 for inst , params_list in submit_failed :
280292 _record_per_cell ([(p , _failure_eval (inst , p , "error" , "BrokenProcessPool: submit failed" )) for p in params_list ])
281- if pool_broken :
282- raise BrokenProcessPool ("pool degraded mid-grid" )
283293
284294 pool : ProcessPoolExecutor | None = _make_pool () if workers > 1 else None
285295 try :
286296 if pending and pool is not None :
287- while True :
288- try :
289- _drain (pool )
290- break
291- except BrokenProcessPool :
292- try :
293- pool .shutdown (wait = False , cancel_futures = True )
294- except Exception :
295- pass
296- pool = _make_pool ()
297- # Recompute pending for the rebuild from current
298- # checkpoint state — instances completed since last
299- # rebuild should be skipped.
300- done_ids_now = {lbl : read_checkpoint (c ) if c is not None else set () for lbl , c in ckpts .items ()}
301- pending [:] = [
302- (inst , [p for p in points if inst .instance_id not in done_ids_now [p .label ()]])
303- for inst , _ in pending
304- if any (inst .instance_id not in done_ids_now [p .label ()] for p in points )
305- ]
297+ # Single drain pass; ProcessPoolExecutor self-heals dead
298+ # workers via spawn-on-next-submit, so we no longer rebuild
299+ # the whole pool every time the kill switch fires (that
300+ # rebuild was 30-60s of dead time per BPP and turned every
301+ # legitimate per-instance timeout into a cell-wide cascade).
302+ _drain (pool )
306303 elif pending and pool is None :
307304 # workers == 1: serial fallback
308305 for inst , params_list in pending :
0 commit comments