@@ -212,6 +212,38 @@ def stop_shell(self, shell_id):
212212 proc .terminate ()
213213 proc .join (timeout = 2.0 )
214214
215+ def _finish_restart (self , shell_id ):
216+ """Start a fresh shell and mark it as needing session restore.
217+
218+ IMPORTANT: Must be called OUTSIDE except handlers. start_shell() uses
219+ multiprocessing.Process.start() → os.fork(). A child forked inside an
220+ except block inherits sys.exc_info() from the parent, causing Python's
221+ implicit exception chaining to attach the parent's exception context to
222+ every subsequent exception in the child process.
223+ """
224+ self .start_shell (shell_id )
225+ with self .manager_lock :
226+ if shell_id in self .shells :
227+ self .shells [shell_id ]["restart_pending" ] = True
228+
229+ def _cleanup_shell_resources (self , proc , conn ):
230+ # Best-effort teardown for the current shell process and pipe.
231+ try :
232+ proc .terminate ()
233+ except Exception :
234+ try :
235+ os .kill (proc .pid , signal .SIGKILL )
236+ except Exception :
237+ pass
238+ try :
239+ proc .join (timeout = 2.0 )
240+ except Exception :
241+ pass
242+ try :
243+ conn .close ()
244+ except Exception :
245+ pass
246+
215247 def run_cell (self , shell_id , code , timeout = 1.0 , grace = 2.0 , traceback_verbosity = "Plain" ):
216248 """
217249 Execute `code` on shell `shell_id`.
@@ -252,18 +284,35 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
252284
253285 exec_id = time .time_ns ()
254286 with lock :
287+ _need_restart = False
255288 # send execution request
256289 try :
257290 conn .send ({"cmd" : "exec" , "id" : exec_id , "code" : code , "traceback_verbosity" : traceback_verbosity })
258291 except Exception as exc :
259- return {
292+ logging .warning (f"Shell process for { shell_id } failed before execution request was sent, restarting" )
293+ # clean up the shell process and connection - best-effort
294+ self ._cleanup_shell_resources (proc , conn )
295+
296+ # remove the old shell entry
297+ with self .manager_lock :
298+ self .shells .pop (shell_id , None )
299+
300+ _need_restart = True
301+ _exc_msg = str (exc )
302+ _restart_result = {
260303 "status" : "error" ,
261- "msg" : f"send failed: { exc } " ,
304+ "msg" : f"send failed: { _exc_msg } " ,
262305 "shell_was_created" : shell_was_created ,
306+ "shell_was_restarted" : True ,
263307 "shell_was_recently_restarted" : shell_was_recently_restarted ,
264308 }
265309
310+ if _need_restart :
311+ self ._finish_restart (shell_id )
312+ return _restart_result
313+
266314 # wait for the result up to `timeout`
315+ _need_restart = False
267316 if conn .poll (timeout ):
268317 try :
269318 result = conn .recv ()
@@ -273,23 +322,27 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
273322 except EOFError :
274323 # Connection closed - shell process died, need to restart
275324 logging .warning (f"Shell process for { shell_id } died during execution, restarting" )
276- with self .manager_lock :
277- self .shells .pop (shell_id , None )
278- self .start_shell (shell_id )
279325
280- # Mark the new shell as having a restart pending
326+ # clean up the shell process and connection - best-effort
327+ self ._cleanup_shell_resources (proc , conn )
328+
329+ # remove the old shell entry
281330 with self .manager_lock :
282- if shell_id in self .shells :
283- self .shells [shell_id ]["restart_pending" ] = True
331+ self .shells .pop (shell_id , None )
284332
285- return {
333+ _need_restart = True
334+ _restart_result = {
286335 "status" : "error" ,
287336 "msg" : "connection closed" ,
288337 "shell_was_created" : shell_was_created ,
289338 "shell_was_restarted" : True ,
290339 "shell_was_recently_restarted" : shell_was_recently_restarted ,
291340 }
292341
342+ if _need_restart :
343+ self ._finish_restart (shell_id )
344+ return _restart_result
345+
293346 # no reply yet -> try gentle interrupt (SIGINT)
294347 try :
295348 # Process.send_signal exists on Unix; fallback to os.kill if necessary
@@ -302,6 +355,7 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
302355 pass
303356
304357 # wait short grace period for the shell to handle the interrupt
358+ _need_restart = False
305359 if conn .poll (grace ):
306360 try :
307361 result = conn .recv ()
@@ -311,48 +365,34 @@ def run_cell(self, shell_id, code, timeout=1.0, grace=2.0, traceback_verbosity="
311365 except EOFError :
312366 # Connection closed - shell process died, need to restart
313367 logging .warning (f"Shell process for { shell_id } died during interrupt, restarting" )
314- with self .manager_lock :
315- self .shells .pop (shell_id , None )
316- self .start_shell (shell_id )
368+ # clean up the shell process and connection - best-effort
369+ self ._cleanup_shell_resources (proc , conn )
317370
318- # Mark the new shell as having a restart pending
371+ # remove the old shell entry
319372 with self .manager_lock :
320- if shell_id in self .shells :
321- self .shells [shell_id ]["restart_pending" ] = True
373+ self .shells .pop (shell_id , None )
322374
323- return {
375+ _need_restart = True
376+ _restart_result = {
324377 "status" : "interrupted" ,
325378 "msg" : "connection closed after interrupt" ,
326379 "shell_was_created" : shell_was_created ,
327380 "shell_was_restarted" : True ,
328381 "shell_was_recently_restarted" : shell_was_recently_restarted ,
329382 }
330383
331- # still stuck -> terminate the shell and restart it (drop memory)
332- try :
333- proc .terminate ()
334- except Exception :
335- try :
336- os .kill (proc .pid , signal .SIGKILL )
337- except Exception :
338- pass
339- proc .join (timeout = 2.0 )
384+ if _need_restart :
385+ self ._finish_restart (shell_id )
386+ return _restart_result
340387
341- # close old connection (best-effort)
342- try :
343- conn .close ()
344- except Exception :
345- pass
388+ # still stuck -> terminate the shell and restart it (drop memory)
389+ # clean up the shell process and connection - best-effort
390+ self ._cleanup_shell_resources (proc , conn )
346391
347392 # remove and restart a fresh shell for this id
348393 with self .manager_lock :
349394 self .shells .pop (shell_id , None )
350- self .start_shell (shell_id )
351-
352- # Mark the new shell as having a restart pending
353- with self .manager_lock :
354- if shell_id in self .shells :
355- self .shells [shell_id ]["restart_pending" ] = True
395+ self ._finish_restart (shell_id )
356396
357397 return {
358398 "status" : "timeout_killed" ,
0 commit comments