@@ -109,6 +109,7 @@ def __init__(self, store: LightningStore, host: str, port: int):
109109 self ._uvicorn_server : uvicorn .Server | None = uvicorn .Server (self ._uvicorn_config )
110110
111111 self ._serving_thread : Optional [threading .Thread ] = None
112+ self ._server_start_exception : Optional [BaseException ] = None
112113
113114 # Process-awareness:
114115 # LightningStoreServer holds a plain Python object (self.store) in one process
@@ -167,17 +168,45 @@ async def start(self):
167168 logger .info (f"Starting server at { self .endpoint } " )
168169
169170 uvicorn_server = self ._uvicorn_server
171+ self ._server_start_exception = None
170172
171173 def run_server_forever ():
172- asyncio .run (uvicorn_server .serve ())
173-
174- self ._serving_thread = threading .Thread (target = run_server_forever , daemon = True )
175- self ._serving_thread .start ()
174+ try :
175+ asyncio .run (uvicorn_server .serve ())
176+ except (SystemExit , Exception ) as exc :
177+ logger .debug ("LightningStore server thread exiting due to %s" , exc , exc_info = exc )
178+ self ._server_start_exception = exc
179+
180+ serving_thread = threading .Thread (target = run_server_forever , daemon = True )
181+ self ._serving_thread = serving_thread
182+ serving_thread .start ()
183+
184+ # Wait for uvicorn to report that it has started before pinging /health.
185+ start_deadline = time .time () + 10
186+ while time .time () < start_deadline :
187+ if uvicorn_server .started :
188+ break
189+ if self ._server_start_exception is not None or not serving_thread .is_alive ():
190+ self ._handle_failed_start ()
191+ raise RuntimeError (self ._format_start_failure_reason ())
192+ await asyncio .sleep (0.05 )
193+ else :
194+ self ._handle_failed_start ()
195+ raise RuntimeError ("Server failed to start within the 10 seconds." )
176196
177- # Wait for /health to be available
197+ # Wait for /health to be available once uvicorn reports started.
178198 if not await self ._server_health_check ():
199+ self ._handle_failed_start ()
179200 raise RuntimeError ("Server failed to start within the 10 seconds." )
180201
202+ # If startup failed (e.g. port already in use), uvicorn never flips `started`
203+ # and the worker thread stops immediately. Guard against latching on to a
204+ # different process that happened to satisfy the health check.
205+ if not uvicorn_server .started or not serving_thread .is_alive () or self ._server_start_exception is not None :
206+ self ._handle_failed_start ()
207+ failure_reason = self ._format_start_failure_reason ()
208+ raise RuntimeError (failure_reason )
209+
181210 async def _server_health_check (self ) -> bool :
182211 """Checks if the server is healthy."""
183212 current_time = time .time ()
@@ -190,22 +219,63 @@ async def _server_health_check(self) -> bool:
190219 await asyncio .sleep (0.1 )
191220 return False
192221
222+ def _handle_failed_start (self ) -> None :
223+ """Clean up thread state when startup fails."""
224+ if self ._uvicorn_server is not None :
225+ self ._uvicorn_server .should_exit = True
226+ if self ._serving_thread is not None :
227+ # Thread already exited in most failure scenarios; join defensively.
228+ self ._serving_thread .join (timeout = 0.1 )
229+ self ._serving_thread = None
230+
231+ def _format_start_failure_reason (self ) -> str :
232+ base_message = f"LightningStore server failed to start on { self .endpoint } ."
233+ if isinstance (self ._server_start_exception , SystemExit ):
234+ return f"{ base_message } Another process may already be using this port."
235+ if isinstance (self ._server_start_exception , OSError ):
236+ return f"{ base_message } { self ._server_start_exception .strerror } ."
237+ if self ._server_start_exception is not None :
238+ return f"{ base_message } Reason: { self ._server_start_exception } ."
239+ return f"{ base_message } Another process may already be using this port."
240+
193241 async def run_forever (self ):
194242 """Runs the FastAPI server indefinitely.
195243
196244 You need to call this method in the same process as the server was created in.
197245 """
198246 assert self ._uvicorn_server is not None
247+ uvicorn_server = self ._uvicorn_server
199248
200249 async def _wait_till_healthy ():
201250 health = await self ._server_health_check ()
202251 if not health :
203252 raise RuntimeError ("Server did not become healthy within the 10 seconds." )
204253 logger .info ("Store server is online at %s" , self .endpoint )
205254
255+ async def _serve_capture ():
256+ try :
257+ await uvicorn_server .serve ()
258+ except KeyboardInterrupt :
259+ raise
260+ except (SystemExit , Exception ) as exc :
261+ logger .debug ("LightningStore server serve() raised %s" , exc , exc_info = exc )
262+ self ._server_start_exception = exc
263+ raise RuntimeError ("LightningStore server failed to serve" ) from exc
264+
206265 # We run _wait_till_healthy and self._uvicorn_server.serve in parallel
207266 # until one of them raises an exception.
208- await asyncio .gather (_wait_till_healthy (), self ._uvicorn_server .serve ())
267+ try :
268+ await asyncio .gather (_wait_till_healthy (), _serve_capture ())
269+ except BaseException as exc :
270+ if isinstance (exc , KeyboardInterrupt ):
271+ raise
272+ startup_failed = not uvicorn_server .started or isinstance (
273+ self ._server_start_exception , (SystemExit , OSError )
274+ )
275+ if startup_failed :
276+ self ._handle_failed_start ()
277+ raise RuntimeError (self ._format_start_failure_reason ())
278+ raise
209279
210280 async def stop (self ):
211281 """Gracefully stops the running FastAPI server.
0 commit comments