1818from dataclasses import dataclass
1919from typing import TYPE_CHECKING , Any
2020
21- from ._constants import DEFAULT_TIMEOUT , SDK_TIMEOUT_BUFFER_SECONDS , VALID_REASONING_EFFORTS
21+ from ._constants import (
22+ CLIENT_HEALTH_CHECK_TIMEOUT ,
23+ CLIENT_INIT_LOCK_TIMEOUT ,
24+ DEFAULT_TIMEOUT ,
25+ SDK_TIMEOUT_BUFFER_SECONDS ,
26+ VALID_REASONING_EFFORTS ,
27+ )
2228from .exceptions import (
2329 CopilotAuthenticationError ,
2430 CopilotConnectionError ,
@@ -149,28 +155,87 @@ def __init__(
149155
150156 logger .debug (f"[CLIENT] CopilotClientWrapper initialized, timeout={ timeout } s" )
151157
158+ async def _check_client_health (self ) -> bool :
159+ """
160+ Verify the cached client subprocess is still responsive.
161+
162+ Sends a ping with a short timeout. If the subprocess has died,
163+ become unresponsive, or the auth token has expired, this will
164+ fail and trigger re-initialization.
165+
166+ Returns:
167+ True if client is healthy, False if it needs re-initialization
168+ """
169+ if self ._client is None :
170+ return False
171+ try :
172+ await asyncio .wait_for (
173+ self ._client .ping (),
174+ timeout = CLIENT_HEALTH_CHECK_TIMEOUT ,
175+ )
176+ return True
177+ except Exception as e :
178+ logger .warning (f"[CLIENT] Health check failed: { type (e ).__name__ } : { e } " )
179+ return False
180+
181+ async def _reset_client (self ) -> None :
182+ """
183+ Tear down a dead/unhealthy client so it can be re-initialized.
184+
185+ Attempts a graceful stop, but always resets state even if stop fails.
186+ Does NOT acquire the lock -- caller must hold it.
187+ """
188+ if self ._client is not None :
189+ try :
190+ await asyncio .shield (self ._client .stop ())
191+ except Exception as e :
192+ logger .debug (f"[CLIENT] Error stopping unhealthy client: { e } " )
193+ finally :
194+ self ._client = None
195+ self ._started = False
196+
152197 async def ensure_client (self ) -> CopilotClient :
153198 """
154199 Lazily initialize and return the Copilot client.
155200
156201 Uses double-checked locking to ensure thread-safe
157202 initialization while minimizing lock contention.
158203
204+ Includes a health check on cached clients to detect and recover
205+ from dead subprocesses (e.g., after long-running sessions where
206+ the CLI process dies silently).
207+
159208 Returns:
160209 Initialized CopilotClient instance
161210
162211 Raises:
163212 CopilotConnectionError: If client initialization fails
164213 CopilotAuthenticationError: If authentication fails
165214 """
215+ # Fast path: client exists and passes health check
166216 if self ._client is not None and self ._started :
167- logger .debug ("[CLIENT] Returning existing client" )
168- return self ._client
217+ if await self ._check_client_health ():
218+ logger .debug ("[CLIENT] Returning existing client (health check passed)" )
219+ return self ._client
220+ # Health check failed -- need to re-initialize under lock
221+ logger .warning ("[CLIENT] Cached client failed health check, will re-initialize" )
169222
170- async with self ._lock :
171- # Double-check after acquiring lock
223+ try :
224+ await asyncio .wait_for (self ._lock .acquire (), timeout = CLIENT_INIT_LOCK_TIMEOUT )
225+ except TimeoutError :
226+ raise CopilotConnectionError (
227+ f"Timed out waiting for client initialization lock "
228+ f"({ CLIENT_INIT_LOCK_TIMEOUT } s). Another caller may be stuck. "
229+ f"Try restarting your session."
230+ ) from None
231+
232+ try :
233+ # Double-check after acquiring lock (another caller may have re-initialized)
172234 if self ._client is not None and self ._started :
173- return self ._client
235+ if await self ._check_client_health ():
236+ return self ._client
237+ # Still unhealthy -- tear it down
238+ await self ._reset_client ()
174239
175240 # Use local variable to ensure atomic assignment after full initialization
176241 client : CopilotClient | None = None
@@ -192,10 +257,7 @@ async def ensure_client(self) -> CopilotClient:
192257 _cli_bin = Path (_copilot_mod .__file__ ).parent / "bin" / "copilot"
193258 if _cli_bin .exists () and not os .access (_cli_bin , os .X_OK ):
194259 _cli_bin .chmod (
195- _cli_bin .stat ().st_mode
196- | stat .S_IXUSR
197- | stat .S_IXGRP
198- | stat .S_IXOTH
260+ _cli_bin .stat ().st_mode | stat .S_IXUSR | stat .S_IXGRP | stat .S_IXOTH
199261 )
200262 logger .info (f"[CLIENT] Fixed missing execute permission on { _cli_bin } " )
201263
@@ -245,6 +307,8 @@ async def ensure_client(self) -> CopilotClient:
245307 raise CopilotConnectionError (
246308 f"Failed to initialize Copilot client: { type (e ).__name__ } : { e } "
247309 ) from e
310+ finally :
311+ self ._lock .release ()
248312
249313 def _build_client_options (self ) -> dict [str , Any ]:
250314 """Build CopilotClientOptions from configuration.
0 commit comments