Skip to content

Commit 74af89a

Browse files
authored
Merge pull request microsoft#3 from microsoft/fix/client-health-check-and-lock-timeout
fix: add client health check and lock timeout to ensure_client()
2 parents 1c94ae5 + e1e233d commit 74af89a

2 files changed

Lines changed: 89 additions & 10 deletions

File tree

amplifier_module_provider_github_copilot/_constants.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,21 @@
5656
# and provides consistent error handling. The SDK timeout acts as a fallback.
5757
SDK_TIMEOUT_BUFFER_SECONDS = 5.0
5858

59+
# Client health check timeout (seconds).
60+
# Used by ensure_client() to verify a cached client subprocess is still alive
61+
# before returning it. Short timeout since ping should be near-instant for a
62+
# healthy process. If this fails, the client is torn down and re-initialized.
63+
# Evidence: TimeoutError in long-running sessions (~90min) where subprocess dies
64+
# but _started flag remains True, causing all subsequent callers to get a dead client.
65+
CLIENT_HEALTH_CHECK_TIMEOUT = 5.0
66+
67+
# Lock acquisition timeout for ensure_client() (seconds).
68+
# Prevents callers from waiting indefinitely when another caller is stuck inside
69+
# ensure_client() (e.g., blocked on a dead subprocess's start() call).
70+
# Evidence: asyncio.Lock with no timeout caused sub-agent callers to queue forever
71+
# behind a stuck initialization, producing 5516.9s elapsed times.
72+
CLIENT_INIT_LOCK_TIMEOUT = 30.0
73+
5974
# ═══════════════════════════════════════════════════════════════════════════════
6075
# Copilot SDK Built-in Tool Names
6176
# ═══════════════════════════════════════════════════════════════════════════════

amplifier_module_provider_github_copilot/client.py

Lines changed: 74 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,13 @@
1818
from dataclasses import dataclass
1919
from typing import TYPE_CHECKING, Any
2020

21-
from ._constants import DEFAULT_TIMEOUT, SDK_TIMEOUT_BUFFER_SECONDS, VALID_REASONING_EFFORTS
21+
from ._constants import (
22+
CLIENT_HEALTH_CHECK_TIMEOUT,
23+
CLIENT_INIT_LOCK_TIMEOUT,
24+
DEFAULT_TIMEOUT,
25+
SDK_TIMEOUT_BUFFER_SECONDS,
26+
VALID_REASONING_EFFORTS,
27+
)
2228
from .exceptions import (
2329
CopilotAuthenticationError,
2430
CopilotConnectionError,
@@ -149,28 +155,87 @@ def __init__(
149155

150156
logger.debug(f"[CLIENT] CopilotClientWrapper initialized, timeout={timeout}s")
151157

158+
async def _check_client_health(self) -> bool:
159+
"""
160+
Verify the cached client subprocess is still responsive.
161+
162+
Sends a ping with a short timeout. If the subprocess has died,
163+
become unresponsive, or the auth token has expired, this will
164+
fail and trigger re-initialization.
165+
166+
Returns:
167+
True if client is healthy, False if it needs re-initialization
168+
"""
169+
if self._client is None:
170+
return False
171+
try:
172+
await asyncio.wait_for(
173+
self._client.ping(),
174+
timeout=CLIENT_HEALTH_CHECK_TIMEOUT,
175+
)
176+
return True
177+
except Exception as e:
178+
logger.warning(f"[CLIENT] Health check failed: {type(e).__name__}: {e}")
179+
return False
180+
181+
async def _reset_client(self) -> None:
182+
"""
183+
Tear down a dead/unhealthy client so it can be re-initialized.
184+
185+
Attempts a graceful stop, but always resets state even if stop fails.
186+
Does NOT acquire the lock -- caller must hold it.
187+
"""
188+
if self._client is not None:
189+
try:
190+
await asyncio.shield(self._client.stop())
191+
except Exception as e:
192+
logger.debug(f"[CLIENT] Error stopping unhealthy client: {e}")
193+
finally:
194+
self._client = None
195+
self._started = False
196+
152197
async def ensure_client(self) -> CopilotClient:
153198
"""
154199
Lazily initialize and return the Copilot client.
155200
156201
Uses double-checked locking to ensure thread-safe
157202
initialization while minimizing lock contention.
158203
204+
Includes a health check on cached clients to detect and recover
205+
from dead subprocesses (e.g., after long-running sessions where
206+
the CLI process dies silently).
207+
159208
Returns:
160209
Initialized CopilotClient instance
161210
162211
Raises:
163212
CopilotConnectionError: If client initialization fails
164213
CopilotAuthenticationError: If authentication fails
165214
"""
215+
# Fast path: client exists and passes health check
166216
if self._client is not None and self._started:
167-
logger.debug("[CLIENT] Returning existing client")
168-
return self._client
217+
if await self._check_client_health():
218+
logger.debug("[CLIENT] Returning existing client (health check passed)")
219+
return self._client
220+
# Health check failed -- need to re-initialize under lock
221+
logger.warning("[CLIENT] Cached client failed health check, will re-initialize")
169222

170-
async with self._lock:
171-
# Double-check after acquiring lock
223+
try:
224+
await asyncio.wait_for(self._lock.acquire(), timeout=CLIENT_INIT_LOCK_TIMEOUT)
225+
except TimeoutError:
226+
raise CopilotConnectionError(
227+
f"Timed out waiting for client initialization lock "
228+
f"({CLIENT_INIT_LOCK_TIMEOUT}s). Another caller may be stuck. "
229+
f"Try restarting your session."
230+
) from None
231+
232+
try:
233+
# Double-check after acquiring lock (another caller may have re-initialized)
172234
if self._client is not None and self._started:
173-
return self._client
235+
if await self._check_client_health():
236+
return self._client
237+
# Still unhealthy -- tear it down
238+
await self._reset_client()
174239

175240
# Use local variable to ensure atomic assignment after full initialization
176241
client: CopilotClient | None = None
@@ -192,10 +257,7 @@ async def ensure_client(self) -> CopilotClient:
192257
_cli_bin = Path(_copilot_mod.__file__).parent / "bin" / "copilot"
193258
if _cli_bin.exists() and not os.access(_cli_bin, os.X_OK):
194259
_cli_bin.chmod(
195-
_cli_bin.stat().st_mode
196-
| stat.S_IXUSR
197-
| stat.S_IXGRP
198-
| stat.S_IXOTH
260+
_cli_bin.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH
199261
)
200262
logger.info(f"[CLIENT] Fixed missing execute permission on {_cli_bin}")
201263

@@ -245,6 +307,8 @@ async def ensure_client(self) -> CopilotClient:
245307
raise CopilotConnectionError(
246308
f"Failed to initialize Copilot client: {type(e).__name__}: {e}"
247309
) from e
310+
finally:
311+
self._lock.release()
248312

249313
def _build_client_options(self) -> dict[str, Any]:
250314
"""Build CopilotClientOptions from configuration.

0 commit comments

Comments
 (0)