Skip to content

Commit 3ed36cf

Browse files
authored
Merge pull request microsoft#8 from robotdad/feat/provider-improvements
Resolved conflicts with PR microsoft#9 (well-known provider): - tests/conftest.py: kept both reset_singleton_state fixture - tests/test_mount.py: kept both SDK binary tests and TestSingleton class
2 parents 396aefb + 34535b5 commit 3ed36cf

5 files changed

Lines changed: 473 additions & 22 deletions

File tree

amplifier_module_provider_github_copilot/__init__.py

Lines changed: 106 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,16 @@
4040

4141
from __future__ import annotations
4242

43+
import asyncio
4344
import logging
4445
import shutil
4546
from collections.abc import Awaitable, Callable
4647
from typing import Any
4748

4849
from amplifier_core import ChatResponse, ToolCall
4950

50-
from .client import AuthStatus, SessionInfo, SessionListResult
51+
from ._constants import DEFAULT_TIMEOUT
52+
from .client import AuthStatus, CopilotClientWrapper, SessionInfo, SessionListResult
5153
from .exceptions import (
5254
CopilotAbortError,
5355
CopilotAuthenticationError,
@@ -121,6 +123,92 @@
121123

122124
logger = logging.getLogger(__name__)
123125

126+
# ═══════════════════════════════════════════════════════════════════════════════
127+
# Process-Level Singleton State
128+
# ═══════════════════════════════════════════════════════════════════════════════
129+
#
130+
# Sub-agents spawned by the task tool run as async coroutines in the SAME
131+
# Python process and asyncio event loop as the parent session (kernel-guaranteed).
132+
# Each sub-agent gets its own fresh ModuleCoordinator — coordinators are not shared.
133+
#
134+
# This singleton ensures all mounts in a process share ONE CopilotClientWrapper
135+
# (one copilot CLI subprocess) regardless of how many sub-agents are spawned.
136+
# Without this, N sub-agents spawn N processes × ~500 MB each.
137+
#
138+
# Reference: docs/plans/2026-02-23-process-singleton-design.md
139+
140+
_shared_client: CopilotClientWrapper | None = None
141+
_shared_client_refcount: int = 0
142+
_shared_client_lock: asyncio.Lock | None = None
143+
144+
145+
def _get_lock() -> asyncio.Lock:
146+
"""Return the singleton lock, creating it lazily on first call.
147+
148+
Lazy initialization avoids creating asyncio.Lock at import time,
149+
which can fail if no event loop exists yet (common in test environments).
150+
"""
151+
global _shared_client_lock
152+
if _shared_client_lock is None:
153+
_shared_client_lock = asyncio.Lock()
154+
return _shared_client_lock
155+
156+
157+
async def _acquire_shared_client(
158+
config: dict[str, Any],
159+
timeout: float,
160+
) -> CopilotClientWrapper:
161+
"""Acquire the shared CopilotClientWrapper, creating it if this is the first mount.
162+
163+
Increments the reference count. Call _release_shared_client() in cleanup
164+
to decrement. The subprocess is shut down when the count reaches zero.
165+
166+
If a second caller passes a different timeout than the first, a DEBUG warning
167+
is logged and the existing client is returned unchanged — the second caller's
168+
timeout is silently ignored. Sub-agents inherit bundle config from the parent,
169+
so all callers typically pass the same values.
170+
"""
171+
global _shared_client, _shared_client_refcount
172+
async with _get_lock():
173+
if _shared_client is None:
174+
logger.info("[MOUNT] Creating shared Copilot subprocess (first mount in process)")
175+
_shared_client = CopilotClientWrapper(config=config, timeout=timeout)
176+
else:
177+
existing_timeout = getattr(_shared_client, "_timeout", timeout)
178+
if existing_timeout != timeout:
179+
logger.debug(
180+
f"[MOUNT] Ignoring timeout={timeout} for shared client "
181+
f"(already created with timeout={existing_timeout})"
182+
)
183+
_shared_client_refcount += 1
184+
logger.debug(f"[MOUNT] Shared client refcount: {_shared_client_refcount}")
185+
return _shared_client
186+
187+
188+
async def _release_shared_client() -> None:
189+
"""Release one reference to the shared client.
190+
191+
When the count reaches zero, closes and destroys the shared subprocess.
192+
Safe to call if already at zero (safety floor prevents negative counts).
193+
194+
NOTE: If the Python process is killed (SIGKILL/crash), the refcount
195+
never reaches zero and close() is never called. This is acceptable —
196+
the OS reclaims the copilot subprocess when the parent process exits.
197+
There is no mitigation needed for this case.
198+
"""
199+
global _shared_client, _shared_client_refcount
200+
async with _get_lock():
201+
_shared_client_refcount -= 1
202+
logger.debug(f"[MOUNT] Shared client refcount after release: {_shared_client_refcount}")
203+
if _shared_client_refcount <= 0:
204+
if _shared_client is not None:
205+
logger.info(
206+
"[MOUNT] Last mount cleaned up — shutting down shared Copilot subprocess"
207+
)
208+
await _shared_client.close()
209+
_shared_client = None
210+
_shared_client_refcount = 0 # safety floor: prevent negative counts
211+
124212

125213
async def mount(
126214
coordinator: Any, # ModuleCoordinator
@@ -178,26 +266,39 @@ async def mount(
178266
# Set CLI path in config for provider to use
179267
config["cli_path"] = cli_path
180268

269+
# Track whether this call acquired a shared client reference,
270+
# so the error path only releases what this call actually acquired.
271+
acquired_client: CopilotClientWrapper | None = None
272+
181273
try:
182-
# Create provider (api_key is None for Copilot - uses GitHub auth)
183-
provider = CopilotSdkProvider(None, config, coordinator)
274+
timeout = float(config.get("timeout", DEFAULT_TIMEOUT))
275+
276+
# Acquire (or reuse) the process-level shared client.
277+
# All mounts in this Python process share one CopilotClientWrapper instance.
278+
acquired_client = await _acquire_shared_client(config, timeout)
279+
280+
# Create provider, injecting the shared client
281+
provider = CopilotSdkProvider(None, config, coordinator, client=acquired_client)
184282

185283
# Register with coordinator
186284
await coordinator.mount("providers", provider, name="github-copilot")
187285

188286
logger.info("[MOUNT] CopilotSdkProvider mounted successfully")
189287

190-
# Return cleanup function
288+
# Return cleanup function — releases the shared reference, not the provider
191289
async def cleanup() -> None:
192290
"""Cleanup function called when unmounting."""
193291
logger.info("[MOUNT] Unmounting CopilotSdkProvider...")
194-
await provider.close()
292+
await _release_shared_client()
195293
logger.info("[MOUNT] CopilotSdkProvider unmounted")
196294

197295
return cleanup
198296

199297
except Exception as e:
200298
logger.error(f"[MOUNT] Failed to mount CopilotSdkProvider: {e}")
299+
# Only release if this call successfully acquired a reference before the failure
300+
if acquired_client is not None:
301+
await _release_shared_client()
201302
return None
202303

203304

amplifier_module_provider_github_copilot/provider.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,7 @@ def __init__(
154154
api_key: str | None = None, # Unused, kept for signature compatibility
155155
config: dict[str, Any] | None = None,
156156
coordinator: Any | None = None, # ModuleCoordinator
157+
client: CopilotClientWrapper | None = None, # Shared singleton if provided
157158
):
158159
"""
159160
Initialize the Copilot SDK provider.
@@ -169,6 +170,10 @@ def __init__(
169170
- debug_truncate_length: Max length for debug output (default: 180)
170171
- cli_path: Path to Copilot CLI executable
171172
coordinator: Amplifier's ModuleCoordinator for hooks and events
173+
client: Optional pre-created CopilotClientWrapper to reuse. If None,
174+
a new wrapper is created (backward-compatible default). Pass the
175+
shared singleton from _acquire_shared_client() to avoid spawning
176+
multiple copilot subprocesses.
172177
173178
Note:
174179
Timeout is automatically selected based on whether extended_thinking
@@ -204,10 +209,15 @@ def __init__(
204209
# Streaming configuration (default: enabled like Anthropic provider)
205210
self._use_streaming = bool(config.get("use_streaming", True))
206211

207-
# Initialize client wrapper
208-
self._client = CopilotClientWrapper(
209-
config=config,
210-
timeout=self._timeout,
212+
# Initialize client wrapper — use injected singleton if provided,
213+
# otherwise create a new one (backward-compatible default).
214+
self._client = (
215+
client
216+
if client is not None
217+
else CopilotClientWrapper(
218+
config=config,
219+
timeout=self._timeout,
220+
)
211221
)
212222

213223
# Retry configuration (lighter defaults than HTTP-only providers
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# Process-Level Singleton Fix for GitHub Copilot Provider
2+
3+
## Goal
4+
5+
Fix the GitHub Copilot provider so that all sub-agents spawned by a recipe within a single Amplifier session share one `CopilotClientWrapper` instance (and therefore one copilot CLI subprocess), rather than each spawning their own.
6+
7+
Tracked as: [Issue #7](https://github.com/microsoft/amplifier-module-provider-github-copilot/issues/7)
8+
9+
## Background
10+
11+
The copilot CLI binary is Electron-bundled at ~500 MB per process. With the current implementation, N sub-agents spawned by a recipe creates N subprocesses consuming N × ~500 MB RAM — the root cause of the OOM being investigated.
12+
13+
Two architectural facts constrain the solution:
14+
15+
- Sub-agents spawned by the `task` tool run as async coroutines in the **same Python process** and same asyncio event loop as the parent session (kernel-guaranteed)
16+
- Each sub-agent gets its own fresh `ModuleCoordinator` — coordinators are not shared across sessions
17+
18+
There is no kernel facility for cross-session resource sharing; resource pooling is module-level policy, consistent with Amplifier's "mechanism not policy" principle.
19+
20+
## Approach
21+
22+
A process-level singleton held as module-level state in `__init__.py`, with reference counting to ensure the subprocess lives exactly as long as at least one session is mounted. The change is entirely contained to `__init__.py` — no other files change.
23+
24+
Two other approaches were considered and rejected:
25+
26+
- **Process-level cap (semaphore)**: Limits blast radius but doesn't eliminate waste; more complex without being more correct
27+
- **Status quo + polish only**: Valid if the OOM were purely a runaway delegation bug, but the singleton is the right architecture independently — even legitimate heavy recipes waste N × 500 MB today
28+
29+
## Architecture
30+
31+
The change is surgical. `CopilotClientWrapper` itself doesn't change. The only change is in when and how many times it gets created.
32+
33+
**Before:**
34+
```
35+
mount() called → new CopilotClientWrapper() → new copilot subprocess
36+
mount() called → new CopilotClientWrapper() → new copilot subprocess
37+
mount() called → new CopilotClientWrapper() → new copilot subprocess
38+
```
39+
40+
**After:**
41+
```
42+
mount() called → _acquire_shared_client() → new copilot subprocess (ref=1)
43+
mount() called → _acquire_shared_client() → reuse existing (ref=2)
44+
mount() called → _acquire_shared_client() → reuse existing (ref=3)
45+
cleanup called → _release_shared_client() → ref=2
46+
cleanup called → _release_shared_client() → ref=1
47+
cleanup called → _release_shared_client() → ref=0 → subprocess shuts down
48+
```
49+
50+
**Blast radius:** `__init__.py` only. `CopilotClientWrapper`, `provider.py`, `tool_capture.py`, `_constants.py`, and `converters.py` are untouched.
51+
52+
## Components
53+
54+
### Module-Level Singleton State
55+
56+
Three module-level variables at the top of `__init__.py`:
57+
58+
```python
59+
_shared_client: CopilotClientWrapper | None = None
60+
_shared_client_refcount: int = 0
61+
_shared_client_lock: asyncio.Lock | None = None
62+
```
63+
64+
The lock is initialized lazily rather than at module load time. `asyncio.Lock()` must be created on the running event loop — creating it at import time can cause issues if the module is imported before an event loop exists (common in test environments). A `_get_lock()` helper handles this:
65+
66+
```python
67+
def _get_lock() -> asyncio.Lock:
68+
global _shared_client_lock
69+
if _shared_client_lock is None:
70+
_shared_client_lock = asyncio.Lock()
71+
return _shared_client_lock
72+
```
73+
74+
### Acquire and Release Functions
75+
76+
```python
77+
async def _acquire_shared_client(config, timeout) -> CopilotClientWrapper:
78+
global _shared_client, _shared_client_refcount
79+
async with _get_lock():
80+
if _shared_client is None:
81+
_shared_client = CopilotClientWrapper(config, timeout)
82+
_shared_client_refcount += 1
83+
return _shared_client
84+
85+
async def _release_shared_client() -> None:
86+
global _shared_client, _shared_client_refcount
87+
async with _get_lock():
88+
_shared_client_refcount -= 1
89+
if _shared_client_refcount <= 0:
90+
if _shared_client is not None:
91+
await _shared_client.close()
92+
_shared_client = None
93+
_shared_client_refcount = 0 # safety floor against accidental negatives
94+
```
95+
96+
The safety floor at zero matters: if cleanup is ever called more times than mount (e.g., in test teardown), a negative count must not prevent the next acquisition from creating a fresh client.
97+
98+
### Updated `mount()`
99+
100+
`mount()` changes in one place only — acquiring the shared client instead of constructing a new one directly:
101+
102+
```python
103+
async def mount(coordinator, config):
104+
timeout = config.get("timeout", DEFAULT_TIMEOUT)
105+
106+
client = await _acquire_shared_client(config, timeout)
107+
108+
provider = CopilotSdkProvider(client=client, config=config)
109+
coordinator.providers.register("github-copilot", provider)
110+
111+
async def cleanup():
112+
await _release_shared_client()
113+
114+
return cleanup
115+
```
116+
117+
Everything else — provider construction, coordinator registration, returning a cleanup callable — is unchanged.
118+
119+
## Data Flow
120+
121+
`_acquire_shared_client()` only uses `config` and `timeout` on the **first** call when it constructs the wrapper. Subsequent calls reuse the existing client regardless of the values passed. Sub-agents inherit bundle config from the parent, so in practice they all pass identical values.
122+
123+
If a sub-agent passes a different `timeout` than the first mount, `_acquire_shared_client()` emits a `DEBUG` log warning and returns the existing client unchanged. No exception is raised — the caller receives a working client and the discrepancy is visible in logs.
124+
125+
## Error Handling
126+
127+
**Config mismatch**: `DEBUG` warning logged, existing client returned, no exception raised.
128+
129+
**Concurrent mounts at startup**: Two sub-agents mounting simultaneously before the client exists are serialized by the `asyncio.Lock`. The second waiter finds `_shared_client` already populated, increments the refcount, and returns. Double-creation is not possible.
130+
131+
**Unclean shutdown** (SIGKILL / process crash): The refcount never reaches zero, so `close()` never fires and the copilot subprocess becomes an orphan. This is acceptable — the OS reclaims the subprocess when the parent Python process dies. A code comment will document this reasoning explicitly so future maintainers don't treat it as an oversight.
132+
133+
## Observability
134+
135+
Three events are emitted via `coordinator.hooks.emit()`, consistent with existing module patterns (e.g., `provider:tool_sequence_repaired`):
136+
137+
| Event | When | Payload |
138+
|---|---|---|
139+
| `github-copilot:subprocess_created` | First acquisition | `session_id` |
140+
| `github-copilot:subprocess_reused` | Subsequent acquisitions | `session_id`, `refcount` |
141+
| `github-copilot:subprocess_shutdown` | Refcount reaches zero | `session_id` |
142+
143+
These are visible in session logs without any extra configuration.
144+
145+
## Testing Strategy
146+
147+
New tests live in `tests/test_mount.py`. `CopilotClientWrapper.__init__` is mocked to count instantiations. A pytest fixture resets `_shared_client`, `_shared_client_refcount`, and `_shared_client_lock` to `None`/`0`/`None` before each test — required for isolation with module-level state.
148+
149+
| Test | What it verifies |
150+
|---|---|
151+
| **Singleton creation** | `mount()` once → one `CopilotClientWrapper` created, refcount=1 |
152+
| **Reuse** | `mount()` three times → one `CopilotClientWrapper` created, refcount=3 |
153+
| **Cleanup lifecycle** | Three mounts, three cleanups → `close()` called only on the third (refcount=0) |
154+
| **Concurrent mounts** | `asyncio.gather()` fires multiple `mount()` calls simultaneously → still one client (exercises the lock) |
155+
| **Config mismatch warning** | Mount with `timeout=300`, mount again with `timeout=600``DEBUG` warning emitted, no exception |
156+
157+
No integration tests are needed — the copilot subprocess is already mocked in the existing suite. These tests only exercise the reference counting logic, which is pure Python with no external dependencies. Existing tests for `CopilotClientWrapper` and `CopilotSdkProvider` are unaffected since neither class changes.
158+
159+
## Open Questions
160+
161+
None — design is fully validated and ready for implementation.

tests/conftest.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,36 @@ def mock_cache_home(tmp_path, monkeypatch):
3939
return tmp_path
4040

4141

42+
@pytest.fixture(autouse=True)
43+
def reset_singleton_state():
44+
"""Reset module-level singleton state before each test.
45+
46+
Required for test isolation: module-level variables persist across
47+
tests in the same pytest session. Without this reset, singleton tests
48+
bleed state into each other.
49+
50+
Uses hasattr guards because the singleton attributes don't exist yet
51+
until Task 4 adds them to __init__.py. Guards make this fixture safe
52+
to land before the implementation.
53+
"""
54+
import amplifier_module_provider_github_copilot as mod
55+
56+
# Guard: attributes may not exist until implementation is added (Task 4)
57+
if hasattr(mod, "_shared_client"):
58+
mod._shared_client = None # type: ignore[attr-defined]
59+
if hasattr(mod, "_shared_client_refcount"):
60+
mod._shared_client_refcount = 0 # type: ignore[attr-defined]
61+
if hasattr(mod, "_shared_client_lock"):
62+
mod._shared_client_lock = None # type: ignore[attr-defined]
63+
yield
64+
if hasattr(mod, "_shared_client"):
65+
mod._shared_client = None # type: ignore[attr-defined]
66+
if hasattr(mod, "_shared_client_refcount"):
67+
mod._shared_client_refcount = 0 # type: ignore[attr-defined]
68+
if hasattr(mod, "_shared_client_lock"):
69+
mod._shared_client_lock = None # type: ignore[attr-defined]
70+
71+
4272
# Fix for Windows asyncio cleanup issues causing KeyboardInterrupt
4373
# See: https://github.com/pytest-dev/pytest-asyncio/issues/671
4474
if sys.platform == "win32":

0 commit comments

Comments
 (0)