Skip to content

Commit fddef4a

Browse files
committed
fix(sprites): address PR review feedback on session lifecycle
Address Codex and reviewer comments on the Sprites sandbox PR: - Thread `wait_for_running_timeout_s` and `idle_close_seconds` from options through `SpritesSandboxSessionState` so the documented knobs actually drive `_wait_for_sprite_running` and the idle watcher. - Track in-flight non-PTY exec ops so the idle watcher does not yank the control connection mid-execution (e.g., long-running `apt-get` installs from the lazy mount path). - Keep the idle watcher alive across PTY/active-op skips: it loops for another idle window instead of exiting, so a finished PTY does not leave a dangling control connection until the next I/O event. - On `resume()`, attempt reattach via `get_sprite` first regardless of ownership. Mark preserved state only on a true reattach; clear `workspace_root_ready` on ephemeral replacement; raise `WorkspaceStartError` for named-attach when the sprite is gone, and for any transient platform error during reattach. - Wrap `hydrate_workspace`'s tar extract in a stdout sentinel so a partial extract is detectable even while sprite-env's WS layer drops exit codes. - Apply `manifest.environment` (merged with per-session env) to both `_exec_with_cwd` and `pty_exec_start` via an `env --` argv prefix, matching the daytona/runloop/e2b pattern.
1 parent 03a19c6 commit fddef4a

3 files changed

Lines changed: 559 additions & 28 deletions

File tree

src/agents/extensions/sandbox/sprites/sandbox.py

Lines changed: 171 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import logging
2525
import os
2626
import posixpath
27+
import shlex
2728
import tarfile
2829
import time
2930
import uuid
@@ -116,6 +117,14 @@
116117
_SPRITE_READY_POLL_INTERVAL_S = 1.0
117118
_DEFAULT_MANIFEST_ROOT = cast(str, Manifest.model_fields["root"].default)
118119

120+
# Stdout sentinels used by ``hydrate_workspace`` to detect a partial tar
121+
# extract. Until sprite-env's WS layer reliably round-trips exit codes,
122+
# ``ExecResult.exit_code`` cannot be the only signal — a remote failure may
123+
# still surface as exit 0 over the wire. Anchoring the decision on stdout
124+
# (which the local shell controls before the WS hop) closes that gap.
125+
_HYDRATE_OK_SENTINEL = "__SPRITES_HYDRATE_OK__"
126+
_HYDRATE_FAIL_SENTINEL = "__SPRITES_HYDRATE_FAIL__"
127+
119128
logger = logging.getLogger(__name__)
120129

121130

@@ -257,6 +266,7 @@ class SpritesSandboxSessionState(SandboxSessionState):
257266
timeout_ms: int | None = None
258267
workspace_persistence: WorkspacePersistenceMode = "tar"
259268
idle_close_seconds: float = DEFAULT_SPRITES_IDLE_CLOSE_SECONDS
269+
wait_for_running_timeout_s: float = DEFAULT_SPRITES_WAIT_FOR_RUNNING_TIMEOUT_S
260270

261271

262272
class SpritesSandboxSession(BaseSandboxSession):
@@ -275,6 +285,7 @@ class SpritesSandboxSession(BaseSandboxSession):
275285
_last_activity_at: float
276286
_idle_close_seconds: float
277287
_idle_watch_task: asyncio.Task[None] | None
288+
_inflight_op_count: int
278289

279290
def __init__(
280291
self,
@@ -303,6 +314,11 @@ def __init__(
303314
self._last_activity_at = time.monotonic()
304315
self._idle_close_seconds = float(state.idle_close_seconds)
305316
self._idle_watch_task = None
317+
# Tracks non-PTY exec/read/write operations currently using the control
318+
# connection. The idle watcher must skip closure while any are in-flight
319+
# so a long-running command (e.g. ``apt-get install``) is not cut off
320+
# mid-execution when it crosses the idle threshold.
321+
self._inflight_op_count = 0
306322

307323
@classmethod
308324
def from_state(
@@ -374,6 +390,34 @@ async def _ensure_sprite(self) -> Sprite:
374390
self._sprite = sprite
375391
return sprite
376392

393+
async def _try_attach_existing_sprite(self) -> Sprite | None:
394+
"""Look up the recorded sprite name and bind it without provisioning.
395+
396+
Used by ``resume()`` for both ``created_by_us=True`` and named-attach
397+
sessions to differentiate "the original sprite is still there" from
398+
"the original is gone and we need to fall through to a fresh create".
399+
Returns ``None`` only when the platform reports the sprite missing —
400+
any other error propagates so the caller can decide whether to retry
401+
or surface the failure.
402+
"""
403+
404+
if self._sprite is not None:
405+
return self._sprite
406+
407+
client = self._ensure_client_sync()
408+
try:
409+
sprite: Sprite = await asyncio.to_thread(
410+
client.get_sprite, self.state.sprite_name
411+
)
412+
except NotFoundError:
413+
return None
414+
# Re-bind a Sprite handle with the live status snapshot as ``_sprite``;
415+
# ``client.sprite(name)`` builds the same handle but doesn't touch the
416+
# platform, so the get_sprite round-trip above is what proves
417+
# existence.
418+
self._sprite = sprite
419+
return sprite
420+
377421
# Both ephemeral and named-attach paths now defer the wait-for-running poll
378422
# (and the URL/org-info refresh that comes with it) until the first I/O
379423
# operation runs ``_ensure_warm``. The platform auto-wakes paused sprites
@@ -423,39 +467,49 @@ def _maybe_start_idle_watch(self) -> None:
423467
self._idle_watch_task = None
424468

425469
async def _idle_watch_loop(self) -> None:
470+
# Loop forever, only exiting on cancel. Sleeping for the remaining
471+
# window each iteration keeps the watcher cheap, and looping (rather
472+
# than returning after a close-attempt) ensures we stay alive across
473+
# PTY/active-op skips so the next idle window still gets serviced
474+
# without depending on subsequent I/O to respawn us.
426475
try:
427476
while True:
428-
# Sleep until the configured idle window elapses since the
429-
# most-recent activity, re-checking each loop because activity
430-
# may have happened during the sleep and reset the deadline.
431477
elapsed = time.monotonic() - self._last_activity_at
432478
remaining = self._idle_close_seconds - elapsed
433479
if remaining > 0:
434480
await asyncio.sleep(remaining)
435481
continue
436-
await self._close_idle_control_connections()
437-
# Watcher exits; the next I/O calls ``_touch_activity`` which
438-
# will respawn it.
439-
return
482+
closed = await self._close_idle_control_connections()
483+
if closed:
484+
# Connections are closed; nothing to watch until a future
485+
# I/O call touches activity and respawns the watcher.
486+
return
487+
# Skipped (PTY active or non-PTY op in flight). Re-check after
488+
# one idle window — by then the active work may have finished.
489+
await asyncio.sleep(self._idle_close_seconds)
440490
except asyncio.CancelledError:
441491
pass
442492

443-
async def _close_idle_control_connections(self) -> None:
493+
async def _close_idle_control_connections(self) -> bool:
444494
"""Close pooled control connections so the sprite can drop to ``warm``.
445495
446-
Skipped when there are active PTY operations — those need their
447-
connections kept alive.
496+
Skipped when there are active PTY operations or non-PTY exec/read/write
497+
operations in flight — those need their connections kept alive.
498+
499+
Returns ``True`` if connections were closed (or there was nothing to
500+
close), ``False`` if closure was skipped because work was active.
448501
"""
449502

450-
if self._pty_processes:
451-
return
503+
if self._pty_processes or self._inflight_op_count > 0:
504+
return False
452505
sprite = self._sprite
453506
if sprite is None:
454-
return
507+
return True
455508
try:
456509
await sprite.close_control_connection()
457510
except Exception:
458511
pass
512+
return True
459513

460514
def _build_sprite_config(self) -> sprites.SpriteConfig | None:
461515
if (
@@ -492,7 +546,7 @@ async def _maybe_update_url_settings(self, sprite: Sprite) -> None:
492546

493547
async def _wait_for_sprite_running(self) -> None:
494548
client = self._ensure_client_sync()
495-
deadline_s = max(0.0, float(self.state.timeout_ms or 0) / 1000.0) or float(
549+
deadline_s = max(0.0, float(self.state.wait_for_running_timeout_s)) or float(
496550
DEFAULT_SPRITES_WAIT_FOR_RUNNING_TIMEOUT_S
497551
)
498552
loop = asyncio.get_event_loop()
@@ -610,7 +664,10 @@ async def _prepare_backend_workspace(self) -> None:
610664
# try to ``chdir`` into it.
611665
root = PurePosixPath(posixpath.normpath(self.state.manifest.root))
612666
result = await self._exec_with_cwd(
613-
["mkdir", "-p", "--", root.as_posix()], cwd=None, timeout=30.0
667+
["mkdir", "-p", "--", root.as_posix()],
668+
cwd=None,
669+
timeout=30.0,
670+
apply_env=False,
614671
)
615672
if not result.ok():
616673
raise WorkspaceStartError(
@@ -684,6 +741,32 @@ async def shutdown(self) -> None:
684741
pass
685742
self._client = None
686743

744+
async def _resolved_envs(self) -> dict[str, str]:
745+
"""Merge per-session env (from options) with manifest-declared env vars.
746+
747+
Manifest values win on conflict because they are the more explicit
748+
configuration surface. Returned dict has only ``str`` values; any
749+
deferred ``EnvValue`` resolutions are awaited here.
750+
"""
751+
752+
manifest_envs = await self.state.manifest.environment.resolve()
753+
session_envs = self.state.env or {}
754+
return {**session_envs, **manifest_envs}
755+
756+
@staticmethod
757+
def _wrap_with_env(command: list[str], envs: dict[str, str]) -> list[str]:
758+
"""Prepend an ``env --`` invocation so the remote process inherits ``envs``.
759+
760+
Uses argv-form so we don't depend on a shell — the WS exec path runs
761+
``execvp`` directly. ``env --`` ensures the following ``NAME=VALUE``
762+
tokens are parsed as env, not as the program name.
763+
"""
764+
765+
if not envs:
766+
return command
767+
prefix = ["env", "--", *(f"{k}={v}" for k, v in envs.items())]
768+
return [*prefix, *command]
769+
687770
async def _exec_internal(
688771
self,
689772
*command: str | Path,
@@ -698,15 +781,25 @@ async def _exec_with_cwd(
698781
*,
699782
cwd: str | None,
700783
timeout: float | None,
784+
apply_env: bool = True,
701785
) -> ExecResult:
702786
normalized = [str(part) for part in command]
703787
if not normalized:
704788
return ExecResult(stdout=b"", stderr=b"", exit_code=0)
705789

790+
if apply_env:
791+
envs = await self._resolved_envs()
792+
if envs:
793+
normalized = self._wrap_with_env(normalized, envs)
794+
706795
await self._ensure_warm()
707796

708797
control: ControlConnection | None = None
709798
op_conn: OpConn | None = None
799+
# Mark this op as in-flight so the idle watcher won't close the
800+
# control connection mid-execution (e.g., long-running ``apt-get
801+
# install`` from the lazy-mount path crossing ``idle_close_seconds``).
802+
self._inflight_op_count += 1
710803
try:
711804
control = await self._ensure_control()
712805
try:
@@ -747,6 +840,8 @@ async def _exec_with_cwd(
747840
cause=exc,
748841
) from exc
749842
finally:
843+
self._inflight_op_count -= 1
844+
self._touch_activity()
750845
if control is not None:
751846
self._release_control(control)
752847

@@ -774,6 +869,9 @@ async def pty_exec_start(
774869
max_output_tokens: int | None = None,
775870
) -> PtyExecUpdate:
776871
sanitized_command = self._prepare_exec_command(*command, shell=shell, user=user)
872+
envs = await self._resolved_envs()
873+
if envs:
874+
sanitized_command = self._wrap_with_env(sanitized_command, envs)
777875
# ``_ensure_control`` will lazily call ``_ensure_sprite``; no extra await here.
778876
await self._ensure_warm()
779877

@@ -1213,16 +1311,28 @@ async def hydrate_workspace(self, data: io.IOBase) -> None:
12131311
await asyncio.to_thread(
12141312
lambda: (sprite.filesystem("/") / archive_path.as_posix()).write_bytes(bytes(raw))
12151313
)
1216-
extract_cmd = ("tar", "xf", archive_path.as_posix(), "-C", root.as_posix())
1217-
result = await self.exec(*extract_cmd, shell=False)
1218-
if not result.ok():
1314+
# Wrap the extract in a stdout-sentinel so a partial extract is
1315+
# detectable even when the WS exit-code wire drops failures (see
1316+
# ``mounts.py`` for the matching pattern). The sentinel runs on
1317+
# the remote shell, so the success/failure decision is driven by
1318+
# the actual ``tar`` exit status, not the round-tripped one.
1319+
extract_script = (
1320+
f"tar xf {shlex.quote(archive_path.as_posix())} "
1321+
f"-C {shlex.quote(root.as_posix())} "
1322+
f"&& printf %s {_HYDRATE_OK_SENTINEL} "
1323+
f"|| (rc=$?; printf %s {_HYDRATE_FAIL_SENTINEL}; exit $rc)"
1324+
)
1325+
result = await self.exec("sh", "-c", extract_script, shell=False)
1326+
stdout_text = result.stdout.decode("utf-8", errors="replace")
1327+
extract_succeeded = _HYDRATE_OK_SENTINEL in stdout_text
1328+
if not extract_succeeded or not result.ok():
12191329
raise WorkspaceArchiveWriteError(
12201330
path=root,
12211331
context={
12221332
"backend": "sprites",
12231333
"sprite_name": self.state.sprite_name,
12241334
"exit_code": result.exit_code,
1225-
"stdout": result.stdout.decode("utf-8", errors="replace"),
1335+
"stdout": stdout_text,
12261336
"stderr": result.stderr.decode("utf-8", errors="replace"),
12271337
},
12281338
)
@@ -1304,6 +1414,8 @@ async def create(
13041414
env=dict(options.env or {}) or None,
13051415
timeout_ms=options.timeout_ms,
13061416
workspace_persistence=options.workspace_persistence,
1417+
idle_close_seconds=options.idle_close_seconds,
1418+
wait_for_running_timeout_s=options.wait_for_running_timeout_s,
13071419
)
13081420

13091421
inner = SpritesSandboxSession.from_state(state, token=self._token, base_url=self._base_url)
@@ -1326,16 +1438,48 @@ async def resume(self, state: SandboxSessionState) -> SandboxSession:
13261438
raise TypeError("SpritesSandboxClient.resume expects a SpritesSandboxSessionState")
13271439

13281440
inner = SpritesSandboxSession.from_state(state, token=self._token, base_url=self._base_url)
1441+
1442+
# Always try to reattach to the recorded sprite first, regardless of
1443+
# ``created_by_us``. A successful reattach preserves the live
1444+
# workspace and avoids duplicating resources; only a true reattach
1445+
# warrants ``_set_start_state_preserved(True)``.
13291446
try:
1330-
await inner._ensure_sprite()
1447+
attached = await inner._try_attach_existing_sprite()
1448+
except (NetworkError, AuthenticationError, SpriteError) as exc:
1449+
# Treat platform errors here as fatal even for ephemeral sessions:
1450+
# we have no signal that the sprite is gone, just that the call
1451+
# failed, and silently recreating risks orphaning the original.
1452+
raise WorkspaceStartError(
1453+
path=posix_path_as_path(coerce_posix_path(state.manifest.root)),
1454+
context={
1455+
"backend": "sprites",
1456+
"sprite_name": state.sprite_name,
1457+
"reason": "reattach_failed",
1458+
},
1459+
cause=exc,
1460+
) from exc
1461+
1462+
if attached is not None:
13311463
inner._set_start_state_preserved(True)
1332-
except WorkspaceStartError:
1333-
if not state.created_by_us:
1334-
raise
1335-
# Fall through to fresh start; ``_ensure_sprite`` will be retried by the
1336-
# session's own ``start()`` lifecycle and will recreate the sprite.
1337-
inner._sprite = None
1338-
state.workspace_root_ready = False
1464+
return self._wrap_session(inner, instrumentation=self._instrumentation)
1465+
1466+
# The recorded sprite is gone. Named-attach sessions cannot be
1467+
# silently replaced — the caller asked for a specific sprite by name.
1468+
if not state.created_by_us:
1469+
raise WorkspaceStartError(
1470+
path=posix_path_as_path(coerce_posix_path(state.manifest.root)),
1471+
context={
1472+
"backend": "sprites",
1473+
"sprite_name": state.sprite_name,
1474+
"reason": "sprite_not_found",
1475+
},
1476+
)
1477+
1478+
# Ephemeral session whose original sprite was deleted: replace it with
1479+
# a fresh provision. Workspace continuity is lost, so clear the
1480+
# readiness flag and do NOT mark start state as preserved — the
1481+
# session's ``start()`` lifecycle must run a full manifest apply.
1482+
state.workspace_root_ready = False
13391483
return self._wrap_session(inner, instrumentation=self._instrumentation)
13401484

13411485
def deserialize_session_state(self, payload: dict[str, object]) -> SandboxSessionState:

0 commit comments

Comments
 (0)