2424import logging
2525import os
2626import posixpath
27+ import shlex
2728import tarfile
2829import time
2930import uuid
116117_SPRITE_READY_POLL_INTERVAL_S = 1.0
117118_DEFAULT_MANIFEST_ROOT = cast (str , Manifest .model_fields ["root" ].default )
118119
120+ # Stdout sentinels used by ``hydrate_workspace`` to detect a partial tar
121+ # extract. Until sprite-env's WS layer reliably round-trips exit codes,
122+ # ``ExecResult.exit_code`` cannot be the only signal — a remote failure may
123+ # still surface as exit 0 over the wire. Anchoring the decision on stdout
124+ # (which the local shell controls before the WS hop) closes that gap.
125+ _HYDRATE_OK_SENTINEL = "__SPRITES_HYDRATE_OK__"
126+ _HYDRATE_FAIL_SENTINEL = "__SPRITES_HYDRATE_FAIL__"
127+
119128logger = logging .getLogger (__name__ )
120129
121130
@@ -257,6 +266,7 @@ class SpritesSandboxSessionState(SandboxSessionState):
257266 timeout_ms : int | None = None
258267 workspace_persistence : WorkspacePersistenceMode = "tar"
259268 idle_close_seconds : float = DEFAULT_SPRITES_IDLE_CLOSE_SECONDS
269+ wait_for_running_timeout_s : float = DEFAULT_SPRITES_WAIT_FOR_RUNNING_TIMEOUT_S
260270
261271
262272class SpritesSandboxSession (BaseSandboxSession ):
@@ -275,6 +285,7 @@ class SpritesSandboxSession(BaseSandboxSession):
275285 _last_activity_at : float
276286 _idle_close_seconds : float
277287 _idle_watch_task : asyncio .Task [None ] | None
288+ _inflight_op_count : int
278289
279290 def __init__ (
280291 self ,
@@ -303,6 +314,11 @@ def __init__(
303314 self ._last_activity_at = time .monotonic ()
304315 self ._idle_close_seconds = float (state .idle_close_seconds )
305316 self ._idle_watch_task = None
317+ # Tracks non-PTY exec/read/write operations currently using the control
318+ # connection. The idle watcher must skip closure while any are in-flight
319+ # so a long-running command (e.g. ``apt-get install``) is not cut off
320+ # mid-execution when it crosses the idle threshold.
321+ self ._inflight_op_count = 0
306322
307323 @classmethod
308324 def from_state (
@@ -374,6 +390,34 @@ async def _ensure_sprite(self) -> Sprite:
374390 self ._sprite = sprite
375391 return sprite
376392
393+ async def _try_attach_existing_sprite (self ) -> Sprite | None :
394+ """Look up the recorded sprite name and bind it without provisioning.
395+
396+ Used by ``resume()`` for both ``created_by_us=True`` and named-attach
397+ sessions to differentiate "the original sprite is still there" from
398+ "the original is gone and we need to fall through to a fresh create".
399+ Returns ``None`` only when the platform reports the sprite missing —
400+ any other error propagates so the caller can decide whether to retry
401+ or surface the failure.
402+ """
403+
404+ if self ._sprite is not None :
405+ return self ._sprite
406+
407+ client = self ._ensure_client_sync ()
408+ try :
409+ sprite : Sprite = await asyncio .to_thread (
410+ client .get_sprite , self .state .sprite_name
411+ )
412+ except NotFoundError :
413+ return None
414+ # Re-bind a Sprite handle with the live status snapshot as ``_sprite``;
415+ # ``client.sprite(name)`` builds the same handle but doesn't touch the
416+ # platform, so the get_sprite round-trip above is what proves
417+ # existence.
418+ self ._sprite = sprite
419+ return sprite
420+
377421 # Both ephemeral and named-attach paths now defer the wait-for-running poll
378422 # (and the URL/org-info refresh that comes with it) until the first I/O
379423 # operation runs ``_ensure_warm``. The platform auto-wakes paused sprites
@@ -423,39 +467,49 @@ def _maybe_start_idle_watch(self) -> None:
423467 self ._idle_watch_task = None
424468
425469 async def _idle_watch_loop (self ) -> None :
470+ # Loop forever, only exiting on cancel. Sleeping for the remaining
471+ # window each iteration keeps the watcher cheap, and looping (rather
472+ # than returning after a close-attempt) ensures we stay alive across
473+ # PTY/active-op skips so the next idle window still gets serviced
474+ # without depending on subsequent I/O to respawn us.
426475 try :
427476 while True :
428- # Sleep until the configured idle window elapses since the
429- # most-recent activity, re-checking each loop because activity
430- # may have happened during the sleep and reset the deadline.
431477 elapsed = time .monotonic () - self ._last_activity_at
432478 remaining = self ._idle_close_seconds - elapsed
433479 if remaining > 0 :
434480 await asyncio .sleep (remaining )
435481 continue
436- await self ._close_idle_control_connections ()
437- # Watcher exits; the next I/O calls ``_touch_activity`` which
438- # will respawn it.
439- return
482+ closed = await self ._close_idle_control_connections ()
483+ if closed :
484+ # Connections are closed; nothing to watch until a future
485+ # I/O call touches activity and respawns the watcher.
486+ return
487+ # Skipped (PTY active or non-PTY op in flight). Re-check after
488+ # one idle window — by then the active work may have finished.
489+ await asyncio .sleep (self ._idle_close_seconds )
440490 except asyncio .CancelledError :
441491 pass
442492
443- async def _close_idle_control_connections (self ) -> None :
493+ async def _close_idle_control_connections (self ) -> bool :
444494 """Close pooled control connections so the sprite can drop to ``warm``.
445495
446- Skipped when there are active PTY operations — those need their
447- connections kept alive.
496+ Skipped when there are active PTY operations or non-PTY exec/read/write
497+ operations in flight — those need their connections kept alive.
498+
499+ Returns ``True`` if connections were closed (or there was nothing to
500+ close), ``False`` if closure was skipped because work was active.
448501 """
449502
450- if self ._pty_processes :
451- return
503+ if self ._pty_processes or self . _inflight_op_count > 0 :
504+ return False
452505 sprite = self ._sprite
453506 if sprite is None :
454- return
507+ return True
455508 try :
456509 await sprite .close_control_connection ()
457510 except Exception :
458511 pass
512+ return True
459513
460514 def _build_sprite_config (self ) -> sprites .SpriteConfig | None :
461515 if (
@@ -492,7 +546,7 @@ async def _maybe_update_url_settings(self, sprite: Sprite) -> None:
492546
493547 async def _wait_for_sprite_running (self ) -> None :
494548 client = self ._ensure_client_sync ()
495- deadline_s = max (0.0 , float (self .state .timeout_ms or 0 ) / 1000.0 ) or float (
549+ deadline_s = max (0.0 , float (self .state .wait_for_running_timeout_s ) ) or float (
496550 DEFAULT_SPRITES_WAIT_FOR_RUNNING_TIMEOUT_S
497551 )
498552 loop = asyncio .get_event_loop ()
@@ -610,7 +664,10 @@ async def _prepare_backend_workspace(self) -> None:
610664 # try to ``chdir`` into it.
611665 root = PurePosixPath (posixpath .normpath (self .state .manifest .root ))
612666 result = await self ._exec_with_cwd (
613- ["mkdir" , "-p" , "--" , root .as_posix ()], cwd = None , timeout = 30.0
667+ ["mkdir" , "-p" , "--" , root .as_posix ()],
668+ cwd = None ,
669+ timeout = 30.0 ,
670+ apply_env = False ,
614671 )
615672 if not result .ok ():
616673 raise WorkspaceStartError (
@@ -684,6 +741,32 @@ async def shutdown(self) -> None:
684741 pass
685742 self ._client = None
686743
744+ async def _resolved_envs (self ) -> dict [str , str ]:
745+ """Merge per-session env (from options) with manifest-declared env vars.
746+
747+ Manifest values win on conflict because they are the more explicit
748+ configuration surface. Returned dict has only ``str`` values; any
749+ deferred ``EnvValue`` resolutions are awaited here.
750+ """
751+
752+ manifest_envs = await self .state .manifest .environment .resolve ()
753+ session_envs = self .state .env or {}
754+ return {** session_envs , ** manifest_envs }
755+
756+ @staticmethod
757+ def _wrap_with_env (command : list [str ], envs : dict [str , str ]) -> list [str ]:
758+ """Prepend an ``env --`` invocation so the remote process inherits ``envs``.
759+
760+ Uses argv-form so we don't depend on a shell — the WS exec path runs
761+ ``execvp`` directly. ``env --`` ensures the following ``NAME=VALUE``
762+ tokens are parsed as env, not as the program name.
763+ """
764+
765+ if not envs :
766+ return command
767+ prefix = ["env" , "--" , * (f"{ k } ={ v } " for k , v in envs .items ())]
768+ return [* prefix , * command ]
769+
687770 async def _exec_internal (
688771 self ,
689772 * command : str | Path ,
@@ -698,15 +781,25 @@ async def _exec_with_cwd(
698781 * ,
699782 cwd : str | None ,
700783 timeout : float | None ,
784+ apply_env : bool = True ,
701785 ) -> ExecResult :
702786 normalized = [str (part ) for part in command ]
703787 if not normalized :
704788 return ExecResult (stdout = b"" , stderr = b"" , exit_code = 0 )
705789
790+ if apply_env :
791+ envs = await self ._resolved_envs ()
792+ if envs :
793+ normalized = self ._wrap_with_env (normalized , envs )
794+
706795 await self ._ensure_warm ()
707796
708797 control : ControlConnection | None = None
709798 op_conn : OpConn | None = None
799+ # Mark this op as in-flight so the idle watcher won't close the
800+ # control connection mid-execution (e.g., long-running ``apt-get
801+ # install`` from the lazy-mount path crossing ``idle_close_seconds``).
802+ self ._inflight_op_count += 1
710803 try :
711804 control = await self ._ensure_control ()
712805 try :
@@ -747,6 +840,8 @@ async def _exec_with_cwd(
747840 cause = exc ,
748841 ) from exc
749842 finally :
843+ self ._inflight_op_count -= 1
844+ self ._touch_activity ()
750845 if control is not None :
751846 self ._release_control (control )
752847
@@ -774,6 +869,9 @@ async def pty_exec_start(
774869 max_output_tokens : int | None = None ,
775870 ) -> PtyExecUpdate :
776871 sanitized_command = self ._prepare_exec_command (* command , shell = shell , user = user )
872+ envs = await self ._resolved_envs ()
873+ if envs :
874+ sanitized_command = self ._wrap_with_env (sanitized_command , envs )
777875 # ``_ensure_control`` will lazily call ``_ensure_sprite``; no extra await here.
778876 await self ._ensure_warm ()
779877
@@ -1213,16 +1311,28 @@ async def hydrate_workspace(self, data: io.IOBase) -> None:
12131311 await asyncio .to_thread (
12141312 lambda : (sprite .filesystem ("/" ) / archive_path .as_posix ()).write_bytes (bytes (raw ))
12151313 )
1216- extract_cmd = ("tar" , "xf" , archive_path .as_posix (), "-C" , root .as_posix ())
1217- result = await self .exec (* extract_cmd , shell = False )
1218- if not result .ok ():
1314+ # Wrap the extract in a stdout-sentinel so a partial extract is
1315+ # detectable even when the WS exit-code wire drops failures (see
1316+ # ``mounts.py`` for the matching pattern). The sentinel runs on
1317+ # the remote shell, so the success/failure decision is driven by
1318+ # the actual ``tar`` exit status, not the round-tripped one.
1319+ extract_script = (
1320+ f"tar xf { shlex .quote (archive_path .as_posix ())} "
1321+ f"-C { shlex .quote (root .as_posix ())} "
1322+ f"&& printf %s { _HYDRATE_OK_SENTINEL } "
1323+ f"|| (rc=$?; printf %s { _HYDRATE_FAIL_SENTINEL } ; exit $rc)"
1324+ )
1325+ result = await self .exec ("sh" , "-c" , extract_script , shell = False )
1326+ stdout_text = result .stdout .decode ("utf-8" , errors = "replace" )
1327+ extract_succeeded = _HYDRATE_OK_SENTINEL in stdout_text
1328+ if not extract_succeeded or not result .ok ():
12191329 raise WorkspaceArchiveWriteError (
12201330 path = root ,
12211331 context = {
12221332 "backend" : "sprites" ,
12231333 "sprite_name" : self .state .sprite_name ,
12241334 "exit_code" : result .exit_code ,
1225- "stdout" : result . stdout . decode ( "utf-8" , errors = "replace" ) ,
1335+ "stdout" : stdout_text ,
12261336 "stderr" : result .stderr .decode ("utf-8" , errors = "replace" ),
12271337 },
12281338 )
@@ -1304,6 +1414,8 @@ async def create(
13041414 env = dict (options .env or {}) or None ,
13051415 timeout_ms = options .timeout_ms ,
13061416 workspace_persistence = options .workspace_persistence ,
1417+ idle_close_seconds = options .idle_close_seconds ,
1418+ wait_for_running_timeout_s = options .wait_for_running_timeout_s ,
13071419 )
13081420
13091421 inner = SpritesSandboxSession .from_state (state , token = self ._token , base_url = self ._base_url )
@@ -1326,16 +1438,48 @@ async def resume(self, state: SandboxSessionState) -> SandboxSession:
13261438 raise TypeError ("SpritesSandboxClient.resume expects a SpritesSandboxSessionState" )
13271439
13281440 inner = SpritesSandboxSession .from_state (state , token = self ._token , base_url = self ._base_url )
1441+
1442+ # Always try to reattach to the recorded sprite first, regardless of
1443+ # ``created_by_us``. A successful reattach preserves the live
1444+ # workspace and avoids duplicating resources; only a true reattach
1445+ # warrants ``_set_start_state_preserved(True)``.
13291446 try :
1330- await inner ._ensure_sprite ()
1447+ attached = await inner ._try_attach_existing_sprite ()
1448+ except (NetworkError , AuthenticationError , SpriteError ) as exc :
1449+ # Treat platform errors here as fatal even for ephemeral sessions:
1450+ # we have no signal that the sprite is gone, just that the call
1451+ # failed, and silently recreating risks orphaning the original.
1452+ raise WorkspaceStartError (
1453+ path = posix_path_as_path (coerce_posix_path (state .manifest .root )),
1454+ context = {
1455+ "backend" : "sprites" ,
1456+ "sprite_name" : state .sprite_name ,
1457+ "reason" : "reattach_failed" ,
1458+ },
1459+ cause = exc ,
1460+ ) from exc
1461+
1462+ if attached is not None :
13311463 inner ._set_start_state_preserved (True )
1332- except WorkspaceStartError :
1333- if not state .created_by_us :
1334- raise
1335- # Fall through to fresh start; ``_ensure_sprite`` will be retried by the
1336- # session's own ``start()`` lifecycle and will recreate the sprite.
1337- inner ._sprite = None
1338- state .workspace_root_ready = False
1464+ return self ._wrap_session (inner , instrumentation = self ._instrumentation )
1465+
1466+ # The recorded sprite is gone. Named-attach sessions cannot be
1467+ # silently replaced — the caller asked for a specific sprite by name.
1468+ if not state .created_by_us :
1469+ raise WorkspaceStartError (
1470+ path = posix_path_as_path (coerce_posix_path (state .manifest .root )),
1471+ context = {
1472+ "backend" : "sprites" ,
1473+ "sprite_name" : state .sprite_name ,
1474+ "reason" : "sprite_not_found" ,
1475+ },
1476+ )
1477+
1478+ # Ephemeral session whose original sprite was deleted: replace it with
1479+ # a fresh provision. Workspace continuity is lost, so clear the
1480+ # readiness flag and do NOT mark start state as preserved — the
1481+ # session's ``start()`` lifecycle must run a full manifest apply.
1482+ state .workspace_root_ready = False
13391483 return self ._wrap_session (inner , instrumentation = self ._instrumentation )
13401484
13411485 def deserialize_session_state (self , payload : dict [str , object ]) -> SandboxSessionState :
0 commit comments