Skip to content

Commit 9de943a

Browse files
Danelegendclaude
andcommitted
feat(sessions): long-lived session pods with manual lifecycle
Adds two new routes: * POST /v1/sessions creates a long-lived executor pod/container, optionally pre-staging files from /v1/files. * DELETE /v1/sessions/{id} tears the session down. Both Docker and Kubernetes backends are supported. Sessions are labeled ``app=code-interpreter,component=session`` so they can be enumerated for debugging, and named ``code-session-<uuid>`` so the delete path can sanity-check that callers aren't accidentally targeting other resources. Known limitation: this PR has no automatic cleanup. A session that the caller forgets to DELETE will live for the configured idle window (``SESSION_MAX_LIFETIME_SECONDS`` = 24h). The follow-up PR adds per-session TTL enforcement plus a reaper for crash recovery. Tests cover both backends and the route layer (success, 404 mapping, 501 mapping, file resolution, prefix-based safety check on delete). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 59fafdb commit 9de943a

8 files changed

Lines changed: 606 additions & 4 deletions

File tree

code-interpreter/app/api/routes.py

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
from app.app_configs import get_settings
1010
from app.models.schemas import (
11+
CreateSessionRequest,
12+
CreateSessionResponse,
13+
ExecuteFile,
1114
ExecuteRequest,
1215
ExecuteResponse,
1316
FileMetadataResponse,
@@ -19,7 +22,7 @@
1922
WorkspaceFile,
2023
)
2124
from app.services.executor_base import EntryKind, StreamChunk, StreamResult, WorkspaceEntry
22-
from app.services.executor_factory import execute_python, execute_python_streaming
25+
from app.services.executor_factory import execute_python, execute_python_streaming, get_executor
2326
from app.services.file_storage import FileStorageService
2427

2528
router = APIRouter()
@@ -46,8 +49,8 @@ def _validate_timeout(req: ExecuteRequest) -> None:
4649
)
4750

4851

49-
def _stage_request_files(
50-
req: ExecuteRequest,
52+
def _resolve_uploaded_files(
53+
files: list[ExecuteFile],
5154
storage: FileStorageService,
5255
) -> tuple[list[tuple[str, bytes]], dict[str, bytes]]:
5356
"""Resolve uploaded file IDs into content for the executor.
@@ -56,7 +59,7 @@ def _stage_request_files(
5659
"""
5760
staged_files: list[tuple[str, bytes]] = []
5861
input_files_map: dict[str, bytes] = {}
59-
for file in req.files:
62+
for file in files:
6063
try:
6164
content, _ = storage.get_file(file.file_id)
6265
except FileNotFoundError as exc:
@@ -69,6 +72,14 @@ def _stage_request_files(
6972
return staged_files, input_files_map
7073

7174

75+
def _stage_request_files(
76+
req: ExecuteRequest,
77+
storage: FileStorageService,
78+
) -> tuple[list[tuple[str, bytes]], dict[str, bytes]]:
79+
"""Resolve uploaded file IDs into content for the executor."""
80+
return _resolve_uploaded_files(req.files, storage)
81+
82+
7283
def _save_workspace_files(
7384
entries: tuple[WorkspaceEntry, ...],
7485
input_files_map: dict[str, bytes],
@@ -248,3 +259,57 @@ def delete_file(file_id: str) -> Response:
248259
)
249260

250261
return Response(status_code=status.HTTP_204_NO_CONTENT)
262+
263+
264+
@router.post(
265+
"/sessions",
266+
response_model=CreateSessionResponse,
267+
status_code=status.HTTP_201_CREATED,
268+
)
269+
def create_session(req: CreateSessionRequest) -> CreateSessionResponse:
270+
"""Create a long-lived code-executor pod.
271+
272+
The session must be torn down explicitly via DELETE /v1/sessions/{id}.
273+
"""
274+
settings = get_settings()
275+
storage = get_file_storage()
276+
staged_files, _ = _resolve_uploaded_files(req.files, storage)
277+
278+
try:
279+
info = get_executor().create_session(
280+
files=staged_files,
281+
cpu_time_limit_sec=settings.cpu_time_limit_sec,
282+
memory_limit_mb=settings.memory_limit_mb,
283+
)
284+
except NotImplementedError as exc:
285+
raise HTTPException(
286+
status_code=status.HTTP_501_NOT_IMPLEMENTED,
287+
detail=str(exc),
288+
) from exc
289+
except ValueError as exc:
290+
raise HTTPException(
291+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
292+
detail=str(exc),
293+
) from exc
294+
295+
return CreateSessionResponse(session_id=info.session_id)
296+
297+
298+
@router.delete("/sessions/{session_id}", status_code=status.HTTP_204_NO_CONTENT)
299+
def delete_session(session_id: str) -> Response:
300+
"""Tear down a session pod by ID."""
301+
try:
302+
deleted = get_executor().delete_session(session_id)
303+
except NotImplementedError as exc:
304+
raise HTTPException(
305+
status_code=status.HTTP_501_NOT_IMPLEMENTED,
306+
detail=str(exc),
307+
) from exc
308+
309+
if not deleted:
310+
raise HTTPException(
311+
status_code=status.HTTP_404_NOT_FOUND,
312+
detail=f"Session '{session_id}' not found",
313+
)
314+
315+
return Response(status_code=status.HTTP_204_NO_CONTENT)

code-interpreter/app/models/schemas.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,3 +120,14 @@ class ListFilesResponse(BaseModel):
120120
class HealthResponse(BaseModel):
121121
status: Literal["ok", "error"]
122122
message: StrictStr | None = None
123+
124+
125+
class CreateSessionRequest(BaseModel):
126+
files: list[ExecuteFile] = Field(
127+
default_factory=list,
128+
description="Files to stage in the session workspace at create time.",
129+
)
130+
131+
132+
class CreateSessionResponse(BaseModel):
133+
session_id: StrictStr = Field(..., description="Identifier for the session pod/container.")

code-interpreter/app/services/executor_base.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,18 @@ class HealthCheck:
106106
message: str | None = None
107107

108108

109+
@dataclass(frozen=True, slots=True)
110+
class SessionInfo:
111+
"""Identifying information for a long-lived session."""
112+
113+
session_id: str
114+
115+
116+
SESSION_NAME_PREFIX = "code-session-"
117+
SESSION_APP_LABEL = "code-interpreter"
118+
SESSION_COMPONENT_LABEL = "session"
119+
120+
109121
class ExecutorProtocol(Protocol):
110122
def execute_python(
111123
self,
@@ -168,6 +180,24 @@ def execute_python_streaming(
168180
"""
169181
raise NotImplementedError(f"{type(self).__name__} does not support streaming execution")
170182

183+
def create_session(
184+
self,
185+
*,
186+
files: Sequence[tuple[str, bytes]] | None = None,
187+
cpu_time_limit_sec: int | None = None,
188+
memory_limit_mb: int | None = None,
189+
) -> SessionInfo:
190+
"""Create a long-lived execution environment.
191+
192+
Returns identifying information for the session. The caller is
193+
responsible for invoking ``delete_session`` when finished.
194+
"""
195+
raise NotImplementedError(f"{type(self).__name__} does not support sessions")
196+
197+
def delete_session(self, session_id: str) -> bool:
198+
"""Tear down a session by ID. Returns True if found and deleted."""
199+
raise NotImplementedError(f"{type(self).__name__} does not support sessions")
200+
171201
@staticmethod
172202
def truncate_output(stream: bytes, max_bytes: int) -> str:
173203
if len(stream) <= max_bytes:

code-interpreter/app/services/executor_docker.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,14 @@
2121
PYTHON_EXECUTOR_DOCKER_RUN_ARGS,
2222
)
2323
from app.services.executor_base import (
24+
SESSION_APP_LABEL,
25+
SESSION_COMPONENT_LABEL,
26+
SESSION_NAME_PREFIX,
2427
BaseExecutor,
2528
EntryKind,
2629
ExecutionResult,
2730
HealthCheck,
31+
SessionInfo,
2832
StreamChunk,
2933
StreamEvent,
3034
StreamResult,
@@ -34,6 +38,10 @@
3438

3539
logger = logging.getLogger(__name__)
3640

41+
# Sessions keep their idle container alive for at most this many seconds; a
42+
# follow-up PR replaces this with a per-session TTL plus a reaper.
43+
SESSION_MAX_LIFETIME_SECONDS = 24 * 60 * 60
44+
3745

3846
@dataclass
3947
class _ExecContext:
@@ -394,6 +402,57 @@ def _run_in_container(
394402
finally:
395403
self._kill_container(container_name)
396404

405+
def create_session(
406+
self,
407+
*,
408+
files: Sequence[tuple[str, bytes]] | None = None,
409+
cpu_time_limit_sec: int | None = None,
410+
memory_limit_mb: int | None = None,
411+
) -> SessionInfo:
412+
container_name = f"{SESSION_NAME_PREFIX}{uuid.uuid4().hex}"
413+
414+
cmd = self._build_run_command(
415+
container_name=container_name,
416+
cpu_time_limit_sec=cpu_time_limit_sec,
417+
memory_limit_mb=memory_limit_mb,
418+
sleep_seconds=SESSION_MAX_LIFETIME_SECONDS,
419+
labels={
420+
"app": SESSION_APP_LABEL,
421+
"component": SESSION_COMPONENT_LABEL,
422+
},
423+
)
424+
start_proc = subprocess.run(cmd, capture_output=True, text=True) # nosec B603
425+
if start_proc.returncode != 0:
426+
raise RuntimeError(f"Failed to start session container: {start_proc.stderr}")
427+
428+
try:
429+
if files:
430+
tar_archive = self._create_tar_archive(files=files)
431+
self._upload_tar_to_container(container_name, tar_archive)
432+
except Exception:
433+
self._kill_container(container_name)
434+
raise
435+
436+
logger.info("Created session container %s", container_name)
437+
return SessionInfo(session_id=container_name)
438+
439+
def delete_session(self, session_id: str) -> bool:
440+
if not session_id.startswith(SESSION_NAME_PREFIX):
441+
return False
442+
result = subprocess.run( # nosec B603
443+
[self.docker_binary, "rm", "-f", session_id],
444+
capture_output=True,
445+
text=True,
446+
)
447+
# `docker rm -f <missing>` exits 0 on modern Docker, so check stderr
448+
# for the "not found" message regardless of exit code.
449+
stderr = (result.stderr or "").lower()
450+
if "no such container" in stderr or "not found" in stderr:
451+
return False
452+
if result.returncode == 0:
453+
return True
454+
raise RuntimeError(f"Failed to delete session {session_id}: {result.stderr}")
455+
397456
def execute_python(
398457
self,
399458
*,

code-interpreter/app/services/executor_kubernetes.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,14 @@
2828
KUBERNETES_EXECUTOR_SERVICE_ACCOUNT,
2929
)
3030
from app.services.executor_base import (
31+
SESSION_APP_LABEL,
32+
SESSION_COMPONENT_LABEL,
33+
SESSION_NAME_PREFIX,
3134
BaseExecutor,
3235
EntryKind,
3336
ExecutionResult,
3437
HealthCheck,
38+
SessionInfo,
3539
StreamChunk,
3640
StreamEvent,
3741
StreamResult,
@@ -45,6 +49,10 @@
4549
POD_DELETE_RETRY_DELAY_SECONDS = 0.2
4650
POD_DELETE_CONFIRM_TIMEOUT_SECONDS = 2.0
4751

52+
# Sessions keep their idle pod alive for at most this many seconds; a follow-up
53+
# PR replaces this with a per-session TTL plus a reaper.
54+
SESSION_MAX_LIFETIME_SECONDS = 24 * 60 * 60
55+
4856

4957
def _parse_exit_code(error: str) -> int | None:
5058
"""Parse the exit code from a Kubernetes exec error channel message."""
@@ -596,6 +604,52 @@ def _cleanup_pod(self, pod_name: str) -> None:
596604
POD_DELETE_RETRIES,
597605
)
598606

607+
def create_session(
608+
self,
609+
*,
610+
files: Sequence[tuple[str, bytes]] | None = None,
611+
cpu_time_limit_sec: int | None = None,
612+
memory_limit_mb: int | None = None,
613+
) -> SessionInfo:
614+
pod_name = f"{SESSION_NAME_PREFIX}{uuid.uuid4().hex}"
615+
616+
manifest = self._create_pod_manifest(
617+
pod_name=pod_name,
618+
command=["sleep", str(SESSION_MAX_LIFETIME_SECONDS)],
619+
labels={"app": SESSION_APP_LABEL, "component": SESSION_COMPONENT_LABEL},
620+
memory_limit_mb=memory_limit_mb,
621+
cpu_time_limit_sec=cpu_time_limit_sec,
622+
)
623+
624+
logger.info("Creating session pod %s in namespace %s", pod_name, self.namespace)
625+
self.v1.create_namespaced_pod(namespace=self.namespace, body=manifest)
626+
627+
try:
628+
self._wait_for_pod_ready(pod_name)
629+
if files:
630+
tar_archive = self._create_tar_archive(files=files)
631+
self._upload_tar_to_pod(pod_name, tar_archive)
632+
except Exception:
633+
self._cleanup_pod(pod_name)
634+
raise
635+
636+
return SessionInfo(session_id=pod_name)
637+
638+
def delete_session(self, session_id: str) -> bool:
639+
if not session_id.startswith(SESSION_NAME_PREFIX):
640+
return False
641+
try:
642+
self.v1.delete_namespaced_pod(
643+
name=session_id,
644+
namespace=self.namespace,
645+
body=client.V1DeleteOptions(grace_period_seconds=0),
646+
)
647+
except ApiException as e:
648+
if e.status == 404:
649+
return False
650+
raise
651+
return True
652+
599653
def execute_python(
600654
self,
601655
*,

0 commit comments

Comments
 (0)