Skip to content

Commit ecd13b0

Browse files
bchaliosclaude
andauthored
Filesystem-only snapshots for auto-pause (#3055)
Summary Adds a per-sandbox autoPauseMemory create option (default true). When false, a sandbox that auto-pauses on timeout takes a filesystem-only snapshot (no memory) instead of a full memory snapshot, so it cold-boots from disk on resume. Builds on the filesystem-only snapshot machinery from #3027 — this PR just lets the timeout auto-pause path opt into it, where previously only an explicit pause(memory=false) could. How it works The FilesystemOnly flag was already plumbed end-to-end through the pause path (RemoveOpts → pauseSandbox → snapshot → Pause RPC); the only missing piece was a per-sandbox policy for the evictor to act on. 1. Create — autoPauseMemory=false → Sandbox.AutoPauseFilesystemOnly=true (runtime state, persisted to Redis). 2. Evict — when the evictor auto-pauses on timeout, it threads opts.FilesystemOnly = sbx.AutoPauseFilesystemOnly. 3. Persist — the policy is stamped onto the snapshot's PausedSandboxConfig, and... 4. Restore — re-read on resume (covers explicit /resume, /connect, and auto-resume — they share buildResumeSandboxData), so the policy survives a pause/resume cycle. 5. Re-sync — also carried through SandboxConfig.auto_pause_filesystem_only in the orchestrator proto, mirroring auto_pause, so an API restart that rebuilds from the orchestrator's sandbox list doesn't lose it. The orchestrator is unchanged — the existing IsFilesystemOnly() dispatch already routes any resume of an fs-only snapshot to RebootSandbox. Constraints (rejected at create, 400) - autoPauseMemory=false requires autoPause=true (it only governs the timeout auto-pause; otherwise a no-op). - autoPauseMemory=false is incompatible with autoResume — a filesystem-only snapshot can't be auto-resumed by arbitrary traffic (it'd cold-boot, losing processes/connections) and must be resumed explicitly. Design notes - Two distinct fields. AutoPauseFilesystemOnly (the policy for the next auto-pause) is kept separate from FilesystemOnly (the kind of this snapshot), so an explicit pause(memory=true) on such a sandbox produces a memory snapshot yet still restores the fs-only policy on the next cycle. - Resume inherits the policy. Unlike auto_pause (overridable via the resume body, now deprecated), the auto-pause kind is always inherited from the snapshot — changing it means creating a new sandbox. Deliberate, to avoid extending a deprecated path. - Backward-compatible: default true, omitempty on both the JSON config and the proto, so existing sandboxes/snapshots decode to memory auto-pause. Testing - Unit — evictor wiring (kill/pause × snapshot kind); PausedSandboxConfig serialization round-trip + legacy-row default; proto re-sync round-trip via GetSandboxes. - Integration — create → timeout auto-pause → resume cold-boots (rootfs marker survives, fresh boot id, correct default user), and the policy persists across a second pause/resume cycle; both create-time guards return 400. Usage ``` # auto-pause this sandbox as a filesystem-only snapshot on timeout POST /sandboxes { "templateID": "...", "autoPause": true, "autoPauseMemory": false } ``` --------- Signed-off-by: Babis Chalios <babis.chalios@e2b.dev> Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
1 parent f99b788 commit ecd13b0

22 files changed

Lines changed: 997 additions & 540 deletions

File tree

packages/api/internal/api/api.gen.go

Lines changed: 181 additions & 176 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/api/internal/handlers/sandbox_create.go

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ func (a *APIStore) PostSandboxes(c *gin.Context) {
142142
)
143143

144144
autoPause := sharedUtils.DerefOrDefault(body.AutoPause, sandbox.AutoPauseDefault)
145+
// autoPauseMemory defaults to true (full memory snapshot). When false, a
146+
// timeout auto-pause takes a filesystem-only snapshot (cold-boots on resume).
147+
autoPauseFilesystemOnly := !sharedUtils.DerefOrDefault(body.AutoPauseMemory, true)
145148
envVars := sharedUtils.DerefOrDefault(body.EnvVars, nil)
146149
mcp := sharedUtils.DerefOrDefault(body.Mcp, nil)
147150
metadata := sharedUtils.DerefOrDefault(body.Metadata, nil)
@@ -164,6 +167,23 @@ func (a *APIStore) PostSandboxes(c *gin.Context) {
164167
autoResume.Timeout = calculateTimeoutSeconds(timeout, minAutoResumeTimeout, teamInfo)
165168
}
166169

170+
// autoPauseMemory only controls the snapshot kind of a timeout auto-pause, so
171+
// it is meaningless without autoPause; reject it rather than silently storing
172+
// a no-op policy.
173+
if autoPauseFilesystemOnly && !autoPause {
174+
a.sendAPIStoreError(c, http.StatusBadRequest, "autoPauseMemory=false only applies when autoPause is true.")
175+
176+
return
177+
}
178+
179+
// A filesystem-only auto-pause produces a snapshot that traffic cannot
180+
// auto-resume (it must be resumed explicitly), so the two are incompatible.
181+
if autoPauseFilesystemOnly && autoResume != nil && autoResume.Policy == types.SandboxAutoResumeAny {
182+
a.sendAPIStoreError(c, http.StatusBadRequest, "autoPauseMemory=false (filesystem-only auto-pause) cannot be combined with autoResume: a filesystem-only snapshot cannot be auto-resumed by traffic and must be resumed explicitly.")
183+
184+
return
185+
}
186+
167187
var envdAccessToken *string = nil
168188
if body.Secure != nil && *body.Secure == true {
169189
accessToken, tokenErr := a.getEnvdAccessToken(build.EnvdVersion, sandboxID)
@@ -268,18 +288,19 @@ func (a *APIStore) PostSandboxes(c *gin.Context) {
268288
// The data can't be influenced by action on the same sandbox as other operations,
269289
// so it's safe to reuse the data
270290
return apiorch.SandboxMetadata{
271-
Metadata: metadata,
272-
EnvVars: envVars,
273-
Build: *build,
274-
AllowInternetAccess: allowInternetAccess,
275-
Network: network,
276-
Alias: alias,
277-
TemplateID: env.TemplateID,
278-
BaseTemplateID: env.TemplateID,
279-
AutoPause: autoPause,
280-
AutoResume: autoResume,
281-
VolumeMounts: sbxVolumeMounts,
282-
EnvdAccessToken: envdAccessToken,
291+
Metadata: metadata,
292+
EnvVars: envVars,
293+
Build: *build,
294+
AllowInternetAccess: allowInternetAccess,
295+
Network: network,
296+
Alias: alias,
297+
TemplateID: env.TemplateID,
298+
BaseTemplateID: env.TemplateID,
299+
AutoPause: autoPause,
300+
AutoPauseFilesystemOnly: autoPauseFilesystemOnly,
301+
AutoResume: autoResume,
302+
VolumeMounts: sbxVolumeMounts,
303+
EnvdAccessToken: envdAccessToken,
283304
}, nil
284305
}
285306

packages/api/internal/handlers/sandbox_resume.go

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -227,25 +227,32 @@ func (a *APIStore) buildResumeSandboxData(sandboxID string, autoPauseOverride *b
227227
var network *types.SandboxNetworkConfig
228228
var autoResume *types.SandboxAutoResumeConfig
229229
var volumes []*types.SandboxVolumeMountConfig
230+
// Unlike auto_pause (which resume can override via the request body), the
231+
// auto-pause snapshot kind is intentionally always inherited from the
232+
// snapshot: there is no resume-time override for it. Changing the kind
233+
// requires creating a new sandbox with the desired autoPauseMemory.
234+
var autoPauseFilesystemOnly bool
230235
if snap.Config != nil {
231236
network = snap.Config.Network
232237
autoResume = snap.Config.AutoResume
233238
volumes = snap.Config.VolumeMounts
239+
autoPauseFilesystemOnly = snap.Config.AutoPauseFilesystemOnly
234240
}
235241

236242
return orchestrator.SandboxMetadata{
237-
Metadata: snap.Metadata,
238-
Build: build,
239-
AllowInternetAccess: snap.AllowInternetAccess,
240-
Network: network,
241-
Alias: alias,
242-
TemplateID: snap.EnvID,
243-
BaseTemplateID: snap.BaseEnvID,
244-
AutoPause: autoPause,
245-
AutoResume: autoResume,
246-
VolumeMounts: convertDatabaseMountsToOrchestratorMounts(volumes),
247-
EnvdAccessToken: envdAccessToken,
248-
NodeID: &nodeID,
243+
Metadata: snap.Metadata,
244+
Build: build,
245+
AllowInternetAccess: snap.AllowInternetAccess,
246+
Network: network,
247+
Alias: alias,
248+
TemplateID: snap.EnvID,
249+
BaseTemplateID: snap.BaseEnvID,
250+
AutoPause: autoPause,
251+
AutoPauseFilesystemOnly: autoPauseFilesystemOnly,
252+
AutoResume: autoResume,
253+
VolumeMounts: convertDatabaseMountsToOrchestratorMounts(volumes),
254+
EnvdAccessToken: envdAccessToken,
255+
NodeID: &nodeID,
249256
}, nil
250257
}
251258
}

packages/api/internal/orchestrator/create_instance.go

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,13 @@ type SandboxMetadata struct {
4747
TemplateID string
4848
BaseTemplateID string
4949
AutoPause bool
50-
AutoResume *types.SandboxAutoResumeConfig
51-
VolumeMounts []*orchestrator.SandboxVolumeMount
52-
EnvdAccessToken *string
53-
NodeID *string
50+
// AutoPauseFilesystemOnly makes a timeout auto-pause take a filesystem-only
51+
// snapshot instead of a full memory one. Only meaningful when AutoPause.
52+
AutoPauseFilesystemOnly bool
53+
AutoResume *types.SandboxAutoResumeConfig
54+
VolumeMounts []*orchestrator.SandboxVolumeMount
55+
EnvdAccessToken *string
56+
NodeID *string
5457
}
5558

5659
// buildEgressConfig constructs the orchestrator egress configuration from
@@ -268,30 +271,31 @@ func (o *Orchestrator) CreateSandbox(
268271

269272
sbxRequest := &orchestrator.SandboxCreateRequest{
270273
Sandbox: &orchestrator.SandboxConfig{
271-
BaseTemplateId: sbxData.BaseTemplateID,
272-
TemplateId: sbxData.TemplateID,
273-
Alias: &sbxData.Alias,
274-
TeamId: team.ID.String(),
275-
BuildId: sbxData.Build.ID.String(),
276-
SandboxId: sandboxID,
277-
ExecutionId: executionID,
278-
KernelVersion: sbxData.Build.KernelVersion,
279-
FirecrackerVersion: sbxData.Build.FirecrackerVersion,
280-
EnvdVersion: *sbxData.Build.EnvdVersion,
281-
Metadata: sbxData.Metadata,
282-
EnvVars: sbxData.EnvVars,
283-
EnvdAccessToken: sbxData.EnvdAccessToken,
284-
MaxSandboxLength: team.Limits.MaxLengthHours,
285-
HugePages: hasHugePages,
286-
RamMb: sbxData.Build.RamMb,
287-
Vcpu: sbxData.Build.Vcpu,
288-
Snapshot: isResume,
289-
AutoPause: sbxData.AutoPause,
290-
AutoResume: orchAutoResume,
291-
AllowInternetAccess: sbxData.AllowInternetAccess,
292-
Network: sbxNetwork,
293-
TotalDiskSizeMb: ut.FromPtr(sbxData.Build.TotalDiskSizeMb),
294-
VolumeMounts: sbxData.VolumeMounts,
274+
BaseTemplateId: sbxData.BaseTemplateID,
275+
TemplateId: sbxData.TemplateID,
276+
Alias: &sbxData.Alias,
277+
TeamId: team.ID.String(),
278+
BuildId: sbxData.Build.ID.String(),
279+
SandboxId: sandboxID,
280+
ExecutionId: executionID,
281+
KernelVersion: sbxData.Build.KernelVersion,
282+
FirecrackerVersion: sbxData.Build.FirecrackerVersion,
283+
EnvdVersion: *sbxData.Build.EnvdVersion,
284+
Metadata: sbxData.Metadata,
285+
EnvVars: sbxData.EnvVars,
286+
EnvdAccessToken: sbxData.EnvdAccessToken,
287+
MaxSandboxLength: team.Limits.MaxLengthHours,
288+
HugePages: hasHugePages,
289+
RamMb: sbxData.Build.RamMb,
290+
Vcpu: sbxData.Build.Vcpu,
291+
Snapshot: isResume,
292+
AutoPause: sbxData.AutoPause,
293+
AutoPauseFilesystemOnly: sbxData.AutoPauseFilesystemOnly,
294+
AutoResume: orchAutoResume,
295+
AllowInternetAccess: sbxData.AllowInternetAccess,
296+
Network: sbxNetwork,
297+
TotalDiskSizeMb: ut.FromPtr(sbxData.Build.TotalDiskSizeMb),
298+
VolumeMounts: sbxData.VolumeMounts,
295299
},
296300
StartTime: timestamppb.New(startTime),
297301
EndTime: timestamppb.New(endTime),
@@ -360,6 +364,7 @@ func (o *Orchestrator) CreateSandbox(
360364
node.ID,
361365
node.ClusterID,
362366
sbxData.AutoPause,
367+
sbxData.AutoPauseFilesystemOnly,
363368
sbxData.AutoResume,
364369
sbxData.EnvdAccessToken,
365370
sbxData.AllowInternetAccess,

packages/api/internal/orchestrator/evictor/evict.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,13 @@ func (e *Evictor) evictSandbox(ctx context.Context, sbx sandbox.Sandbox) {
156156
}
157157

158158
opts := sandbox.RemoveOpts{Action: action, Eviction: true}
159-
if action == sandbox.StateActionKill {
159+
switch action {
160+
case sandbox.StateActionKill:
160161
opts.Reason = sandbox.KillReasonTimeout
162+
case sandbox.StateActionPause:
163+
// Honor the sandbox's auto-pause snapshot kind: filesystem-only drops
164+
// memory (cold-boots on resume); otherwise a full memory snapshot.
165+
opts.FilesystemOnly = sbx.AutoPauseFilesystemOnly
161166
}
162167

163168
if err := e.removeSandbox(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID, opts); err != nil {

packages/api/internal/orchestrator/evictor/evict_test.go

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
func TestEvictSandbox_ReasonByAction(t *testing.T) {
1616
t.Parallel()
1717

18-
run := func(autoPause bool) sandbox.RemoveOpts {
18+
run := func(autoPause, autoPauseFilesystemOnly bool) sandbox.RemoveOpts {
1919
var got sandbox.RemoveOpts
2020
called := false
2121
e := &Evictor{
@@ -33,10 +33,11 @@ func TestEvictSandbox_ReasonByAction(t *testing.T) {
3333
}
3434

3535
e.evictSandbox(context.Background(), sandbox.Sandbox{
36-
SandboxID: "sbx",
37-
TeamID: uuid.New(),
38-
AutoPause: autoPause,
39-
EndTime: time.Now(),
36+
SandboxID: "sbx",
37+
TeamID: uuid.New(),
38+
AutoPause: autoPause,
39+
AutoPauseFilesystemOnly: autoPauseFilesystemOnly,
40+
EndTime: time.Now(),
4041
})
4142

4243
require.True(t, called)
@@ -47,19 +48,48 @@ func TestEvictSandbox_ReasonByAction(t *testing.T) {
4748
t.Run("kill carries timeout reason", func(t *testing.T) {
4849
t.Parallel()
4950

50-
got := run(false)
51+
got := run(false, false)
5152

5253
assert.Equal(t, sandbox.StateActionKill, got.Action)
5354
assert.True(t, got.Eviction)
5455
assert.Equal(t, sandbox.KillReasonTimeout, got.Reason)
5556
})
5657

58+
t.Run("kill ignores the auto-pause snapshot kind", func(t *testing.T) {
59+
t.Parallel()
60+
61+
// AutoPauseFilesystemOnly is meaningless without AutoPause; a kill must
62+
// never carry it.
63+
got := run(false, true)
64+
65+
assert.Equal(t, sandbox.StateActionKill, got.Action)
66+
assert.False(t, got.FilesystemOnly)
67+
})
68+
5769
t.Run("auto-pause carries no kill reason", func(t *testing.T) {
5870
t.Parallel()
5971

60-
got := run(true)
72+
got := run(true, false)
6173

6274
assert.Equal(t, sandbox.StateActionPause, got.Action)
6375
assert.Empty(t, got.Reason)
6476
})
77+
78+
t.Run("memory auto-pause is not filesystem-only", func(t *testing.T) {
79+
t.Parallel()
80+
81+
got := run(true, false)
82+
83+
assert.Equal(t, sandbox.StateActionPause, got.Action)
84+
assert.False(t, got.FilesystemOnly)
85+
})
86+
87+
t.Run("filesystem-only auto-pause requests a filesystem-only snapshot", func(t *testing.T) {
88+
t.Parallel()
89+
90+
got := run(true, true)
91+
92+
assert.Equal(t, sandbox.StateActionPause, got.Action)
93+
assert.True(t, got.FilesystemOnly)
94+
})
6595
}

packages/api/internal/orchestrator/nodemanager/sandboxes.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ func (n *Node) GetSandboxes(ctx context.Context) ([]sandbox.Sandbox, error) {
131131
n.ID,
132132
n.ClusterID,
133133
config.GetAutoPause(),
134+
config.GetAutoPauseFilesystemOnly(),
134135
autoResume,
135136
config.EnvdAccessToken, //nolint:protogetter // we need the nil check too
136137
config.AllowInternetAccess, //nolint:protogetter // we need the nil check too
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package nodemanager
2+
3+
import (
4+
"context"
5+
"testing"
6+
"time"
7+
8+
"github.com/google/uuid"
9+
"github.com/stretchr/testify/assert"
10+
"github.com/stretchr/testify/require"
11+
"google.golang.org/grpc"
12+
"google.golang.org/protobuf/types/known/emptypb"
13+
"google.golang.org/protobuf/types/known/timestamppb"
14+
15+
"github.com/e2b-dev/infra/packages/api/internal/api"
16+
"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
17+
)
18+
19+
// mockSandboxListClient implements orchestrator.SandboxServiceClient and returns
20+
// a canned List response, so GetSandboxes' proto->Sandbox reconstruction can be
21+
// tested without a live orchestrator.
22+
type mockSandboxListClient struct {
23+
orchestrator.SandboxServiceClient
24+
25+
resp *orchestrator.SandboxListResponse
26+
}
27+
28+
func (m *mockSandboxListClient) List(_ context.Context, _ *emptypb.Empty, _ ...grpc.CallOption) (*orchestrator.SandboxListResponse, error) {
29+
return m.resp, nil
30+
}
31+
32+
// TestGetSandboxes_RestoresAutoPauseFilesystemOnly verifies that the auto-pause
33+
// snapshot-kind policy round-trips through the orchestrator's SandboxConfig when
34+
// the API re-syncs its sandbox list (e.g. after a restart). The proto field
35+
// exists for exactly this path, so without it the policy would silently revert
36+
// to a memory auto-pause.
37+
func TestGetSandboxes_RestoresAutoPauseFilesystemOnly(t *testing.T) {
38+
t.Parallel()
39+
40+
now := time.Now()
41+
runningSandbox := func(id string, autoPauseFilesystemOnly bool) *orchestrator.RunningSandbox {
42+
return &orchestrator.RunningSandbox{
43+
StartTime: timestamppb.New(now),
44+
EndTime: timestamppb.New(now.Add(time.Hour)),
45+
Config: &orchestrator.SandboxConfig{
46+
SandboxId: id,
47+
TemplateId: "tmpl",
48+
BaseTemplateId: "tmpl",
49+
TeamId: uuid.NewString(),
50+
BuildId: uuid.NewString(),
51+
ExecutionId: uuid.NewString(),
52+
AutoPause: true,
53+
AutoPauseFilesystemOnly: autoPauseFilesystemOnly,
54+
},
55+
}
56+
}
57+
58+
node := NewTestNode("test-node", api.NodeStatusReady, 0, 4)
59+
node.SetSandboxClient(&mockSandboxListClient{
60+
resp: &orchestrator.SandboxListResponse{
61+
Sandboxes: []*orchestrator.RunningSandbox{
62+
runningSandbox("fs-only", true),
63+
runningSandbox("memory", false),
64+
},
65+
},
66+
})
67+
68+
sandboxes, err := node.GetSandboxes(t.Context())
69+
require.NoError(t, err)
70+
require.Len(t, sandboxes, 2)
71+
72+
got := make(map[string]bool, len(sandboxes))
73+
for _, sbx := range sandboxes {
74+
got[sbx.SandboxID] = sbx.AutoPauseFilesystemOnly
75+
}
76+
77+
assert.True(t, got["fs-only"], "filesystem-only auto-pause policy must survive an orchestrator re-sync")
78+
assert.False(t, got["memory"], "memory auto-pause policy must survive an orchestrator re-sync")
79+
}

packages/api/internal/orchestrator/pause_instance.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,12 @@ func buildUpsertSnapshotParams(sbx sandbox.Sandbox, node *nodemanager.Node, file
154154
AllowInternetAccess: sbx.AllowInternetAccess,
155155
AutoPause: sbx.AutoPause,
156156
Config: &types.PausedSandboxConfig{
157-
Version: types.PausedSandboxConfigVersion,
158-
Network: sbx.Network,
159-
AutoResume: sbx.AutoResume,
160-
VolumeMounts: sbx.VolumeMounts,
161-
FilesystemOnly: filesystemOnly,
157+
Version: types.PausedSandboxConfigVersion,
158+
Network: sbx.Network,
159+
AutoResume: sbx.AutoResume,
160+
VolumeMounts: sbx.VolumeMounts,
161+
FilesystemOnly: filesystemOnly,
162+
AutoPauseFilesystemOnly: sbx.AutoPauseFilesystemOnly,
162163
},
163164
OriginNodeID: node.ID,
164165
Status: types.BuildStatusSnapshotting,

0 commit comments

Comments
 (0)