Skip to content

Commit 8bc7d4b

Browse files
committed
feat(api): cleaner evicts sandboxes older than StaleCutoff
The Redis cleaner now removes sandboxes whose EndTime is older than StaleCutoff, in addition to pruning orphaned index entries. Recently expired sandboxes are left to the evictor to avoid races.
1 parent f2c360d commit 8bc7d4b

2 files changed

Lines changed: 83 additions & 12 deletions

File tree

packages/api/internal/sandbox/storage/redis/cleaner.go

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@ import (
88

99
"go.uber.org/zap"
1010

11+
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
1112
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
1213
)
1314

1415
const cleanerInterval = time.Minute
1516

1617
// TODO: Remove once fully migrated to Redis
1718
//
18-
// Cleaner prunes stale entries from the two Redis sandbox indexes
19-
// (`globalExpirationSet` and `globalTeamsSet`).
19+
// Cleaner:
20+
// - prunes stale entries from the two Redis sandbox indexes (`globalExpirationSet` and `globalTeamsSet`).
21+
// - removes expired sandboxes
2022
//
2123
// Multi-pod safety: every operation the Cleaner triggers (ZREM/SREM of
2224
// possibly-absent members) is idempotent. Concurrent Cleaners across pods
@@ -53,22 +55,19 @@ func (c *Cleaner) Start(ctx context.Context) {
5355

5456
// RunOnce performs one cleanup pass. Each sub-step is independent; a failure
5557
// in one is logged but does not abort the other.
56-
//
57-
// Per-cycle work is bounded:
58-
// - ExpiredItems caps internally at expiredItemsBatchSize (256) members.
59-
// - TeamsWithSandboxCount is one ZRANGE + one pipelined SCARD batch.
6058
func (c *Cleaner) RunOnce(ctx context.Context) error {
6159
var errs []error
6260

63-
// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose
64-
// sandbox JSON is gone (items.go:131-135). Discard the returned
65-
// sandbox list — actually evicting still-running sandboxes is the
66-
// evictor's job, which in memory mode reads the memory backend.
67-
if _, err := c.storage.ExpiredItems(ctx); err != nil {
61+
// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone (items.go:131-135).
62+
// 2. Discard the returned sandbox list (only if older than StaleCutoff) since the cleaner is not responsible for removing sandboxes
63+
expired, err := c.storage.ExpiredItems(ctx)
64+
if err != nil {
6865
errs = append(errs, fmt.Errorf("expiration index sweep: %w", err))
66+
} else {
67+
c.evictExpired(ctx, expired)
6968
}
7069

71-
// 2. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
70+
// 3. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
7271
// per-team SCARD is 0 AND whose score is older than StaleCutoff
7372
// (operations.go:268-288). Discard the returned counts.
7473
if _, err := c.storage.TeamsWithSandboxCount(ctx); err != nil {
@@ -77,3 +76,23 @@ func (c *Cleaner) RunOnce(ctx context.Context) error {
7776

7877
return errors.Join(errs...)
7978
}
79+
80+
func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) {
81+
if len(expired) == 0 {
82+
return
83+
}
84+
85+
for _, sbx := range expired {
86+
if time.Since(sbx.EndTime) < sandbox.StaleCutoff {
87+
continue
88+
}
89+
90+
if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil {
91+
logger.L().Warn(ctx, "Cleaner failed to purge stale Redis entry after ErrNotFound",
92+
zap.Error(rmErr),
93+
logger.WithSandboxID(sbx.SandboxID),
94+
logger.WithTeamID(sbx.TeamID.String()),
95+
)
96+
}
97+
}
98+
}

packages/api/internal/sandbox/storage/redis/cleaner_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,58 @@ func TestCleaner_PreservesFutureScoredExpirationEntry(t *testing.T) {
219219
require.NoError(t, err, "future-scored entry must not be pruned")
220220
}
221221

222+
// TestCleaner_EvictsStaleExpiredSandbox covers the new evictExpired path:
223+
// a sandbox whose EndTime is older than StaleCutoff must be Remove()'d by
224+
// the cleaner so its JSON key, per-team index entry, and globalExpirationSet
225+
// member all disappear.
226+
func TestCleaner_EvictsStaleExpiredSandbox(t *testing.T) {
227+
t.Parallel()
228+
229+
storage, client := setupTestStorage(t)
230+
ctx := t.Context()
231+
232+
sbx := createTestSandbox("stale-expired-" + uuid.NewString())
233+
sbx.EndTime = time.Now().Add(-sandbox.StaleCutoff - time.Minute)
234+
require.NoError(t, storage.Add(ctx, sbx))
235+
236+
cleaner := NewCleaner(storage)
237+
require.NoError(t, cleaner.RunOnce(ctx))
238+
239+
_, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
240+
require.ErrorIs(t, err, sandbox.ErrNotFound, "stale expired sandbox JSON should be removed")
241+
242+
_, err = client.ZScore(ctx, globalExpirationSet,
243+
expirationMember(sbx.TeamID.String(), sbx.SandboxID)).Result()
244+
require.ErrorIs(t, err, redis.Nil, "stale expired sandbox should be removed from globalExpirationSet")
245+
246+
isMember, err := client.SIsMember(ctx,
247+
GetSandboxStorageTeamIndexKey(sbx.TeamID.String()), sbx.SandboxID).Result()
248+
require.NoError(t, err)
249+
require.False(t, isMember, "stale expired sandbox should be removed from per-team index")
250+
}
251+
252+
// TestCleaner_PreservesRecentlyExpiredSandbox guards the StaleCutoff window
253+
// inside evictExpired: a sandbox that has just expired (EndTime in the past
254+
// but newer than StaleCutoff) is still the evictor's responsibility — the
255+
// cleaner must leave it alone so we don't race the evictor.
256+
func TestCleaner_PreservesRecentlyExpiredSandbox(t *testing.T) {
257+
t.Parallel()
258+
259+
storage, _ := setupTestStorage(t)
260+
ctx := t.Context()
261+
262+
sbx := createTestSandbox("fresh-expired-" + uuid.NewString())
263+
sbx.EndTime = time.Now().Add(-time.Second)
264+
require.NoError(t, storage.Add(ctx, sbx))
265+
266+
cleaner := NewCleaner(storage)
267+
require.NoError(t, cleaner.RunOnce(ctx))
268+
269+
got, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
270+
require.NoError(t, err, "recently expired sandbox must survive — eviction is the evictor's job")
271+
require.Equal(t, sbx.SandboxID, got.SandboxID)
272+
}
273+
222274
// Compile-time guard so future refactors of sandbox.StaleCutoff get noticed
223275
// here: the cleaner's correctness depends on it being > 0.
224276
var _ = func() bool {

0 commit comments

Comments
 (0)