From 8bc7d4b0e9c846dc7b0fe615ac5301fea8734f2e Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Wed, 13 May 2026 13:32:15 +0000 Subject: [PATCH 1/4] feat(api): cleaner evicts sandboxes older than StaleCutoff The Redis cleaner now removes sandboxes whose EndTime is older than StaleCutoff, in addition to pruning orphaned index entries. Recently expired sandboxes are left to the evictor to avoid races. --- .../internal/sandbox/storage/redis/cleaner.go | 43 ++++++++++----- .../sandbox/storage/redis/cleaner_test.go | 52 +++++++++++++++++++ 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/packages/api/internal/sandbox/storage/redis/cleaner.go b/packages/api/internal/sandbox/storage/redis/cleaner.go index dac17a4628..7adef698a9 100644 --- a/packages/api/internal/sandbox/storage/redis/cleaner.go +++ b/packages/api/internal/sandbox/storage/redis/cleaner.go @@ -8,6 +8,7 @@ import ( "go.uber.org/zap" + "github.com/e2b-dev/infra/packages/api/internal/sandbox" "github.com/e2b-dev/infra/packages/shared/pkg/logger" ) @@ -15,8 +16,9 @@ const cleanerInterval = time.Minute // TODO: Remove once fully migrated to Redis // -// Cleaner prunes stale entries from the two Redis sandbox indexes -// (`globalExpirationSet` and `globalTeamsSet`). +// Cleaner: +// - prunes stale entries from the two Redis sandbox indexes (`globalExpirationSet` and `globalTeamsSet`). +// - removes expired sandboxes // // Multi-pod safety: every operation the Cleaner triggers (ZREM/SREM of // possibly-absent members) is idempotent. Concurrent Cleaners across pods @@ -53,22 +55,19 @@ func (c *Cleaner) Start(ctx context.Context) { // RunOnce performs one cleanup pass. Each sub-step is independent; a failure // in one is logged but does not abort the other. -// -// Per-cycle work is bounded: -// - ExpiredItems caps internally at expiredItemsBatchSize (256) members. -// - TeamsWithSandboxCount is one ZRANGE + one pipelined SCARD batch. func (c *Cleaner) RunOnce(ctx context.Context) error { var errs []error - // 1. globalExpirationSet: ExpiredItems internally ZREMs members whose - // sandbox JSON is gone (items.go:131-135). Discard the returned - // sandbox list — actually evicting still-running sandboxes is the - // evictor's job, which in memory mode reads the memory backend. - if _, err := c.storage.ExpiredItems(ctx); err != nil { + // 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone (items.go:131-135). + // 2. Discard the returned sandbox list (only if older than StaleCutoff) since the cleaner is not responsible for removing sandboxes + expired, err := c.storage.ExpiredItems(ctx) + if err != nil { errs = append(errs, fmt.Errorf("expiration index sweep: %w", err)) + } else { + c.evictExpired(ctx, expired) } - // 2. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose + // 3. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose // per-team SCARD is 0 AND whose score is older than StaleCutoff // (operations.go:268-288). Discard the returned counts. if _, err := c.storage.TeamsWithSandboxCount(ctx); err != nil { @@ -77,3 +76,23 @@ func (c *Cleaner) RunOnce(ctx context.Context) error { return errors.Join(errs...) } + +func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) { + if len(expired) == 0 { + return + } + + for _, sbx := range expired { + if time.Since(sbx.EndTime) < sandbox.StaleCutoff { + continue + } + + if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil { + logger.L().Warn(ctx, "Cleaner failed to purge stale Redis entry after ErrNotFound", + zap.Error(rmErr), + logger.WithSandboxID(sbx.SandboxID), + logger.WithTeamID(sbx.TeamID.String()), + ) + } + } +} diff --git a/packages/api/internal/sandbox/storage/redis/cleaner_test.go b/packages/api/internal/sandbox/storage/redis/cleaner_test.go index ced3faac4a..40a25a7780 100644 --- a/packages/api/internal/sandbox/storage/redis/cleaner_test.go +++ b/packages/api/internal/sandbox/storage/redis/cleaner_test.go @@ -219,6 +219,58 @@ func TestCleaner_PreservesFutureScoredExpirationEntry(t *testing.T) { require.NoError(t, err, "future-scored entry must not be pruned") } +// TestCleaner_EvictsStaleExpiredSandbox covers the new evictExpired path: +// a sandbox whose EndTime is older than StaleCutoff must be Remove()'d by +// the cleaner so its JSON key, per-team index entry, and globalExpirationSet +// member all disappear. +func TestCleaner_EvictsStaleExpiredSandbox(t *testing.T) { + t.Parallel() + + storage, client := setupTestStorage(t) + ctx := t.Context() + + sbx := createTestSandbox("stale-expired-" + uuid.NewString()) + sbx.EndTime = time.Now().Add(-sandbox.StaleCutoff - time.Minute) + require.NoError(t, storage.Add(ctx, sbx)) + + cleaner := NewCleaner(storage) + require.NoError(t, cleaner.RunOnce(ctx)) + + _, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID) + require.ErrorIs(t, err, sandbox.ErrNotFound, "stale expired sandbox JSON should be removed") + + _, err = client.ZScore(ctx, globalExpirationSet, + expirationMember(sbx.TeamID.String(), sbx.SandboxID)).Result() + require.ErrorIs(t, err, redis.Nil, "stale expired sandbox should be removed from globalExpirationSet") + + isMember, err := client.SIsMember(ctx, + GetSandboxStorageTeamIndexKey(sbx.TeamID.String()), sbx.SandboxID).Result() + require.NoError(t, err) + require.False(t, isMember, "stale expired sandbox should be removed from per-team index") +} + +// TestCleaner_PreservesRecentlyExpiredSandbox guards the StaleCutoff window +// inside evictExpired: a sandbox that has just expired (EndTime in the past +// but newer than StaleCutoff) is still the evictor's responsibility — the +// cleaner must leave it alone so we don't race the evictor. +func TestCleaner_PreservesRecentlyExpiredSandbox(t *testing.T) { + t.Parallel() + + storage, _ := setupTestStorage(t) + ctx := t.Context() + + sbx := createTestSandbox("fresh-expired-" + uuid.NewString()) + sbx.EndTime = time.Now().Add(-time.Second) + require.NoError(t, storage.Add(ctx, sbx)) + + cleaner := NewCleaner(storage) + require.NoError(t, cleaner.RunOnce(ctx)) + + got, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID) + require.NoError(t, err, "recently expired sandbox must survive — eviction is the evictor's job") + require.Equal(t, sbx.SandboxID, got.SandboxID) +} + // Compile-time guard so future refactors of sandbox.StaleCutoff get noticed // here: the cleaner's correctness depends on it being > 0. var _ = func() bool { From f4e80239d6ea0792e7a81475a3931fd32176de63 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Wed, 13 May 2026 14:25:11 +0000 Subject: [PATCH 2/4] chore(api): clarify cleaner comments and warning message Address PR feedback: the inline comment in RunOnce contradicted what evictExpired actually does, and the warning message referenced a non-existent ErrNotFound precondition. --- packages/api/internal/sandbox/storage/redis/cleaner.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/api/internal/sandbox/storage/redis/cleaner.go b/packages/api/internal/sandbox/storage/redis/cleaner.go index 7adef698a9..03e7bbbb96 100644 --- a/packages/api/internal/sandbox/storage/redis/cleaner.go +++ b/packages/api/internal/sandbox/storage/redis/cleaner.go @@ -59,7 +59,8 @@ func (c *Cleaner) RunOnce(ctx context.Context) error { var errs []error // 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone (items.go:131-135). - // 2. Discard the returned sandbox list (only if older than StaleCutoff) since the cleaner is not responsible for removing sandboxes + // 2. evictExpired removes sandboxes whose EndTime is older than StaleCutoff; + // recently expired ones are left to the evictor to avoid racing it. expired, err := c.storage.ExpiredItems(ctx) if err != nil { errs = append(errs, fmt.Errorf("expiration index sweep: %w", err)) @@ -88,7 +89,7 @@ func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) { } if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil { - logger.L().Warn(ctx, "Cleaner failed to purge stale Redis entry after ErrNotFound", + logger.L().Warn(ctx, "Cleaner failed to remove stale expired sandbox", zap.Error(rmErr), logger.WithSandboxID(sbx.SandboxID), logger.WithTeamID(sbx.TeamID.String()), From e33eea1afaad883832cf977231fb2f8a93b06170 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Wed, 13 May 2026 15:15:16 +0000 Subject: [PATCH 3/4] chore(api): address cleaner review feedback - drop items.go line reference from RunOnce comment - promote stale-remove failure log from Warn to Error --- packages/api/internal/sandbox/storage/redis/cleaner.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/api/internal/sandbox/storage/redis/cleaner.go b/packages/api/internal/sandbox/storage/redis/cleaner.go index 03e7bbbb96..a06d29cde1 100644 --- a/packages/api/internal/sandbox/storage/redis/cleaner.go +++ b/packages/api/internal/sandbox/storage/redis/cleaner.go @@ -58,7 +58,7 @@ func (c *Cleaner) Start(ctx context.Context) { func (c *Cleaner) RunOnce(ctx context.Context) error { var errs []error - // 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone (items.go:131-135). + // 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone. // 2. evictExpired removes sandboxes whose EndTime is older than StaleCutoff; // recently expired ones are left to the evictor to avoid racing it. expired, err := c.storage.ExpiredItems(ctx) @@ -89,7 +89,7 @@ func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) { } if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil { - logger.L().Warn(ctx, "Cleaner failed to remove stale expired sandbox", + logger.L().Error(ctx, "Cleaner failed to remove stale expired sandbox", zap.Error(rmErr), logger.WithSandboxID(sbx.SandboxID), logger.WithTeamID(sbx.TeamID.String()), From cadc7ddea5fbc4f8e87c6d04613f3b621ef3db71 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Wed, 13 May 2026 15:42:12 +0000 Subject: [PATCH 4/4] chore: add log --- packages/api/internal/sandbox/storage/redis/cleaner.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/packages/api/internal/sandbox/storage/redis/cleaner.go b/packages/api/internal/sandbox/storage/redis/cleaner.go index a06d29cde1..aec4e866eb 100644 --- a/packages/api/internal/sandbox/storage/redis/cleaner.go +++ b/packages/api/internal/sandbox/storage/redis/cleaner.go @@ -83,6 +83,8 @@ func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) { return } + logger.L().Info(ctx, "Cleaner found expired sandboxes", zap.Int("count", len(expired))) + for _, sbx := range expired { if time.Since(sbx.EndTime) < sandbox.StaleCutoff { continue