Skip to content

Commit 1e1fbe8

Browse files
jakubnoValentaTomas
authored andcommitted
feat(api): cleaner evicts sandboxes older than StaleCutoff (#2640)
The Redis cleaner now removes sandboxes whose EndTime is older than StaleCutoff, in addition to pruning orphaned index entries. Recently expired sandboxes are left to the evictor to avoid races.
1 parent 142c831 commit 1e1fbe8

2 files changed

Lines changed: 86 additions & 12 deletions

File tree

packages/api/internal/sandbox/storage/redis/cleaner.go

Lines changed: 34 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,17 @@ import (
88

99
"go.uber.org/zap"
1010

11+
"github.com/e2b-dev/infra/packages/api/internal/sandbox"
1112
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
1213
)
1314

1415
const cleanerInterval = time.Minute
1516

1617
// TODO: Remove once fully migrated to Redis
1718
//
18-
// Cleaner prunes stale entries from the two Redis sandbox indexes
19-
// (`globalExpirationSet` and `globalTeamsSet`).
19+
// Cleaner:
20+
// - prunes stale entries from the two Redis sandbox indexes (`globalExpirationSet` and `globalTeamsSet`).
21+
// - removes expired sandboxes
2022
//
2123
// Multi-pod safety: every operation the Cleaner triggers (ZREM/SREM of
2224
// possibly-absent members) is idempotent. Concurrent Cleaners across pods
@@ -53,22 +55,20 @@ func (c *Cleaner) Start(ctx context.Context) {
5355

5456
// RunOnce performs one cleanup pass. Each sub-step is independent; a failure
5557
// in one is logged but does not abort the other.
56-
//
57-
// Per-cycle work is bounded:
58-
// - ExpiredItems caps internally at expiredItemsBatchSize (256) members.
59-
// - TeamsWithSandboxCount is one ZRANGE + one pipelined SCARD batch.
6058
func (c *Cleaner) RunOnce(ctx context.Context) error {
6159
var errs []error
6260

63-
// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose
64-
// sandbox JSON is gone (items.go:131-135). Discard the returned
65-
// sandbox list — actually evicting still-running sandboxes is the
66-
// evictor's job, which in memory mode reads the memory backend.
67-
if _, err := c.storage.ExpiredItems(ctx); err != nil {
61+
// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone.
62+
// 2. evictExpired removes sandboxes whose EndTime is older than StaleCutoff;
63+
// recently expired ones are left to the evictor to avoid racing it.
64+
expired, err := c.storage.ExpiredItems(ctx)
65+
if err != nil {
6866
errs = append(errs, fmt.Errorf("expiration index sweep: %w", err))
67+
} else {
68+
c.evictExpired(ctx, expired)
6969
}
7070

71-
// 2. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
71+
// 3. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
7272
// per-team SCARD is 0 AND whose score is older than StaleCutoff
7373
// (operations.go:268-288). Discard the returned counts.
7474
if _, err := c.storage.TeamsWithSandboxCount(ctx); err != nil {
@@ -77,3 +77,25 @@ func (c *Cleaner) RunOnce(ctx context.Context) error {
7777

7878
return errors.Join(errs...)
7979
}
80+
81+
func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) {
82+
if len(expired) == 0 {
83+
return
84+
}
85+
86+
logger.L().Info(ctx, "Cleaner found expired sandboxes", zap.Int("count", len(expired)))
87+
88+
for _, sbx := range expired {
89+
if time.Since(sbx.EndTime) < sandbox.StaleCutoff {
90+
continue
91+
}
92+
93+
if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil {
94+
logger.L().Error(ctx, "Cleaner failed to remove stale expired sandbox",
95+
zap.Error(rmErr),
96+
logger.WithSandboxID(sbx.SandboxID),
97+
logger.WithTeamID(sbx.TeamID.String()),
98+
)
99+
}
100+
}
101+
}

packages/api/internal/sandbox/storage/redis/cleaner_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,58 @@ func TestCleaner_PreservesFutureScoredExpirationEntry(t *testing.T) {
219219
require.NoError(t, err, "future-scored entry must not be pruned")
220220
}
221221

222+
// TestCleaner_EvictsStaleExpiredSandbox covers the new evictExpired path:
223+
// a sandbox whose EndTime is older than StaleCutoff must be Remove()'d by
224+
// the cleaner so its JSON key, per-team index entry, and globalExpirationSet
225+
// member all disappear.
226+
func TestCleaner_EvictsStaleExpiredSandbox(t *testing.T) {
227+
t.Parallel()
228+
229+
storage, client := setupTestStorage(t)
230+
ctx := t.Context()
231+
232+
sbx := createTestSandbox("stale-expired-" + uuid.NewString())
233+
sbx.EndTime = time.Now().Add(-sandbox.StaleCutoff - time.Minute)
234+
require.NoError(t, storage.Add(ctx, sbx))
235+
236+
cleaner := NewCleaner(storage)
237+
require.NoError(t, cleaner.RunOnce(ctx))
238+
239+
_, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
240+
require.ErrorIs(t, err, sandbox.ErrNotFound, "stale expired sandbox JSON should be removed")
241+
242+
_, err = client.ZScore(ctx, globalExpirationSet,
243+
expirationMember(sbx.TeamID.String(), sbx.SandboxID)).Result()
244+
require.ErrorIs(t, err, redis.Nil, "stale expired sandbox should be removed from globalExpirationSet")
245+
246+
isMember, err := client.SIsMember(ctx,
247+
GetSandboxStorageTeamIndexKey(sbx.TeamID.String()), sbx.SandboxID).Result()
248+
require.NoError(t, err)
249+
require.False(t, isMember, "stale expired sandbox should be removed from per-team index")
250+
}
251+
252+
// TestCleaner_PreservesRecentlyExpiredSandbox guards the StaleCutoff window
253+
// inside evictExpired: a sandbox that has just expired (EndTime in the past
254+
// but newer than StaleCutoff) is still the evictor's responsibility — the
255+
// cleaner must leave it alone so we don't race the evictor.
256+
func TestCleaner_PreservesRecentlyExpiredSandbox(t *testing.T) {
257+
t.Parallel()
258+
259+
storage, _ := setupTestStorage(t)
260+
ctx := t.Context()
261+
262+
sbx := createTestSandbox("fresh-expired-" + uuid.NewString())
263+
sbx.EndTime = time.Now().Add(-time.Second)
264+
require.NoError(t, storage.Add(ctx, sbx))
265+
266+
cleaner := NewCleaner(storage)
267+
require.NoError(t, cleaner.RunOnce(ctx))
268+
269+
got, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
270+
require.NoError(t, err, "recently expired sandbox must survive — eviction is the evictor's job")
271+
require.Equal(t, sbx.SandboxID, got.SandboxID)
272+
}
273+
222274
// Compile-time guard so future refactors of sandbox.StaleCutoff get noticed
223275
// here: the cleaner's correctness depends on it being > 0.
224276
var _ = func() bool {

0 commit comments

Comments
 (0)