Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 31 additions & 12 deletions packages/api/internal/sandbox/storage/redis/cleaner.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,17 @@

"go.uber.org/zap"

"github.com/e2b-dev/infra/packages/api/internal/sandbox"
"github.com/e2b-dev/infra/packages/shared/pkg/logger"
)

const cleanerInterval = time.Minute

// TODO: Remove once fully migrated to Redis
//
// Cleaner prunes stale entries from the two Redis sandbox indexes
// (`globalExpirationSet` and `globalTeamsSet`).
// Cleaner:
// - prunes stale entries from the two Redis sandbox indexes (`globalExpirationSet` and `globalTeamsSet`).
// - removes expired sandboxes
//
// Multi-pod safety: every operation the Cleaner triggers (ZREM/SREM of
// possibly-absent members) is idempotent. Concurrent Cleaners across pods
Expand Down Expand Up @@ -53,22 +55,19 @@

// RunOnce performs one cleanup pass. Each sub-step is independent; a failure
// in one is logged but does not abort the other.
//
// Per-cycle work is bounded:
// - ExpiredItems caps internally at expiredItemsBatchSize (256) members.
// - TeamsWithSandboxCount is one ZRANGE + one pipelined SCARD batch.
func (c *Cleaner) RunOnce(ctx context.Context) error {
var errs []error

// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose
// sandbox JSON is gone (items.go:131-135). Discard the returned
// sandbox list — actually evicting still-running sandboxes is the
// evictor's job, which in memory mode reads the memory backend.
if _, err := c.storage.ExpiredItems(ctx); err != nil {
// 1. globalExpirationSet: ExpiredItems internally ZREMs members whose sandbox JSON is gone (items.go:131-135).
Comment thread
jakubno marked this conversation as resolved.
Outdated
// 2. Discard the returned sandbox list (only if older than StaleCutoff) since the cleaner is not responsible for removing sandboxes
expired, err := c.storage.ExpiredItems(ctx)
if err != nil {
errs = append(errs, fmt.Errorf("expiration index sweep: %w", err))
} else {
c.evictExpired(ctx, expired)

Check warning on line 67 in packages/api/internal/sandbox/storage/redis/cleaner.go

View check run for this annotation

Claude / Claude Code Review

Outdated RunOnce comment contradicts new behavior

The comment block at `cleaner.go:60-62` is internally contradictory and contradicts the new behavior introduced by this PR: it says "Discard the returned sandbox list (only if older than StaleCutoff) since the cleaner is not responsible for removing sandboxes", but the code immediately below calls `c.evictExpired(ctx, expired)` which does exactly that — removes sandboxes older than StaleCutoff. Suggest rewording to something like: "evict sandboxes whose EndTime is older than StaleCutoff; recentl
Comment thread
jakubno marked this conversation as resolved.
}

// 2. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
// 3. globalTeamsSet: TeamsWithSandboxCount internally ZREMs teams whose
// per-team SCARD is 0 AND whose score is older than StaleCutoff
// (operations.go:268-288). Discard the returned counts.
if _, err := c.storage.TeamsWithSandboxCount(ctx); err != nil {
Expand All @@ -77,3 +76,23 @@

return errors.Join(errs...)
}

func (c *Cleaner) evictExpired(ctx context.Context, expired []sandbox.Sandbox) {
if len(expired) == 0 {
return
}

for _, sbx := range expired {
if time.Since(sbx.EndTime) < sandbox.StaleCutoff {
continue
}

if rmErr := c.storage.Remove(context.WithoutCancel(ctx), sbx.TeamID, sbx.SandboxID); rmErr != nil {
Comment thread
jakubno marked this conversation as resolved.
Comment thread
jakubno marked this conversation as resolved.
Comment thread
jakubno marked this conversation as resolved.
Comment thread
jakubno marked this conversation as resolved.
logger.L().Warn(ctx, "Cleaner failed to purge stale Redis entry after ErrNotFound",
zap.Error(rmErr),
logger.WithSandboxID(sbx.SandboxID),
logger.WithTeamID(sbx.TeamID.String()),
)

Check warning on line 95 in packages/api/internal/sandbox/storage/redis/cleaner.go

View check run for this annotation

Claude / Claude Code Review

Misleading log message references ErrNotFound

The warning message "Cleaner failed to purge stale Redis entry after ErrNotFound" at cleaner.go:91 is misleading — `storage.Remove` never returns `ErrNotFound` and there is no preceding `ErrNotFound` check in `evictExpired`, so this fires for any `Remove` failure (lock acquisition, Lua script errors, Redis connectivity). Operators investigating this warning would be sent looking for a condition that doesn't exist. Suggest rewording to e.g. "Cleaner failed to remove stale expired sandbox".
Comment thread
jakubno marked this conversation as resolved.
Outdated
}
}
}
52 changes: 52 additions & 0 deletions packages/api/internal/sandbox/storage/redis/cleaner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,58 @@ func TestCleaner_PreservesFutureScoredExpirationEntry(t *testing.T) {
require.NoError(t, err, "future-scored entry must not be pruned")
}

// TestCleaner_EvictsStaleExpiredSandbox covers the new evictExpired path:
// a sandbox whose EndTime is older than StaleCutoff must be Remove()'d by
// the cleaner so its JSON key, per-team index entry, and globalExpirationSet
// member all disappear.
func TestCleaner_EvictsStaleExpiredSandbox(t *testing.T) {
t.Parallel()

storage, client := setupTestStorage(t)
ctx := t.Context()

sbx := createTestSandbox("stale-expired-" + uuid.NewString())
sbx.EndTime = time.Now().Add(-sandbox.StaleCutoff - time.Minute)
require.NoError(t, storage.Add(ctx, sbx))

cleaner := NewCleaner(storage)
require.NoError(t, cleaner.RunOnce(ctx))

_, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
require.ErrorIs(t, err, sandbox.ErrNotFound, "stale expired sandbox JSON should be removed")

_, err = client.ZScore(ctx, globalExpirationSet,
expirationMember(sbx.TeamID.String(), sbx.SandboxID)).Result()
require.ErrorIs(t, err, redis.Nil, "stale expired sandbox should be removed from globalExpirationSet")

isMember, err := client.SIsMember(ctx,
GetSandboxStorageTeamIndexKey(sbx.TeamID.String()), sbx.SandboxID).Result()
require.NoError(t, err)
require.False(t, isMember, "stale expired sandbox should be removed from per-team index")
}

// TestCleaner_PreservesRecentlyExpiredSandbox guards the StaleCutoff window
// inside evictExpired: a sandbox that has just expired (EndTime in the past
// but newer than StaleCutoff) is still the evictor's responsibility — the
// cleaner must leave it alone so we don't race the evictor.
func TestCleaner_PreservesRecentlyExpiredSandbox(t *testing.T) {
t.Parallel()

storage, _ := setupTestStorage(t)
ctx := t.Context()

sbx := createTestSandbox("fresh-expired-" + uuid.NewString())
sbx.EndTime = time.Now().Add(-time.Second)
require.NoError(t, storage.Add(ctx, sbx))

cleaner := NewCleaner(storage)
require.NoError(t, cleaner.RunOnce(ctx))

got, err := storage.Get(ctx, sbx.TeamID, sbx.SandboxID)
require.NoError(t, err, "recently expired sandbox must survive — eviction is the evictor's job")
require.Equal(t, sbx.SandboxID, got.SandboxID)
}

// Compile-time guard so future refactors of sandbox.StaleCutoff get noticed
// here: the cleaner's correctness depends on it being > 0.
var _ = func() bool {
Expand Down
Loading