Skip to content

Commit 572fc02

Browse files
authored
fix(cachewd): decouple scheduler context from signal context (#312)
The graceful shutdown PR (#307) passes the signal-notified context to the scheduler, so SIGTERM immediately kills all workers even though `server.Shutdown` keeps HTTP handlers alive for up to 150s. In-flight jobs (snapshots, repacks, fetches) get context-cancelled mid-execution. Gives the scheduler its own context via `context.WithoutCancel` (same pattern already used for the HTTP server `BaseContext`) and cancels it after `server.Shutdown` completes. Workers finish their current jobs during the drain window instead of dying instantly. The drain is bounded to 10s so long-running jobs don't block process exit past the pod's `terminationGracePeriodSeconds`. Production evidence from today's rolling deploys: - Every SIGTERM produced 32+ simultaneous `Worker terminated` log lines per pod - Snapshot jobs killed mid-write: `tar failed: signal: killed`, `context canceled` - `cash-server` snapshot killed 50s in, `ios-register` mirror snapshot killed 5m31s in Shutdown order after this change: 1. SIGTERM → readiness flips to 503 2. `server.Shutdown` drains in-flight HTTP requests (up to 150s) 3. Scheduler context cancelled → workers finish current job and exit 4. Wait up to 10s for workers to drain, then exit
1 parent 26f531a commit 572fc02

2 files changed

Lines changed: 79 additions & 1 deletion

File tree

cmd/cachewd/main.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,11 @@ func main() {
111111
})
112112
s3ClientProvider := s3client.NewClientProvider(ctx, globalConfig.S3Config)
113113

114-
schedulerProvider := jobscheduler.NewProvider(ctx, globalConfig.SchedulerConfig)
114+
// The scheduler gets its own context so workers keep running during
115+
// graceful shutdown while in-flight HTTP handlers drain. We cancel it
116+
// explicitly after server.Shutdown completes.
117+
schedulerCtx, cancelScheduler := context.WithCancel(context.WithoutCancel(ctx))
118+
schedulerProvider := jobscheduler.NewProvider(schedulerCtx, globalConfig.SchedulerConfig)
115119

116120
cr, mr, sr := newRegistries(schedulerProvider, gitManagerProvider, tokenManagerProvider, s3ClientProvider)
117121

@@ -163,6 +167,9 @@ func main() {
163167
}
164168

165169
gracefulShutdown(ctx, logger, server, &shuttingDown, globalConfig.ShutdownReadinessDelay, globalConfig.ShutdownTimeout)
170+
171+
cancelScheduler()
172+
drainScheduler(ctx, logger, schedulerProvider)
166173
}
167174

168175
// gracefulShutdown fails readiness, waits readinessDelay for load balancers
@@ -191,6 +198,26 @@ func gracefulShutdown(
191198
}
192199
}
193200

201+
const schedulerDrainTimeout = 10 * time.Second
202+
203+
func drainScheduler(ctx context.Context, logger *slog.Logger, provider jobscheduler.Provider) {
204+
scheduler, err := provider()
205+
if err != nil {
206+
return
207+
}
208+
done := make(chan struct{})
209+
go func() {
210+
scheduler.Wait()
211+
close(done)
212+
}()
213+
select {
214+
case <-done:
215+
logger.InfoContext(ctx, "Scheduler drained cleanly")
216+
case <-time.After(schedulerDrainTimeout):
217+
logger.WarnContext(ctx, "Scheduler drain timed out, exiting with in-flight jobs")
218+
}
219+
}
220+
194221
func newRegistries(
195222
scheduler jobscheduler.Provider,
196223
cloneManagerProvider gitclone.ManagerProvider,

internal/jobscheduler/jobs_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,57 @@ func TestJobSchedulerSubmitDroppedAfterShutdown(t *testing.T) {
335335
assert.False(t, executed.Load(), "submissions after shutdown should be dropped")
336336
}
337337

338+
// TestJobSchedulerSurvivesParentCancel verifies the shutdown ordering fix:
339+
// when the scheduler is created with context.WithoutCancel, cancelling the
340+
// parent (simulating SIGTERM) does NOT kill workers. Jobs submitted after the
341+
// parent cancel still execute. Only cancelling the scheduler's own context
342+
// stops the workers.
343+
func TestJobSchedulerSurvivesParentCancel(t *testing.T) {
344+
_, ctx := logging.Configure(context.Background(), logging.Config{Level: slog.LevelError})
345+
parentCtx, cancelParent := context.WithCancel(ctx)
346+
347+
// Simulate the production fix: scheduler gets context.WithoutCancel so
348+
// it is decoupled from the signal context.
349+
schedulerCtx, cancelScheduler := context.WithCancel(context.WithoutCancel(parentCtx))
350+
defer cancelScheduler()
351+
352+
scheduler := newTestScheduler(schedulerCtx, t, jobscheduler.Config{Concurrency: 2})
353+
354+
// Submit a job and confirm it runs.
355+
var firstJob atomic.Bool
356+
scheduler.Submit("q1", "before-sigterm", func(_ context.Context) error {
357+
firstJob.Store(true)
358+
return nil
359+
})
360+
eventually(t, time.Second, firstJob.Load, "job before parent cancel should run")
361+
362+
// Cancel the parent context (simulates SIGTERM arriving).
363+
cancelParent()
364+
time.Sleep(50 * time.Millisecond)
365+
366+
// Workers should still be alive — submit another job and verify it runs.
367+
var afterCancel atomic.Bool
368+
scheduler.Submit("q2", "after-sigterm", func(_ context.Context) error {
369+
afterCancel.Store(true)
370+
return nil
371+
})
372+
eventually(t, time.Second, afterCancel.Load,
373+
"job submitted after parent cancel should still execute")
374+
375+
// Now cancel the scheduler's own context (simulates post-Shutdown teardown).
376+
cancelScheduler()
377+
time.Sleep(50 * time.Millisecond)
378+
379+
var postShutdown atomic.Bool
380+
scheduler.Submit("q3", "post-shutdown", func(_ context.Context) error {
381+
postShutdown.Store(true)
382+
return nil
383+
})
384+
time.Sleep(100 * time.Millisecond)
385+
assert.False(t, postShutdown.Load(),
386+
"job submitted after scheduler cancel should be dropped")
387+
}
388+
338389
func TestJobSchedulerMultipleQueues(t *testing.T) {
339390
_, ctx := logging.Configure(context.Background(), logging.Config{Level: slog.LevelError})
340391
ctx, cancel := context.WithCancel(ctx)

0 commit comments

Comments
 (0)