Basic stuck job detection (#1097)

brandur · web-flow · commit 51f834601a1d · 2025-12-16T16:03:54.000+02:00
Here, try to make some inroads on a feature we've been talking about for
a while: detection of stuck jobs.

Unfortunately in Go it's quite easy to accidentally park a job by using
a `select` on a channel that won't return and forgetting a separate
branch for `&lt;-ctx.Done()` so that it won't respect job timeouts either.

Here, add in some basic detection for that case. Eventually we'd like to
give users some options for what to do in case jobs become stuck, but
here we do only the simplest things for now: log when we detect a stuck
job and count the number of stuck jobs in a producer's stats loop.

In the future we may want to have some additional intelligence like
having producers move stuck jobs to a separate bucket up to a certain
limit before crashing (the next best option because it's not possible to
manually kill goroutines).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- Basic stuck detection after a job's exceeded its timeout and still not returned after the executor's initiated context cancellation and waited a short margin for the cancellation to take effect. [PR #1097](https://github.com/riverqueue/river/pull/1097).
+
 ## [0.29.0-rc.1] - 2025-12-04
 
 - Added `HookPeriodicJobsStart` that can be used to run custom logic when a periodic job enqueuer starts up on a new leader. [PR #1084](https://github.com/riverqueue/river/pull/1084).
diff --git a/internal/jobexecutor/job_executor.go b/internal/jobexecutor/job_executor.go
@@ -112,12 +112,17 @@ type JobExecutor struct {
 	ErrorHandler             ErrorHandler
 	HookLookupByJob          *hooklookup.JobHookLookup
 	HookLookupGlobal         hooklookup.HookLookupInterface
-	InformProducerDoneFunc   func(jobRow *rivertype.JobRow)
 	JobRow                   *rivertype.JobRow
 	MiddlewareLookupGlobal   middlewarelookup.MiddlewareLookupInterface
-	SchedulerInterval        time.Duration
-	WorkerMiddleware         []rivertype.WorkerMiddleware
-	WorkUnit                 workunit.WorkUnit
+	ProducerCallbacks        struct {
+		JobDone func(jobRow *rivertype.JobRow)
+		Stuck   func()
+		Unstuck func()
+	}
+	SchedulerInterval      time.Duration
+	StuckThresholdOverride time.Duration
+	WorkerMiddleware       []rivertype.WorkerMiddleware
+	WorkUnit               workunit.WorkUnit
 
 	// Meant to be used from within the job executor only.
 	start time.Time
@@ -159,7 +164,7 @@ func (e *JobExecutor) Execute(ctx context.Context) {
 		}
 	}
 
-	e.InformProducerDoneFunc(e.JobRow)
+	e.ProducerCallbacks.JobDone(e.JobRow)
 }
 
 // Executes the job, handling a panic if necessary (and various other error
@@ -171,6 +176,59 @@ func (e *JobExecutor) execute(ctx context.Context) (res *jobExecutorResult) {
 	metadataUpdates := make(map[string]any)
 	ctx = context.WithValue(ctx, ContextKeyMetadataUpdates, metadataUpdates)
 
+	// Watches for jobs that may have become stuck. i.e. They've run longer than
+	// their job timeout (plus a small margin) and don't appear to be responding
+	// to context cancellation (unfortunately, quite an easy error to make in
+	// Go).
+	//
+	// Currently we don't do anything if we notice a job is stuck. Knowing about
+	// stuck jobs is just used for informational purposes in the producer in
+	// generating periodic stats.
+	if e.ClientJobTimeout > 0 {
+		// We add a WithoutCancel here so that this inner goroutine becomes
+		// immune to all context cancellations _except_ the one where it's
+		// cancelled because we leave JobExecutor.execute.
+		//
+		// This shadows the context outside the e.ClientJobTimeout > 0 check.
+		ctx, cancel := context.WithCancel(context.WithoutCancel(ctx))
+		defer cancel()
+
+		go func() {
+			const stuckThresholdDefault = 5 * time.Second
+
+			select {
+			case <-ctx.Done():
+				// context cancelled as we leave JobExecutor.execute
+
+			case <-time.After(e.ClientJobTimeout + cmp.Or(e.StuckThresholdOverride, stuckThresholdDefault)):
+				e.ProducerCallbacks.Stuck()
+
+				e.Logger.WarnContext(ctx, e.Name+": Job appears to be stuck",
+					slog.Int64("job_id", e.JobRow.ID),
+					slog.String("kind", e.JobRow.Kind),
+					slog.Duration("timeout", e.ClientJobTimeout),
+				)
+
+				// context cancelled as we leave JobExecutor.execute
+				<-ctx.Done()
+
+				// In case the executor ever becomes unstuck, inform the
+				// producer. However, if we got all the way here there's a good
+				// chance this will never happen (the worker is really stuck and
+				// will never return).
+				defer e.ProducerCallbacks.Unstuck()
+
+				defer func() {
+					e.Logger.InfoContext(ctx, e.Name+": Job became unstuck",
+						slog.Duration("duration", time.Since(e.start)),
+						slog.Int64("job_id", e.JobRow.ID),
+						slog.String("kind", e.JobRow.Kind),
+					)
+				}()
+			}
+		}()
+	}
+
 	defer func() {
 		if recovery := recover(); recovery != nil {
 			e.Logger.ErrorContext(ctx, e.Name+": panic recovery; possible bug with Worker",
diff --git a/internal/jobexecutor/job_executor_test.go b/internal/jobexecutor/job_executor_test.go
@@ -191,11 +191,19 @@ func TestJobExecutor_Execute(t *testing.T) {
 			ErrorHandler:             bundle.errorHandler,
 			HookLookupByJob:          hooklookup.NewJobHookLookup(),
 			HookLookupGlobal:         hooklookup.NewHookLookup(nil),
-			InformProducerDoneFunc:   func(job *rivertype.JobRow) {},
 			JobRow:                   bundle.jobRow,
 			MiddlewareLookupGlobal:   middlewarelookup.NewMiddlewareLookup(nil),
-			SchedulerInterval:        riverinternaltest.SchedulerShortInterval,
-			WorkUnit:                 workUnitFactory.MakeUnit(bundle.jobRow),
+			ProducerCallbacks: struct {
+				JobDone func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				JobDone: func(jobRow *rivertype.JobRow) {},
+				Stuck:   func() {},
+				Unstuck: func() {},
+			},
+			SchedulerInterval: riverinternaltest.SchedulerShortInterval,
+			WorkUnit:          workUnitFactory.MakeUnit(bundle.jobRow),
 		})
 
 		return executor, bundle
@@ -696,6 +704,94 @@ func TestJobExecutor_Execute(t *testing.T) {
 		})
 	})
 
+	configureStuckDetection := func(executor *JobExecutor) {
+		executor.ClientJobTimeout = 5 * time.Millisecond
+		executor.StuckThresholdOverride = 1 * time.Nanosecond // must be greater than 0 to take effect
+	}
+
+	t.Run("StuckDetectionActivates", func(t *testing.T) {
+		t.Parallel()
+
+		executor, bundle := setup(t)
+
+		configureStuckDetection(executor)
+
+		var (
+			informProducerStuckReceived   = make(chan struct{})
+			informProducerUnstuckReceived = make(chan struct{})
+		)
+		executor.ProducerCallbacks.Stuck = func() {
+			t.Log("Job executor reported stuck")
+			close(informProducerStuckReceived)
+		}
+		executor.ProducerCallbacks.Unstuck = func() {
+			t.Log("Job executor reported unstuck (after being stuck)")
+			close(informProducerUnstuckReceived)
+		}
+
+		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error {
+			riversharedtest.WaitOrTimeout(t, informProducerStuckReceived)
+
+			select {
+			case <-informProducerUnstuckReceived:
+				require.FailNow(t, "Executor should not have reported unstuck immediately")
+			case <-time.After(10 * time.Millisecond):
+				t.Log("Job executor still stuck after wait (this is expected)")
+			}
+
+			return nil
+		}, nil).MakeUnit(bundle.jobRow)
+
+		executor.Execute(ctx)
+		_ = riversharedtest.WaitOrTimeout(t, bundle.updateCh)
+
+		riversharedtest.WaitOrTimeout(t, informProducerUnstuckReceived)
+	})
+
+	// Checks that even if a work context is cancelled immediately, stuck
+	// detection still works as expected.
+	t.Run("StuckDetectionIgnoresParentContextCancellation", func(t *testing.T) {
+		t.Parallel()
+
+		executor, bundle := setup(t)
+
+		configureStuckDetection(executor)
+
+		var (
+			informProducerStuckReceived   = make(chan struct{})
+			informProducerUnstuckReceived = make(chan struct{})
+		)
+		executor.ProducerCallbacks.Stuck = func() {
+			t.Log("Job executor reported stuck")
+			close(informProducerStuckReceived)
+		}
+		executor.ProducerCallbacks.Unstuck = func() {
+			t.Log("Job executor reported unstuck (after being stuck)")
+			close(informProducerUnstuckReceived)
+		}
+
+		executor.WorkUnit = newWorkUnitFactoryWithCustomRetry(func() error {
+			riversharedtest.WaitOrTimeout(t, informProducerStuckReceived)
+
+			select {
+			case <-informProducerUnstuckReceived:
+				require.FailNow(t, "Executor should not have reported unstuck immediately")
+			case <-time.After(10 * time.Millisecond):
+				t.Log("Job executor still stuck after wait (this is expected)")
+			}
+
+			return nil
+		}, nil).MakeUnit(bundle.jobRow)
+
+		ctx, cancel := context.WithCancel(ctx)
+		cancel() // cancel immediately
+
+		executor.Execute(ctx)
+		_ = riversharedtest.WaitOrTimeout(t, bundle.updateCh)
+
+		riversharedtest.WaitOrTimeout(t, informProducerUnstuckReceived)
+	})
+
 	t.Run("Panic", func(t *testing.T) {
 		t.Parallel()
 
diff --git a/producer.go b/producer.go
@@ -209,6 +209,7 @@ type producer struct {
 	// An atomic count of the number of jobs actively being worked on. This is
 	// written to by the main goroutine, but read by the dispatcher.
 	numJobsActive atomic.Int32
+	numJobsStuck  atomic.Int32
 
 	numJobsRan atomic.Uint64
 	paused     bool
@@ -771,20 +772,26 @@ func (p *producer) heartbeatLogLoop(ctx context.Context, wg *sync.WaitGroup) {
 	ticker := time.NewTicker(5 * time.Second)
 	defer ticker.Stop()
 	type jobCount struct {
-		ran    uint64
 		active int
+		ran    uint64
+		stuck  int
 	}
 	var prevCount jobCount
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			curCount := jobCount{ran: p.numJobsRan.Load(), active: int(p.numJobsActive.Load())}
+			curCount := jobCount{
+				active: int(p.numJobsActive.Load()),
+				ran:    p.numJobsRan.Load(),
+				stuck:  int(p.numJobsStuck.Load()),
+			}
 			if curCount != prevCount {
 				p.Logger.InfoContext(ctx, p.Name+": Producer job counts",
 					slog.Uint64("num_completed_jobs", curCount.ran),
 					slog.Int("num_jobs_running", curCount.active),
+					slog.Int("num_jobs_stuck", curCount.stuck),
 					slog.String("queue", p.config.Queue),
 				)
 			}
@@ -815,10 +822,18 @@ func (p *producer) startNewExecutors(workCtx context.Context, jobs []*rivertype.
 			HookLookupByJob:          p.config.HookLookupByJob,
 			HookLookupGlobal:         p.config.HookLookupGlobal,
 			MiddlewareLookupGlobal:   p.config.MiddlewareLookupGlobal,
-			InformProducerDoneFunc:   p.handleWorkerDone,
 			JobRow:                   job,
-			SchedulerInterval:        p.config.SchedulerInterval,
-			WorkUnit:                 workUnit,
+			ProducerCallbacks: struct {
+				JobDone func(jobRow *rivertype.JobRow)
+				Stuck   func()
+				Unstuck func()
+			}{
+				JobDone: p.handleWorkerDone,
+				Stuck:   func() { p.numJobsStuck.Add(1) },
+				Unstuck: func() { p.numJobsStuck.Add(-1) },
+			},
+			SchedulerInterval: p.config.SchedulerInterval,
+			WorkUnit:          workUnit,
 		})
 		p.addActiveJob(job.ID, executor)
 
diff --git a/rivertest/worker.go b/rivertest/worker.go
@@ -203,13 +203,21 @@ func (w *Worker[T, TTx]) workJob(ctx context.Context, tb testing.TB, tx TTx, job
 				return nil
 			},
 		},
-		InformProducerDoneFunc: func(job *rivertype.JobRow) { close(executionDone) },
 		HookLookupGlobal:       hooklookup.NewHookLookup(w.config.Hooks),
 		HookLookupByJob:        hooklookup.NewJobHookLookup(),
 		JobRow:                 job,
 		MiddlewareLookupGlobal: middlewarelookup.NewMiddlewareLookup(w.config.Middleware),
-		SchedulerInterval:      maintenance.JobSchedulerIntervalDefault,
-		WorkUnit:               workUnit,
+		ProducerCallbacks: struct {
+			JobDone func(jobRow *rivertype.JobRow)
+			Stuck   func()
+			Unstuck func()
+		}{
+			JobDone: func(job *rivertype.JobRow) { close(executionDone) },
+			Stuck:   func() {},
+			Unstuck: func() {},
+		},
+		SchedulerInterval: maintenance.JobSchedulerIntervalDefault,
+		WorkUnit:          workUnit,
 	})
 
 	executor.Execute(jobCtx)