Skip to content

Commit d2596a3

Browse files
authored
Fix root cause of nil return in queryWithRetry and labelsWithRetry (#7375)
1 parent 88c3622 commit d2596a3

3 files changed

Lines changed: 49 additions & 22 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
* [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355
88
* [BUGFIX] Alertmanager: Fix disappearing user config and state when ring is temporarily unreachable. #7372
99
* [BUGFIX] Fix nil when ingester_query_max_attempts > 1. #7369
10+
* [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370
1011
* [BUGFIX] Metrics Helper: Fix non-deterministic bucket order in merged histograms by sorting buckets after map iteration, matching Prometheus client library behavior. #7380
1112

1213
## 1.21.0 in progress

pkg/querier/distributor_queryable.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -166,16 +166,6 @@ func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, pa
166166
return storage.ErrSeriesSet(err)
167167
}
168168

169-
// Guard against nil results. This can happen when queryWithRetry's
170-
// backoff loop exits without executing (e.g., context cancelled before
171-
// the first attempt), returning (nil, nil). See #7364.
172-
if results == nil {
173-
if err != nil {
174-
return storage.ErrSeriesSet(err)
175-
}
176-
return storage.EmptySeriesSet()
177-
}
178-
179169
serieses := make([]storage.Series, 0, len(results.Chunkseries))
180170
for _, result := range results.Chunkseries {
181171
// Sometimes the ingester can send series that have no data.
@@ -236,6 +226,13 @@ func (q *distributorQuerier) queryWithRetry(ctx context.Context, queryFunc func(
236226
retries.Wait()
237227
}
238228

229+
// If the loop never executed (e.g. context cancelled before the first
230+
// attempt), result and err are both nil. Return the context error so
231+
// callers don't receive a nil result with no error.
232+
if err == nil {
233+
err = ctx.Err()
234+
}
235+
239236
return result, err
240237
}
241238

@@ -299,7 +296,7 @@ func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.Labe
299296
}
300297

301298
func (q *distributorQuerier) labelsWithRetry(ctx context.Context, labelsFunc func() ([]string, error)) ([]string, error) {
302-
if q.ingesterQueryMaxAttempts == 1 {
299+
if q.ingesterQueryMaxAttempts <= 1 {
303300
return labelsFunc()
304301
}
305302

@@ -322,6 +319,13 @@ func (q *distributorQuerier) labelsWithRetry(ctx context.Context, labelsFunc fun
322319
retries.Wait()
323320
}
324321

322+
// If the loop never executed (e.g. context cancelled before the first
323+
// attempt), result and err are both nil. Return the context error so
324+
// callers don't receive a nil result with no error.
325+
if err == nil {
326+
err = ctx.Err()
327+
}
328+
325329
return result, err
326330
}
327331

pkg/querier/distributor_queryable_test.go

Lines changed: 33 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ func TestDistributorQuerier_Select_CancelledContext_NoRetry(t *testing.T) {
418418
//
419419
// When ingesterQueryMaxAttempts > 1 and the context is cancelled before the
420420
// retry loop starts (e.g. query timeout or another querier goroutine failing),
421-
// backoff.Ongoing() returns false immediately. The result variable stays nil,
422-
// queryWithRetry returns (nil, nil), and streamingSelect dereferences the nil
423-
// result at line 169 → panic.
421+
// backoff.Ongoing() returns false immediately. queryWithRetry now propagates
422+
// ctx.Err() so callers always see a non-nil error.
424423
func TestDistributorQuerier_Select_CancelledContext(t *testing.T) {
425424
t.Parallel()
426425

@@ -440,14 +439,37 @@ func TestDistributorQuerier_Select_CancelledContext(t *testing.T) {
440439
querier, err := queryable.Querier(mint, maxt)
441440
require.NoError(t, err)
442441

443-
// This should NOT panic. Before the fix, the cancelled context causes
444-
// queryWithRetry to return (nil, nil), and streamingSelect dereferences
445-
// the nil result: panic: runtime error: invalid memory address or nil
446-
// pointer dereference [signal SIGSEGV ... addr=0x8]
447-
require.NotPanics(t, func() {
448-
seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt})
449-
// With a cancelled context, we expect either an error or an empty result.
450-
_ = seriesSet.Err()
442+
seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt})
443+
require.ErrorIs(t, seriesSet.Err(), context.Canceled)
444+
}
445+
446+
// TestDistributorQuerier_Labels_CancelledContext verifies that labelsWithRetry
447+
// propagates ctx.Err() when the context is cancelled before the retry loop
448+
// executes.
449+
func TestDistributorQuerier_Labels_CancelledContext(t *testing.T) {
450+
t.Parallel()
451+
452+
ctx := user.InjectOrgID(context.Background(), "0")
453+
ctx, cancel := context.WithCancel(ctx)
454+
cancel()
455+
456+
d := &MockDistributor{}
457+
458+
ingesterQueryMaxAttempts := 2
459+
queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0, func(string) bool {
460+
return true
461+
}, ingesterQueryMaxAttempts)
462+
querier, err := queryable.Querier(mint, maxt)
463+
require.NoError(t, err)
464+
465+
t.Run("LabelNames", func(t *testing.T) {
466+
_, _, err := querier.LabelNames(ctx, nil)
467+
require.ErrorIs(t, err, context.Canceled)
468+
})
469+
470+
t.Run("LabelValues", func(t *testing.T) {
471+
_, _, err := querier.LabelValues(ctx, "foo", nil)
472+
require.ErrorIs(t, err, context.Canceled)
451473
})
452474
}
453475

0 commit comments

Comments
 (0)