From a1ed0ca32bd7c907a8591375e8c88c76e8869645 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Wed, 25 Mar 2026 07:47:02 +0100 Subject: [PATCH] Fix root cause of nil return in queryWithRetry and labelsWithRetry When ingesterQueryMaxAttempts > 1 and the context is cancelled before the backoff loop starts, retries.Ongoing() returns false immediately and the loop never executes, causing both retry functions to return (nil, nil). This propagates ctx.Err() from the retry functions themselves so all callers are protected, and removes the caller-side nil guard added in #7369. Also aligns the early-return check in labelsWithRetry to use <= 1 consistently with queryWithRetry. Signed-off-by: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> --- CHANGELOG.md | 1 + pkg/querier/distributor_queryable.go | 26 ++++++++------ pkg/querier/distributor_queryable_test.go | 44 +++++++++++++++++------ 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6182531869..118077e1e5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363 * [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355 * [BUGFIX] Fix nil when ingester_query_max_attempts > 1. #7369 +* [BUGFIX] Querier: Fix queryWithRetry and labelsWithRetry returning (nil, nil) on cancelled context by propagating ctx.Err(). #7370 ## 1.21.0 in progress diff --git a/pkg/querier/distributor_queryable.go b/pkg/querier/distributor_queryable.go index 6ab0b592bf..8d82cd7887 100644 --- a/pkg/querier/distributor_queryable.go +++ b/pkg/querier/distributor_queryable.go @@ -166,16 +166,6 @@ func (q *distributorQuerier) streamingSelect(ctx context.Context, sortSeries, pa return storage.ErrSeriesSet(err) } - // Guard against nil results. This can happen when queryWithRetry's - // backoff loop exits without executing (e.g., context cancelled before - // the first attempt), returning (nil, nil). See #7364. - if results == nil { - if err != nil { - return storage.ErrSeriesSet(err) - } - return storage.EmptySeriesSet() - } - serieses := make([]storage.Series, 0, len(results.Chunkseries)) for _, result := range results.Chunkseries { // Sometimes the ingester can send series that have no data. @@ -236,6 +226,13 @@ func (q *distributorQuerier) queryWithRetry(ctx context.Context, queryFunc func( retries.Wait() } + // If the loop never executed (e.g. context cancelled before the first + // attempt), result and err are both nil. Return the context error so + // callers don't receive a nil result with no error. + if err == nil { + err = ctx.Err() + } + return result, err } @@ -299,7 +296,7 @@ func (q *distributorQuerier) LabelNames(ctx context.Context, hints *storage.Labe } func (q *distributorQuerier) labelsWithRetry(ctx context.Context, labelsFunc func() ([]string, error)) ([]string, error) { - if q.ingesterQueryMaxAttempts == 1 { + if q.ingesterQueryMaxAttempts <= 1 { return labelsFunc() } @@ -322,6 +319,13 @@ func (q *distributorQuerier) labelsWithRetry(ctx context.Context, labelsFunc fun retries.Wait() } + // If the loop never executed (e.g. context cancelled before the first + // attempt), result and err are both nil. Return the context error so + // callers don't receive a nil result with no error. + if err == nil { + err = ctx.Err() + } + return result, err } diff --git a/pkg/querier/distributor_queryable_test.go b/pkg/querier/distributor_queryable_test.go index cbec09d8bc..7ec3f583a1 100644 --- a/pkg/querier/distributor_queryable_test.go +++ b/pkg/querier/distributor_queryable_test.go @@ -418,9 +418,8 @@ func TestDistributorQuerier_Select_CancelledContext_NoRetry(t *testing.T) { // // When ingesterQueryMaxAttempts > 1 and the context is cancelled before the // retry loop starts (e.g. query timeout or another querier goroutine failing), -// backoff.Ongoing() returns false immediately. The result variable stays nil, -// queryWithRetry returns (nil, nil), and streamingSelect dereferences the nil -// result at line 169 → panic. +// backoff.Ongoing() returns false immediately. queryWithRetry now propagates +// ctx.Err() so callers always see a non-nil error. func TestDistributorQuerier_Select_CancelledContext(t *testing.T) { t.Parallel() @@ -440,14 +439,37 @@ func TestDistributorQuerier_Select_CancelledContext(t *testing.T) { querier, err := queryable.Querier(mint, maxt) require.NoError(t, err) - // This should NOT panic. Before the fix, the cancelled context causes - // queryWithRetry to return (nil, nil), and streamingSelect dereferences - // the nil result: panic: runtime error: invalid memory address or nil - // pointer dereference [signal SIGSEGV ... addr=0x8] - require.NotPanics(t, func() { - seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt}) - // With a cancelled context, we expect either an error or an empty result. - _ = seriesSet.Err() + seriesSet := querier.Select(ctx, true, &storage.SelectHints{Start: mint, End: maxt}) + require.ErrorIs(t, seriesSet.Err(), context.Canceled) +} + +// TestDistributorQuerier_Labels_CancelledContext verifies that labelsWithRetry +// propagates ctx.Err() when the context is cancelled before the retry loop +// executes. +func TestDistributorQuerier_Labels_CancelledContext(t *testing.T) { + t.Parallel() + + ctx := user.InjectOrgID(context.Background(), "0") + ctx, cancel := context.WithCancel(ctx) + cancel() + + d := &MockDistributor{} + + ingesterQueryMaxAttempts := 2 + queryable := newDistributorQueryable(d, true, true, batch.NewChunkMergeIterator, 0, func(string) bool { + return true + }, ingesterQueryMaxAttempts) + querier, err := queryable.Querier(mint, maxt) + require.NoError(t, err) + + t.Run("LabelNames", func(t *testing.T) { + _, _, err := querier.LabelNames(ctx, nil) + require.ErrorIs(t, err, context.Canceled) + }) + + t.Run("LabelValues", func(t *testing.T) { + _, _, err := querier.LabelValues(ctx, "foo", nil) + require.ErrorIs(t, err, context.Canceled) }) }