Skip to content

Commit d0d09de

Browse files
authored
Add continue on error when user discovery fail in regex resolver (#7534)
* add continue on error in user discovery in regex resolver Signed-off-by: SungJin1212 <tjdwls1201@gmail.com> * changelog Signed-off-by: SungJin1212 <tjdwls1201@gmail.com> * add unit test Signed-off-by: SungJin1212 <tjdwls1201@gmail.com> --------- Signed-off-by: SungJin1212 <tjdwls1201@gmail.com>
1 parent 510097c commit d0d09de

3 files changed

Lines changed: 116 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
* [BUGFIX] Compactor: Fix stale `cortex_bucket_index_last_successful_update_timestamp_seconds` metric not being cleaned up when tenant ownership changes due to ring rebalancing. This caused false alarms on bucket index update rate when a tenant moved between compactors. #7485
3939
* [BUGFIX] Security: Fix stored XSS vulnerability in Alertmanager and Store Gateway status pages by replacing `text/template` with `html/template`. #7512
4040
* [BUGFIX] Security: Limit decompressed gzip output in `ParseProtoReader` and OTLP ingestion path. The decompressed body is now capped by `-distributor.otlp-max-recv-msg-size`. #7515
41+
* [BUGFIX] Tenant Federation: Fix regex resolver clearing known users list when user scan fails. #7534
4142

4243
## 1.21.0 2026-04-24
4344

pkg/querier/tenantfederation/regex_resolver.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ func (r *RegexResolver) running(ctx context.Context) error {
125125
active, deleting, _, err := r.userScanner.ScanUsers(ctx)
126126
if err != nil {
127127
level.Error(r.logger).Log("msg", "failed to discover users from bucket", "err", err)
128+
continue
128129
}
129130

130131
newUsers := append(active, deleting...)

pkg/querier/tenantfederation/regex_resolver_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"errors"
66
"fmt"
77
"strings"
8+
"sync"
89
"testing"
910
"time"
1011

@@ -417,6 +418,119 @@ func Test_RegexResolver_CacheNotPurgedWhenUsersUnchanged(t *testing.T) {
417418
require.Equal(t, 1, cacheLen, "cache should not be purged when the set of users has not changed")
418419
}
419420

421+
// toggleableScanner wraps a real scanner and can be flipped into a failure mode
422+
// to simulate transient bucket-scan errors observed by RegexResolver.running().
423+
type toggleableScanner struct {
424+
inner users.Scanner
425+
426+
mu sync.Mutex
427+
failing bool
428+
callsErr int
429+
}
430+
431+
func (t *toggleableScanner) setFailing(v bool) {
432+
t.mu.Lock()
433+
defer t.mu.Unlock()
434+
t.failing = v
435+
}
436+
437+
func (t *toggleableScanner) failedCalls() int {
438+
t.mu.Lock()
439+
defer t.mu.Unlock()
440+
return t.callsErr
441+
}
442+
443+
func (t *toggleableScanner) ScanUsers(ctx context.Context) ([]string, []string, []string, error) {
444+
t.mu.Lock()
445+
failing := t.failing
446+
if failing {
447+
t.callsErr++
448+
}
449+
t.mu.Unlock()
450+
451+
if failing {
452+
return nil, nil, nil, errors.New("simulated scan failure")
453+
}
454+
return t.inner.ScanUsers(ctx)
455+
}
456+
457+
func Test_RegexResolver_ScanFailurePreservesState(t *testing.T) {
458+
reg := prometheus.NewRegistry()
459+
existingTenants := []string{"user-1", "user-2"}
460+
461+
bucketClient := &bucket.ClientMock{}
462+
bucketClient.MockIter("", existingTenants, nil)
463+
bucketClient.MockIter("__markers__", []string{}, nil)
464+
for _, tenant := range existingTenants {
465+
bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil)
466+
bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil)
467+
}
468+
469+
bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) {
470+
return bucketClient, nil
471+
}
472+
473+
scannerCfg := users.UsersScannerConfig{Strategy: users.UserScanStrategyList}
474+
cfg := Config{UserSyncInterval: 100 * time.Millisecond, MaxTenant: 0, RegexCacheSize: 10}
475+
476+
regexResolver, err := NewRegexResolver(scannerCfg, cfg, reg, bucketClientFactory, log.NewNopLogger())
477+
require.NoError(t, err)
478+
479+
// Swap the scanner BEFORE starting the service so there's no data race with running().
480+
toggle := &toggleableScanner{inner: regexResolver.userScanner}
481+
regexResolver.userScanner = toggle
482+
483+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), regexResolver))
484+
defer services.StopAndAwaitTerminated(context.Background(), regexResolver) //nolint:errcheck
485+
486+
// Wait for the first successful sync to populate knownUsers + metrics.
487+
test.Poll(t, 10*time.Second, true, func() any {
488+
return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 &&
489+
testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(existingTenants))
490+
})
491+
492+
// Populate the matched cache.
493+
ctx := user.InjectOrgID(context.Background(), "user-.+")
494+
orgIDs, err := regexResolver.TenantIDs(ctx)
495+
require.NoError(t, err)
496+
require.Equal(t, []string{"user-1", "user-2"}, orgIDs)
497+
require.Equal(t, 1, regexResolver.matchedCache.Len())
498+
499+
// Flip the scanner into failure mode, then wait until at least one failed
500+
// scan has actually been observed by the loop. Snapshotting the metrics
501+
// *after* this point avoids a race where a successful tick could fire
502+
// between the snapshot and the toggle flip.
503+
toggle.setFailing(true)
504+
test.Poll(t, 10*time.Second, true, func() any {
505+
return toggle.failedCalls() >= 1
506+
})
507+
lastRunAtFirstFailure := testutil.ToFloat64(regexResolver.lastUpdateUserRun)
508+
discoveredAtFirstFailure := testutil.ToFloat64(regexResolver.discoveredUsers)
509+
510+
// Wait for additional failed scans to confirm the loop iterated past the
511+
// failure point multiple times.
512+
test.Poll(t, 10*time.Second, true, func() any {
513+
return toggle.failedCalls() >= 3
514+
})
515+
516+
// The fix under test: knownUsers and the matched cache must be preserved,
517+
// and success-only metrics must not be updated.
518+
regexResolver.RLock()
519+
require.Equal(t, existingTenants, regexResolver.knownUsers, "knownUsers must be preserved on scan failure")
520+
require.Equal(t, 1, regexResolver.matchedCache.Len(), "matched cache must not be purged on scan failure")
521+
regexResolver.RUnlock()
522+
523+
require.Equal(t, lastRunAtFirstFailure, testutil.ToFloat64(regexResolver.lastUpdateUserRun),
524+
"lastUpdateUserRun must not advance on scan failure")
525+
require.Equal(t, discoveredAtFirstFailure, testutil.ToFloat64(regexResolver.discoveredUsers),
526+
"discoveredUsers must not change on scan failure")
527+
528+
// Repeating the query must still resolve from cached state.
529+
orgIDs, err = regexResolver.TenantIDs(ctx)
530+
require.NoError(t, err)
531+
require.Equal(t, []string{"user-1", "user-2"}, orgIDs)
532+
}
533+
420534
func BenchmarkRegexResolver_TenantIDs(b *testing.B) {
421535
numUsers := 1000
422536
existingTenants := make([]string, numUsers)

0 commit comments

Comments
 (0)