|
5 | 5 | "errors" |
6 | 6 | "fmt" |
7 | 7 | "strings" |
| 8 | + "sync" |
8 | 9 | "testing" |
9 | 10 | "time" |
10 | 11 |
|
@@ -417,6 +418,119 @@ func Test_RegexResolver_CacheNotPurgedWhenUsersUnchanged(t *testing.T) { |
417 | 418 | require.Equal(t, 1, cacheLen, "cache should not be purged when the set of users has not changed") |
418 | 419 | } |
419 | 420 |
|
| 421 | +// toggleableScanner wraps a real scanner and can be flipped into a failure mode |
| 422 | +// to simulate transient bucket-scan errors observed by RegexResolver.running(). |
| 423 | +type toggleableScanner struct { |
| 424 | + inner users.Scanner |
| 425 | + |
| 426 | + mu sync.Mutex |
| 427 | + failing bool |
| 428 | + callsErr int |
| 429 | +} |
| 430 | + |
| 431 | +func (t *toggleableScanner) setFailing(v bool) { |
| 432 | + t.mu.Lock() |
| 433 | + defer t.mu.Unlock() |
| 434 | + t.failing = v |
| 435 | +} |
| 436 | + |
| 437 | +func (t *toggleableScanner) failedCalls() int { |
| 438 | + t.mu.Lock() |
| 439 | + defer t.mu.Unlock() |
| 440 | + return t.callsErr |
| 441 | +} |
| 442 | + |
| 443 | +func (t *toggleableScanner) ScanUsers(ctx context.Context) ([]string, []string, []string, error) { |
| 444 | + t.mu.Lock() |
| 445 | + failing := t.failing |
| 446 | + if failing { |
| 447 | + t.callsErr++ |
| 448 | + } |
| 449 | + t.mu.Unlock() |
| 450 | + |
| 451 | + if failing { |
| 452 | + return nil, nil, nil, errors.New("simulated scan failure") |
| 453 | + } |
| 454 | + return t.inner.ScanUsers(ctx) |
| 455 | +} |
| 456 | + |
| 457 | +func Test_RegexResolver_ScanFailurePreservesState(t *testing.T) { |
| 458 | + reg := prometheus.NewRegistry() |
| 459 | + existingTenants := []string{"user-1", "user-2"} |
| 460 | + |
| 461 | + bucketClient := &bucket.ClientMock{} |
| 462 | + bucketClient.MockIter("", existingTenants, nil) |
| 463 | + bucketClient.MockIter("__markers__", []string{}, nil) |
| 464 | + for _, tenant := range existingTenants { |
| 465 | + bucketClient.MockExists(users.GetGlobalDeletionMarkPath(tenant), false, nil) |
| 466 | + bucketClient.MockExists(users.GetLocalDeletionMarkPath(tenant), false, nil) |
| 467 | + } |
| 468 | + |
| 469 | + bucketClientFactory := func(ctx context.Context) (objstore.InstrumentedBucket, error) { |
| 470 | + return bucketClient, nil |
| 471 | + } |
| 472 | + |
| 473 | + scannerCfg := users.UsersScannerConfig{Strategy: users.UserScanStrategyList} |
| 474 | + cfg := Config{UserSyncInterval: 100 * time.Millisecond, MaxTenant: 0, RegexCacheSize: 10} |
| 475 | + |
| 476 | + regexResolver, err := NewRegexResolver(scannerCfg, cfg, reg, bucketClientFactory, log.NewNopLogger()) |
| 477 | + require.NoError(t, err) |
| 478 | + |
| 479 | + // Swap the scanner BEFORE starting the service so there's no data race with running(). |
| 480 | + toggle := &toggleableScanner{inner: regexResolver.userScanner} |
| 481 | + regexResolver.userScanner = toggle |
| 482 | + |
| 483 | + require.NoError(t, services.StartAndAwaitRunning(context.Background(), regexResolver)) |
| 484 | + defer services.StopAndAwaitTerminated(context.Background(), regexResolver) //nolint:errcheck |
| 485 | + |
| 486 | + // Wait for the first successful sync to populate knownUsers + metrics. |
| 487 | + test.Poll(t, 10*time.Second, true, func() any { |
| 488 | + return testutil.ToFloat64(regexResolver.lastUpdateUserRun) > 0 && |
| 489 | + testutil.ToFloat64(regexResolver.discoveredUsers) == float64(len(existingTenants)) |
| 490 | + }) |
| 491 | + |
| 492 | + // Populate the matched cache. |
| 493 | + ctx := user.InjectOrgID(context.Background(), "user-.+") |
| 494 | + orgIDs, err := regexResolver.TenantIDs(ctx) |
| 495 | + require.NoError(t, err) |
| 496 | + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) |
| 497 | + require.Equal(t, 1, regexResolver.matchedCache.Len()) |
| 498 | + |
| 499 | + // Flip the scanner into failure mode, then wait until at least one failed |
| 500 | + // scan has actually been observed by the loop. Snapshotting the metrics |
| 501 | + // *after* this point avoids a race where a successful tick could fire |
| 502 | + // between the snapshot and the toggle flip. |
| 503 | + toggle.setFailing(true) |
| 504 | + test.Poll(t, 10*time.Second, true, func() any { |
| 505 | + return toggle.failedCalls() >= 1 |
| 506 | + }) |
| 507 | + lastRunAtFirstFailure := testutil.ToFloat64(regexResolver.lastUpdateUserRun) |
| 508 | + discoveredAtFirstFailure := testutil.ToFloat64(regexResolver.discoveredUsers) |
| 509 | + |
| 510 | + // Wait for additional failed scans to confirm the loop iterated past the |
| 511 | + // failure point multiple times. |
| 512 | + test.Poll(t, 10*time.Second, true, func() any { |
| 513 | + return toggle.failedCalls() >= 3 |
| 514 | + }) |
| 515 | + |
| 516 | + // The fix under test: knownUsers and the matched cache must be preserved, |
| 517 | + // and success-only metrics must not be updated. |
| 518 | + regexResolver.RLock() |
| 519 | + require.Equal(t, existingTenants, regexResolver.knownUsers, "knownUsers must be preserved on scan failure") |
| 520 | + require.Equal(t, 1, regexResolver.matchedCache.Len(), "matched cache must not be purged on scan failure") |
| 521 | + regexResolver.RUnlock() |
| 522 | + |
| 523 | + require.Equal(t, lastRunAtFirstFailure, testutil.ToFloat64(regexResolver.lastUpdateUserRun), |
| 524 | + "lastUpdateUserRun must not advance on scan failure") |
| 525 | + require.Equal(t, discoveredAtFirstFailure, testutil.ToFloat64(regexResolver.discoveredUsers), |
| 526 | + "discoveredUsers must not change on scan failure") |
| 527 | + |
| 528 | + // Repeating the query must still resolve from cached state. |
| 529 | + orgIDs, err = regexResolver.TenantIDs(ctx) |
| 530 | + require.NoError(t, err) |
| 531 | + require.Equal(t, []string{"user-1", "user-2"}, orgIDs) |
| 532 | +} |
| 533 | + |
420 | 534 | func BenchmarkRegexResolver_TenantIDs(b *testing.B) { |
421 | 535 | numUsers := 1000 |
422 | 536 | existingTenants := make([]string, numUsers) |
|
0 commit comments