Skip to content

Commit 88c3622

Browse files
fix: alertmanager user config disappearing when ring is unreachable (#7372)
* Fix multitenant alertmanager user config disappearing when ring is unreachable Signed-off-by: Kishore K G <kishorekg@google.com> * Add change log Signed-off-by: Kishore K G <kishorekg@google.com> * format multitenant Signed-off-by: Kishore K G <kishorekg@google.com> * fix pr number Signed-off-by: Kishore K G <kishorekg@google.com> * use ErrNotFound for error validation in unit test Signed-off-by: kishorekg1999 <kishorekg.github@gmail.com> --------- Signed-off-by: Kishore K G <kishorekg@google.com> Signed-off-by: kishorekg1999 <kishorekg@google.com> Signed-off-by: kishorekg1999 <kishorekg.github@gmail.com>
1 parent c9f30dc commit 88c3622

3 files changed

Lines changed: 140 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
* [ENHANCEMENT] Cache: Add per-tenant TTL configuration for query results cache to control cache expiration on a per-tenant basis with separate TTLs for regular and out-of-order data. #7357
66
* [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363
77
* [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355
8+
* [BUGFIX] Alertmanager: Fix disappearing user config and state when ring is temporarily unreachable. #7372
89
* [BUGFIX] Fix nil when ingester_query_max_attempts > 1. #7369
910
* [BUGFIX] Metrics Helper: Fix non-deterministic bucket order in merged histograms by sorting buckets after map iteration, matching Prometheus client library behavior. #7380
1011

pkg/alertmanager/multitenant.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,10 @@ func (am *MultitenantAlertmanager) userIndexUpdateLoop(ctx context.Context) {
714714
level.Error(am.logger).Log("msg", "context timeout, exit user index update loop", "err", ctx.Err())
715715
return
716716
case <-ticker.C:
717-
owned := am.isUserOwned(userID)
717+
owned, err := am.isUserOwned(userID)
718+
if err != nil {
719+
continue
720+
}
718721
if !owned {
719722
continue
720723
}
@@ -804,7 +807,11 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context)
804807

805808
// Filter out users not owned by this shard.
806809
for _, userID := range allUserIDs {
807-
if am.isUserOwned(userID) {
810+
owned, err := am.isUserOwned(userID)
811+
if err != nil {
812+
return nil, nil, errors.Wrapf(err, "failed to check if user %s is owned", userID)
813+
}
814+
if owned {
808815
ownedUserIDs = append(ownedUserIDs, userID)
809816
}
810817
}
@@ -821,24 +828,24 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context)
821828
return allUserIDs, configs, nil
822829
}
823830

824-
func (am *MultitenantAlertmanager) isUserOwned(userID string) bool {
831+
func (am *MultitenantAlertmanager) isUserOwned(userID string) (bool, error) {
825832
if !am.allowedTenants.IsAllowed(userID) {
826-
return false
833+
return false, nil
827834
}
828835

829836
// If sharding is disabled, any alertmanager instance owns all users.
830837
if !am.cfg.ShardingEnabled {
831-
return true
838+
return true, nil
832839
}
833840

834841
alertmanagers, err := am.ring.Get(users.ShardByUser(userID), getSyncRingOp(am.cfg.ShardingRing.DisableReplicaSetExtension), nil, nil, nil)
835842
if err != nil {
836843
am.ringCheckErrors.Inc()
837844
level.Error(am.logger).Log("msg", "failed to load alertmanager configuration", "user", userID, "err", err)
838-
return false
845+
return false, err
839846
}
840847

841-
return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr())
848+
return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr()), nil
842849
}
843850

844851
func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertConfigDesc) {

pkg/alertmanager/multitenant_test.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2501,3 +2501,128 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int {
25012501
func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int {
25022502
return m.maxSilencesSizeBytes
25032503
}
2504+
2505+
func TestMultitenantAlertmanager_isUserOwned(t *testing.T) {
2506+
ctx := context.Background()
2507+
_ = ctx
2508+
amConfig := mockAlertmanagerConfig(t)
2509+
amConfig.ShardingEnabled = true
2510+
2511+
ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
2512+
t.Cleanup(func() { assert.NoError(t, closer.Close()) })
2513+
2514+
alertStore, err := prepareInMemoryAlertStore()
2515+
require.NoError(t, err)
2516+
2517+
am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil)
2518+
require.NoError(t, err)
2519+
2520+
// We don't start the AM, so the ring is empty.
2521+
// When sharding is enabled, isUserOwned will call am.ring.Get, which should fail because the ring is empty.
2522+
owned, err := am.isUserOwned("user-1")
2523+
require.Error(t, err)
2524+
require.Equal(t, ring.ErrEmptyRing, err)
2525+
require.False(t, owned)
2526+
2527+
// If sharding is disabled, it should return true, nil
2528+
am.cfg.ShardingEnabled = false
2529+
owned, err = am.isUserOwned("user-1")
2530+
require.NoError(t, err)
2531+
require.True(t, owned)
2532+
}
2533+
2534+
func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *testing.T) {
2535+
ctx := context.Background()
2536+
amConfig := mockAlertmanagerConfig(t)
2537+
amConfig.ShardingEnabled = true
2538+
2539+
ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
2540+
t.Cleanup(func() { assert.NoError(t, closer.Close()) })
2541+
2542+
alertStore, err := prepareInMemoryAlertStore()
2543+
require.NoError(t, err)
2544+
2545+
am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil)
2546+
require.NoError(t, err)
2547+
2548+
// We don't start the AM. We just manually insert state, then call loadAndSyncConfigs.
2549+
user1 := "user-1"
2550+
2551+
// 1. Create a FullState (remote state) for user1 in the store.
2552+
fullState := alertspb.FullStateDesc{
2553+
State: &clusterpb.FullState{},
2554+
}
2555+
err = alertStore.SetFullState(ctx, user1, fullState)
2556+
require.NoError(t, err)
2557+
2558+
// Since we did NOT SetAlertConfig for user1, the store.ListAllUsers will return empty.
2559+
allUsers, err := alertStore.ListAllUsers(ctx)
2560+
require.NoError(t, err)
2561+
require.Empty(t, allUsers)
2562+
2563+
// Verify user1's FullState is written to the store.
2564+
_, err = alertStore.GetFullState(ctx, user1)
2565+
require.NoError(t, err)
2566+
2567+
// 2. Call loadAndSyncConfigs. It should fetch all users (which is empty),
2568+
// and since sharding is enabled, it should clean up unused remote state for any users not in the list.
2569+
// user1 has state but no config, so its state should be pruned.
2570+
err = am.loadAndSyncConfigs(ctx, reasonPeriodic)
2571+
require.NoError(t, err)
2572+
2573+
// 3. Verify user1's FullState is deleted from the store.
2574+
_, err = alertStore.GetFullState(ctx, user1)
2575+
require.Error(t, err)
2576+
require.Equal(t, alertspb.ErrNotFound, err)
2577+
}
2578+
2579+
func TestMultitenantAlertmanager_loadAndSyncConfigs_DoesNotDeleteUserFromStoreWhenRingUnreachable(t *testing.T) {
2580+
ctx := context.Background()
2581+
amConfig := mockAlertmanagerConfig(t)
2582+
amConfig.ShardingEnabled = true
2583+
2584+
ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil)
2585+
t.Cleanup(func() { assert.NoError(t, closer.Close()) })
2586+
2587+
alertStore, err := prepareInMemoryAlertStore()
2588+
require.NoError(t, err)
2589+
2590+
am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil)
2591+
require.NoError(t, err)
2592+
2593+
user1 := "user-1"
2594+
2595+
// 1. Create a FullState (remote state) for user1 in the store.
2596+
fullState := alertspb.FullStateDesc{
2597+
State: &clusterpb.FullState{},
2598+
}
2599+
err = alertStore.SetFullState(ctx, user1, fullState)
2600+
require.NoError(t, err)
2601+
2602+
// Since we DO SetAlertConfig for user1, the store.ListAllUsers will return the user.
2603+
err = alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{
2604+
User: user1,
2605+
RawConfig: simpleConfigOne,
2606+
Templates: []*alertspb.TemplateDesc{},
2607+
})
2608+
require.NoError(t, err)
2609+
2610+
allUsers, err := alertStore.ListAllUsers(ctx)
2611+
require.NoError(t, err)
2612+
require.Len(t, allUsers, 1)
2613+
2614+
// Verify user1's FullState is written to the store.
2615+
_, err = alertStore.GetFullState(ctx, user1)
2616+
require.NoError(t, err)
2617+
2618+
// 2. Call loadAndSyncConfigs. It should fetch all users.
2619+
// Since sharding is enabled but ring is empty (unreachable), isUserOwned will fail.
2620+
// loadAndSyncConfigs will return early with an error before pruning any states!
2621+
err = am.loadAndSyncConfigs(ctx, reasonPeriodic)
2622+
require.Error(t, err)
2623+
require.Contains(t, err.Error(), ring.ErrEmptyRing.Error())
2624+
2625+
// 3. Verify user1's FullState is STILL in the store.
2626+
_, err = alertStore.GetFullState(ctx, user1)
2627+
require.NoError(t, err)
2628+
}

0 commit comments

Comments
 (0)