From c6e8a009a2197cb499a3919b80e9e6f737736df5 Mon Sep 17 00:00:00 2001 From: Kishore K G Date: Tue, 24 Mar 2026 10:01:16 +0000 Subject: [PATCH 1/5] Fix multitenant alertmanager user config disappearing when ring is unreachable Signed-off-by: Kishore K G --- pkg/alertmanager/multitenant.go | 21 +++-- pkg/alertmanager/multitenant_test.go | 131 +++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 7 deletions(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index f473a66c1b..9494ce77c7 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -714,7 +714,10 @@ func (am *MultitenantAlertmanager) userIndexUpdateLoop(ctx context.Context) { level.Error(am.logger).Log("msg", "context timeout, exit user index update loop", "err", ctx.Err()) return case <-ticker.C: - owned := am.isUserOwned(userID) + owned, err := am.isUserOwned(userID) + if err != nil { + continue + } if !owned { continue } @@ -804,7 +807,11 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) // Filter out users not owned by this shard. for _, userID := range allUserIDs { - if am.isUserOwned(userID) { + owned, err := am.isUserOwned(userID) + if err != nil { + return nil, nil, errors.Wrapf(err, "failed to check if user %s is owned", userID) + } + if owned { ownedUserIDs = append(ownedUserIDs, userID) } } @@ -821,24 +828,24 @@ func (am *MultitenantAlertmanager) loadAlertmanagerConfigs(ctx context.Context) return allUserIDs, configs, nil } -func (am *MultitenantAlertmanager) isUserOwned(userID string) bool { +func (am *MultitenantAlertmanager) isUserOwned(userID string) (bool, error) { if !am.allowedTenants.IsAllowed(userID) { - return false + return false, nil } // If sharding is disabled, any alertmanager instance owns all users. if !am.cfg.ShardingEnabled { - return true + return true, nil } alertmanagers, err := am.ring.Get(users.ShardByUser(userID), getSyncRingOp(am.cfg.ShardingRing.DisableReplicaSetExtension), nil, nil, nil) if err != nil { am.ringCheckErrors.Inc() level.Error(am.logger).Log("msg", "failed to load alertmanager configuration", "user", userID, "err", err) - return false + return false, err } - return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr()) + return alertmanagers.Includes(am.ringLifecycler.GetInstanceAddr()), nil } func (am *MultitenantAlertmanager) syncConfigs(cfgs map[string]alertspb.AlertConfigDesc) { diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index c1e6a483af..2ecf975439 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2501,3 +2501,134 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxSilencesCount(_ string) int { func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int { return m.maxSilencesSizeBytes } + + + + +func TestMultitenantAlertmanager_isUserOwned(t *testing.T) { + ctx := context.Background() + _ = ctx + amConfig := mockAlertmanagerConfig(t) + amConfig.ShardingEnabled = true + + ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + t.Cleanup(func() { assert.NoError(t, closer.Close()) }) + + alertStore, err := prepareInMemoryAlertStore() + require.NoError(t, err) + + am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil) + require.NoError(t, err) + + // We don't start the AM, so the ring is empty. +// When sharding is enabled, isUserOwned will call am.ring.Get, which should fail because the ring is empty. +owned, err := am.isUserOwned("user-1") +require.Error(t, err) +require.Equal(t, ring.ErrEmptyRing, err) +require.False(t, owned) + +// If sharding is disabled, it should return true, nil +am.cfg.ShardingEnabled = false +owned, err = am.isUserOwned("user-1") +require.NoError(t, err) +require.True(t, owned) +} + + + + +func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *testing.T) { + ctx := context.Background() + amConfig := mockAlertmanagerConfig(t) + amConfig.ShardingEnabled = true + + ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + t.Cleanup(func() { assert.NoError(t, closer.Close()) }) + + alertStore, err := prepareInMemoryAlertStore() + require.NoError(t, err) + + am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil) + require.NoError(t, err) + + // We don't start the AM. We just manually insert state, then call loadAndSyncConfigs. +user1 := "user-1" + +// 1. Create a FullState (remote state) for user1 in the store. +fullState := alertspb.FullStateDesc{ +State: &clusterpb.FullState{}, +} +err = alertStore.SetFullState(ctx, user1, fullState) +require.NoError(t, err) + +// Since we did NOT SetAlertConfig for user1, the store.ListAllUsers will return empty. +allUsers, err := alertStore.ListAllUsers(ctx) +require.NoError(t, err) +require.Empty(t, allUsers) + +// Verify user1's FullState is written to the store. + _, err = alertStore.GetFullState(ctx, user1) + require.NoError(t, err) + + // 2. Call loadAndSyncConfigs. It should fetch all users (which is empty), + // and since sharding is enabled, it should clean up unused remote state for any users not in the list. + // user1 has state but no config, so its state should be pruned. + err = am.loadAndSyncConfigs(ctx, reasonPeriodic) + require.NoError(t, err) + + // 3. Verify user1's FullState is deleted from the store. +_, err = alertStore.GetFullState(ctx, user1) +require.Error(t, err) +require.Contains(t, err.Error(), "alertmanager storage object not found") +} + +func TestMultitenantAlertmanager_loadAndSyncConfigs_DoesNotDeleteUserFromStoreWhenRingUnreachable(t *testing.T) { + ctx := context.Background() + amConfig := mockAlertmanagerConfig(t) + amConfig.ShardingEnabled = true + + ringStore, closer := consul.NewInMemoryClient(ring.GetCodec(), log.NewNopLogger(), nil) + t.Cleanup(func() { assert.NoError(t, closer.Close()) }) + + alertStore, err := prepareInMemoryAlertStore() + require.NoError(t, err) + + am, err := createMultitenantAlertmanager(amConfig, nil, nil, alertStore, ringStore, nil, log.NewNopLogger(), nil) + require.NoError(t, err) + + user1 := "user-1" + + // 1. Create a FullState (remote state) for user1 in the store. + fullState := alertspb.FullStateDesc{ + State: &clusterpb.FullState{}, + } + err = alertStore.SetFullState(ctx, user1, fullState) + require.NoError(t, err) + + // Since we DO SetAlertConfig for user1, the store.ListAllUsers will return the user. + err = alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ +User: user1, +RawConfig: simpleConfigOne, +Templates: []*alertspb.TemplateDesc{}, +}) + require.NoError(t, err) + + allUsers, err := alertStore.ListAllUsers(ctx) + require.NoError(t, err) + require.Len(t, allUsers, 1) + + // Verify user1's FullState is written to the store. +_, err = alertStore.GetFullState(ctx, user1) +require.NoError(t, err) + +// 2. Call loadAndSyncConfigs. It should fetch all users. +// Since sharding is enabled but ring is empty (unreachable), isUserOwned will fail. +// loadAndSyncConfigs will return early with an error before pruning any states! +err = am.loadAndSyncConfigs(ctx, reasonPeriodic) +require.Error(t, err) +require.Contains(t, err.Error(), ring.ErrEmptyRing.Error()) + +// 3. Verify user1's FullState is STILL in the store. + _, err = alertStore.GetFullState(ctx, user1) + require.NoError(t, err) +} From d02696ccda4a785f0426d0688b296835bc3b1559 Mon Sep 17 00:00:00 2001 From: Kishore K G Date: Tue, 24 Mar 2026 10:09:33 +0000 Subject: [PATCH 2/5] Add change log Signed-off-by: Kishore K G --- CHANGELOG.md | 1 + pkg/alertmanager/multitenant_test.go | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2f2ea4b79..b1e23fd513 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [ENHANCEMENT] Cache: Add per-tenant TTL configuration for query results cache to control cache expiration on a per-tenant basis with separate TTLs for regular and out-of-order data. #7357 * [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363 * [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355 +* [BUGFIX] Alertmanager: Fix disappearing user config and state when ring is temporarily unreachable. #7367 ## 1.21.0 in progress diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 2ecf975439..5fcaddb40c 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2534,9 +2534,6 @@ require.NoError(t, err) require.True(t, owned) } - - - func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *testing.T) { ctx := context.Background() amConfig := mockAlertmanagerConfig(t) From b5d6b751ca9a150ec32ea2be80360695fbad9ff6 Mon Sep 17 00:00:00 2001 From: Kishore K G Date: Tue, 24 Mar 2026 10:32:43 +0000 Subject: [PATCH 3/5] format multitenant Signed-off-by: Kishore K G --- pkg/alertmanager/multitenant_test.go | 83 ++++++++++++++-------------- 1 file changed, 40 insertions(+), 43 deletions(-) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 5fcaddb40c..506e9d60d5 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2502,9 +2502,6 @@ func (m *mockAlertManagerLimits) AlertmanagerMaxSilenceSizeBytes(_ string) int { return m.maxSilencesSizeBytes } - - - func TestMultitenantAlertmanager_isUserOwned(t *testing.T) { ctx := context.Background() _ = ctx @@ -2521,17 +2518,17 @@ func TestMultitenantAlertmanager_isUserOwned(t *testing.T) { require.NoError(t, err) // We don't start the AM, so the ring is empty. -// When sharding is enabled, isUserOwned will call am.ring.Get, which should fail because the ring is empty. -owned, err := am.isUserOwned("user-1") -require.Error(t, err) -require.Equal(t, ring.ErrEmptyRing, err) -require.False(t, owned) - -// If sharding is disabled, it should return true, nil -am.cfg.ShardingEnabled = false -owned, err = am.isUserOwned("user-1") -require.NoError(t, err) -require.True(t, owned) + // When sharding is enabled, isUserOwned will call am.ring.Get, which should fail because the ring is empty. + owned, err := am.isUserOwned("user-1") + require.Error(t, err) + require.Equal(t, ring.ErrEmptyRing, err) + require.False(t, owned) + + // If sharding is disabled, it should return true, nil + am.cfg.ShardingEnabled = false + owned, err = am.isUserOwned("user-1") + require.NoError(t, err) + require.True(t, owned) } func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *testing.T) { @@ -2549,34 +2546,34 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *test require.NoError(t, err) // We don't start the AM. We just manually insert state, then call loadAndSyncConfigs. -user1 := "user-1" + user1 := "user-1" -// 1. Create a FullState (remote state) for user1 in the store. -fullState := alertspb.FullStateDesc{ -State: &clusterpb.FullState{}, -} -err = alertStore.SetFullState(ctx, user1, fullState) -require.NoError(t, err) + // 1. Create a FullState (remote state) for user1 in the store. + fullState := alertspb.FullStateDesc{ + State: &clusterpb.FullState{}, + } + err = alertStore.SetFullState(ctx, user1, fullState) + require.NoError(t, err) -// Since we did NOT SetAlertConfig for user1, the store.ListAllUsers will return empty. -allUsers, err := alertStore.ListAllUsers(ctx) -require.NoError(t, err) -require.Empty(t, allUsers) + // Since we did NOT SetAlertConfig for user1, the store.ListAllUsers will return empty. + allUsers, err := alertStore.ListAllUsers(ctx) + require.NoError(t, err) + require.Empty(t, allUsers) -// Verify user1's FullState is written to the store. + // Verify user1's FullState is written to the store. _, err = alertStore.GetFullState(ctx, user1) require.NoError(t, err) - // 2. Call loadAndSyncConfigs. It should fetch all users (which is empty), + // 2. Call loadAndSyncConfigs. It should fetch all users (which is empty), // and since sharding is enabled, it should clean up unused remote state for any users not in the list. // user1 has state but no config, so its state should be pruned. err = am.loadAndSyncConfigs(ctx, reasonPeriodic) require.NoError(t, err) // 3. Verify user1's FullState is deleted from the store. -_, err = alertStore.GetFullState(ctx, user1) -require.Error(t, err) -require.Contains(t, err.Error(), "alertmanager storage object not found") + _, err = alertStore.GetFullState(ctx, user1) + require.Error(t, err) + require.Contains(t, err.Error(), "alertmanager storage object not found") } func TestMultitenantAlertmanager_loadAndSyncConfigs_DoesNotDeleteUserFromStoreWhenRingUnreachable(t *testing.T) { @@ -2604,10 +2601,10 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs_DoesNotDeleteUserFromStoreWh // Since we DO SetAlertConfig for user1, the store.ListAllUsers will return the user. err = alertStore.SetAlertConfig(ctx, alertspb.AlertConfigDesc{ -User: user1, -RawConfig: simpleConfigOne, -Templates: []*alertspb.TemplateDesc{}, -}) + User: user1, + RawConfig: simpleConfigOne, + Templates: []*alertspb.TemplateDesc{}, + }) require.NoError(t, err) allUsers, err := alertStore.ListAllUsers(ctx) @@ -2615,17 +2612,17 @@ Templates: []*alertspb.TemplateDesc{}, require.Len(t, allUsers, 1) // Verify user1's FullState is written to the store. -_, err = alertStore.GetFullState(ctx, user1) -require.NoError(t, err) + _, err = alertStore.GetFullState(ctx, user1) + require.NoError(t, err) -// 2. Call loadAndSyncConfigs. It should fetch all users. -// Since sharding is enabled but ring is empty (unreachable), isUserOwned will fail. -// loadAndSyncConfigs will return early with an error before pruning any states! -err = am.loadAndSyncConfigs(ctx, reasonPeriodic) -require.Error(t, err) -require.Contains(t, err.Error(), ring.ErrEmptyRing.Error()) + // 2. Call loadAndSyncConfigs. It should fetch all users. + // Since sharding is enabled but ring is empty (unreachable), isUserOwned will fail. + // loadAndSyncConfigs will return early with an error before pruning any states! + err = am.loadAndSyncConfigs(ctx, reasonPeriodic) + require.Error(t, err) + require.Contains(t, err.Error(), ring.ErrEmptyRing.Error()) -// 3. Verify user1's FullState is STILL in the store. + // 3. Verify user1's FullState is STILL in the store. _, err = alertStore.GetFullState(ctx, user1) require.NoError(t, err) } From c6d6356c030a1de70007f9d541cc361f9829c350 Mon Sep 17 00:00:00 2001 From: Kishore K G Date: Tue, 24 Mar 2026 16:26:52 +0000 Subject: [PATCH 4/5] fix pr number Signed-off-by: Kishore K G --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1e23fd513..befae9d6d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ * [ENHANCEMENT] Cache: Add per-tenant TTL configuration for query results cache to control cache expiration on a per-tenant basis with separate TTLs for regular and out-of-order data. #7357 * [ENHANCEMENT] Tenant Federation: Add a local cache to regex resolver. #7363 * [ENHANCEMENT] Query Scheduler: Add `cortex_query_scheduler_tracked_requests` metric to track the current number of requests held by the scheduler. #7355 -* [BUGFIX] Alertmanager: Fix disappearing user config and state when ring is temporarily unreachable. #7367 +* [BUGFIX] Alertmanager: Fix disappearing user config and state when ring is temporarily unreachable. #7372 ## 1.21.0 in progress From f2ec6b164071103b859194d5ffd7afc82d0f39c2 Mon Sep 17 00:00:00 2001 From: kishorekg1999 Date: Thu, 26 Mar 2026 09:29:18 +0530 Subject: [PATCH 5/5] use ErrNotFound for error validation in unit test Signed-off-by: kishorekg1999 --- pkg/alertmanager/multitenant_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 506e9d60d5..7cde6689d5 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -2573,7 +2573,7 @@ func TestMultitenantAlertmanager_loadAndSyncConfigs_deletesUserFromStore(t *test // 3. Verify user1's FullState is deleted from the store. _, err = alertStore.GetFullState(ctx, user1) require.Error(t, err) - require.Contains(t, err.Error(), "alertmanager storage object not found") + require.Equal(t, alertspb.ErrNotFound, err) } func TestMultitenantAlertmanager_loadAndSyncConfigs_DoesNotDeleteUserFromStoreWhenRingUnreachable(t *testing.T) {