|
1 | 1 | package ruler |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "bytes" |
4 | 5 | "context" |
| 6 | + "errors" |
| 7 | + "runtime/pprof" |
5 | 8 | "sync" |
6 | 9 | "testing" |
7 | 10 | "time" |
8 | 11 |
|
9 | 12 | "github.com/go-kit/log" |
10 | 13 | "github.com/prometheus/client_golang/prometheus" |
| 14 | + config_util "github.com/prometheus/common/config" |
| 15 | + promConfig "github.com/prometheus/prometheus/config" |
11 | 16 | "github.com/prometheus/prometheus/model/labels" |
12 | 17 | "github.com/prometheus/prometheus/model/rulefmt" |
13 | 18 | "github.com/prometheus/prometheus/notifier" |
@@ -304,6 +309,198 @@ func TestBackupRules(t *testing.T) { |
304 | 309 | require.Equal(t, userRules[user2], m.GetBackupRules(user2)) |
305 | 310 | } |
306 | 311 |
|
| 312 | +// TestSyncRuleGroupsCleansUpNotifierOnManagerFactoryError is a regression test for |
| 313 | +// https://github.com/cortexproject/cortex/issues/7595. When the manager factory |
| 314 | +// returns an error, newManager has already created and started the per-user |
| 315 | +// notifier (via getOrCreateNotifier -> n.run()) and registered it in r.notifiers. |
| 316 | +// Because the user is never added to r.userManagers, the removal loop in |
| 317 | +// SyncRuleGroups never stops that notifier, so it and its discovery/notification |
| 318 | +// goroutines used to leak until the process exited. |
| 319 | +func TestSyncRuleGroupsCleansUpNotifierOnManagerFactoryError(t *testing.T) { |
| 320 | + dir := t.TempDir() |
| 321 | + const user = "testUser" |
| 322 | + |
| 323 | + factoryErr := errors.New("manager factory failed") |
| 324 | + failingFactory := func(_ context.Context, _ string, _ *notifier.Manager, _ log.Logger, _ *client.Pool, _ prometheus.Registerer) (RulesManager, error) { |
| 325 | + return nil, factoryErr |
| 326 | + } |
| 327 | + |
| 328 | + // Use a dedicated registry (not nil): a nil registry registers the notifier |
| 329 | + // service-discovery metrics on the global default registerer, which can |
| 330 | + // os.Exit(1) on duplicate registration when running alongside other tests. |
| 331 | + m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, &ruleLimits{}, failingFactory, nil, prometheus.NewRegistry(), log.NewNopLogger()) |
| 332 | + require.NoError(t, err) |
| 333 | + t.Cleanup(m.Stop) |
| 334 | + |
| 335 | + // Baseline notifier-run goroutines before the (failing) sync. Assert a delta |
| 336 | + // back to this baseline rather than an absolute zero so the check is robust to |
| 337 | + // any unrelated notifier goroutines from other tests sharing this process. |
| 338 | + before := countNotifierRunGoroutines() |
| 339 | + |
| 340 | + userRules := map[string]rulespb.RuleGroupList{ |
| 341 | + user: { |
| 342 | + &rulespb.RuleGroupDesc{ |
| 343 | + Name: "group1", |
| 344 | + Namespace: "ns", |
| 345 | + Interval: 1 * time.Minute, |
| 346 | + User: user, |
| 347 | + }, |
| 348 | + }, |
| 349 | + } |
| 350 | + m.SyncRuleGroups(context.Background(), userRules) |
| 351 | + |
| 352 | + // The factory failed, so the user must not be tracked as a live manager. |
| 353 | + require.Nil(t, getManager(m, user)) |
| 354 | + |
| 355 | + // The notifier must have been stopped and removed from the map. |
| 356 | + m.notifiersMtx.Lock() |
| 357 | + _, notifierExists := m.notifiers[user] |
| 358 | + m.notifiersMtx.Unlock() |
| 359 | + require.False(t, notifierExists, "notifier must be removed after a managerFactory error") |
| 360 | + |
| 361 | + // The per-user metrics registry must have been removed too. |
| 362 | + require.False(t, hasUserManagerRegistry(t, m, user), "per-user metrics registry must be removed after a managerFactory error") |
| 363 | + |
| 364 | + // Its goroutines (started by rulerNotifier.run) must not leak. removeNotifier |
| 365 | + // -> stop() -> wg.Wait() is synchronous, so they are gone by now; poll back to |
| 366 | + // the baseline to absorb any scheduling latency. |
| 367 | + test.Poll(t, 5*time.Second, before, func() interface{} { |
| 368 | + return countNotifierRunGoroutines() |
| 369 | + }) |
| 370 | +} |
| 371 | + |
| 372 | +// TestGetOrCreateNotifierStopsNotifierOnApplyConfigError is a regression test for |
| 373 | +// the secondary leak path in https://github.com/cortexproject/cortex/issues/7595: |
| 374 | +// getOrCreateNotifier starts the notifier with n.run() before calling |
| 375 | +// n.applyConfig. If applyConfig fails, the notifier was never inserted into |
| 376 | +// r.notifiers, so it must be stopped directly to avoid leaking its goroutines. |
| 377 | +func TestGetOrCreateNotifierStopsNotifierOnApplyConfigError(t *testing.T) { |
| 378 | + const user = "testUser" |
| 379 | + m, err := NewDefaultMultiTenantManager(Config{RulePath: t.TempDir()}, &ruleLimits{}, RuleManagerFactory(nil, nil), nil, prometheus.NewRegistry(), log.NewNopLogger()) |
| 380 | + require.NoError(t, err) |
| 381 | + t.Cleanup(m.Stop) |
| 382 | + |
| 383 | + // Force applyConfig to fail by pointing the Alertmanager TLS config at a CA |
| 384 | + // file that does not exist. |
| 385 | + m.notifierCfg = &promConfig.Config{ |
| 386 | + AlertingConfig: promConfig.AlertingConfig{ |
| 387 | + AlertmanagerConfigs: promConfig.AlertmanagerConfigs{ |
| 388 | + { |
| 389 | + HTTPClientConfig: config_util.HTTPClientConfig{ |
| 390 | + TLSConfig: config_util.TLSConfig{CAFile: "/does/not/exist"}, |
| 391 | + }, |
| 392 | + APIVersion: promConfig.AlertmanagerAPIVersionV2, |
| 393 | + }, |
| 394 | + }, |
| 395 | + }, |
| 396 | + } |
| 397 | + |
| 398 | + before := countNotifierRunGoroutines() |
| 399 | + _, err = m.getOrCreateNotifier(user, prometheus.NewRegistry()) |
| 400 | + require.Error(t, err) |
| 401 | + |
| 402 | + m.notifiersMtx.Lock() |
| 403 | + _, ok := m.notifiers[user] |
| 404 | + m.notifiersMtx.Unlock() |
| 405 | + require.False(t, ok, "notifier must not be registered when applyConfig fails") |
| 406 | + |
| 407 | + test.Poll(t, 5*time.Second, before, func() interface{} { |
| 408 | + return countNotifierRunGoroutines() |
| 409 | + }) |
| 410 | +} |
| 411 | + |
| 412 | +// TestSyncRuleGroupsRecoversAfterManagerFactoryError verifies that a user whose |
| 413 | +// first manager creation failed — and whose notifier was therefore cleaned up by |
| 414 | +// the fix for https://github.com/cortexproject/cortex/issues/7595 — is created |
| 415 | +// normally on a later sync once the factory succeeds, i.e. the failure cleanup |
| 416 | +// does not leave the user in an unrecoverable state. |
| 417 | +func TestSyncRuleGroupsRecoversAfterManagerFactoryError(t *testing.T) { |
| 418 | + dir := t.TempDir() |
| 419 | + const user = "testUser" |
| 420 | + |
| 421 | + fail := atomic.NewBool(true) |
| 422 | + base := RuleManagerFactory([][]*promRules.Group{{}, {}}, []time.Duration{time.Millisecond, time.Millisecond}) |
| 423 | + factory := func(ctx context.Context, userID string, n *notifier.Manager, logger log.Logger, p *client.Pool, reg prometheus.Registerer) (RulesManager, error) { |
| 424 | + if fail.Load() { |
| 425 | + return nil, errors.New("manager factory failed") |
| 426 | + } |
| 427 | + return base(ctx, userID, n, logger, p, reg) |
| 428 | + } |
| 429 | + |
| 430 | + m, err := NewDefaultMultiTenantManager(Config{RulePath: dir}, &ruleLimits{}, factory, nil, prometheus.NewRegistry(), log.NewNopLogger()) |
| 431 | + require.NoError(t, err) |
| 432 | + t.Cleanup(m.Stop) |
| 433 | + |
| 434 | + before := countNotifierRunGoroutines() |
| 435 | + |
| 436 | + userRules := map[string]rulespb.RuleGroupList{ |
| 437 | + user: { |
| 438 | + &rulespb.RuleGroupDesc{Name: "group1", Namespace: "ns", Interval: 1 * time.Minute, User: user}, |
| 439 | + }, |
| 440 | + } |
| 441 | + |
| 442 | + // First sync fails: no manager is tracked and the notifier must be cleaned up. |
| 443 | + m.SyncRuleGroups(context.Background(), userRules) |
| 444 | + require.Nil(t, getManager(m, user)) |
| 445 | + m.notifiersMtx.Lock() |
| 446 | + _, notifierExists := m.notifiers[user] |
| 447 | + m.notifiersMtx.Unlock() |
| 448 | + require.False(t, notifierExists) |
| 449 | + test.Poll(t, 5*time.Second, before, func() interface{} { |
| 450 | + return countNotifierRunGoroutines() |
| 451 | + }) |
| 452 | + |
| 453 | + // Once the factory succeeds, the user is created normally and gets a notifier. |
| 454 | + fail.Store(false) |
| 455 | + m.SyncRuleGroups(context.Background(), userRules) |
| 456 | + require.NotNil(t, getManager(m, user)) |
| 457 | + m.notifiersMtx.Lock() |
| 458 | + _, notifierExists = m.notifiers[user] |
| 459 | + m.notifiersMtx.Unlock() |
| 460 | + require.True(t, notifierExists, "notifier should be created when the user recovers") |
| 461 | +} |
| 462 | + |
| 463 | +// countNotifierRunGoroutines returns the number of goroutines currently running |
| 464 | +// inside rulerNotifier.run (its discovery and notification loops). It is used to |
| 465 | +// assert that a notifier's goroutines have been stopped rather than leaked. |
| 466 | +func countNotifierRunGoroutines() int { |
| 467 | + var buf bytes.Buffer |
| 468 | + _ = pprof.Lookup("goroutine").WriteTo(&buf, 2) |
| 469 | + count := 0 |
| 470 | + for _, stack := range bytes.Split(buf.Bytes(), []byte("\n\n")) { |
| 471 | + // Matches both the run.funcN frames and the "created by ...run" line; each |
| 472 | + // stack block is counted at most once, so the count stays accurate even if |
| 473 | + // the closure frame is ever inlined away. |
| 474 | + if bytes.Contains(stack, []byte("github.com/cortexproject/cortex/pkg/ruler.(*rulerNotifier).run")) { |
| 475 | + count++ |
| 476 | + } |
| 477 | + } |
| 478 | + return count |
| 479 | +} |
| 480 | + |
| 481 | +// hasUserManagerRegistry reports whether a per-user metrics registry for the |
| 482 | +// given user is still registered with the manager's metrics aggregator. The |
| 483 | +// notifier registers prometheus_notifications_* metrics into that per-user |
| 484 | +// registry, so its presence is observable via the aggregated, user-labelled |
| 485 | +// output. |
| 486 | +func hasUserManagerRegistry(t *testing.T, m *DefaultMultiTenantManager, user string) bool { |
| 487 | + t.Helper() |
| 488 | + tmp := prometheus.NewRegistry() |
| 489 | + tmp.MustRegister(m.userManagerMetrics) |
| 490 | + mfs, err := tmp.Gather() |
| 491 | + require.NoError(t, err) |
| 492 | + for _, mf := range mfs { |
| 493 | + for _, metric := range mf.GetMetric() { |
| 494 | + for _, lp := range metric.GetLabel() { |
| 495 | + if lp.GetName() == "user" && lp.GetValue() == user { |
| 496 | + return true |
| 497 | + } |
| 498 | + } |
| 499 | + } |
| 500 | + } |
| 501 | + return false |
| 502 | +} |
| 503 | + |
307 | 504 | func getManager(m *DefaultMultiTenantManager, user string) RulesManager { |
308 | 505 | m.userManagerMtx.RLock() |
309 | 506 | defer m.userManagerMtx.RUnlock() |
|
0 commit comments