test: add e2e tests for UIPlugin incident detection

DavidRajnoha · DavidRajnoha · commit 89ac6786e2c2 · 2026-03-16T10:37:49.000+01:00
Add end-to-end tests that validate the monitoring UIPlugin with
cluster-health-analyzer: a deployment readiness check and a functional
test that triggers a CrashLoopBackOff alert and verifies the
cluster_health_components_map incident metric is produced.

Also introduce AssertPromQLResultWithOptions to allow callers to
override the default poll interval and timeout, and generalize
waitForDBUIPluginDeletion to waitForUIPluginDeletion.
diff --git a/test/e2e/framework/assertions.go b/test/e2e/framework/assertions.go
@@ -422,12 +422,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
 // It returns an error if the request fails. Otherwise the result is passed to
 // the callback function for additional checks.
 func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
+	return f.AssertPromQLResultWithOptions(t, expr, callback)
+}
+
+// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
+// WithTimeout and WithPollInterval options to override the default polling
+// parameters.
+func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
 	t.Helper()
+	option := AssertOption{
+		PollInterval: 20 * time.Second,
+		WaitTimeout:  3 * DefaultTestTimeout,
+	}
+	for _, fn := range fns {
+		fn(&option)
+	}
 	var (
 		pollErr error
 		v       model.Value
 	)
-	if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
+	if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
 		v, pollErr = f.getPromQLResult(context.Background(), expr)
 		if pollErr != nil {
 			t.Logf("error from getPromQLResult(): %s", pollErr)
diff --git a/test/e2e/uiplugin_incident_detection_test.go b/test/e2e/uiplugin_incident_detection_test.go
@@ -0,0 +1,241 @@
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/prometheus/common/model"
+	monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
+	"gotest.tools/v3/assert"
+	appsv1 "k8s.io/api/apps/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
+	"github.com/rhobs/observability-operator/test/e2e/framework"
+)
+
+const (
+	healthAnalyzerDeploymentName = "health-analyzer"
+	prometheusRuleNamespace      = "openshift-monitoring"
+)
+
+func monitoringUIPluginDeploy(t *testing.T) {
+	plugin := newMonitoringUIPlugin(t)
+	err := f.K8sClient.Create(context.Background(), plugin)
+	assert.NilError(t, err, "failed to create monitoring UIPlugin")
+
+	t.Log("Waiting for health-analyzer deployment to become ready...")
+	haDeployment := appsv1.Deployment{}
+	f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
+	f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
+}
+
+func incidentDetectionFunctional(t *testing.T) {
+	monv1.AddToScheme(f.K8sClient.Scheme())
+
+	plugin := newMonitoringUIPlugin(t)
+	err := f.K8sClient.Create(context.Background(), plugin)
+	assert.NilError(t, err, "failed to create monitoring UIPlugin")
+
+	t.Log("Waiting for health-analyzer deployment to become ready...")
+	haDeployment := appsv1.Deployment{}
+	f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
+	f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
+
+	suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
+	ruleName := "e2e-crashloop-" + suffix
+	alertName := "E2ECrashLoop" + suffix
+	deployName := "e2e-crasher-" + suffix
+
+	rule := newCrashLoopRule(t, ruleName, alertName, deployName)
+	err = f.K8sClient.Create(context.Background(), rule)
+	assert.NilError(t, err, "failed to create PrometheusRule")
+
+	dep := newCrashingDeployment(t, deployName)
+	err = f.K8sClient.Create(context.Background(), dep)
+	assert.NilError(t, err, "failed to create crashing deployment")
+
+	t.Log("Waiting for pod to enter CrashLoopBackOff...")
+	assertPodCrashLooping(t, deployName, e2eTestNamespace, 10*time.Second, 3*time.Minute)
+
+	t.Log("Waiting for alert to fire in Prometheus...")
+	alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
+	err = f.AssertPromQLResultWithOptions(t, alertQuery,
+		func(v model.Value) error {
+			vec, ok := v.(model.Vector)
+			if !ok || len(vec) == 0 {
+				return fmt.Errorf("expected firing alert, got: %v", v)
+			}
+			return nil
+		},
+		framework.WithPollInterval(30*time.Second),
+		framework.WithTimeout(10*time.Minute),
+	)
+	assert.NilError(t, err, "alert %s never fired", alertName)
+
+	t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
+	incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s"}`, alertName)
+	err = f.AssertPromQLResultWithOptions(t, incidentQuery,
+		func(v model.Value) error {
+			vec, ok := v.(model.Vector)
+			if !ok || len(vec) == 0 {
+				return fmt.Errorf("expected incident metric, got: %v", v)
+			}
+			for _, sample := range vec {
+				if string(sample.Metric["src_alertname"]) != alertName {
+					return fmt.Errorf("expected src_alertname=%s, got %s", alertName, sample.Metric["src_alertname"])
+				}
+				if string(sample.Metric["src_severity"]) != "warning" {
+					return fmt.Errorf("expected src_severity=warning, got %s", sample.Metric["src_severity"])
+				}
+			}
+			return nil
+		},
+		framework.WithPollInterval(30*time.Second),
+		framework.WithTimeout(15*time.Minute),
+	)
+	assert.NilError(t, err, "incident metric for %s never appeared", alertName)
+}
+
+func newMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
+	plugin := &uiv1.UIPlugin{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "monitoring",
+		},
+		Spec: uiv1.UIPluginSpec{
+			Type: uiv1.TypeMonitoring,
+			Monitoring: &uiv1.MonitoringConfig{
+				ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
+					Enabled: true,
+				},
+			},
+		},
+	}
+
+	existing := &uiv1.UIPlugin{}
+	err := f.K8sClient.Get(context.Background(), client.ObjectKey{Name: plugin.Name}, existing)
+	if err == nil {
+		t.Log("UIPlugin 'monitoring' already exists, deleting before recreation...")
+		f.K8sClient.Delete(context.Background(), existing)
+		waitForUIPluginDeletion(existing)
+	} else if !errors.IsNotFound(err) {
+		t.Fatalf("failed to check for existing UIPlugin: %v", err)
+	}
+
+	f.CleanUp(t, func() {
+		f.K8sClient.Delete(context.Background(), plugin)
+		waitForUIPluginDeletion(plugin)
+	})
+	return plugin
+}
+
+func newCrashLoopRule(t *testing.T, ruleName, alertName, podPrefix string) *monv1.PrometheusRule {
+	rule := &monv1.PrometheusRule{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      ruleName,
+			Namespace: prometheusRuleNamespace,
+			Labels: map[string]string{
+				"app.kubernetes.io/name":    "kube-prometheus",
+				"app.kubernetes.io/part-of": "openshift-monitoring",
+				"prometheus":                "k8s",
+				"role":                      "alert-rules",
+			},
+		},
+		Spec: monv1.PrometheusRuleSpec{
+			Groups: []monv1.RuleGroup{{
+				Name: "crashloop-test-" + ruleName,
+				Rules: []monv1.Rule{{
+					Alert: alertName,
+					Expr: intstr.FromString(fmt.Sprintf(
+						`max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace="%s", pod=~"%s.*", job="kube-state-metrics"}[5m]) >= 1`,
+						e2eTestNamespace, podPrefix)),
+					For:    ptr.To(monv1.Duration("1m")),
+					Labels: map[string]string{"severity": "warning"},
+					Annotations: map[string]string{
+						"summary": "Pod is crash looping.",
+					},
+				}},
+			}},
+		},
+	}
+	f.CleanUp(t, func() {
+		f.K8sClient.Delete(context.Background(), rule)
+	})
+	return rule
+}
+
+func newCrashingDeployment(t *testing.T, name string) *appsv1.Deployment {
+	dep := &appsv1.Deployment{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: e2eTestNamespace,
+			Labels:    map[string]string{"app": name},
+		},
+		Spec: appsv1.DeploymentSpec{
+			Replicas: ptr.To(int32(1)),
+			Selector: &metav1.LabelSelector{
+				MatchLabels: map[string]string{"app": name},
+			},
+			Template: corev1.PodTemplateSpec{
+				ObjectMeta: metav1.ObjectMeta{
+					Labels: map[string]string{"app": name},
+				},
+				Spec: corev1.PodSpec{
+					Containers: []corev1.Container{{
+						Name:    "crasher",
+						Image:   "registry.access.redhat.com/ubi9-minimal:latest",
+						Command: []string{"sh", "-c", "exit 1"},
+						Resources: corev1.ResourceRequirements{
+							Requests: corev1.ResourceList{
+								corev1.ResourceCPU:    resource.MustParse("1m"),
+								corev1.ResourceMemory: resource.MustParse("4Mi"),
+							},
+							Limits: corev1.ResourceList{
+								corev1.ResourceCPU:    resource.MustParse("10m"),
+								corev1.ResourceMemory: resource.MustParse("16Mi"),
+							},
+						},
+					}},
+				},
+			},
+		},
+	}
+	f.CleanUp(t, func() {
+		f.K8sClient.Delete(context.Background(), dep)
+	})
+	return dep
+}
+
+func assertPodCrashLooping(t *testing.T, deploymentName, namespace string, pollInterval, timeout time.Duration) {
+	t.Helper()
+	err := wait.PollUntilContextTimeout(context.Background(), pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
+		var pods corev1.PodList
+		if err := f.K8sClient.List(ctx, &pods,
+			client.InNamespace(namespace),
+			client.MatchingLabels{"app": deploymentName},
+		); err != nil {
+			return false, nil
+		}
+		for i := range pods.Items {
+			for _, cs := range pods.Items[i].Status.ContainerStatuses {
+				if cs.State.Waiting != nil && cs.State.Waiting.Reason == "CrashLoopBackOff" {
+					return true, nil
+				}
+			}
+		}
+		return false, nil
+	})
+	if err != nil {
+		t.Fatalf("pod with label app=%s in %s never entered CrashLoopBackOff: %v", deploymentName, namespace, err)
+	}
+}
diff --git a/test/e2e/uiplugin_test.go b/test/e2e/uiplugin_test.go
@@ -34,6 +34,14 @@ func TestUIPlugin(t *testing.T) {
 			name:     "Create dashboards UIPlugin",
 			scenario: dashboardsUIPlugin,
 		},
+		{
+			name:     "Deploy health-analyzer via Monitoring UIPlugin",
+			scenario: monitoringUIPluginDeploy,
+		},
+		{
+			name:     "Incident detection functional validation",
+			scenario: incidentDetectionFunctional,
+		},
 	}
 
 	for _, tc := range ts {
@@ -63,13 +71,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
 	}
 	f.CleanUp(t, func() {
 		f.K8sClient.Delete(context.Background(), db)
-		waitForDBUIPluginDeletion(db)
+		waitForUIPluginDeletion(db)
 	})
 
 	return db
 }
 
-func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
+func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
 	return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
 		err = f.K8sClient.Get(context.Background(),
 			client.ObjectKey{Name: db.Name},