Skip to content

Commit 89ac678

Browse files
committed
test: add e2e tests for UIPlugin incident detection
Add end-to-end tests that validate the monitoring UIPlugin with cluster-health-analyzer: a deployment readiness check and a functional test that triggers a CrashLoopBackOff alert and verifies the cluster_health_components_map incident metric is produced. Also introduce AssertPromQLResultWithOptions to allow callers to override the default poll interval and timeout, and generalize waitForDBUIPluginDeletion to waitForUIPluginDeletion.
1 parent 3a30aca commit 89ac678

3 files changed

Lines changed: 266 additions & 3 deletions

File tree

test/e2e/framework/assertions.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,12 +422,26 @@ func (f *Framework) GetPodMetrics(pod *v1.Pod, opts ...func(*HTTPOptions)) ([]by
422422
// It returns an error if the request fails. Otherwise the result is passed to
423423
// the callback function for additional checks.
424424
func (f *Framework) AssertPromQLResult(t *testing.T, expr string, callback func(model.Value) error) error {
425+
return f.AssertPromQLResultWithOptions(t, expr, callback)
426+
}
427+
428+
// AssertPromQLResultWithOptions is like AssertPromQLResult but accepts
429+
// WithTimeout and WithPollInterval options to override the default polling
430+
// parameters.
431+
func (f *Framework) AssertPromQLResultWithOptions(t *testing.T, expr string, callback func(model.Value) error, fns ...OptionFn) error {
425432
t.Helper()
433+
option := AssertOption{
434+
PollInterval: 20 * time.Second,
435+
WaitTimeout: 3 * DefaultTestTimeout,
436+
}
437+
for _, fn := range fns {
438+
fn(&option)
439+
}
426440
var (
427441
pollErr error
428442
v model.Value
429443
)
430-
if err := wait.PollUntilContextTimeout(context.Background(), 20*time.Second, 3*DefaultTestTimeout, true, func(context.Context) (bool, error) {
444+
if err := wait.PollUntilContextTimeout(context.Background(), option.PollInterval, option.WaitTimeout, true, func(context.Context) (bool, error) {
431445
v, pollErr = f.getPromQLResult(context.Background(), expr)
432446
if pollErr != nil {
433447
t.Logf("error from getPromQLResult(): %s", pollErr)
Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"strconv"
7+
"testing"
8+
"time"
9+
10+
"github.com/prometheus/common/model"
11+
monv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
12+
"gotest.tools/v3/assert"
13+
appsv1 "k8s.io/api/apps/v1"
14+
corev1 "k8s.io/api/core/v1"
15+
"k8s.io/apimachinery/pkg/api/errors"
16+
"k8s.io/apimachinery/pkg/api/resource"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
"k8s.io/apimachinery/pkg/util/intstr"
19+
"k8s.io/apimachinery/pkg/util/wait"
20+
"k8s.io/utils/ptr"
21+
"sigs.k8s.io/controller-runtime/pkg/client"
22+
23+
uiv1 "github.com/rhobs/observability-operator/pkg/apis/uiplugin/v1alpha1"
24+
"github.com/rhobs/observability-operator/test/e2e/framework"
25+
)
26+
27+
const (
28+
healthAnalyzerDeploymentName = "health-analyzer"
29+
prometheusRuleNamespace = "openshift-monitoring"
30+
)
31+
32+
func monitoringUIPluginDeploy(t *testing.T) {
33+
plugin := newMonitoringUIPlugin(t)
34+
err := f.K8sClient.Create(context.Background(), plugin)
35+
assert.NilError(t, err, "failed to create monitoring UIPlugin")
36+
37+
t.Log("Waiting for health-analyzer deployment to become ready...")
38+
haDeployment := appsv1.Deployment{}
39+
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
40+
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
41+
}
42+
43+
func incidentDetectionFunctional(t *testing.T) {
44+
monv1.AddToScheme(f.K8sClient.Scheme())
45+
46+
plugin := newMonitoringUIPlugin(t)
47+
err := f.K8sClient.Create(context.Background(), plugin)
48+
assert.NilError(t, err, "failed to create monitoring UIPlugin")
49+
50+
t.Log("Waiting for health-analyzer deployment to become ready...")
51+
haDeployment := appsv1.Deployment{}
52+
f.GetResourceWithRetry(t, healthAnalyzerDeploymentName, uiPluginInstallNS, &haDeployment)
53+
f.AssertDeploymentReady(healthAnalyzerDeploymentName, uiPluginInstallNS, framework.WithTimeout(5*time.Minute))(t)
54+
55+
suffix := strconv.FormatInt(time.Now().UnixNano()%100000, 10)
56+
ruleName := "e2e-crashloop-" + suffix
57+
alertName := "E2ECrashLoop" + suffix
58+
deployName := "e2e-crasher-" + suffix
59+
60+
rule := newCrashLoopRule(t, ruleName, alertName, deployName)
61+
err = f.K8sClient.Create(context.Background(), rule)
62+
assert.NilError(t, err, "failed to create PrometheusRule")
63+
64+
dep := newCrashingDeployment(t, deployName)
65+
err = f.K8sClient.Create(context.Background(), dep)
66+
assert.NilError(t, err, "failed to create crashing deployment")
67+
68+
t.Log("Waiting for pod to enter CrashLoopBackOff...")
69+
assertPodCrashLooping(t, deployName, e2eTestNamespace, 10*time.Second, 3*time.Minute)
70+
71+
t.Log("Waiting for alert to fire in Prometheus...")
72+
alertQuery := fmt.Sprintf(`ALERTS{alertname="%s",alertstate="firing"}`, alertName)
73+
err = f.AssertPromQLResultWithOptions(t, alertQuery,
74+
func(v model.Value) error {
75+
vec, ok := v.(model.Vector)
76+
if !ok || len(vec) == 0 {
77+
return fmt.Errorf("expected firing alert, got: %v", v)
78+
}
79+
return nil
80+
},
81+
framework.WithPollInterval(30*time.Second),
82+
framework.WithTimeout(10*time.Minute),
83+
)
84+
assert.NilError(t, err, "alert %s never fired", alertName)
85+
86+
t.Log("Waiting for cluster-health-analyzer to expose incident metric...")
87+
incidentQuery := fmt.Sprintf(`cluster_health_components_map{src_alertname="%s"}`, alertName)
88+
err = f.AssertPromQLResultWithOptions(t, incidentQuery,
89+
func(v model.Value) error {
90+
vec, ok := v.(model.Vector)
91+
if !ok || len(vec) == 0 {
92+
return fmt.Errorf("expected incident metric, got: %v", v)
93+
}
94+
for _, sample := range vec {
95+
if string(sample.Metric["src_alertname"]) != alertName {
96+
return fmt.Errorf("expected src_alertname=%s, got %s", alertName, sample.Metric["src_alertname"])
97+
}
98+
if string(sample.Metric["src_severity"]) != "warning" {
99+
return fmt.Errorf("expected src_severity=warning, got %s", sample.Metric["src_severity"])
100+
}
101+
}
102+
return nil
103+
},
104+
framework.WithPollInterval(30*time.Second),
105+
framework.WithTimeout(15*time.Minute),
106+
)
107+
assert.NilError(t, err, "incident metric for %s never appeared", alertName)
108+
}
109+
110+
func newMonitoringUIPlugin(t *testing.T) *uiv1.UIPlugin {
111+
plugin := &uiv1.UIPlugin{
112+
ObjectMeta: metav1.ObjectMeta{
113+
Name: "monitoring",
114+
},
115+
Spec: uiv1.UIPluginSpec{
116+
Type: uiv1.TypeMonitoring,
117+
Monitoring: &uiv1.MonitoringConfig{
118+
ClusterHealthAnalyzer: &uiv1.ClusterHealthAnalyzerReference{
119+
Enabled: true,
120+
},
121+
},
122+
},
123+
}
124+
125+
existing := &uiv1.UIPlugin{}
126+
err := f.K8sClient.Get(context.Background(), client.ObjectKey{Name: plugin.Name}, existing)
127+
if err == nil {
128+
t.Log("UIPlugin 'monitoring' already exists, deleting before recreation...")
129+
f.K8sClient.Delete(context.Background(), existing)
130+
waitForUIPluginDeletion(existing)
131+
} else if !errors.IsNotFound(err) {
132+
t.Fatalf("failed to check for existing UIPlugin: %v", err)
133+
}
134+
135+
f.CleanUp(t, func() {
136+
f.K8sClient.Delete(context.Background(), plugin)
137+
waitForUIPluginDeletion(plugin)
138+
})
139+
return plugin
140+
}
141+
142+
func newCrashLoopRule(t *testing.T, ruleName, alertName, podPrefix string) *monv1.PrometheusRule {
143+
rule := &monv1.PrometheusRule{
144+
ObjectMeta: metav1.ObjectMeta{
145+
Name: ruleName,
146+
Namespace: prometheusRuleNamespace,
147+
Labels: map[string]string{
148+
"app.kubernetes.io/name": "kube-prometheus",
149+
"app.kubernetes.io/part-of": "openshift-monitoring",
150+
"prometheus": "k8s",
151+
"role": "alert-rules",
152+
},
153+
},
154+
Spec: monv1.PrometheusRuleSpec{
155+
Groups: []monv1.RuleGroup{{
156+
Name: "crashloop-test-" + ruleName,
157+
Rules: []monv1.Rule{{
158+
Alert: alertName,
159+
Expr: intstr.FromString(fmt.Sprintf(
160+
`max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", namespace="%s", pod=~"%s.*", job="kube-state-metrics"}[5m]) >= 1`,
161+
e2eTestNamespace, podPrefix)),
162+
For: ptr.To(monv1.Duration("1m")),
163+
Labels: map[string]string{"severity": "warning"},
164+
Annotations: map[string]string{
165+
"summary": "Pod is crash looping.",
166+
},
167+
}},
168+
}},
169+
},
170+
}
171+
f.CleanUp(t, func() {
172+
f.K8sClient.Delete(context.Background(), rule)
173+
})
174+
return rule
175+
}
176+
177+
func newCrashingDeployment(t *testing.T, name string) *appsv1.Deployment {
178+
dep := &appsv1.Deployment{
179+
ObjectMeta: metav1.ObjectMeta{
180+
Name: name,
181+
Namespace: e2eTestNamespace,
182+
Labels: map[string]string{"app": name},
183+
},
184+
Spec: appsv1.DeploymentSpec{
185+
Replicas: ptr.To(int32(1)),
186+
Selector: &metav1.LabelSelector{
187+
MatchLabels: map[string]string{"app": name},
188+
},
189+
Template: corev1.PodTemplateSpec{
190+
ObjectMeta: metav1.ObjectMeta{
191+
Labels: map[string]string{"app": name},
192+
},
193+
Spec: corev1.PodSpec{
194+
Containers: []corev1.Container{{
195+
Name: "crasher",
196+
Image: "registry.access.redhat.com/ubi9-minimal:latest",
197+
Command: []string{"sh", "-c", "exit 1"},
198+
Resources: corev1.ResourceRequirements{
199+
Requests: corev1.ResourceList{
200+
corev1.ResourceCPU: resource.MustParse("1m"),
201+
corev1.ResourceMemory: resource.MustParse("4Mi"),
202+
},
203+
Limits: corev1.ResourceList{
204+
corev1.ResourceCPU: resource.MustParse("10m"),
205+
corev1.ResourceMemory: resource.MustParse("16Mi"),
206+
},
207+
},
208+
}},
209+
},
210+
},
211+
},
212+
}
213+
f.CleanUp(t, func() {
214+
f.K8sClient.Delete(context.Background(), dep)
215+
})
216+
return dep
217+
}
218+
219+
func assertPodCrashLooping(t *testing.T, deploymentName, namespace string, pollInterval, timeout time.Duration) {
220+
t.Helper()
221+
err := wait.PollUntilContextTimeout(context.Background(), pollInterval, timeout, true, func(ctx context.Context) (bool, error) {
222+
var pods corev1.PodList
223+
if err := f.K8sClient.List(ctx, &pods,
224+
client.InNamespace(namespace),
225+
client.MatchingLabels{"app": deploymentName},
226+
); err != nil {
227+
return false, nil
228+
}
229+
for i := range pods.Items {
230+
for _, cs := range pods.Items[i].Status.ContainerStatuses {
231+
if cs.State.Waiting != nil && cs.State.Waiting.Reason == "CrashLoopBackOff" {
232+
return true, nil
233+
}
234+
}
235+
}
236+
return false, nil
237+
})
238+
if err != nil {
239+
t.Fatalf("pod with label app=%s in %s never entered CrashLoopBackOff: %v", deploymentName, namespace, err)
240+
}
241+
}

test/e2e/uiplugin_test.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ func TestUIPlugin(t *testing.T) {
3434
name: "Create dashboards UIPlugin",
3535
scenario: dashboardsUIPlugin,
3636
},
37+
{
38+
name: "Deploy health-analyzer via Monitoring UIPlugin",
39+
scenario: monitoringUIPluginDeploy,
40+
},
41+
{
42+
name: "Incident detection functional validation",
43+
scenario: incidentDetectionFunctional,
44+
},
3745
}
3846

3947
for _, tc := range ts {
@@ -63,13 +71,13 @@ func newDashboardsUIPlugin(t *testing.T) *uiv1.UIPlugin {
6371
}
6472
f.CleanUp(t, func() {
6573
f.K8sClient.Delete(context.Background(), db)
66-
waitForDBUIPluginDeletion(db)
74+
waitForUIPluginDeletion(db)
6775
})
6876

6977
return db
7078
}
7179

72-
func waitForDBUIPluginDeletion(db *uiv1.UIPlugin) error {
80+
func waitForUIPluginDeletion(db *uiv1.UIPlugin) error {
7381
return wait.PollUntilContextTimeout(context.Background(), 5*time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
7482
err = f.K8sClient.Get(context.Background(),
7583
client.ObjectKey{Name: db.Name},

0 commit comments

Comments
 (0)