Skip to content

Commit a73306d

Browse files
ArvindthiruArvind Thirumurugan
andauthored
fix: panic for incomplete metric in eviction controller (#1083)
* test: add tests for eviction complete metric * add UT for failed reconcile, emit incomplete metric * fix test name * remove new line * lint fix * delete the incomplete eviction metric * minor changes * remove table from UT * move complete metric UT -> IT * emit incomplete metric in IT to ensure it gets deleted * add comment * address comments * remove Ordered marker * use DeletePartial * address comment --------- Co-authored-by: Arvind Thirumurugan <arvindth@microsoft.com>
1 parent 59896b5 commit a73306d

3 files changed

Lines changed: 158 additions & 7 deletions

File tree

pkg/controllers/clusterresourceplacementeviction/controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req runtime.Request) (runtim
4646
var internalError bool
4747
defer func() {
4848
if internalError {
49-
metrics.FleetEvictionStatus.WithLabelValues(evictionName, "false").SetToCurrentTime()
49+
metrics.FleetEvictionStatus.WithLabelValues(evictionName, "false", "unknown").SetToCurrentTime()
5050
}
5151
latency := time.Since(startTime).Milliseconds()
5252
klog.V(2).InfoS("ClusterResourcePlacementEviction reconciliation ends", "clusterResourcePlacementEviction", evictionName, "latency", latency)
@@ -365,7 +365,7 @@ func markEvictionNotExecuted(eviction *placementv1beta1.ClusterResourcePlacement
365365
}
366366

367367
func emitEvictionCompleteMetric(eviction *placementv1beta1.ClusterResourcePlacementEviction) {
368-
metrics.FleetEvictionStatus.Delete(prometheus.Labels{"name": eviction.Name, "isCompleted": "false"})
368+
metrics.FleetEvictionStatus.DeletePartialMatch(prometheus.Labels{"name": eviction.GetName(), "isCompleted": "false"})
369369
// check to see if eviction is valid.
370370
if condition.IsConditionStatusTrue(eviction.GetCondition(string(placementv1beta1.PlacementEvictionConditionTypeValid)), eviction.GetGeneration()) {
371371
metrics.FleetEvictionStatus.WithLabelValues(eviction.Name, "true", "true").SetToCurrentTime()

pkg/controllers/clusterresourceplacementeviction/controller_intergration_test.go

Lines changed: 78 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111

1212
. "github.com/onsi/ginkgo/v2"
1313
. "github.com/onsi/gomega"
14+
"github.com/prometheus/client_golang/prometheus"
15+
prometheusclientmodel "github.com/prometheus/client_model/go"
1416
k8serrors "k8s.io/apimachinery/pkg/api/errors"
1517
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1618
"k8s.io/apimachinery/pkg/labels"
@@ -21,6 +23,7 @@ import (
2123

2224
placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
2325
"go.goms.io/fleet/pkg/utils/condition"
26+
"go.goms.io/fleet/pkg/utils/controller/metrics"
2427
testutilseviction "go.goms.io/fleet/test/utils/eviction"
2528
)
2629

@@ -41,15 +44,46 @@ const (
4144
var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
4245
crpName := fmt.Sprintf(crpNameTemplate, GinkgoParallelProcess())
4346
evictionName := fmt.Sprintf(evictionNameTemplate, GinkgoParallelProcess())
47+
var customRegistry *prometheus.Registry
48+
49+
BeforeEach(func() {
50+
// Create a test registry
51+
customRegistry = prometheus.NewRegistry()
52+
Expect(customRegistry.Register(metrics.FleetEvictionStatus)).Should(Succeed())
53+
// Reset metrics before each test
54+
metrics.FleetEvictionStatus.Reset()
55+
// emit incomplete eviction metric to simulate eviction failed once.
56+
metrics.FleetEvictionStatus.WithLabelValues(evictionName, "false", "unknown").SetToCurrentTime()
57+
})
4458

4559
AfterEach(func() {
4660
ensureCRPDBRemoved(crpName)
4761
ensureAllBindingsAreRemoved(crpName)
4862
ensureEvictionRemoved(evictionName)
4963
ensureCRPRemoved(crpName)
64+
Expect(customRegistry.Unregister(metrics.FleetEvictionStatus)).Should(BeTrue())
5065
})
5166

52-
It("Eviction Blocked - ClusterResourcePlacementDisruptionBudget's maxUnavailable blocks eviction", func() {
67+
It("Invalid Eviction Blocked - emit complete metric with isValid=false, isComplete=true", func() {
68+
By("Create ClusterResourcePlacementEviction", func() {
69+
eviction := buildTestEviction(evictionName, "random-crp", "test-cluster")
70+
Expect(k8sClient.Create(ctx, eviction)).Should(Succeed())
71+
})
72+
73+
By("Check eviction status", func() {
74+
evictionStatusUpdatedActual := testutilseviction.StatusUpdatedActual(
75+
ctx, k8sClient, evictionName,
76+
&testutilseviction.IsValidEviction{IsValid: false, Msg: condition.EvictionInvalidMissingCRPMessage},
77+
nil)
78+
Eventually(evictionStatusUpdatedActual, eventuallyDuration, eventuallyInterval).Should(Succeed())
79+
})
80+
81+
By("Ensure eviction complete metric was emitted", func() {
82+
checkEvictionCompleteMetric(customRegistry, "false", "true")
83+
})
84+
})
85+
86+
It("Eviction Blocked - ClusterResourcePlacementDisruptionBudget's maxUnavailable blocks eviction, emit complete isValid=true, isComplete=true", func() {
5387
crbName := fmt.Sprintf(crbNameTemplate, GinkgoParallelProcess())
5488

5589
By("Create ClusterResourcePlacement", func() {
@@ -122,9 +156,13 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
122156
return !k8serrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: crbName}, &crb))
123157
}, consistentlyDuration, consistentlyInterval).Should(BeTrue())
124158
})
159+
160+
By("Ensure eviction complete metric was emitted", func() {
161+
checkEvictionCompleteMetric(customRegistry, "true", "true")
162+
})
125163
})
126164

127-
It("Eviction Allowed - ClusterResourcePlacementDisruptionBudget's maxUnavailable allows eviction", func() {
165+
It("Eviction Allowed - ClusterResourcePlacementDisruptionBudget's maxUnavailable allows eviction, emit complete isValid=true, isComplete=true", func() {
128166
crbName := fmt.Sprintf(crbNameTemplate, GinkgoParallelProcess())
129167

130168
By("Create ClusterResourcePlacement", func() {
@@ -218,9 +256,13 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
218256
return k8serrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: crbName}, &crb))
219257
}, consistentlyDuration, consistentlyInterval).Should(BeTrue())
220258
})
259+
260+
By("Ensure eviction complete metric was emitted", func() {
261+
checkEvictionCompleteMetric(customRegistry, "true", "true")
262+
})
221263
})
222264

223-
It("Eviction Blocked - ClusterResourcePlacementDisruptionBudget's minAvailable blocks eviction", func() {
265+
It("Eviction Blocked - ClusterResourcePlacementDisruptionBudget's minAvailable blocks eviction, emit complete isValid=true, isComplete=true", func() {
224266
crbName := fmt.Sprintf(crbNameTemplate, GinkgoParallelProcess())
225267

226268
By("Create ClusterResourcePlacement", func() {
@@ -293,9 +335,13 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
293335
return !k8serrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: crbName}, &crb))
294336
}, consistentlyDuration, consistentlyInterval).Should(BeTrue())
295337
})
338+
339+
By("Ensure eviction complete metric was emitted", func() {
340+
checkEvictionCompleteMetric(customRegistry, "true", "true")
341+
})
296342
})
297343

298-
It("Eviction Allowed - ClusterResourcePlacementDisruptionBudget's minUnavailable allows eviction", func() {
344+
It("Eviction Allowed - ClusterResourcePlacementDisruptionBudget's minUnavailable allows eviction, emit complete isValid=true, isComplete=true", func() {
299345
crbName := fmt.Sprintf(crbNameTemplate, GinkgoParallelProcess())
300346
anotherCRBName := fmt.Sprintf(anotherCRBNameTemplate, GinkgoParallelProcess())
301347

@@ -419,6 +465,10 @@ var _ = Describe("Test ClusterResourcePlacementEviction Controller", func() {
419465
return !k8serrors.IsNotFound(k8sClient.Get(ctx, types.NamespacedName{Name: anotherCRBName}, &anotherCRB))
420466
}, consistentlyDuration, consistentlyInterval).Should(BeTrue())
421467
})
468+
469+
By("Ensure eviction complete metric was emitted", func() {
470+
checkEvictionCompleteMetric(customRegistry, "true", "true")
471+
})
422472
})
423473
})
424474

@@ -464,7 +514,7 @@ func ensureEvictionRemoved(name string) {
464514
Name: name,
465515
},
466516
}
467-
Expect(k8sClient.Delete(ctx, &eviction)).Should(Succeed())
517+
Expect(client.IgnoreNotFound(k8sClient.Delete(ctx, &eviction))).Should(Succeed())
468518
// Ensure eviction doesn't exist.
469519
Eventually(func() error {
470520
if err := k8sClient.Get(ctx, types.NamespacedName{Name: name}, &eviction); !k8serrors.IsNotFound(err) {
@@ -536,3 +586,26 @@ func ensureAllBindingsAreRemoved(crpName string) {
536586
ensureCRBRemoved(bindingList.Items[i].Name)
537587
}
538588
}
589+
590+
func checkEvictionCompleteMetric(registry *prometheus.Registry, isValid, isComplete string) {
591+
metricFamilies, err := registry.Gather()
592+
Expect(err).Should(Succeed())
593+
var evictionCompleteMetrics []*prometheusclientmodel.Metric
594+
for _, mf := range metricFamilies {
595+
if mf.GetName() == "fleet_workload_eviction_complete" {
596+
evictionCompleteMetrics = mf.GetMetric()
597+
}
598+
}
599+
// we only expect one metric, incomplete eviction metric should be removed.
600+
Expect(len(evictionCompleteMetrics)).Should(Equal(1))
601+
metricLabels := evictionCompleteMetrics[0].GetLabel()
602+
Expect(len(metricLabels)).Should(Equal(3))
603+
for _, label := range metricLabels {
604+
if label.GetName() == "isValid" {
605+
Expect(label.GetValue()).Should(Equal(isValid))
606+
}
607+
if label.GetName() == "isComplete" {
608+
Expect(label.GetValue()).Should(Equal(isComplete))
609+
}
610+
}
611+
}

pkg/controllers/clusterresourceplacementeviction/controller_test.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,29 @@ Licensed under the MIT license.
66
package clusterresourceplacementeviction
77

88
import (
9+
"context"
910
"errors"
11+
"fmt"
1012
"strings"
1113
"testing"
1214
"time"
1315

1416
"github.com/google/go-cmp/cmp"
1517
"github.com/google/go-cmp/cmp/cmpopts"
18+
"github.com/prometheus/client_golang/prometheus"
19+
prometheusclientmodel "github.com/prometheus/client_model/go"
1620
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1721
"k8s.io/apimachinery/pkg/runtime"
22+
"k8s.io/apimachinery/pkg/types"
1823
"k8s.io/apimachinery/pkg/util/intstr"
1924
"k8s.io/utils/ptr"
25+
controllerruntime "sigs.k8s.io/controller-runtime"
2026
"sigs.k8s.io/controller-runtime/pkg/client"
2127
"sigs.k8s.io/controller-runtime/pkg/client/fake"
2228

2329
placementv1beta1 "go.goms.io/fleet/apis/placement/v1beta1"
2430
"go.goms.io/fleet/pkg/utils/condition"
31+
"go.goms.io/fleet/pkg/utils/controller/metrics"
2532
"go.goms.io/fleet/pkg/utils/defaulter"
2633
)
2734

@@ -1469,6 +1476,69 @@ func TestIsEvictionAllowed(t *testing.T) {
14691476
}
14701477
}
14711478

1479+
func TestReconcileForIncompleteEvictionMetric(t *testing.T) {
1480+
request := controllerruntime.Request{NamespacedName: types.NamespacedName{Name: "test-eviction"}}
1481+
isValid := "unknown"
1482+
isComplete := "false"
1483+
1484+
// Create a test registry
1485+
customRegistry := prometheus.NewRegistry()
1486+
if err := customRegistry.Register(metrics.FleetEvictionStatus); err != nil {
1487+
t.Errorf("Failed to register metric: %v", err)
1488+
}
1489+
1490+
// Reset metrics before each test
1491+
metrics.FleetEvictionStatus.Reset()
1492+
1493+
scheme := serviceScheme(t)
1494+
fakeClient := fake.NewClientBuilder().
1495+
WithScheme(scheme).
1496+
Build()
1497+
r := Reconciler{
1498+
Client: mockClient{fakeClient},
1499+
}
1500+
_, err := r.Reconcile(ctx, request)
1501+
if err == nil {
1502+
t.Errorf("reconcile should have failed")
1503+
}
1504+
1505+
metricFamilies, err := customRegistry.Gather()
1506+
if err != nil {
1507+
t.Errorf("error gathering metrics: %v", err)
1508+
}
1509+
1510+
var evictionCompleteMetrics []*prometheusclientmodel.Metric
1511+
for _, mf := range metricFamilies {
1512+
if mf.GetName() == "fleet_workload_eviction_complete" {
1513+
evictionCompleteMetrics = mf.GetMetric()
1514+
}
1515+
}
1516+
1517+
// we only expect one metric.
1518+
if len(evictionCompleteMetrics) != 1 {
1519+
t.Errorf("expected one eviction complete metric, got %d", len(evictionCompleteMetrics))
1520+
}
1521+
1522+
// Check if the metric matches the expected label values
1523+
labels := evictionCompleteMetrics[0].GetLabel()
1524+
if len(labels) != 3 {
1525+
t.Errorf("expected three labels, got %d", len(labels))
1526+
}
1527+
1528+
for _, label := range labels {
1529+
if label.GetName() == "isValid" {
1530+
if label.GetValue() != isValid {
1531+
t.Errorf("isValid label value doesn't match got: %v, want %v", label.GetValue(), isValid)
1532+
}
1533+
}
1534+
if label.GetName() == "isComplete" {
1535+
if label.GetValue() != isComplete {
1536+
t.Errorf("isComplete label value doesn't match got: %v, want %v", label.GetValue(), isComplete)
1537+
}
1538+
}
1539+
}
1540+
}
1541+
14721542
func buildTestPickAllCRP(crpName string) placementv1beta1.ClusterResourcePlacement {
14731543
return placementv1beta1.ClusterResourcePlacement{
14741544
ObjectMeta: metav1.ObjectMeta{
@@ -1492,3 +1562,11 @@ func serviceScheme(t *testing.T) *runtime.Scheme {
14921562
}
14931563
return scheme
14941564
}
1565+
1566+
type mockClient struct {
1567+
client.Client
1568+
}
1569+
1570+
func (m mockClient) Get(_ context.Context, _ client.ObjectKey, _ client.Object, _ ...client.GetOption) error {
1571+
return fmt.Errorf("internal error")
1572+
}

0 commit comments

Comments
 (0)