Skip to content

Commit 923381d

Browse files
committed
Introduce a new metric cluster_version_risk_conditions
Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. #1284 (comment)
1 parent 0797b28 commit 923381d

2 files changed

Lines changed: 139 additions & 0 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ type operatorMetrics struct {
5858
capability *prometheus.GaugeVec
5959
clusterOperatorUp *prometheus.GaugeVec
6060
clusterOperatorConditions *prometheus.GaugeVec
61+
clusterVersionRiskConditions *prometheus.GaugeVec
6162
clusterOperatorConditionTransitions *prometheus.GaugeVec
6263
clusterInstaller *prometheus.GaugeVec
6364
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -108,6 +109,10 @@ penultimate completed version for 'completed'.
108109
Name: "cluster_operator_conditions",
109110
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
110111
}, []string{"name", "condition", "reason"}),
112+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
113+
Name: "cluster_version_risk_conditions",
114+
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
115+
}, []string{"name", "condition", "risk"}),
111116
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
112117
Name: "cluster_operator_condition_transitions",
113118
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -436,6 +441,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
436441
ch <- m.capability.WithLabelValues("").Desc()
437442
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
438443
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
444+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
439445
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
440446
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
441447
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -457,6 +463,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
457463
}
458464
}
459465

466+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
467+
for _, risk := range risks {
468+
for _, condition := range risk.Conditions {
469+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
470+
continue
471+
}
472+
473+
g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
474+
if condition.Status == metav1.ConditionTrue {
475+
g.Set(1)
476+
} else {
477+
g.Set(0)
478+
}
479+
ch <- g
480+
}
481+
}
482+
}
483+
460484
// Collect collects metrics from the operator into the channel ch
461485
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
462486
current := m.optr.currentVersion()
@@ -602,6 +626,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
602626
}
603627

604628
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
629+
if m.optr.shouldReconcileAcceptRisks() {
630+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
631+
}
605632
}
606633

607634
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"errors"
66
"fmt"
7+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
78
"io"
89
"net/http"
910
"net/http/httptest"
@@ -667,6 +668,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
667668
}
668669
for _, tt := range tests {
669670
t.Run(tt.name, func(t *testing.T) {
671+
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
670672
tt.optr.eventRecorder = record.NewFakeRecorder(100)
671673
if tt.optr.cvLister == nil {
672674
tt.optr.cvLister = &cvLister{}
@@ -973,6 +975,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
973975
}
974976
}
975977

978+
func Test_collectConditionalUpdateRisks(t *testing.T) {
979+
type valueWithLabels struct {
980+
value float64
981+
labels map[string]string
982+
}
983+
testCases := []struct {
984+
name string
985+
risks []configv1.ConditionalUpdateRisk
986+
expected []valueWithLabels
987+
}{
988+
{
989+
name: "no conditional updates",
990+
expected: []valueWithLabels{},
991+
},
992+
{
993+
name: "unknown type",
994+
risks: []configv1.ConditionalUpdateRisk{
995+
{
996+
Name: "RiskX",
997+
Conditions: []metav1.Condition{{
998+
Type: internal.ConditionalUpdateConditionTypeRecommended,
999+
Status: metav1.ConditionFalse,
1000+
Reason: "ReasonA",
1001+
Message: "Risk does not apply",
1002+
}},
1003+
},
1004+
},
1005+
},
1006+
{
1007+
name: "apply false",
1008+
risks: []configv1.ConditionalUpdateRisk{
1009+
{
1010+
Name: "RiskX",
1011+
Conditions: []metav1.Condition{{
1012+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1013+
Status: metav1.ConditionFalse,
1014+
Reason: "ReasonA",
1015+
Message: "Risk does not apply",
1016+
}},
1017+
},
1018+
},
1019+
expected: []valueWithLabels{{
1020+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1021+
}},
1022+
},
1023+
{
1024+
name: "apply true",
1025+
risks: []configv1.ConditionalUpdateRisk{
1026+
{
1027+
Name: "RiskX",
1028+
Conditions: []metav1.Condition{{
1029+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1030+
Status: metav1.ConditionTrue,
1031+
Reason: "ReasonA",
1032+
Message: "Risk does not apply",
1033+
}},
1034+
},
1035+
},
1036+
expected: []valueWithLabels{{
1037+
value: 1,
1038+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1039+
}},
1040+
},
1041+
{
1042+
name: "apply unknown",
1043+
risks: []configv1.ConditionalUpdateRisk{
1044+
{
1045+
Name: "RiskX",
1046+
Conditions: []metav1.Condition{{
1047+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1048+
Status: metav1.ConditionUnknown,
1049+
Reason: "ReasonA",
1050+
Message: "Risk does not apply",
1051+
}},
1052+
},
1053+
},
1054+
expected: []valueWithLabels{{
1055+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1056+
}},
1057+
},
1058+
}
1059+
1060+
for _, tc := range testCases {
1061+
tc := tc
1062+
t.Run(tc.name, func(t *testing.T) {
1063+
optr := &Operator{}
1064+
m := newOperatorMetrics(optr)
1065+
ch := make(chan prometheus.Metric)
1066+
1067+
go func() {
1068+
m.collectConditionalUpdateRisks(ch, tc.risks)
1069+
close(ch)
1070+
}()
1071+
1072+
var collected []prometheus.Metric
1073+
for item := range ch {
1074+
collected = append(collected, item)
1075+
}
1076+
1077+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1078+
1079+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1080+
}
1081+
for i := range tc.expected {
1082+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1083+
}
1084+
})
1085+
}
1086+
}
1087+
9761088
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9771089
t.Helper()
9781090
var d dto.Metric

0 commit comments

Comments
 (0)