Skip to content

Commit 502964f

Browse files
committed
Introduce a new metric cluster_version_risk_conditions
Follow up [1]. The samples for `cluster_version_risk_conditions` will be collected only when its operator `shouldReconcileAcceptRisks`. It means, e.g., on a TechPreview disabled cluster the metric is still defined but has no samples. [1]. openshift#1284 (comment)
1 parent 0797b28 commit 502964f

2 files changed

Lines changed: 137 additions & 0 deletions

File tree

pkg/cvo/metrics.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ type operatorMetrics struct {
5858
capability *prometheus.GaugeVec
5959
clusterOperatorUp *prometheus.GaugeVec
6060
clusterOperatorConditions *prometheus.GaugeVec
61+
clusterVersionRiskConditions *prometheus.GaugeVec
6162
clusterOperatorConditionTransitions *prometheus.GaugeVec
6263
clusterInstaller *prometheus.GaugeVec
6364
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -108,6 +109,10 @@ penultimate completed version for 'completed'.
108109
Name: "cluster_operator_conditions",
109110
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
110111
}, []string{"name", "condition", "reason"}),
112+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
113+
Name: "cluster_version_risk_conditions",
114+
Help: "Report the risk conditions for cluster versions. 0 is False and 1 is True.",
115+
}, []string{"name", "condition", "risk"}),
111116
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
112117
Name: "cluster_operator_condition_transitions",
113118
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -436,6 +441,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
436441
ch <- m.capability.WithLabelValues("").Desc()
437442
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
438443
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
444+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
439445
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
440446
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
441447
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -457,6 +463,24 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
457463
}
458464
}
459465

466+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
467+
for _, risk := range risks {
468+
for _, condition := range risk.Conditions {
469+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
470+
continue
471+
}
472+
473+
g := m.clusterVersionRiskConditions.WithLabelValues("version", condition.Type, risk.Name)
474+
if condition.Status == metav1.ConditionTrue {
475+
g.Set(1)
476+
} else {
477+
g.Set(0)
478+
}
479+
ch <- g
480+
}
481+
}
482+
}
483+
460484
// Collect collects metrics from the operator into the channel ch
461485
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
462486
current := m.optr.currentVersion()
@@ -602,6 +626,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
602626
}
603627

604628
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
629+
if m.optr.shouldReconcileAcceptRisks() {
630+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
631+
}
605632
}
606633

607634
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,116 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
973973
}
974974
}
975975

976+
func Test_collectConditionalUpdateRisks(t *testing.T) {
977+
type valueWithLabels struct {
978+
value float64
979+
labels map[string]string
980+
}
981+
testCases := []struct {
982+
name string
983+
risks []configv1.ConditionalUpdateRisk
984+
expected []valueWithLabels
985+
}{
986+
{
987+
name: "no conditional updates",
988+
expected: []valueWithLabels{},
989+
},
990+
{
991+
name: "unknown type",
992+
risks: []configv1.ConditionalUpdateRisk{
993+
{
994+
Name: "RiskX",
995+
Conditions: []metav1.Condition{{
996+
Type: internal.ConditionalUpdateConditionTypeRecommended,
997+
Status: metav1.ConditionFalse,
998+
Reason: "ReasonA",
999+
Message: "Risk does not apply",
1000+
}},
1001+
},
1002+
},
1003+
},
1004+
{
1005+
name: "apply false",
1006+
risks: []configv1.ConditionalUpdateRisk{
1007+
{
1008+
Name: "RiskX",
1009+
Conditions: []metav1.Condition{{
1010+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1011+
Status: metav1.ConditionFalse,
1012+
Reason: "ReasonA",
1013+
Message: "Risk does not apply",
1014+
}},
1015+
},
1016+
},
1017+
expected: []valueWithLabels{{
1018+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1019+
}},
1020+
},
1021+
{
1022+
name: "apply true",
1023+
risks: []configv1.ConditionalUpdateRisk{
1024+
{
1025+
Name: "RiskX",
1026+
Conditions: []metav1.Condition{{
1027+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1028+
Status: metav1.ConditionTrue,
1029+
Reason: "ReasonA",
1030+
Message: "Risk does not apply",
1031+
}},
1032+
},
1033+
},
1034+
expected: []valueWithLabels{{
1035+
value: 1,
1036+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1037+
}},
1038+
},
1039+
{
1040+
name: "apply unknown",
1041+
risks: []configv1.ConditionalUpdateRisk{
1042+
{
1043+
Name: "RiskX",
1044+
Conditions: []metav1.Condition{{
1045+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1046+
Status: metav1.ConditionUnknown,
1047+
Reason: "ReasonA",
1048+
Message: "Risk does not apply",
1049+
}},
1050+
},
1051+
},
1052+
expected: []valueWithLabels{{
1053+
labels: map[string]string{"name": "version", "condition": "Applies", "risk": "RiskX"},
1054+
}},
1055+
},
1056+
}
1057+
1058+
for _, tc := range testCases {
1059+
tc := tc
1060+
t.Run(tc.name, func(t *testing.T) {
1061+
optr := &Operator{}
1062+
m := newOperatorMetrics(optr)
1063+
ch := make(chan prometheus.Metric)
1064+
1065+
go func() {
1066+
m.collectConditionalUpdateRisks(ch, tc.risks)
1067+
close(ch)
1068+
}()
1069+
1070+
var collected []prometheus.Metric
1071+
for item := range ch {
1072+
collected = append(collected, item)
1073+
}
1074+
1075+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1076+
1077+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1078+
}
1079+
for i := range tc.expected {
1080+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1081+
}
1082+
})
1083+
}
1084+
}
1085+
9761086
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9771087
t.Helper()
9781088
var d dto.Metric

0 commit comments

Comments
 (0)