Skip to content

Commit 7092376

Browse files
Merge pull request #1318 from hongkailiu/cluster_version_risk_condition
OTA-1546: Add a metric and an alert for conditional risk conditions
2 parents af5e95e + 3176f79 commit 7092376

3 files changed

Lines changed: 169 additions & 0 deletions

File tree

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PrometheusRule
3+
metadata:
4+
labels:
5+
k8s-app: cluster-version-operator
6+
name: cluster-version-operator-accept-risks
7+
namespace: openshift-cluster-version
8+
annotations:
9+
kubernetes.io/description: Alerting rules for the feature gate ClusterUpdateAcceptRisks.
10+
exclude.release.openshift.io/internal-openshift-hosted: "true"
11+
include.release.openshift.io/self-managed-high-availability: "true"
12+
release.openshift.io/feature-gate: "ClusterUpdateAcceptRisks"
13+
spec:
14+
groups:
15+
- name: cluster-version-tech-preview
16+
rules:
17+
- alert: OpenShiftUpdateRiskMightApply
18+
annotations:
19+
summary: The cluster might have been exposed to the conditional update risk for 10 minutes.
20+
description: The conditional update risk {{ "{{ $labels.risk }}" }} might apply to the cluster because of {{ "{{ $labels.reason }}" }}, and the cluster update to a version exposed to the risk is not recommended. For more information refer to 'oc adm upgrade'.
21+
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-version-operator/OpenShiftUpdateRiskMightApply.md
22+
expr: |
23+
max by (namespace, risk, reason) (last_over_time(cluster_version_risk_conditions{job="cluster-version-operator", condition="Applies"}[5m]) != 0)
24+
for: 10m
25+
labels:
26+
severity: warning

pkg/cvo/metrics.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515

1616
corev1 "k8s.io/api/core/v1"
1717
apierrors "k8s.io/apimachinery/pkg/api/errors"
18+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1819
"k8s.io/apimachinery/pkg/labels"
1920
"k8s.io/apimachinery/pkg/util/sets"
2021
"k8s.io/apiserver/pkg/server/dynamiccertificates"
@@ -54,6 +55,7 @@ type operatorMetrics struct {
5455
capability *prometheus.GaugeVec
5556
clusterOperatorUp *prometheus.GaugeVec
5657
clusterOperatorConditions *prometheus.GaugeVec
58+
clusterVersionRiskConditions *prometheus.GaugeVec
5759
clusterOperatorConditionTransitions *prometheus.GaugeVec
5860
clusterInstaller *prometheus.GaugeVec
5961
clusterVersionOperatorUpdateRetrievalTimestampSeconds *prometheus.GaugeVec
@@ -104,6 +106,10 @@ penultimate completed version for 'completed'.
104106
Name: "cluster_operator_conditions",
105107
Help: "Report the conditions for active cluster operators. 0 is False and 1 is True.",
106108
}, []string{"name", "condition", "reason"}),
109+
clusterVersionRiskConditions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
110+
Name: "cluster_version_risk_conditions",
111+
Help: "Report the risk conditions for the cluster version. -1 is Unknown, 0 is False and 1 is True.",
112+
}, []string{"condition", "risk", "reason"}),
107113
clusterOperatorConditionTransitions: prometheus.NewGaugeVec(prometheus.GaugeOpts{
108114
Name: "cluster_operator_condition_transitions",
109115
Help: "Reports the number of times that a condition on a cluster operator changes status",
@@ -489,6 +495,7 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
489495
ch <- m.capability.WithLabelValues("").Desc()
490496
ch <- m.clusterOperatorUp.WithLabelValues("", "", "").Desc()
491497
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
498+
ch <- m.clusterVersionRiskConditions.WithLabelValues("", "", "").Desc()
492499
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
493500
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
494501
ch <- m.clusterVersionOperatorUpdateRetrievalTimestampSeconds.WithLabelValues("").Desc()
@@ -510,6 +517,26 @@ func (m *operatorMetrics) collectConditionalUpdates(ch chan<- prometheus.Metric,
510517
}
511518
}
512519

520+
func (m *operatorMetrics) collectConditionalUpdateRisks(ch chan<- prometheus.Metric, risks []configv1.ConditionalUpdateRisk) {
521+
for _, risk := range risks {
522+
for _, condition := range risk.Conditions {
523+
if condition.Type != internal.ConditionalUpdateRiskConditionTypeApplies {
524+
continue
525+
}
526+
527+
g := m.clusterVersionRiskConditions.WithLabelValues(condition.Type, risk.Name, condition.Reason)
528+
switch condition.Status {
529+
case metav1.ConditionTrue:
530+
g.Set(1)
531+
case metav1.ConditionUnknown:
532+
g.Set(-1)
533+
}
534+
// We do not need to do g.Set(0) as it is done when g is initialized
535+
ch <- g
536+
}
537+
}
538+
}
539+
513540
// Collect collects metrics from the operator into the channel ch
514541
func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
515542
current := m.optr.currentVersion()
@@ -655,6 +682,9 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
655682
}
656683

657684
m.collectConditionalUpdates(ch, cv.Status.ConditionalUpdates)
685+
if m.optr.shouldReconcileAcceptRisks() {
686+
m.collectConditionalUpdateRisks(ch, cv.Status.ConditionalUpdateRisks)
687+
}
658688
}
659689

660690
g := m.version.WithLabelValues("current", current.Version, current.Image, completed.Version)

pkg/cvo/metrics_test.go

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
configv1 "github.com/openshift/api/config/v1"
2626
"github.com/openshift/library-go/pkg/crypto"
2727

28+
"github.com/openshift/cluster-version-operator/pkg/featuregates"
2829
"github.com/openshift/cluster-version-operator/pkg/internal"
2930
)
3031

@@ -669,6 +670,7 @@ func Test_operatorMetrics_Collect(t *testing.T) {
669670
}
670671
for _, tt := range tests {
671672
t.Run(tt.name, func(t *testing.T) {
673+
tt.optr.enabledCVOFeatureGates = featuregates.DefaultCvoGates("version")
672674
tt.optr.eventRecorder = record.NewFakeRecorder(100)
673675
if tt.optr.cvLister == nil {
674676
tt.optr.cvLister = &cvLister{}
@@ -975,6 +977,117 @@ func TestCollectUnknownConditionalUpdates(t *testing.T) {
975977
}
976978
}
977979

980+
func Test_collectConditionalUpdateRisks(t *testing.T) {
981+
type valueWithLabels struct {
982+
value float64
983+
labels map[string]string
984+
}
985+
testCases := []struct {
986+
name string
987+
risks []configv1.ConditionalUpdateRisk
988+
expected []valueWithLabels
989+
}{
990+
{
991+
name: "no conditional updates",
992+
expected: []valueWithLabels{},
993+
},
994+
{
995+
name: "unknown type",
996+
risks: []configv1.ConditionalUpdateRisk{
997+
{
998+
Name: "RiskX",
999+
Conditions: []metav1.Condition{{
1000+
Type: internal.ConditionalUpdateConditionTypeRecommended,
1001+
Status: metav1.ConditionFalse,
1002+
Reason: "ReasonA",
1003+
Message: "Risk does not apply",
1004+
}},
1005+
},
1006+
},
1007+
},
1008+
{
1009+
name: "apply false",
1010+
risks: []configv1.ConditionalUpdateRisk{
1011+
{
1012+
Name: "RiskX",
1013+
Conditions: []metav1.Condition{{
1014+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1015+
Status: metav1.ConditionFalse,
1016+
Reason: "ReasonA",
1017+
Message: "Risk does not apply",
1018+
}},
1019+
},
1020+
},
1021+
expected: []valueWithLabels{{
1022+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
1023+
}},
1024+
},
1025+
{
1026+
name: "apply true",
1027+
risks: []configv1.ConditionalUpdateRisk{
1028+
{
1029+
Name: "RiskX",
1030+
Conditions: []metav1.Condition{{
1031+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1032+
Status: metav1.ConditionTrue,
1033+
Reason: "ReasonA",
1034+
Message: "Risk does not apply",
1035+
}},
1036+
},
1037+
},
1038+
expected: []valueWithLabels{{
1039+
value: 1,
1040+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
1041+
}},
1042+
},
1043+
{
1044+
name: "apply unknown",
1045+
risks: []configv1.ConditionalUpdateRisk{
1046+
{
1047+
Name: "RiskX",
1048+
Conditions: []metav1.Condition{{
1049+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
1050+
Status: metav1.ConditionUnknown,
1051+
Reason: "ReasonA",
1052+
Message: "Risk does not apply",
1053+
}},
1054+
},
1055+
},
1056+
expected: []valueWithLabels{{
1057+
value: -1,
1058+
labels: map[string]string{"condition": "Applies", "risk": "RiskX", "reason": "ReasonA"},
1059+
}},
1060+
},
1061+
}
1062+
1063+
for _, tc := range testCases {
1064+
tc := tc
1065+
t.Run(tc.name, func(t *testing.T) {
1066+
optr := &Operator{}
1067+
m := newOperatorMetrics(optr)
1068+
ch := make(chan prometheus.Metric)
1069+
1070+
go func() {
1071+
m.collectConditionalUpdateRisks(ch, tc.risks)
1072+
close(ch)
1073+
}()
1074+
1075+
var collected []prometheus.Metric
1076+
for item := range ch {
1077+
collected = append(collected, item)
1078+
}
1079+
1080+
if lenC, lenE := len(collected), len(tc.expected); lenC != lenE {
1081+
1082+
t.Fatalf("Expected %d metrics, got %d metrics\nGot metrics: %s", lenE, lenC, spew.Sdump(collected))
1083+
}
1084+
for i := range tc.expected {
1085+
expectMetric(t, collected[i], tc.expected[i].value, tc.expected[i].labels)
1086+
}
1087+
})
1088+
}
1089+
}
1090+
9781091
func expectMetric(t *testing.T, metric prometheus.Metric, value float64, labels map[string]string) {
9791092
t.Helper()
9801093
var d dto.Metric

0 commit comments

Comments
 (0)