Skip to content

Commit 8d8ebb4

Browse files
committed
OTA-1813: Populate risks from alerts
CVO fetches alerts from OpenShift monitoring via its RestAPI, then filters them out by its state such as firing, severity such as critical, name, or label. Those alerts are converted to conditional update risks and displayed in the cv.status. Moreover, they are associated to each update targets, regardless of conditional or not. Their Recommended condition is evaluated accordingly. CVO caches the returned alerts for a configurable expiration. This avoid bombarding the OpenShift monitoring. In addition, it fixes `newRecommendedReason` which returns a new combined reason or a new one that overrides the existing one. The issue comes with `recommendedReasonAllExposedRisksAccepted` in [1]. The unit test covers all the possible combinations. [1]. openshift#1284
1 parent 0276666 commit 8d8ebb4

11 files changed

Lines changed: 1061 additions & 62 deletions

File tree

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package promql
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/api"
10+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
11+
"github.com/prometheus/common/config"
12+
13+
"k8s.io/klog/v2"
14+
15+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
16+
)
17+
18+
type Getter interface {
19+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
20+
}
21+
22+
func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter {
23+
p := NewPromQL(promQLTarget)
24+
condition := p.Condition
25+
v, ok := condition.(*PromQL)
26+
if !ok {
27+
panic("invalid condition type")
28+
}
29+
return &ocAlertGetter{promQL: v, expiration: 1 * time.Minute}
30+
}
31+
32+
type ocAlertGetter struct {
33+
promQL *PromQL
34+
35+
mutex sync.Mutex
36+
cached prometheusv1.AlertsResult
37+
expiration time.Duration
38+
lastRefresh time.Time
39+
}
40+
41+
func (o *ocAlertGetter) Get(ctx context.Context) (prometheusv1.AlertsResult, error) {
42+
if time.Now().After(o.lastRefresh.Add(o.expiration)) {
43+
if err := o.refresh(ctx); err != nil {
44+
klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err)
45+
}
46+
}
47+
return o.cached, nil
48+
}
49+
50+
func (o *ocAlertGetter) refresh(ctx context.Context) error {
51+
o.mutex.Lock()
52+
defer o.mutex.Unlock()
53+
54+
klog.Info("refresh alerts ...")
55+
p := o.promQL
56+
host, err := p.Host(ctx)
57+
if err != nil {
58+
return fmt.Errorf("failure determine thanos IP: %w", err)
59+
}
60+
p.url.Host = host
61+
clientConfig := api.Config{Address: p.url.String()}
62+
63+
if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil {
64+
clientConfig.RoundTripper = roundTripper
65+
} else {
66+
return fmt.Errorf("creating PromQL round-tripper: %w", err)
67+
}
68+
69+
promqlClient, err := api.NewClient(clientConfig)
70+
if err != nil {
71+
return fmt.Errorf("creating PromQL client: %w", err)
72+
}
73+
74+
client := &statusCodeNotImplementedForPostClient{
75+
client: promqlClient,
76+
}
77+
78+
v1api := prometheusv1.NewAPI(client)
79+
80+
queryContext := ctx
81+
if p.QueryTimeout > 0 {
82+
var cancel context.CancelFunc
83+
queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout)
84+
defer cancel()
85+
}
86+
87+
r, err := v1api.Alerts(queryContext)
88+
if err != nil {
89+
return fmt.Errorf("failed to get alerts: %w", err)
90+
}
91+
o.cached = r
92+
o.lastRefresh = time.Now()
93+
klog.Infof("refreshed: %d alerts", len(o.cached.Alerts))
94+
return nil
95+
}

pkg/cvo/availableupdates.go

Lines changed: 191 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/go-cmp/cmp"
1616
"github.com/google/go-cmp/cmp/cmpopts"
1717
"github.com/google/uuid"
18+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819

1920
"k8s.io/apimachinery/pkg/api/equality"
2021
"k8s.io/apimachinery/pkg/api/meta"
@@ -162,6 +163,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
162163

163164
optrAvailableUpdates.AcceptRisks = acceptRisks
164165
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
166+
optrAvailableUpdates.AlertGetter = optr.AlertGetter
167+
if optrAvailableUpdates.ShouldReconcileAcceptRisks() {
168+
if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil {
169+
klog.Errorf("Failed to evaluate alert conditions: %v", err)
170+
}
171+
}
165172
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
166173

167174
queueKey := optr.queueKey()
@@ -215,6 +222,11 @@ type availableUpdates struct {
215222

216223
// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
217224
RiskConditions map[string][]metav1.Condition
225+
226+
AlertGetter AlertGetter
227+
228+
// AlertRisks stores the condition for every alert that might have impact on cluster update
229+
AlertRisks []configv1.ConditionalUpdateRisk
218230
}
219231

220232
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -320,6 +332,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
320332
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
321333
AcceptRisks: optr.availableUpdates.AcceptRisks,
322334
RiskConditions: optr.availableUpdates.RiskConditions,
335+
AlertRisks: optr.availableUpdates.AlertRisks,
336+
AlertGetter: optr.availableUpdates.AlertGetter,
323337
LastAttempt: optr.availableUpdates.LastAttempt,
324338
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
325339
Current: *optr.availableUpdates.Current.DeepCopy(),
@@ -512,6 +526,170 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
512526
}
513527
}
514528

529+
type AlertGetter interface {
530+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
531+
}
532+
533+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error {
534+
if u == nil || u.AlertGetter == nil {
535+
u.AlertRisks = nil
536+
return nil
537+
}
538+
alertsResult, err := u.AlertGetter.Get(ctx)
539+
if err != nil {
540+
return fmt.Errorf("failed to get alerts: %w", err)
541+
}
542+
543+
u.AlertRisks = alertsToRisks(alertsResult.Alerts)
544+
return nil
545+
}
546+
547+
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
548+
klog.V(2).Infof("Found %d alerts", len(alerts))
549+
risks := map[string]configv1.ConditionalUpdateRisk{}
550+
for _, alert := range alerts {
551+
var alertName string
552+
if alertName = string(alert.Labels["alertname"]); alertName == "" {
553+
continue
554+
}
555+
if alert.State == "pending" {
556+
continue
557+
}
558+
559+
var summary string
560+
if summary = string(alert.Annotations["summary"]); summary == "" {
561+
summary = alertName
562+
}
563+
if !strings.HasSuffix(summary, ".") {
564+
summary += "."
565+
}
566+
567+
var description string
568+
alertMessage := string(alert.Annotations["message"])
569+
alertDescription := string(alert.Annotations["description"])
570+
switch {
571+
case alertMessage != "" && alertDescription != "":
572+
description += " The alert description is: " + alertDescription + " | " + alertMessage
573+
case alertDescription != "":
574+
description += " The alert description is: " + alertDescription
575+
case alertMessage != "":
576+
description += " The alert description is: " + alertMessage
577+
default:
578+
description += " The alert has no description."
579+
}
580+
581+
var runbook string
582+
if runbook = string(alert.Annotations["runbook"]); runbook == "" {
583+
runbook = "<alert does not have a runbook_url annotation>"
584+
}
585+
586+
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
587+
588+
// TODO (hongkliu):
589+
alertURL := "https://console.com/monitoring/alertrules/id"
590+
alertPromQL := "todo-expression"
591+
592+
severity := string(alert.Labels["severity"])
593+
if severity == "critical" {
594+
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
595+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
596+
}
597+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
598+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
599+
Status: metav1.ConditionTrue,
600+
Reason: fmt.Sprintf("Alert:%s", alert.State),
601+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details),
602+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
603+
})
604+
continue
605+
}
606+
607+
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
608+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
609+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
610+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
611+
Status: metav1.ConditionTrue,
612+
Reason: internal.AlertConditionReason(string(alert.State)),
613+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details),
614+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
615+
})
616+
continue
617+
}
618+
619+
if internal.HavePullWaiting.Has(alertName) ||
620+
internal.HaveNodes.Has(alertName) ||
621+
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
622+
alertName == internal.AlertNameVMCannotBeEvicted {
623+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
624+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
625+
Status: metav1.ConditionTrue,
626+
Reason: internal.AlertConditionReason(string(alert.State)),
627+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details),
628+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
629+
})
630+
continue
631+
}
632+
633+
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
634+
if updatePrecheck == "true" {
635+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
636+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
637+
Status: metav1.ConditionTrue,
638+
Reason: fmt.Sprintf("Alert:%s", alert.State),
639+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details),
640+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
641+
})
642+
continue
643+
}
644+
}
645+
646+
klog.V(2).Infof("Got %d risks", len(risks))
647+
if len(risks) == 0 {
648+
return nil
649+
}
650+
651+
var ret []configv1.ConditionalUpdateRisk
652+
var keys []string
653+
for k := range risks {
654+
keys = append(keys, k)
655+
}
656+
sort.Strings(keys)
657+
for _, k := range keys {
658+
ret = append(ret, risks[k])
659+
}
660+
return ret
661+
}
662+
663+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
664+
risk, ok := risks[riskName]
665+
if !ok {
666+
return configv1.ConditionalUpdateRisk{
667+
Name: riskName,
668+
Message: message,
669+
URL: url,
670+
MatchingRules: []configv1.ClusterCondition{
671+
{
672+
Type: "PromQL",
673+
PromQL: &configv1.PromQLClusterCondition{
674+
PromQL: promQL,
675+
},
676+
},
677+
},
678+
Conditions: []metav1.Condition{condition},
679+
}
680+
}
681+
682+
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
683+
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
684+
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
685+
c.LastTransitionTime = condition.LastTransitionTime
686+
}
687+
meta.SetStatusCondition(&risk.Conditions, *c)
688+
}
689+
690+
return risk
691+
}
692+
515693
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
516694
if u == nil {
517695
return
@@ -521,7 +699,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
521699
risks := risksInOrder(riskVersions)
522700
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
523701

524-
if err := sanityCheck(u.ConditionalUpdates); err != nil {
702+
if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil {
525703
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
526704
}
527705
for i, conditionalUpdate := range u.ConditionalUpdates {
@@ -539,7 +717,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
539717
}
540718
}
541719

542-
func sanityCheck(updates []configv1.ConditionalUpdate) error {
720+
func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error {
543721
risks := map[string]configv1.ConditionalUpdateRisk{}
544722
var errs []error
545723
for _, update := range updates {
@@ -555,6 +733,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
555733
}
556734
}
557735
}
736+
for _, alertRisk := range alertRisks {
737+
if _, ok := risks[alertRisk.Name]; ok {
738+
errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name))
739+
}
740+
}
558741
return utilerrors.NewAggregate(errs)
559742
}
560743

@@ -599,6 +782,7 @@ const (
599782
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
600783
recommendedReasonEvaluationFailed = "EvaluationFailed"
601784
recommendedReasonMultiple = "MultipleReasons"
785+
//recommendedReasonAlertFiring = "AlertFiring"
602786

603787
// recommendedReasonExposed is used instead of the original name if it does
604788
// not match the pattern for a valid k8s condition reason.
@@ -616,10 +800,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
616800
func newRecommendedReason(now, want string) string {
617801
switch {
618802
case now == recommendedReasonRisksNotExposed ||
619-
now == recommendedReasonAllExposedRisksAccepted ||
803+
now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
804+
now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
620805
now == want:
621806
return want
622-
case want == recommendedReasonRisksNotExposed:
807+
case want == recommendedReasonRisksNotExposed ||
808+
(now == recommendedReasonEvaluationFailed) && want == recommendedReasonAllExposedRisksAccepted ||
809+
now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed):
623810
return now
624811
default:
625812
return recommendedReasonMultiple
@@ -676,7 +863,6 @@ func evaluateConditionalUpdate(
676863
if len(errorMessages) > 0 {
677864
recommended.Message = strings.Join(errorMessages, "\n\n")
678865
}
679-
680866
return recommended
681867
}
682868

0 commit comments

Comments
 (0)