@@ -15,6 +15,7 @@ import (
1515 "github.com/google/go-cmp/cmp"
1616 "github.com/google/go-cmp/cmp/cmpopts"
1717 "github.com/google/uuid"
18+ prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819
1920 "k8s.io/apimachinery/pkg/api/equality"
2021 "k8s.io/apimachinery/pkg/api/meta"
@@ -162,6 +163,12 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
162163
163164 optrAvailableUpdates .AcceptRisks = acceptRisks
164165 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
166+ optrAvailableUpdates .AlertGetter = optr .AlertGetter
167+ if optrAvailableUpdates .ShouldReconcileAcceptRisks () {
168+ if err := optrAvailableUpdates .evaluateAlertRisks (ctx ); err != nil {
169+ klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
170+ }
171+ }
165172 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
166173
167174 queueKey := optr .queueKey ()
@@ -215,6 +222,11 @@ type availableUpdates struct {
215222
216223 // RiskConditions stores the condition for every risk (name, url, message, matchingRules).
217224 RiskConditions map [string ][]metav1.Condition
225+
226+ AlertGetter AlertGetter
227+
228+ // AlertRisks stores the condition for every alert that might have impact on cluster update
229+ AlertRisks []configv1.ConditionalUpdateRisk
218230}
219231
220232func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -320,6 +332,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
320332 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
321333 AcceptRisks : optr .availableUpdates .AcceptRisks ,
322334 RiskConditions : optr .availableUpdates .RiskConditions ,
335+ AlertRisks : optr .availableUpdates .AlertRisks ,
336+ AlertGetter : optr .availableUpdates .AlertGetter ,
323337 LastAttempt : optr .availableUpdates .LastAttempt ,
324338 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
325339 Current : * optr .availableUpdates .Current .DeepCopy (),
@@ -512,6 +526,170 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
512526 }
513527}
514528
529+ type AlertGetter interface {
530+ Get (ctx context.Context ) (prometheusv1.AlertsResult , error )
531+ }
532+
533+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) error {
534+ if u == nil || u .AlertGetter == nil {
535+ u .AlertRisks = nil
536+ return nil
537+ }
538+ alertsResult , err := u .AlertGetter .Get (ctx )
539+ if err != nil {
540+ return fmt .Errorf ("failed to get alerts: %w" , err )
541+ }
542+
543+ u .AlertRisks = alertsToRisks (alertsResult .Alerts )
544+ return nil
545+ }
546+
547+ func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
548+ klog .V (2 ).Infof ("Found %d alerts" , len (alerts ))
549+ risks := map [string ]configv1.ConditionalUpdateRisk {}
550+ for _ , alert := range alerts {
551+ var alertName string
552+ if alertName = string (alert .Labels ["alertname" ]); alertName == "" {
553+ continue
554+ }
555+ if alert .State == "pending" {
556+ continue
557+ }
558+
559+ var summary string
560+ if summary = string (alert .Annotations ["summary" ]); summary == "" {
561+ summary = alertName
562+ }
563+ if ! strings .HasSuffix (summary , "." ) {
564+ summary += "."
565+ }
566+
567+ var description string
568+ alertMessage := string (alert .Annotations ["message" ])
569+ alertDescription := string (alert .Annotations ["description" ])
570+ switch {
571+ case alertMessage != "" && alertDescription != "" :
572+ description += " The alert description is: " + alertDescription + " | " + alertMessage
573+ case alertDescription != "" :
574+ description += " The alert description is: " + alertDescription
575+ case alertMessage != "" :
576+ description += " The alert description is: " + alertMessage
577+ default :
578+ description += " The alert has no description."
579+ }
580+
581+ var runbook string
582+ if runbook = string (alert .Annotations ["runbook" ]); runbook == "" {
583+ runbook = "<alert does not have a runbook_url annotation>"
584+ }
585+
586+ details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
587+
588+ // TODO (hongkliu):
589+ alertURL := "https://console.com/monitoring/alertrules/id"
590+ alertPromQL := "todo-expression"
591+
592+ severity := string (alert .Labels ["severity" ])
593+ if severity == "critical" {
594+ if alertName == internal .AlertNamePodDisruptionBudgetLimit {
595+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
596+ }
597+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
598+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
599+ Status : metav1 .ConditionTrue ,
600+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
601+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting significant cluster issues worth investigating" , details ),
602+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
603+ })
604+ continue
605+ }
606+
607+ if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
608+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
609+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
610+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
611+ Status : metav1 .ConditionTrue ,
612+ Reason : internal .AlertConditionReason (string (alert .State )),
613+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which might slow node drains" , details ),
614+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
615+ })
616+ continue
617+ }
618+
619+ if internal .HavePullWaiting .Has (alertName ) ||
620+ internal .HaveNodes .Has (alertName ) ||
621+ alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
622+ alertName == internal .AlertNameVMCannotBeEvicted {
623+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
624+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
625+ Status : metav1 .ConditionTrue ,
626+ Reason : internal .AlertConditionReason (string (alert .State )),
627+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which may slow workload redistribution during rolling node updates" , details ),
628+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
629+ })
630+ continue
631+ }
632+
633+ updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
634+ if updatePrecheck == "true" {
635+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
636+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
637+ Status : metav1 .ConditionTrue ,
638+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
639+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting issues worth investigating before updating the cluster" , details ),
640+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
641+ })
642+ continue
643+ }
644+ }
645+
646+ klog .V (2 ).Infof ("Got %d risks" , len (risks ))
647+ if len (risks ) == 0 {
648+ return nil
649+ }
650+
651+ var ret []configv1.ConditionalUpdateRisk
652+ var keys []string
653+ for k := range risks {
654+ keys = append (keys , k )
655+ }
656+ sort .Strings (keys )
657+ for _ , k := range keys {
658+ ret = append (ret , risks [k ])
659+ }
660+ return ret
661+ }
662+
663+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url , promQL string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
664+ risk , ok := risks [riskName ]
665+ if ! ok {
666+ return configv1.ConditionalUpdateRisk {
667+ Name : riskName ,
668+ Message : message ,
669+ URL : url ,
670+ MatchingRules : []configv1.ClusterCondition {
671+ {
672+ Type : "PromQL" ,
673+ PromQL : & configv1.PromQLClusterCondition {
674+ PromQL : promQL ,
675+ },
676+ },
677+ },
678+ Conditions : []metav1.Condition {condition },
679+ }
680+ }
681+
682+ if c := meta .FindStatusCondition (risk .Conditions , condition .Type ); c != nil {
683+ c .Message = fmt .Sprintf ("%s; %s" , c .Message , condition .Message )
684+ if c .LastTransitionTime .After (condition .LastTransitionTime .Time ) {
685+ c .LastTransitionTime = condition .LastTransitionTime
686+ }
687+ meta .SetStatusCondition (& risk .Conditions , * c )
688+ }
689+
690+ return risk
691+ }
692+
515693func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
516694 if u == nil {
517695 return
@@ -521,7 +699,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
521699 risks := risksInOrder (riskVersions )
522700 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
523701
524- if err := sanityCheck (u .ConditionalUpdates ); err != nil {
702+ if err := sanityCheck (u .ConditionalUpdates , u . AlertRisks ); err != nil {
525703 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
526704 }
527705 for i , conditionalUpdate := range u .ConditionalUpdates {
@@ -539,7 +717,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
539717 }
540718}
541719
542- func sanityCheck (updates []configv1.ConditionalUpdate ) error {
720+ func sanityCheck (updates []configv1.ConditionalUpdate , alertRisks []configv1. ConditionalUpdateRisk ) error {
543721 risks := map [string ]configv1.ConditionalUpdateRisk {}
544722 var errs []error
545723 for _ , update := range updates {
@@ -555,6 +733,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
555733 }
556734 }
557735 }
736+ for _ , alertRisk := range alertRisks {
737+ if _ , ok := risks [alertRisk .Name ]; ok {
738+ errs = append (errs , fmt .Errorf ("found alert risk and conditional update risk share the name: %s" , alertRisk .Name ))
739+ }
740+ }
558741 return utilerrors .NewAggregate (errs )
559742}
560743
@@ -599,6 +782,7 @@ const (
599782 recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
600783 recommendedReasonEvaluationFailed = "EvaluationFailed"
601784 recommendedReasonMultiple = "MultipleReasons"
785+ //recommendedReasonAlertFiring = "AlertFiring"
602786
603787 // recommendedReasonExposed is used instead of the original name if it does
604788 // not match the pattern for a valid k8s condition reason.
@@ -616,10 +800,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
616800func newRecommendedReason (now , want string ) string {
617801 switch {
618802 case now == recommendedReasonRisksNotExposed ||
619- now == recommendedReasonAllExposedRisksAccepted ||
803+ now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
804+ now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
620805 now == want :
621806 return want
622- case want == recommendedReasonRisksNotExposed :
807+ case want == recommendedReasonRisksNotExposed ||
808+ (now == recommendedReasonEvaluationFailed ) && want == recommendedReasonAllExposedRisksAccepted ||
809+ now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed ):
623810 return now
624811 default :
625812 return recommendedReasonMultiple
@@ -676,7 +863,6 @@ func evaluateConditionalUpdate(
676863 if len (errorMessages ) > 0 {
677864 recommended .Message = strings .Join (errorMessages , "\n \n " )
678865 }
679-
680866 return recommended
681867}
682868
0 commit comments