Skip to content

Commit ae34300

Browse files
authored
Allow pausing machine config pools on failure (#192)
1 parent 923d4a2 commit ae34300

7 files changed

Lines changed: 344 additions & 3 deletions

api/v1beta1/upgradejob_types.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ const (
5757
UpgradeJobReasonDelaySet = "DelaySet"
5858
// UpgradeJobReasonDelayReached is used to indicate that the pause delay for the upgrade of the machine config pool has expired.
5959
UpgradeJobReasonDelayReached = "DelayReached"
60+
// UpgradeJobReasonPausedOnFailure is used when the upgrade job paused machine config pools due to job failure.
61+
UpgradeJobReasonPausedOnFailure = "PausedOnFailure"
6062
)
6163

6264
// UpgradeJobSpec defines the desired state of UpgradeJob
@@ -109,6 +111,25 @@ type UpgradeJobConfig struct {
109111
// MachineConfigPools defines the machine config pool specific configuration for the upgrade job
110112
// +optional
111113
MachineConfigPools []UpgradeJobMachineConfigPoolSpec `json:"machineConfigPools,omitempty"`
114+
115+
// PauseMachineConfigPoolsOnFailure allows pausing machine config pools on upgrade job failure.
116+
// Allows further investigation and manual remediation by the cluster administrator.
117+
// +optional
118+
PauseMachineConfigPoolsOnFailure PauseMachineConfigPoolsOnFailureSpec `json:"pauseMachineConfigPoolsOnFailure"`
119+
}
120+
121+
// PauseMachineConfigPoolsOnFailureSpec defines the configuration for pausing machine config pools on upgrade job failure.
122+
type PauseMachineConfigPoolsOnFailureSpec struct {
123+
// Enabled defines whether to pause machine config pools on upgrade job failure.
124+
// If true, the controller will pause machine config pools matching the selector if an upgrade job fails.
125+
// Machine config pools should then be unpaused manually by the cluster administrator after the underlying issue has been resolved.
126+
// Machine config pools that are already done updating or that are already paused are not affected.
127+
// +optional
128+
Enabled bool `json:"enabled"`
129+
// Selector defines the labels to match the machine config pools to pause on failure.
130+
// If empty, all machine config pools are matched.
131+
// +optional
132+
Selector metav1.LabelSelector `json:"selector"`
112133
}
113134

114135
// UpgradeJobMachineConfigPoolSpec allows configuring the upgrade of a machine config pool

api/v1beta1/zz_generated.deepcopy.go

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

config/crd/bases/managedupgrade.appuio.io_upgradeconfigs.yaml

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,68 @@ spec:
145145
x-kubernetes-map-type: atomic
146146
type: object
147147
type: array
148+
pauseMachineConfigPoolsOnFailure:
149+
description: |-
150+
PauseMachineConfigPoolsOnFailure allows pausing machine config pools on upgrade job failure.
151+
Allows further investigation and manual remediation by the cluster administrator.
152+
properties:
153+
enabled:
154+
description: |-
155+
Enabled defines whether to pause machine config pools on upgrade job failure.
156+
If true, the controller will pause machine config pools matching the selector if an upgrade job fails.
157+
Machine config pools should then be unpaused manually by the cluster administrator after the underlying issue has been resolved.
158+
Machine config pools that are already done updating or that are already paused are not affected.
159+
type: boolean
160+
selector:
161+
description: |-
162+
Selector defines the labels to match the machine config pools to pause on failure.
163+
If empty, all machine config pools are matched.
164+
properties:
165+
matchExpressions:
166+
description: matchExpressions is a list of label
167+
selector requirements. The requirements are
168+
ANDed.
169+
items:
170+
description: |-
171+
A label selector requirement is a selector that contains values, a key, and an operator that
172+
relates the key and values.
173+
properties:
174+
key:
175+
description: key is the label key that the
176+
selector applies to.
177+
type: string
178+
operator:
179+
description: |-
180+
operator represents a key's relationship to a set of values.
181+
Valid operators are In, NotIn, Exists and DoesNotExist.
182+
type: string
183+
values:
184+
description: |-
185+
values is an array of string values. If the operator is In or NotIn,
186+
the values array must be non-empty. If the operator is Exists or DoesNotExist,
187+
the values array must be empty. This array is replaced during a strategic
188+
merge patch.
189+
items:
190+
type: string
191+
type: array
192+
x-kubernetes-list-type: atomic
193+
required:
194+
- key
195+
- operator
196+
type: object
197+
type: array
198+
x-kubernetes-list-type: atomic
199+
matchLabels:
200+
additionalProperties:
201+
type: string
202+
description: |-
203+
matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
204+
map is equivalent to an element of matchExpressions, whose key field is "key", the
205+
operator is "In", and the values array contains only "value". The requirements are ANDed.
206+
type: object
207+
type: object
208+
x-kubernetes-map-type: atomic
209+
type: object
148210
postUpgradeHealthChecks:
149211
description: PostUpgradeHealthChecks defines the health
150212
checks to be performed after the upgrade

config/crd/bases/managedupgrade.appuio.io_upgradejobs.yaml

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,67 @@ spec:
117117
x-kubernetes-map-type: atomic
118118
type: object
119119
type: array
120+
pauseMachineConfigPoolsOnFailure:
121+
description: |-
122+
PauseMachineConfigPoolsOnFailure allows pausing machine config pools on upgrade job failure.
123+
Allows further investigation and manual remediation by the cluster administrator.
124+
properties:
125+
enabled:
126+
description: |-
127+
Enabled defines whether to pause machine config pools on upgrade job failure.
128+
If true, the controller will pause machine config pools matching the selector if an upgrade job fails.
129+
Machine config pools should then be unpaused manually by the cluster administrator after the underlying issue has been resolved.
130+
Machine config pools that are already done updating or that are already paused are not affected.
131+
type: boolean
132+
selector:
133+
description: |-
134+
Selector defines the labels to match the machine config pools to pause on failure.
135+
If empty, all machine config pools are matched.
136+
properties:
137+
matchExpressions:
138+
description: matchExpressions is a list of label selector
139+
requirements. The requirements are ANDed.
140+
items:
141+
description: |-
142+
A label selector requirement is a selector that contains values, a key, and an operator that
143+
relates the key and values.
144+
properties:
145+
key:
146+
description: key is the label key that the selector
147+
applies to.
148+
type: string
149+
operator:
150+
description: |-
151+
operator represents a key's relationship to a set of values.
152+
Valid operators are In, NotIn, Exists and DoesNotExist.
153+
type: string
154+
values:
155+
description: |-
156+
values is an array of string values. If the operator is In or NotIn,
157+
the values array must be non-empty. If the operator is Exists or DoesNotExist,
158+
the values array must be empty. This array is replaced during a strategic
159+
merge patch.
160+
items:
161+
type: string
162+
type: array
163+
x-kubernetes-list-type: atomic
164+
required:
165+
- key
166+
- operator
167+
type: object
168+
type: array
169+
x-kubernetes-list-type: atomic
170+
matchLabels:
171+
additionalProperties:
172+
type: string
173+
description: |-
174+
matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels
175+
map is equivalent to an element of matchExpressions, whose key field is "key", the
176+
operator is "In", and the values array contains only "value". The requirements are ANDed.
177+
type: object
178+
type: object
179+
x-kubernetes-map-type: atomic
180+
type: object
120181
postUpgradeHealthChecks:
121182
description: PostUpgradeHealthChecks defines the health checks
122183
to be performed after the upgrade

controllers/upgrade_information_collector.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ import (
2222
)
2323

2424
//+kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions,verbs=get;list;watch;update;patch
25-
//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list;watch;update;patch
25+
//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list;watch
2626

2727
var clusterUpgradingDesc = prometheus.NewDesc(
2828
MetricsNamespace+"_cluster_upgrading",

controllers/upgradejob_controller.go

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ const (
6161
)
6262

6363
//+kubebuilder:rbac:groups=config.openshift.io,resources=clusterversions,verbs=get;list;watch;update;patch
64-
//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list;watch
64+
//+kubebuilder:rbac:groups=machineconfiguration.openshift.io,resources=machineconfigpools,verbs=get;list;watch;update;patch
6565

6666
//+kubebuilder:rbac:groups=managedupgrade.appuio.io,resources=upgradejobs,verbs=get;list;watch;create;update;patch;delete
6767
//+kubebuilder:rbac:groups=managedupgrade.appuio.io,resources=upgradejobs/status,verbs=get;update;patch
@@ -113,7 +113,12 @@ func (r *UpgradeJobReconciler) Reconcile(ctx context.Context, req ctrl.Request)
113113
// Don't execute hooks created after the job was finished.
114114
_, efaerr := r.executeHooks(ctx, &uj, managedupgradev1beta1.EventFailure, noTrackingKey, eventInfoWithReason(fc.Reason), fc.LastTransitionTime.Time)
115115
_, efierr := r.executeHooks(ctx, &uj, managedupgradev1beta1.EventFinish, noTrackingKey, eventInfoWithReason(fc.Reason), fc.LastTransitionTime.Time)
116-
return ctrl.Result{}, multierr.Combine(efaerr, efierr, r.cleanupLock(ctx, uj))
116+
return ctrl.Result{}, multierr.Combine(
117+
efaerr,
118+
efierr,
119+
r.pauseMachinePoolsOnFailure(ctx, uj),
120+
r.cleanupLock(ctx, uj),
121+
)
117122
}
118123

119124
cont, err := r.executeHooks(ctx, &uj, managedupgradev1beta1.EventCreate, noTrackingKey, eventInfoWithReason(""), time.Time{})
@@ -1106,6 +1111,53 @@ func (r *UpgradeJobReconciler) cleanupMachineConfigPools(ctx context.Context, uj
11061111
return multierr.Combine(errs...)
11071112
}
11081113

1114+
func (r *UpgradeJobReconciler) pauseMachinePoolsOnFailure(ctx context.Context, uj managedupgradev1beta1.UpgradeJob) error {
1115+
l := log.FromContext(ctx).WithName("UpgradeJobReconciler.pauseMachinePoolsOnFailure")
1116+
1117+
if !uj.Spec.PauseMachineConfigPoolsOnFailure.Enabled {
1118+
return nil
1119+
}
1120+
if cond := apimeta.FindStatusCondition(uj.Status.Conditions, managedupgradev1beta1.UpgradeJobConditionMachineConfigPoolsPaused); cond != nil &&
1121+
cond.Reason == managedupgradev1beta1.UpgradeJobReasonPausedOnFailure && cond.Status == metav1.ConditionTrue {
1122+
l.Info("machine config pools already paused on failure")
1123+
return nil
1124+
}
1125+
1126+
selector, err := metav1.LabelSelectorAsSelector(&uj.Spec.PauseMachineConfigPoolsOnFailure.Selector)
1127+
if err != nil {
1128+
return fmt.Errorf("failed to parse machine config pool selector: %w", err)
1129+
}
1130+
var mcpl machineconfigurationv1.MachineConfigPoolList
1131+
if err := r.List(ctx, &mcpl, client.MatchingLabelsSelector{Selector: selector}); err != nil {
1132+
return fmt.Errorf("failed to list machine config pools: %w", err)
1133+
}
1134+
var errs []error
1135+
for _, mcp := range mcpl.Items {
1136+
if mcp.Spec.Paused {
1137+
continue
1138+
}
1139+
if mcp.Status.MachineCount == mcp.Status.UpdatedMachineCount {
1140+
continue
1141+
}
1142+
l.Info("pausing machine config pool due to upgrade job failure", "pool", mcp.Name)
1143+
mcp.Spec.Paused = true
1144+
if err := r.Update(ctx, &mcp); err != nil {
1145+
errs = append(errs, fmt.Errorf("failed to pause machine config pool %q: %w", mcp.Name, err))
1146+
}
1147+
}
1148+
if changed := r.setStatusCondition(&uj.Status.Conditions, metav1.Condition{
1149+
Type: managedupgradev1beta1.UpgradeJobConditionMachineConfigPoolsPaused,
1150+
Status: metav1.ConditionTrue,
1151+
Reason: managedupgradev1beta1.UpgradeJobReasonPausedOnFailure,
1152+
}); changed {
1153+
if err := r.Status().Update(ctx, &uj); err != nil {
1154+
errs = append(errs, fmt.Errorf("failed to update upgrade job status: %w", err))
1155+
}
1156+
}
1157+
1158+
return multierr.Combine(errs...)
1159+
}
1160+
11091161
func eventInfoWithReason(reason string) map[string]any {
11101162
return map[string]any{
11111163
"reason": reason,

0 commit comments

Comments
 (0)