Skip to content

Commit 1117ff7

Browse files
committed
Apply offboarding taint once VMs are evicted
Gate on Evicting=False to avoid racing with live-migration. Watch Hypervisor status changes (evictingConditionChangedPredicate) so the taint is applied promptly when the condition transitions.
1 parent 441452b commit 1117ff7

3 files changed

Lines changed: 144 additions & 1 deletion

File tree

internal/controller/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,9 @@ package controller
2121
const (
2222
labelHypervisor = "nova.openstack.cloud.sap/virt-driver"
2323
testAggregateName = "tenant_filter_tests"
24+
25+
// taintKeyOffboarding is used as a NoExecute taint. nova-compute and
26+
// neutron agent pods do not tolerate it (the kvm-node-agent and the
27+
// signalling pod do), so applying it forces those agents off the node.
28+
taintKeyOffboarding = "kvm.cloud.sap/offboarding"
2429
)

internal/controller/gardener_node_lifecycle_controller.go

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,13 @@ import (
3333
v1 "k8s.io/client-go/applyconfigurations/meta/v1"
3434
policyv1ac "k8s.io/client-go/applyconfigurations/policy/v1"
3535
ctrl "sigs.k8s.io/controller-runtime"
36+
"sigs.k8s.io/controller-runtime/pkg/builder"
3637
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3738
"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
39+
"sigs.k8s.io/controller-runtime/pkg/event"
40+
"sigs.k8s.io/controller-runtime/pkg/handler"
3841
logger "sigs.k8s.io/controller-runtime/pkg/log"
42+
"sigs.k8s.io/controller-runtime/pkg/predicate"
3943

4044
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
4145
)
@@ -85,6 +89,19 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr
8589
return ctrl.Result{}, nil
8690
}
8791

92+
// Apply the offboarding taint once VMs are gone; gate on Evicting=False
93+
// to avoid racing with live-migration.
94+
if hv.Spec.Maintenance == kvmv1.MaintenanceTermination &&
95+
meta.IsStatusConditionFalse(hv.Status.Conditions, kvmv1.ConditionTypeEvicting) {
96+
patched, err := r.ensureOffboardingTaint(ctx, node)
97+
if err != nil {
98+
return ctrl.Result{}, fmt.Errorf("failed to ensure offboarding taint: %w", err)
99+
}
100+
if patched {
101+
return ctrl.Result{}, nil
102+
}
103+
}
104+
88105
// We do not care about the particular value, as long as it isn't an error
89106
var minAvailable int32 = 1
90107

@@ -115,6 +132,30 @@ func (r *GardenerNodeLifecycleController) Reconcile(ctx context.Context, req ctr
115132
return ctrl.Result{}, nil
116133
}
117134

135+
// ensureOffboardingTaint adds the offboarding NoExecute taint if not already
136+
// present. Returns true when a patch was issued (caller should return early).
137+
func (r *GardenerNodeLifecycleController) ensureOffboardingTaint(ctx context.Context, node *corev1.Node) (bool, error) {
138+
for _, t := range node.Spec.Taints {
139+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
140+
return false, nil
141+
}
142+
}
143+
144+
log := logger.FromContext(ctx)
145+
log.Info("Adding offboarding taint to node",
146+
"node", node.Name,
147+
"taint", taintKeyOffboarding,
148+
"effect", corev1.TaintEffectNoExecute)
149+
150+
// StrategicMergeFrom merges taints by key, preserving concurrent additions.
151+
patch := k8sclient.StrategicMergeFrom(node.DeepCopy())
152+
node.Spec.Taints = append(node.Spec.Taints, corev1.Taint{
153+
Key: taintKeyOffboarding,
154+
Effect: corev1.TaintEffectNoExecute,
155+
})
156+
return true, r.Patch(ctx, node, patch, k8sclient.FieldOwner(MaintenanceControllerName))
157+
}
158+
118159
func (r *GardenerNodeLifecycleController) ensureBlockingPodDisruptionBudget(ctx context.Context, node *corev1.Node, minAvailable int32) error {
119160
name := nameForNode(node)
120161
nodeLabels := labelsForNode(node)
@@ -226,10 +267,48 @@ func (r *GardenerNodeLifecycleController) SetupWithManager(mgr ctrl.Manager, nam
226267
_ = logger.FromContext(ctx)
227268
r.namespace = namespace
228269

270+
hypervisorToNode := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &corev1.Node{})
271+
272+
// Maintenance=termination bumps generation; Evicting status changes do not.
273+
hypervisorRelevantChange := predicate.Or(
274+
predicate.GenerationChangedPredicate{},
275+
evictingConditionChangedPredicate{},
276+
)
277+
229278
return ctrl.NewControllerManagedBy(mgr).
230279
Named(MaintenanceControllerName).
231280
For(&corev1.Node{}).
232-
Owns(&appsv1.Deployment{}). // trigger the r.Reconcile whenever an Own-ed deployment is created/updated/deleted
281+
Watches(&kvmv1.Hypervisor{}, hypervisorToNode,
282+
builder.WithPredicates(hypervisorRelevantChange),
283+
).
284+
Owns(&appsv1.Deployment{}).
233285
Owns(&policyv1.PodDisruptionBudget{}).
234286
Complete(r)
235287
}
288+
289+
// evictingConditionChangedPredicate complements GenerationChangedPredicate,
290+
// which ignores status-only updates.
291+
type evictingConditionChangedPredicate struct {
292+
predicate.Funcs
293+
}
294+
295+
func (evictingConditionChangedPredicate) Update(e event.UpdateEvent) bool {
296+
if e.ObjectOld == nil || e.ObjectNew == nil {
297+
return false
298+
}
299+
oldHv, ok1 := e.ObjectOld.(*kvmv1.Hypervisor)
300+
newHv, ok2 := e.ObjectNew.(*kvmv1.Hypervisor)
301+
if !ok1 || !ok2 {
302+
return false
303+
}
304+
oldCond := meta.FindStatusCondition(oldHv.Status.Conditions, kvmv1.ConditionTypeEvicting)
305+
newCond := meta.FindStatusCondition(newHv.Status.Conditions, kvmv1.ConditionTypeEvicting)
306+
switch {
307+
case oldCond == nil && newCond == nil:
308+
return false
309+
case oldCond == nil || newCond == nil:
310+
return true
311+
default:
312+
return oldCond.Status != newCond.Status
313+
}
314+
}

internal/controller/gardener_node_lifecycle_controller_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,4 +207,63 @@ var _ = Describe("Gardener Maintenance Controller", func() {
207207
Expect(pdb.Spec.MinAvailable).To(HaveField("IntVal", BeNumerically("==", int32(0))))
208208
})
209209
})
210+
211+
Context("Offboarding taint", func() {
212+
findOffboardingTaint := func(node *corev1.Node) *corev1.Taint {
213+
for i := range node.Spec.Taints {
214+
t := &node.Spec.Taints[i]
215+
if t.Key == taintKeyOffboarding && t.Effect == corev1.TaintEffectNoExecute {
216+
return t
217+
}
218+
}
219+
return nil
220+
}
221+
222+
When("the hypervisor is in maintenance termination and the VMs have been evicted", func() {
223+
BeforeEach(func(ctx SpecContext) {
224+
hypervisor := &kvmv1.Hypervisor{}
225+
Expect(k8sClient.Get(ctx, name, hypervisor)).To(Succeed())
226+
hypervisor.Spec.Maintenance = kvmv1.MaintenanceTermination
227+
Expect(k8sClient.Update(ctx, hypervisor)).To(Succeed())
228+
229+
meta.SetStatusCondition(&hypervisor.Status.Conditions, metav1.Condition{
230+
Type: kvmv1.ConditionTypeEvicting,
231+
Status: metav1.ConditionFalse,
232+
Reason: "Succeeded",
233+
Message: "All VMs evicted",
234+
})
235+
Expect(k8sClient.Status().Update(ctx, hypervisor)).To(Succeed())
236+
})
237+
238+
It("should add the offboarding NoExecute taint to the node", func(ctx SpecContext) {
239+
_, err := controller.Reconcile(ctx, reconcileReq)
240+
Expect(err).NotTo(HaveOccurred())
241+
242+
node := &corev1.Node{}
243+
Expect(k8sClient.Get(ctx, name, node)).To(Succeed())
244+
taint := findOffboardingTaint(node)
245+
Expect(taint).NotTo(BeNil())
246+
Expect(taint.Key).To(Equal(taintKeyOffboarding))
247+
Expect(taint.Effect).To(Equal(corev1.TaintEffectNoExecute))
248+
})
249+
250+
It("should be idempotent: a second reconcile must not add a duplicate taint", func(ctx SpecContext) {
251+
_, err := controller.Reconcile(ctx, reconcileReq)
252+
Expect(err).NotTo(HaveOccurred())
253+
_, err = controller.Reconcile(ctx, reconcileReq)
254+
Expect(err).NotTo(HaveOccurred())
255+
256+
node := &corev1.Node{}
257+
Expect(k8sClient.Get(ctx, name, node)).To(Succeed())
258+
259+
count := 0
260+
for _, t := range node.Spec.Taints {
261+
if t.Key == taintKeyOffboarding {
262+
count++
263+
}
264+
}
265+
Expect(count).To(Equal(1))
266+
})
267+
})
268+
})
210269
})

0 commit comments

Comments
 (0)