Skip to content

Commit 1b503cf

Browse files
committed
ctrl: nro: handle paused MCPs
The situtaion is mainly mitigated in the kubeletconfig controller, however we have a bug in the NRO controller such that it puts the NRO CR in progressing state because paused MCP is not in updated condition. This commit ignores checking whether a paused MCP is up-to-date, and introduces a new status condition to report paused MCPs if exist. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent b2fa2e0 commit 1b503cf

6 files changed

Lines changed: 174 additions & 16 deletions

File tree

internal/controller/numaresourcesoperator_controller.go

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"reflect"
23+
"strings"
2324
"time"
2425

2526
appsv1 "k8s.io/api/apps/v1"
@@ -31,6 +32,7 @@ import (
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/labels"
3334
"k8s.io/apimachinery/pkg/runtime"
35+
"k8s.io/apimachinery/pkg/util/sets"
3436
"k8s.io/client-go/tools/record"
3537
"k8s.io/klog/v2"
3638

@@ -141,7 +143,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
141143

142144
initialStatus := *instance.Status.DeepCopy()
143145
if len(initialStatus.Conditions) == 0 {
144-
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
146+
instance.Status.Conditions = status.NewNUMAResourcesOperatorConditions()
145147
}
146148

147149
if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
@@ -241,7 +243,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Conte
241243
func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) intreconcile.Step {
242244
// we need to sync machine configs first and wait for the MachineConfigPool updates
243245
// before checking additional components for updates
244-
mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, existing, trees)
246+
mcpUpdatedFunc, pausedMCPs, err := r.syncMachineConfigs(ctx, instance, existing, trees)
245247
if err != nil {
246248
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err)
247249
err = fmt.Errorf("failed to sync machine configs: %w", err)
@@ -251,7 +253,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con
251253

252254
// MCO needs to update the SELinux context removal and other stuff, and need to trigger a reboot.
253255
// It can take a while.
254-
mcpStatuses, mcpNamePending := syncMachineConfigPoolsStatuses(instance.Name, trees, r.ForwardMCPConds, mcpUpdatedFunc)
256+
mcpStatuses, mcpNamePending := syncMachineConfigPoolsStatuses(instance.Name, trees, r.ForwardMCPConds, mcpUpdatedFunc, pausedMCPs)
255257
instance.Status.MachineConfigPools = mcpStatuses
256258

257259
if mcpNamePending != "" {
@@ -260,6 +262,8 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con
260262
}
261263
instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees)
262264

265+
updateMachineConfigPoolPausedCondition(instance.Status.Conditions, instance.Generation, pausedMCPs)
266+
263267
return intreconcile.StepSuccess()
264268
}
265269

@@ -399,7 +403,7 @@ func (r *NUMAResourcesOperatorReconciler) syncNodeResourceTopologyAPI(ctx contex
399403
return (updatedCount == len(objStates)), err
400404
}
401405

402-
func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, error) {
406+
func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, sets.Set[string], error) {
403407
klog.V(4).InfoS("Machine Config Sync start", "trees", len(trees))
404408
defer klog.V(4).Info("Machine Config Sync stop")
405409

@@ -409,7 +413,7 @@ func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context
409413
// In case of operator upgrade from 4.1X → 4.18, it's necessary to remove the old MachineConfig,
410414
// unless an emergency annotation is provided which forces the operator to use custom policy
411415

412-
objStates, waitFunc := existing.MachineConfigsState(r.RTEManifests)
416+
objStates, waitFunc, pausedMCPs := existing.MachineConfigsState(r.RTEManifests)
413417
for _, objState := range objStates {
414418
klog.InfoS("objState", "desired", objState.Desired, "existing", objState.Existing, "createOrUpdate", objState.IsCreateOrUpdate())
415419
if objState.IsCreateOrUpdate() {
@@ -429,10 +433,10 @@ func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context
429433
break
430434
}
431435
}
432-
return waitFunc, err
436+
return waitFunc, pausedMCPs, err
433437
}
434438

435-
func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tree, forwardMCPConds bool, updatedFunc rtestate.MCPWaitForUpdatedFunc) ([]nropv1.MachineConfigPool, string) {
439+
func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tree, forwardMCPConds bool, updatedFunc rtestate.MCPWaitForUpdatedFunc, pausedMCPs sets.Set[string]) ([]nropv1.MachineConfigPool, string) {
436440
klog.V(4).InfoS("Machine Config Status Sync start", "trees", len(trees))
437441
defer klog.V(4).Info("Machine Config Status Sync stop")
438442

@@ -441,6 +445,11 @@ func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tre
441445
for _, mcp := range tree.MachineConfigPools {
442446
mcpStatuses = append(mcpStatuses, extractMCPStatus(mcp, forwardMCPConds))
443447

448+
if pausedMCPs.Has(mcp.Name) {
449+
klog.V(5).InfoS("Paused MachineConfigPool detected", "name", mcp.Name)
450+
continue
451+
}
452+
444453
isUpdated := updatedFunc(instanceName, mcp)
445454
klog.V(5).InfoS("Machine Config Pool state", "name", mcp.Name, "instance", instanceName, "updated", isUpdated)
446455

@@ -813,3 +822,21 @@ func getTreesByNodeGroup(ctx context.Context, cli client.Client, nodeGroups []nr
813822
return nil, fmt.Errorf("unsupported platform")
814823
}
815824
}
825+
826+
func updateMachineConfigPoolPausedCondition(conditions []metav1.Condition, generation int64, pausedMCPs sets.Set[string]) {
827+
pausedStatus := metav1.ConditionFalse
828+
message := ""
829+
if pausedMCPs.Len() > 0 {
830+
pausedStatus = metav1.ConditionTrue
831+
message = "detected paused MCPs: " + strings.Join(pausedMCPs.UnsortedList(), ", ")
832+
}
833+
condition := metav1.Condition{
834+
Type: status.ConditionMachineConfigPoolPaused,
835+
Status: pausedStatus,
836+
Reason: status.ConditionMachineConfigPoolPaused,
837+
Message: message,
838+
ObservedGeneration: generation,
839+
LastTransitionTime: metav1.Now(),
840+
}
841+
status.UpdateConditionsInPlace(conditions, condition, time.Time{})
842+
}

internal/controller/numaresourcesoperator_controller_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2195,6 +2195,105 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
21952195
Expect(apierrors.IsNotFound(err)).To(BeTrue(), "MachineConfig %s expected to be deleted; err=%v", mc2Key.Name, err)
21962196
})
21972197
})
2198+
2199+
Context("with status condition updates", func() {
2200+
var nro *nropv1.NUMAResourcesOperator
2201+
var mcp *machineconfigv1.MachineConfigPool
2202+
var reconciler *NUMAResourcesOperatorReconciler
2203+
var label map[string]string
2204+
var key client.ObjectKey
2205+
var err error
2206+
2207+
ctx := context.TODO()
2208+
2209+
BeforeEach(func() {
2210+
label = map[string]string{
2211+
"test1": "test1",
2212+
}
2213+
2214+
ng := nropv1.NodeGroup{
2215+
MachineConfigPoolSelector: &metav1.LabelSelector{
2216+
MatchLabels: label,
2217+
},
2218+
}
2219+
nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng)
2220+
key = client.ObjectKeyFromObject(nro)
2221+
2222+
mcp = testobjs.NewMachineConfigPool("test1", label, &metav1.LabelSelector{MatchLabels: label}, &metav1.LabelSelector{MatchLabels: label})
2223+
2224+
reconciler, err = NewFakeNUMAResourcesOperatorReconciler(platform.OpenShift, defaultOCPVersion, nro, mcp)
2225+
Expect(err).ToNot(HaveOccurred())
2226+
2227+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2228+
Expect(err).ToNot(HaveOccurred())
2229+
Expect(result).To(Equal(reconcile.Result{}))
2230+
})
2231+
It("should report progressing condition for un-paused MCPs but in updating state", func() {
2232+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2233+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2234+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2235+
{
2236+
Type: machineconfigv1.MachineConfigPoolUpdated,
2237+
Status: corev1.ConditionFalse,
2238+
},
2239+
}
2240+
2241+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2242+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2243+
Expect(err).ToNot(HaveOccurred())
2244+
Expect(result).To(Equal(reconcile.Result{RequeueAfter: time.Minute}))
2245+
2246+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2247+
2248+
progressingCondition := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
2249+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue))
2250+
Expect(progressingCondition.Reason).To(Equal("MachineConfigPoolIsUpdating"))
2251+
Expect(progressingCondition.Message).To(ContainSubstring("test1 is updating"))
2252+
})
2253+
It("should not report progressing condition for paused MCPs", func() {
2254+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2255+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2256+
{
2257+
Type: machineconfigv1.MachineConfigPoolUpdated,
2258+
Status: corev1.ConditionFalse,
2259+
},
2260+
}
2261+
mcp.Spec.Paused = true
2262+
2263+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2264+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2265+
Expect(err).ToNot(HaveOccurred())
2266+
Expect(result).To(Equal(reconcile.Result{}))
2267+
2268+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2269+
2270+
progressingCondition := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
2271+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionFalse), "Progressing condition should be false because the MCP is paused, got conditions: %v", nro.Status.Conditions)
2272+
})
2273+
2274+
It("should report MachineConfigPoolPaused condition for paused MCPs", func() {
2275+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2276+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2277+
{
2278+
Type: machineconfigv1.MachineConfigPoolUpdated,
2279+
Status: corev1.ConditionFalse,
2280+
},
2281+
}
2282+
mcp.Spec.Paused = true
2283+
2284+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2285+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2286+
Expect(err).ToNot(HaveOccurred())
2287+
Expect(result).To(Equal(reconcile.Result{}))
2288+
2289+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2290+
2291+
pausedCondition := getConditionByType(nro.Status.Conditions, status.ConditionMachineConfigPoolPaused)
2292+
Expect(pausedCondition.Status).To(Equal(metav1.ConditionTrue), "Paused condition should be true because the MCP is paused, got conditions: %v", nro.Status.Conditions)
2293+
Expect(pausedCondition.Reason).To(Equal(status.ConditionMachineConfigPoolPaused))
2294+
Expect(pausedCondition.Message).To(ContainSubstring("detected paused MCPs: test1"))
2295+
})
2296+
})
21982297
})
21992298
})
22002299

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
117117

118118
initialStatus := *instance.Status.DeepCopy()
119119
if len(initialStatus.Conditions) == 0 {
120-
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
120+
instance.Status.Conditions = status.NewNUMAResourcesSchedulerConditions()
121121
}
122122

123123
if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {

pkg/objectstate/rte/machineconfigpool.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222

2323
corev1 "k8s.io/api/core/v1"
24+
"k8s.io/apimachinery/pkg/util/sets"
2425
"k8s.io/klog/v2"
2526

2627
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -129,14 +130,20 @@ func MatchMachineConfigPoolCondition(conditions []machineconfigv1.MachineConfigP
129130

130131
type MCPWaitForUpdatedFunc func(string, *machineconfigv1.MachineConfigPool) bool
131132

132-
func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.ObjectState, MCPWaitForUpdatedFunc) {
133+
func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.ObjectState, MCPWaitForUpdatedFunc, sets.Set[string]) {
134+
pausedMCPs := sets.New[string]()
133135
var ret []objectstate.ObjectState
134136
if mf.Core.MachineConfig == nil {
135-
return ret, nullMachineConfigPoolUpdated
137+
return ret, nullMachineConfigPoolUpdated, pausedMCPs
136138
}
137139
enabledMCCount := 0
138140
for _, tree := range em.trees {
139141
for _, mcp := range tree.MachineConfigPools {
142+
if mcp.Spec.Paused {
143+
pausedMCPs.Insert(mcp.Name)
144+
continue
145+
}
146+
140147
mcName := objectnames.GetMachineConfigName(em.instance.Name, mcp.Name)
141148
if mcp.Spec.MachineConfigSelector == nil {
142149
klog.Warningf("the machine config pool %q does not have machine config selector", mcp.Name)
@@ -183,10 +190,16 @@ func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.Ob
183190
}
184191

185192
klog.V(4).InfoS("machineConfigsState", "enabledMachineConfigs", enabledMCCount)
193+
// TODO: the API design allows to configure custom annotation per nodegroup,
194+
// meaning that the function that checks if an MCP is updated or not is different
195+
// in each case. This should be refactored to adapt the flexible configuration or
196+
// get consensus on the API design to correct this. The problem with the current
197+
// approach is that nodegroups with default selinux will need to have the custom
198+
// machine config, which is never met.
186199
if enabledMCCount > 0 {
187-
return ret, IsMachineConfigPoolUpdated
200+
return ret, IsMachineConfigPoolUpdated, pausedMCPs
188201
}
189-
return ret, IsMachineConfigPoolUpdatedAfterDeletion
202+
return ret, IsMachineConfigPoolUpdatedAfterDeletion, pausedMCPs
190203
}
191204

192205
func nullMachineConfigPoolUpdated(instanceName string, mcp *machineconfigv1.MachineConfigPool) bool {

pkg/status/status.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,15 @@ const (
3636
)
3737

3838
const (
39+
// scheduler conditions
3940
ConditionDedicatedInformerActive = "DedicatedInformerActive"
4041
)
4142

43+
const (
44+
// operator conditions
45+
ConditionMachineConfigPoolPaused = "MachineConfigPoolPaused"
46+
)
47+
4248
// TODO: are we duping these?
4349
const (
4450
ReasonAsExpected = "AsExpected"
@@ -181,9 +187,9 @@ func DefaultBaseConditions(timestamp time.Time) []metav1.Condition {
181187
}
182188
}
183189

184-
// NewNUMAResourcesSchedulerBaseConditions creates specific conditions on
190+
// NewNUMAResourcesSchedulerConditions creates specific scheduler conditions on
185191
// top of NewBaseConditions.
186-
func NewNUMAResourcesSchedulerBaseConditions() []metav1.Condition {
192+
func NewNUMAResourcesSchedulerConditions() []metav1.Condition {
187193
now := time.Now()
188194
conds := append(DefaultBaseConditions(now), metav1.Condition{
189195
Type: ConditionDedicatedInformerActive,
@@ -194,6 +200,19 @@ func NewNUMAResourcesSchedulerBaseConditions() []metav1.Condition {
194200
return conds
195201
}
196202

203+
// NewNUMAResourcesOperatorConditions creates specific operator conditions on
204+
// top of NewBaseConditions.
205+
func NewNUMAResourcesOperatorConditions() []metav1.Condition {
206+
now := time.Now()
207+
conds := append(DefaultBaseConditions(now), metav1.Condition{
208+
Type: ConditionMachineConfigPoolPaused,
209+
Status: metav1.ConditionUnknown,
210+
LastTransitionTime: metav1.Time{Time: now},
211+
Reason: ConditionMachineConfigPoolPaused,
212+
})
213+
return conds
214+
}
215+
197216
// CloneConditions creates a deep copy of the given `conditions`.
198217
func CloneConditions(conditions []metav1.Condition) []metav1.Condition {
199218
var c = make([]metav1.Condition, len(conditions))

pkg/status/status_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ func TestUpdateConditionsInPlace(t *testing.T) {
331331
}{
332332
{
333333
name: "first reconcile iteration - with operator condition",
334-
conditions: NewNUMAResourcesSchedulerBaseConditions(),
334+
conditions: NewNUMAResourcesSchedulerConditions(),
335335
condition: metav1.Condition{
336336
Type: ConditionAvailable,
337337
Status: metav1.ConditionTrue,
@@ -372,7 +372,7 @@ func TestUpdateConditionsInPlace(t *testing.T) {
372372
},
373373
{
374374
name: "first reconcile iteration - with informer condition",
375-
conditions: NewNUMAResourcesSchedulerBaseConditions(),
375+
conditions: NewNUMAResourcesSchedulerConditions(),
376376
condition: metav1.Condition{
377377
Type: ConditionDedicatedInformerActive,
378378
Status: metav1.ConditionTrue,

0 commit comments

Comments
 (0)