Skip to content

Commit e25bcee

Browse files
committed
ctrl: nro: handle paused MCPs
The situtaion is mainly mitigated in the kubeletconfig controller, however we have a bug in the NRO controller such that it puts the NRO CR in progressing state because paused MCP is not in updated condition. This commit ignores checking whether a paused MCP is up-to-date, and introduces a new status condition to report paused MCPs if exist. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 3358aa5 commit e25bcee

6 files changed

Lines changed: 174 additions & 16 deletions

File tree

internal/controller/numaresourcesoperator_controller.go

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121
"fmt"
2222
"reflect"
23+
"strings"
2324
"time"
2425

2526
appsv1 "k8s.io/api/apps/v1"
@@ -31,6 +32,7 @@ import (
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/labels"
3334
"k8s.io/apimachinery/pkg/runtime"
35+
"k8s.io/apimachinery/pkg/util/sets"
3436
"k8s.io/client-go/tools/record"
3537
"k8s.io/klog/v2"
3638

@@ -141,7 +143,7 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
141143

142144
initialStatus := *instance.Status.DeepCopy()
143145
if len(initialStatus.Conditions) == 0 {
144-
instance.Status.Conditions = status.DefaultBaseConditions(time.Now())
146+
instance.Status.Conditions = status.NewNUMAResourcesOperatorConditions()
145147
}
146148

147149
if req.Name != objectnames.DefaultNUMAResourcesOperatorCrName {
@@ -236,7 +238,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceAPI(ctx context.Conte
236238
func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) intreconcile.Step {
237239
// we need to sync machine configs first and wait for the MachineConfigPool updates
238240
// before checking additional components for updates
239-
mcpUpdatedFunc, err := r.syncMachineConfigs(ctx, instance, existing, trees)
241+
mcpUpdatedFunc, pausedMCPs, err := r.syncMachineConfigs(ctx, instance, existing, trees)
240242
if err != nil {
241243
r.Recorder.Eventf(instance, corev1.EventTypeWarning, "FailedMCSync", "Failed to set up machine configuration for worker nodes: %v", err)
242244
err = fmt.Errorf("failed to sync machine configs: %w", err)
@@ -246,7 +248,7 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con
246248

247249
// MCO needs to update the SELinux context removal and other stuff, and need to trigger a reboot.
248250
// It can take a while.
249-
mcpStatuses, mcpNamePending := syncMachineConfigPoolsStatuses(instance.Name, trees, r.ForwardMCPConds, mcpUpdatedFunc)
251+
mcpStatuses, mcpNamePending := syncMachineConfigPoolsStatuses(instance.Name, trees, r.ForwardMCPConds, mcpUpdatedFunc, pausedMCPs)
250252
instance.Status.MachineConfigPools = mcpStatuses
251253

252254
if mcpNamePending != "" {
@@ -255,6 +257,8 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResourceMachineConfig(ctx con
255257
}
256258
instance.Status.MachineConfigPools = syncMachineConfigPoolNodeGroupConfigStatuses(instance.Status.MachineConfigPools, trees)
257259

260+
updateMachineConfigPoolPausedCondition(instance.Status.Conditions, instance.Generation, pausedMCPs)
261+
258262
return intreconcile.StepSuccess()
259263
}
260264

@@ -390,7 +394,7 @@ func (r *NUMAResourcesOperatorReconciler) syncNodeResourceTopologyAPI(ctx contex
390394
return (updatedCount == len(objStates)), err
391395
}
392396

393-
func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, error) {
397+
func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context, instance *nropv1.NUMAResourcesOperator, existing *rtestate.ExistingManifests, trees []nodegroupv1.Tree) (rtestate.MCPWaitForUpdatedFunc, sets.Set[string], error) {
394398
klog.V(4).InfoS("Machine Config Sync start", "trees", len(trees))
395399
defer klog.V(4).Info("Machine Config Sync stop")
396400

@@ -400,7 +404,7 @@ func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context
400404
// In case of operator upgrade from 4.1X → 4.18, it's necessary to remove the old MachineConfig,
401405
// unless an emergency annotation is provided which forces the operator to use custom policy
402406

403-
objStates, waitFunc := existing.MachineConfigsState(r.RTEManifests)
407+
objStates, waitFunc, pausedMCPs := existing.MachineConfigsState(r.RTEManifests)
404408
for _, objState := range objStates {
405409
klog.InfoS("objState", "desired", objState.Desired, "existing", objState.Existing, "createOrUpdate", objState.IsCreateOrUpdate())
406410
if objState.IsCreateOrUpdate() {
@@ -420,10 +424,10 @@ func (r *NUMAResourcesOperatorReconciler) syncMachineConfigs(ctx context.Context
420424
break
421425
}
422426
}
423-
return waitFunc, err
427+
return waitFunc, pausedMCPs, err
424428
}
425429

426-
func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tree, forwardMCPConds bool, updatedFunc rtestate.MCPWaitForUpdatedFunc) ([]nropv1.MachineConfigPool, string) {
430+
func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tree, forwardMCPConds bool, updatedFunc rtestate.MCPWaitForUpdatedFunc, pausedMCPs sets.Set[string]) ([]nropv1.MachineConfigPool, string) {
427431
klog.V(4).InfoS("Machine Config Status Sync start", "trees", len(trees))
428432
defer klog.V(4).Info("Machine Config Status Sync stop")
429433

@@ -432,6 +436,11 @@ func syncMachineConfigPoolsStatuses(instanceName string, trees []nodegroupv1.Tre
432436
for _, mcp := range tree.MachineConfigPools {
433437
mcpStatuses = append(mcpStatuses, extractMCPStatus(mcp, forwardMCPConds))
434438

439+
if pausedMCPs.Has(mcp.Name) {
440+
klog.V(5).InfoS("Paused MachineConfigPool detected", "name", mcp.Name)
441+
continue
442+
}
443+
435444
isUpdated := updatedFunc(instanceName, mcp)
436445
klog.V(5).InfoS("Machine Config Pool state", "name", mcp.Name, "instance", instanceName, "updated", isUpdated)
437446

@@ -802,3 +811,21 @@ func getTreesByNodeGroup(ctx context.Context, cli client.Client, nodeGroups []nr
802811
return nil, fmt.Errorf("unsupported platform")
803812
}
804813
}
814+
815+
func updateMachineConfigPoolPausedCondition(conditions []metav1.Condition, generation int64, pausedMCPs sets.Set[string]) {
816+
pausedStatus := metav1.ConditionFalse
817+
message := ""
818+
if pausedMCPs.Len() > 0 {
819+
pausedStatus = metav1.ConditionTrue
820+
message = "detected paused MCPs: " + strings.Join(pausedMCPs.UnsortedList(), ", ")
821+
}
822+
condition := metav1.Condition{
823+
Type: status.ConditionMachineConfigPoolPaused,
824+
Status: pausedStatus,
825+
Reason: status.ConditionMachineConfigPoolPaused,
826+
Message: message,
827+
ObservedGeneration: generation,
828+
LastTransitionTime: metav1.Now(),
829+
}
830+
status.UpdateConditionsInPlace(conditions, condition, time.Time{})
831+
}

internal/controller/numaresourcesoperator_controller_test.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2149,6 +2149,105 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
21492149
Expect(apierrors.IsNotFound(err)).To(BeTrue(), "MachineConfig %s expected to be deleted; err=%v", mc2Key.Name, err)
21502150
})
21512151
})
2152+
2153+
Context("with status condition updates", func() {
2154+
var nro *nropv1.NUMAResourcesOperator
2155+
var mcp *machineconfigv1.MachineConfigPool
2156+
var reconciler *NUMAResourcesOperatorReconciler
2157+
var label map[string]string
2158+
var key client.ObjectKey
2159+
var err error
2160+
2161+
ctx := context.TODO()
2162+
2163+
BeforeEach(func() {
2164+
label = map[string]string{
2165+
"test1": "test1",
2166+
}
2167+
2168+
ng := nropv1.NodeGroup{
2169+
MachineConfigPoolSelector: &metav1.LabelSelector{
2170+
MatchLabels: label,
2171+
},
2172+
}
2173+
nro = testobjs.NewNUMAResourcesOperator(objectnames.DefaultNUMAResourcesOperatorCrName, ng)
2174+
key = client.ObjectKeyFromObject(nro)
2175+
2176+
mcp = testobjs.NewMachineConfigPool("test1", label, &metav1.LabelSelector{MatchLabels: label}, &metav1.LabelSelector{MatchLabels: label})
2177+
2178+
reconciler, err = NewFakeNUMAResourcesOperatorReconciler(platform.OpenShift, defaultOCPVersion, nro, mcp)
2179+
Expect(err).ToNot(HaveOccurred())
2180+
2181+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2182+
Expect(err).ToNot(HaveOccurred())
2183+
Expect(result).To(Equal(reconcile.Result{}))
2184+
})
2185+
It("should report progressing condition for un-paused MCPs but in updating state", func() {
2186+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2187+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2188+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2189+
{
2190+
Type: machineconfigv1.MachineConfigPoolUpdated,
2191+
Status: corev1.ConditionFalse,
2192+
},
2193+
}
2194+
2195+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2196+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2197+
Expect(err).ToNot(HaveOccurred())
2198+
Expect(result).To(Equal(reconcile.Result{RequeueAfter: time.Minute}))
2199+
2200+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2201+
2202+
progressingCondition := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
2203+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionTrue))
2204+
Expect(progressingCondition.Reason).To(Equal("MachineConfigPoolIsUpdating"))
2205+
Expect(progressingCondition.Message).To(ContainSubstring("test1 is updating"))
2206+
})
2207+
It("should not report progressing condition for paused MCPs", func() {
2208+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2209+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2210+
{
2211+
Type: machineconfigv1.MachineConfigPoolUpdated,
2212+
Status: corev1.ConditionFalse,
2213+
},
2214+
}
2215+
mcp.Spec.Paused = true
2216+
2217+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2218+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2219+
Expect(err).ToNot(HaveOccurred())
2220+
Expect(result).To(Equal(reconcile.Result{}))
2221+
2222+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2223+
2224+
progressingCondition := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
2225+
Expect(progressingCondition.Status).To(Equal(metav1.ConditionFalse), "Progressing condition should be false because the MCP is paused, got conditions: %v", nro.Status.Conditions)
2226+
})
2227+
2228+
It("should report MachineConfigPoolPaused condition for paused MCPs", func() {
2229+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp), mcp)).To(Succeed())
2230+
mcp.Status.Conditions = []machineconfigv1.MachineConfigPoolCondition{
2231+
{
2232+
Type: machineconfigv1.MachineConfigPoolUpdated,
2233+
Status: corev1.ConditionFalse,
2234+
},
2235+
}
2236+
mcp.Spec.Paused = true
2237+
2238+
Expect(reconciler.Client.Update(ctx, mcp)).To(Succeed())
2239+
result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})
2240+
Expect(err).ToNot(HaveOccurred())
2241+
Expect(result).To(Equal(reconcile.Result{}))
2242+
2243+
Expect(reconciler.Client.Get(ctx, key, nro)).To(Succeed())
2244+
2245+
pausedCondition := getConditionByType(nro.Status.Conditions, status.ConditionMachineConfigPoolPaused)
2246+
Expect(pausedCondition.Status).To(Equal(metav1.ConditionTrue), "Paused condition should be true because the MCP is paused, got conditions: %v", nro.Status.Conditions)
2247+
Expect(pausedCondition.Reason).To(Equal(status.ConditionMachineConfigPoolPaused))
2248+
Expect(pausedCondition.Message).To(ContainSubstring("detected paused MCPs: test1"))
2249+
})
2250+
})
21522251
})
21532252

21542253
Describe("platform agnostic", func() {

internal/controller/numaresourcesscheduler_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func (r *NUMAResourcesSchedulerReconciler) Reconcile(ctx context.Context, req ct
117117

118118
initialStatus := *instance.Status.DeepCopy()
119119
if len(initialStatus.Conditions) == 0 {
120-
instance.Status.Conditions = status.NewNUMAResourcesSchedulerBaseConditions()
120+
instance.Status.Conditions = status.NewNUMAResourcesSchedulerConditions()
121121
}
122122

123123
if req.Name != objectnames.DefaultNUMAResourcesSchedulerCrName {

pkg/objectstate/rte/machineconfigpool.go

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222

2323
corev1 "k8s.io/api/core/v1"
24+
"k8s.io/apimachinery/pkg/util/sets"
2425
"k8s.io/klog/v2"
2526

2627
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -129,14 +130,20 @@ func MatchMachineConfigPoolCondition(conditions []machineconfigv1.MachineConfigP
129130

130131
type MCPWaitForUpdatedFunc func(string, *machineconfigv1.MachineConfigPool) bool
131132

132-
func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.ObjectState, MCPWaitForUpdatedFunc) {
133+
func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.ObjectState, MCPWaitForUpdatedFunc, sets.Set[string]) {
134+
pausedMCPs := sets.New[string]()
133135
var ret []objectstate.ObjectState
134136
if mf.Core.MachineConfig == nil {
135-
return ret, nullMachineConfigPoolUpdated
137+
return ret, nullMachineConfigPoolUpdated, pausedMCPs
136138
}
137139
enabledMCCount := 0
138140
for _, tree := range em.trees {
139141
for _, mcp := range tree.MachineConfigPools {
142+
if mcp.Spec.Paused {
143+
pausedMCPs.Insert(mcp.Name)
144+
continue
145+
}
146+
140147
mcName := objectnames.GetMachineConfigName(em.instance.Name, mcp.Name)
141148
if mcp.Spec.MachineConfigSelector == nil {
142149
klog.Warningf("the machine config pool %q does not have machine config selector", mcp.Name)
@@ -183,10 +190,16 @@ func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]objectstate.Ob
183190
}
184191

185192
klog.V(4).InfoS("machineConfigsState", "enabledMachineConfigs", enabledMCCount)
193+
// TODO: the API design allows to configure custom annotation per nodegroup,
194+
// meaning that the function that checks if an MCP is updated or not is different
195+
// in each case. This should be refactored to adapt the flexible configuration or
196+
// get consensus on the API design to correct this. The problem with the current
197+
// approach is that nodegroups with default selinux will need to have the custom
198+
// machine config, which is never met.
186199
if enabledMCCount > 0 {
187-
return ret, IsMachineConfigPoolUpdated
200+
return ret, IsMachineConfigPoolUpdated, pausedMCPs
188201
}
189-
return ret, IsMachineConfigPoolUpdatedAfterDeletion
202+
return ret, IsMachineConfigPoolUpdatedAfterDeletion, pausedMCPs
190203
}
191204

192205
func nullMachineConfigPoolUpdated(instanceName string, mcp *machineconfigv1.MachineConfigPool) bool {

pkg/status/status.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,15 @@ const (
3737
)
3838

3939
const (
40+
// scheduler conditions
4041
ConditionDedicatedInformerActive = "DedicatedInformerActive"
4142
)
4243

44+
const (
45+
// operator conditions
46+
ConditionMachineConfigPoolPaused = "MachineConfigPoolPaused"
47+
)
48+
4349
// TODO: are we duping these?
4450
const (
4551
ReasonAsExpected = "AsExpected"
@@ -183,9 +189,9 @@ func DefaultBaseConditions(timestamp time.Time) []metav1.Condition {
183189
}
184190
}
185191

186-
// NewNUMAResourcesSchedulerBaseConditions creates specific conditions on
192+
// NewNUMAResourcesSchedulerConditions creates specific scheduler conditions on
187193
// top of NewBaseConditions.
188-
func NewNUMAResourcesSchedulerBaseConditions() []metav1.Condition {
194+
func NewNUMAResourcesSchedulerConditions() []metav1.Condition {
189195
now := time.Now()
190196
conds := append(DefaultBaseConditions(now), metav1.Condition{
191197
Type: ConditionDedicatedInformerActive,
@@ -196,6 +202,19 @@ func NewNUMAResourcesSchedulerBaseConditions() []metav1.Condition {
196202
return conds
197203
}
198204

205+
// NewNUMAResourcesOperatorConditions creates specific operator conditions on
206+
// top of NewBaseConditions.
207+
func NewNUMAResourcesOperatorConditions() []metav1.Condition {
208+
now := time.Now()
209+
conds := append(DefaultBaseConditions(now), metav1.Condition{
210+
Type: ConditionMachineConfigPoolPaused,
211+
Status: metav1.ConditionUnknown,
212+
LastTransitionTime: metav1.Time{Time: now},
213+
Reason: ConditionMachineConfigPoolPaused,
214+
})
215+
return conds
216+
}
217+
199218
// CloneConditions creates a deep copy of the given `conditions`.
200219
func CloneConditions(conditions []metav1.Condition) []metav1.Condition {
201220
var c = make([]metav1.Condition, len(conditions))

pkg/status/status_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,7 @@ func TestComputeConditions(t *testing.T) {
292292
}{
293293
{
294294
name: "first reconcile iteration - with operator condition",
295-
conditions: NewNUMAResourcesSchedulerBaseConditions(),
295+
conditions: NewNUMAResourcesSchedulerConditions(),
296296
condition: metav1.Condition{
297297
Type: ConditionAvailable,
298298
Status: metav1.ConditionTrue,
@@ -334,7 +334,7 @@ func TestComputeConditions(t *testing.T) {
334334
},
335335
{
336336
name: "first reconcile iteration - with informer condition",
337-
conditions: NewNUMAResourcesSchedulerBaseConditions(),
337+
conditions: NewNUMAResourcesSchedulerConditions(),
338338
condition: metav1.Condition{
339339
Type: ConditionDedicatedInformerActive,
340340
Status: metav1.ConditionTrue,

0 commit comments

Comments
 (0)