Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions api/v1/helper/nodegroup/nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ type Tree struct {
MachineConfigPools []*mcov1.MachineConfigPool
}

func (ttr Tree) Name() string {
if ttr.NodeGroup == nil {
return "<nil>"
}
if ttr.NodeGroup.PoolName == nil {
return "<unknown>"
}
return *ttr.NodeGroup.PoolName
}

// Clone creates a deepcopy of a Tree
func (ttr Tree) Clone() Tree {
ret := Tree{
Expand Down
39 changes: 39 additions & 0 deletions api/v1/helper/nodegroup/nodegroup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,45 @@ import (
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
)

func TestTreeName(t *testing.T) {
type testCase struct {
name string
tree Tree
expected string
}
testCases := []testCase{
{
name: "zero value tree",
tree: Tree{},
expected: "<nil>",
},
{
name: "nil PoolName",
tree: Tree{
NodeGroup: &nropv1.NodeGroup{},
},
expected: "<unknown>",
},
{
name: "with PoolName",
tree: Tree{
NodeGroup: &nropv1.NodeGroup{
PoolName: ptr.To("worker-cnf"),
},
},
expected: "worker-cnf",
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
got := tc.tree.Name()
if got != tc.expected {
t.Errorf("expected %q, got %q", tc.expected, got)
}
})
}
}

func TestFindTreesOpenshift(t *testing.T) {
mcpList := mcov1.MachineConfigPoolList{
Items: []mcov1.MachineConfigPool{
Expand Down
537 changes: 302 additions & 235 deletions internal/controller/numaresourcesoperator_controller.go

Large diffs are not rendered by default.

59 changes: 44 additions & 15 deletions internal/controller/numaresourcesoperator_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1751,13 +1751,10 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
Expect(err).ToNot(HaveOccurred())
})
It("should wait", func() {
// check reconcile first loop result
// wait one minute to update MCP, thus RTE daemonsets and complete status update is not going to be achieved at this point
Expect(result).To(Equal(reconcile.Result{RequeueAfter: time.Minute}))

Expect(reconciler.Client.Get(context.TODO(), key, nro)).ToNot(HaveOccurred())
Expect(nro.Status.MachineConfigPools).To(HaveLen(1))
Expect(nro.Status.MachineConfigPools[0].Name).To(Equal("test1"))
Expect(nro.Status.MachineConfigPools).To(HaveLen(2))
})
})

Expand Down Expand Up @@ -2263,27 +2260,59 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
By("Reconcile: pool1 converges (default policy, no MC to wait for), pool2 skipped (paused)")
Expect(reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})).ToNot(CauseRequeue())

By("Verify DaemonSets for both pools exist")
By("Verify NRO status reflects paused pool")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())

pausedCond := getConditionByType(nro.Status.Conditions, status.ConditionMachineConfigPoolPaused)
Expect(pausedCond).ToNot(BeNil())
Expect(pausedCond.Status).To(Equal(metav1.ConditionTrue))
})

It("should create DaemonSet for ready tree while another tree is still pending", func(ctx context.Context) {
// pool1 has custom policy -> MC created, needs MCO rollout
// pool2 has no custom annotation -> default policy (no MC to wait for)
nro.Spec.NodeGroups[0].Annotations[annotations.SELinuxPolicyConfigAnnotation] = annotations.SELinuxPolicyCustom

var err error
reconciler, err = NewFakeNUMAResourcesOperatorReconciler(platform.OpenShift, defaultOCPVersion, nro, mcp1, mcp2)
Expect(err).ToNot(HaveOccurred())

key := client.ObjectKeyFromObject(nro)

By("First reconcile: pool1 MC pending, pool2 ready immediately")
Expect(reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})).To(CauseRequeue())

By("Verify pool2 DaemonSet exists even though pool1 is still pending")
ds := &appsv1.DaemonSet{}
mcp1DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp1.Name),
mcp2DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
Namespace: testNamespace,
}
Expect(reconciler.Client.Get(ctx, mcp1DSKey, ds)).To(Succeed())
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())

mcp2DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
By("Verify global status is Progressing, not Available")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
progressingCond := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
Expect(progressingCond).ToNot(BeNil())
Expect(progressingCond.Status).To(Equal(metav1.ConditionTrue))
Comment on lines +2285 to +2297
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Assert the pending tree is still blocked in the first reconcile

This test currently proves only the positive path (ready tree DS exists, Progressing=true). It does not assert that the pending tree DS is still absent and Available is false at that point, so a regression to eager DS creation could still pass.

Suggested test hardening
 				By("Verify pool2 DaemonSet exists even though pool1 is still pending")
 				ds := &appsv1.DaemonSet{}
 				mcp2DSKey := client.ObjectKey{
 					Name:      objectnames.GetComponentName(nro.Name, mcp2.Name),
 					Namespace: testNamespace,
 				}
 				Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())
+
+				By("Verify pool1 DaemonSet is not created yet")
+				mcp1DSKeyPending := client.ObjectKey{
+					Name:      objectnames.GetComponentName(nro.Name, mcp1.Name),
+					Namespace: testNamespace,
+				}
+				err = reconciler.Client.Get(ctx, mcp1DSKeyPending, &appsv1.DaemonSet{})
+				Expect(apierrors.IsNotFound(err)).To(BeTrue(), "pool1 daemonset should still be pending")
 
 				By("Verify global status is Progressing, not Available")
 				Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
 				progressingCond := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
 				Expect(progressingCond).ToNot(BeNil())
 				Expect(progressingCond.Status).To(Equal(metav1.ConditionTrue))
+				availableCond := getConditionByType(nro.Status.Conditions, status.ConditionAvailable)
+				Expect(availableCond).ToNot(BeNil())
+				Expect(availableCond.Status).To(Equal(metav1.ConditionFalse))
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
By("Verify pool2 DaemonSet exists even though pool1 is still pending")
ds := &appsv1.DaemonSet{}
mcp1DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp1.Name),
mcp2DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
Namespace: testNamespace,
}
Expect(reconciler.Client.Get(ctx, mcp1DSKey, ds)).To(Succeed())
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())
mcp2DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
By("Verify global status is Progressing, not Available")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
progressingCond := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
Expect(progressingCond).ToNot(BeNil())
Expect(progressingCond.Status).To(Equal(metav1.ConditionTrue))
By("Verify pool2 DaemonSet exists even though pool1 is still pending")
ds := &appsv1.DaemonSet{}
mcp2DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
Namespace: testNamespace,
}
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())
By("Verify pool1 DaemonSet is not created yet")
mcp1DSKeyPending := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp1.Name),
Namespace: testNamespace,
}
err = reconciler.Client.Get(ctx, mcp1DSKeyPending, &appsv1.DaemonSet{})
Expect(apierrors.IsNotFound(err)).To(BeTrue(), "pool1 daemonset should still be pending")
By("Verify global status is Progressing, not Available")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
progressingCond := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
Expect(progressingCond).ToNot(BeNil())
Expect(progressingCond.Status).To(Equal(metav1.ConditionTrue))
availableCond := getConditionByType(nro.Status.Conditions, status.ConditionAvailable)
Expect(availableCond).ToNot(BeNil())
Expect(availableCond.Status).To(Equal(metav1.ConditionFalse))
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@internal/controller/numaresourcesoperator_controller_test.go` around lines
2285 - 2297, Add assertions that the pending tree's DaemonSet is still absent
and the global Available condition is false: attempt to Get the DaemonSet for
mcp1 (using objectnames.GetComponentName(nro.Name, mcp1.Name) and testNamespace)
and expect a NotFound error, then fetch nro and verify
getConditionByType(nro.Status.Conditions, status.ConditionAvailable) is not nil
and its Status equals metav1.ConditionFalse. This complements the existing
checks that pool2 DS exists and Progressing is true.


By("Make pool1 ready and reconcile again")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp1), mcp1)).To(Succeed())
ensureMCPIsReady(mcp1, nro.Name)
Expect(reconciler.Client.Update(ctx, mcp1)).To(Succeed())

Expect(reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})).ToNot(CauseRequeue())

By("Now both DaemonSets exist and status is Available")
mcp1DSKey := client.ObjectKey{
Name: objectnames.GetComponentName(nro.Name, mcp1.Name),
Namespace: testNamespace,
}
Expect(reconciler.Client.Get(ctx, mcp1DSKey, ds)).To(Succeed())
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())

By("Verify NRO status is Available with paused condition")
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
Expect(nro).To(BeInCondition(status.ConditionAvailable))

pausedCond := getConditionByType(nro.Status.Conditions, status.ConditionMachineConfigPoolPaused)
Expect(pausedCond).ToNot(BeNil())
Expect(pausedCond.Status).To(Equal(metav1.ConditionTrue))
})

It("should report paused condition while Progressing", func(ctx context.Context) {
Expand Down
19 changes: 19 additions & 0 deletions internal/reconcile/step.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,16 @@ func (rs Step) EarlyStop() bool {
return rs.ConditionInfo.Type != status.ConditionAvailable
}

// Ongoing returns true if the reconciliation is still ongoing/progressing
func (rs Step) Ongoing() bool {
return rs.Result.RequeueAfter > 0
}
Comment on lines +46 to +49
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Ongoing() misclassifies valid progressing steps

At Line 48, RequeueAfter > 0 treats StepOngoing(0) as not ongoing, even though it is explicitly a progressing step. This breaks per-tree aggregation semantics (progress counters/selection) for paused MCP flows.

Suggested fix
 func (rs Step) Ongoing() bool {
-	return rs.Result.RequeueAfter > 0
+	return rs.ConditionInfo.Type == status.ConditionProgressing && rs.Error == nil
 }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Ongoing returns true if the reconciliation is still ongoing/progressing
func (rs Step) Ongoing() bool {
return rs.Result.RequeueAfter > 0
}
// Ongoing returns true if the reconciliation is still ongoing/progressing
func (rs Step) Ongoing() bool {
return rs.ConditionInfo.Type == status.ConditionProgressing && rs.Error == nil
}
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@internal/reconcile/step.go` around lines 46 - 49, The Ongoing() method
currently returns true only when Result.RequeueAfter > 0 which incorrectly
treats StepOngoing(0) as not ongoing; update Step.Ongoing() to treat zero as an
ongoing/ progressing value (e.g., return rs.Result.RequeueAfter >= 0) or
otherwise compare against the defined sentinel for "no requeue" so that
StepOngoing(0) is classified as ongoing; change the condition in the Ongoing()
implementation that references Result.RequeueAfter so zero is considered
progressing (and ensure this aligns with the StepOngoing(0) semantics).


// Failed returns true if the reconciliation was not succesfull
func (rs Step) Failed() bool {
return rs.Error != nil
}

// WithReason set the existing reason with the given value,
// if not set already and returns a new updated Step
func (rs Step) WithReason(reason string) Step {
Expand All @@ -63,6 +73,15 @@ func (rs Step) WithMessage(message string) Step {
}
}

// UpdateMessage overrides the message, prepending to any existing one, and returns a new updated Step
func (rs Step) UpdateMessage(message string) Step {
return Step{
Result: rs.Result,
ConditionInfo: rs.ConditionInfo.UpdateMessage(message),
Error: rs.Error,
}
}

// StepSuccess returns a Step which tells the caller
// the reconciliation attempt was completed successfully
func StepSuccess() Step {
Expand Down
32 changes: 32 additions & 0 deletions internal/reconcile/step_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,35 @@ func TestStepFailed(t *testing.T) {
assert.False(t, st.Done())
assert.True(t, st.EarlyStop())
}

func TestStepOngoingIsOngoing(t *testing.T) {
st := StepOngoing(5 * time.Second)
assert.True(t, st.Ongoing())
assert.False(t, st.Failed())
}

func TestStepFailedIsFailed(t *testing.T) {
st := StepFailed(errors.New("fake error"))
assert.True(t, st.Failed())
assert.False(t, st.Ongoing())
}

func TestStepSuccessIsNeitherOngoingNorFailed(t *testing.T) {
st := StepSuccess()
assert.False(t, st.Ongoing())
assert.False(t, st.Failed())
}

func TestStepUpdateMessageEmpty(t *testing.T) {
st := StepOngoing(5 * time.Second)
st2 := st.UpdateMessage("summary")
assert.Empty(t, st.ConditionInfo.Message)
assert.Equal(t, st2.ConditionInfo.Message, "summary")
}

func TestStepUpdateMessageExisting(t *testing.T) {
st := StepFailed(errors.New("fake error"))
st2 := st.UpdateMessage("summary")
assert.Equal(t, st.ConditionInfo.Message, "fake error")
assert.Equal(t, st2.ConditionInfo.Message, "summary; fake error")
}
89 changes: 43 additions & 46 deletions pkg/objectstate/rte/machineconfigpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,66 +136,63 @@ type MachineConfigObjectState struct {
WaitForUpdated MCPWaitForUpdatedFunc
}

func (em *ExistingManifests) MachineConfigsState(mf Manifests) ([]MachineConfigObjectState, sets.Set[string]) {
func (em *ExistingManifests) MachineConfigsState(mf Manifests, tree nodegroupv1.Tree) ([]MachineConfigObjectState, sets.Set[string]) {
var ret []MachineConfigObjectState
pausedMCPNames := sets.New[string]()
if mf.Core.MachineConfig == nil {
return ret, pausedMCPNames
}
for _, tree := range em.trees {
isCustomPolicy := annotations.IsCustomPolicyEnabled(tree.NodeGroup.Annotations)
for _, mcp := range tree.MachineConfigPools {
// do not update state when MachineConfigPool is paused
if mcp.Spec.Paused {
pausedMCPNames.Insert(mcp.Name)
continue
}

mcName := objectnames.GetMachineConfigName(em.instance.Name, mcp.Name)
if mcp.Spec.MachineConfigSelector == nil {
klog.Warningf("the machine config pool %q does not have machine config selector", mcp.Name)
continue
}

existingMachineConfig, ok := em.machineConfigs[mcName]
if !ok {
klog.Warningf("failed to find machine config %q in namespace %q", mcName, em.namespace)
continue
}

if !isCustomPolicy {
// caution here: we want a *nil interface value*, not an *interface which points to nil*.
// the latter would lead to apparently correct code leading to runtime panics. See:
// https://trstringer.com/go-nil-interface-and-interface-with-nil-concrete-value/
// (and many other docs like this)
ret = append(ret, MachineConfigObjectState{
ObjectState: objectstate.ObjectState{
Existing: existingMachineConfig.machineConfig,
Error: existingMachineConfig.machineConfigError,
Desired: nil,
},
PoolName: mcp.Name,
WaitForUpdated: IsMachineConfigPoolUpdatedAfterDeletion,
})
continue
}

desiredMachineConfig := mf.Core.MachineConfig.DeepCopy()
desiredMachineConfig.Name = mcName
desiredMachineConfig.Labels = GetMachineConfigLabel(mcp)
isCustomPolicy := annotations.IsCustomPolicyEnabled(tree.NodeGroup.Annotations)
for _, mcp := range tree.MachineConfigPools {
if mcp.Spec.Paused {
pausedMCPNames.Insert(mcp.Name)
continue
}

mcName := objectnames.GetMachineConfigName(em.instance.Name, mcp.Name)
if mcp.Spec.MachineConfigSelector == nil {
klog.Warningf("the machine config pool %q does not have machine config selector", mcp.Name)
continue
}

existingMachineConfig, ok := em.machineConfigs[mcName]
if !ok {
klog.Warningf("failed to find machine config %q in namespace %q", mcName, em.namespace)
continue
}

if !isCustomPolicy {
// caution here: we want a *nil interface value*, not an *interface which points to nil*.
// the latter would lead to apparently correct code leading to runtime panics. See:
// https://trstringer.com/go-nil-interface-and-interface-with-nil-concrete-value/
// (and many other docs like this)
ret = append(ret, MachineConfigObjectState{
ObjectState: objectstate.ObjectState{
Existing: existingMachineConfig.machineConfig,
Error: existingMachineConfig.machineConfigError,
Desired: desiredMachineConfig,
Compare: compare.Object,
Merge: merge.ObjectForUpdate,
Desired: nil,
},
PoolName: mcp.Name,
WaitForUpdated: IsMachineConfigPoolUpdated,
WaitForUpdated: IsMachineConfigPoolUpdatedAfterDeletion,
})
continue
}

desiredMachineConfig := mf.Core.MachineConfig.DeepCopy()
desiredMachineConfig.Name = mcName
desiredMachineConfig.Labels = GetMachineConfigLabel(mcp)

ret = append(ret, MachineConfigObjectState{
ObjectState: objectstate.ObjectState{
Existing: existingMachineConfig.machineConfig,
Error: existingMachineConfig.machineConfigError,
Desired: desiredMachineConfig,
Compare: compare.Object,
Merge: merge.ObjectForUpdate,
},
PoolName: mcp.Name,
WaitForUpdated: IsMachineConfigPoolUpdated,
})
}
return ret, pausedMCPNames
}
Expand Down
Loading
Loading