Skip to content

Commit da33cd8

Browse files
committed
controller: nrop: switch to vertical per-tree processing
Change reconcileResource from horizontal orchestration (all trees complete machine config step, then all trees complete daemonset step) to vertical processing (each tree progresses through all steps independently). This means a single slow nodegroup no longer blocks progress for the others. Move tree normalization from the pre-processing loop in Reconcile into the per-tree loop in reconcileResource, where it naturally belongs alongside the rest of per-tree processing. Add a test verifying that a tree whose MCP is ready creates its DaemonSet even while another tree's MCP is still pending. AI-attribution: AIA PAI CeNc Hin R claude-4.6-opus-1M v1.0 Signed-off-by: Francesco Romani <fromani@redhat.com>
1 parent 92657bf commit da33cd8

2 files changed

Lines changed: 73 additions & 30 deletions

File tree

internal/controller/numaresourcesoperator_controller.go

Lines changed: 26 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
"k8s.io/apimachinery/pkg/util/sets"
3636
"k8s.io/client-go/tools/record"
3737
"k8s.io/klog/v2"
38+
"k8s.io/utils/ptr"
3839

3940
ctrl "sigs.k8s.io/controller-runtime"
4041
"sigs.k8s.io/controller-runtime/pkg/builder"
@@ -179,11 +180,6 @@ func (r *NUMAResourcesOperatorReconciler) Reconcile(ctx context.Context, req ctr
179180
return r.degradeStatus(ctx, initialInstance, instance, validation.NodeGroupsError, err)
180181
}
181182

182-
for idx := range trees {
183-
conf := trees[idx].NodeGroup.NormalizeConfig()
184-
trees[idx].NodeGroup.Config = &conf
185-
}
186-
187183
step := r.reconcileResource(ctx, instance, trees)
188184
instance.Status.Conditions, _ = status.ComputeConditions(instance.Status.Conditions, step.ConditionInfo.ToMetav1Condition(instance.Generation), time.Now())
189185

@@ -259,39 +255,39 @@ func (r *NUMAResourcesOperatorReconciler) reconcileResource(ctx context.Context,
259255
}
260256
}
261257

262-
// horizontal processing: all trees complete each step before moving to the next
258+
var results []perTreeResult
259+
for _, tree := range trees {
260+
tree.NodeGroup.Config = ptr.To(tree.NodeGroup.NormalizeConfig())
263261

264-
// step 1: machine configs for all trees
265-
var mcResults []perTreeResult
266-
if r.Platform == platform.OpenShift {
267-
for _, tree := range trees {
268-
treeExisting := existing.PerTree(ctx, r.Client, tree)
269-
mcResults = append(mcResults, r.reconcilePerTreeMachineConfig(ctx, instance, treeExisting, tree))
262+
treeExisting := existing.PerTree(ctx, r.Client, tree)
263+
264+
mcResult := perTreeResult{}
265+
if r.Platform == platform.OpenShift {
266+
mcResult = r.reconcilePerTreeMachineConfig(ctx, instance, treeExisting, tree)
267+
if mcResult.step.EarlyStop() {
268+
results = append(results, mcResult)
269+
continue
270+
}
270271
}
271-
}
272-
mcOverall := reducePerTreeResults(mcResults)
273-
instance.Status.MachineConfigPools = mcOverall.mcpStatuses
274-
instance.Status.Conditions = updateMachineConfigPoolPausedCondition(instance.Status.Conditions, instance.Generation, mcOverall.pausedMCPNames)
275-
if mcOverall.step.EarlyStop() {
276-
return mcOverall.step
277-
}
278272

279-
// step 2: daemonsets for all trees
280-
var dsResults []perTreeResult
281-
for _, tree := range trees {
282-
treeExisting := existing.PerTree(ctx, r.Client, tree)
283-
dsResults = append(dsResults, r.reconcilePerTreeDaemonSet(ctx, instance, treeExisting, tree))
273+
dsResult := r.reconcilePerTreeDaemonSet(ctx, instance, treeExisting, tree)
274+
dsResult.mcpStatuses = mcResult.mcpStatuses
275+
dsResult.pausedMCPNames = mcResult.pausedMCPNames
276+
results = append(results, dsResult)
284277
}
285-
dsOverall := reducePerTreeResults(dsResults)
286-
dssReady := collectDaemonSets(dsOverall.dsInfo)
278+
279+
overall := reducePerTreeResults(results)
280+
dssReady := collectDaemonSets(overall.dsInfo)
281+
instance.Status.MachineConfigPools = overall.mcpStatuses
282+
instance.Status.Conditions = updateMachineConfigPoolPausedCondition(instance.Status.Conditions, instance.Generation, overall.pausedMCPNames)
287283
instance.Status.DaemonSets = dssReady
288284
instance.Status.RelatedObjects = relatedobjects.ResourceTopologyExporter(r.Namespace, dssReady)
289-
if !dsOverall.step.Done() {
290-
return dsOverall.step
285+
286+
if overall.step.Done() {
287+
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, overall.dsInfo)
291288
}
292289

293-
instance.Status.NodeGroups = syncNodeGroupsStatus(instance, dsOverall.dsInfo)
294-
return intreconcile.StepSuccess()
290+
return overall.step
295291
}
296292

297293
func syncNodeGroupsStatus(instance *nropv1.NUMAResourcesOperator, dsPerPool []poolDaemonSet) []nropv1.NodeGroupStatus {

internal/controller/numaresourcesoperator_controller_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2268,6 +2268,53 @@ var _ = Describe("Test NUMAResourcesOperator Reconcile", func() {
22682268
Expect(pausedCond.Status).To(Equal(metav1.ConditionTrue))
22692269
})
22702270

2271+
It("should create DaemonSet for ready tree while another tree is still pending", func(ctx context.Context) {
2272+
// pool1 has custom policy -> MC created, needs MCO rollout
2273+
// pool2 has no custom annotation -> default policy (no MC to wait for)
2274+
nro.Spec.NodeGroups[0].Annotations[annotations.SELinuxPolicyConfigAnnotation] = annotations.SELinuxPolicyCustom
2275+
2276+
var err error
2277+
reconciler, err = NewFakeNUMAResourcesOperatorReconciler(platform.OpenShift, defaultOCPVersion, nro, mcp1, mcp2)
2278+
Expect(err).ToNot(HaveOccurred())
2279+
2280+
key := client.ObjectKeyFromObject(nro)
2281+
2282+
By("First reconcile: pool1 MC pending, pool2 ready immediately")
2283+
Expect(reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})).To(CauseRequeue())
2284+
2285+
By("Verify pool2 DaemonSet exists even though pool1 is still pending")
2286+
ds := &appsv1.DaemonSet{}
2287+
mcp2DSKey := client.ObjectKey{
2288+
Name: objectnames.GetComponentName(nro.Name, mcp2.Name),
2289+
Namespace: testNamespace,
2290+
}
2291+
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())
2292+
2293+
By("Verify global status is Progressing, not Available")
2294+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
2295+
progressingCond := getConditionByType(nro.Status.Conditions, status.ConditionProgressing)
2296+
Expect(progressingCond).ToNot(BeNil())
2297+
Expect(progressingCond.Status).To(Equal(metav1.ConditionTrue))
2298+
2299+
By("Make pool1 ready and reconcile again")
2300+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(mcp1), mcp1)).To(Succeed())
2301+
ensureMCPIsReady(mcp1, nro.Name)
2302+
Expect(reconciler.Client.Update(ctx, mcp1)).To(Succeed())
2303+
2304+
Expect(reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key})).ToNot(CauseRequeue())
2305+
2306+
By("Now both DaemonSets exist and status is Available")
2307+
mcp1DSKey := client.ObjectKey{
2308+
Name: objectnames.GetComponentName(nro.Name, mcp1.Name),
2309+
Namespace: testNamespace,
2310+
}
2311+
Expect(reconciler.Client.Get(ctx, mcp1DSKey, ds)).To(Succeed())
2312+
Expect(reconciler.Client.Get(ctx, mcp2DSKey, ds)).To(Succeed())
2313+
2314+
Expect(reconciler.Client.Get(ctx, client.ObjectKeyFromObject(nro), nro)).To(Succeed())
2315+
Expect(nro).To(BeInCondition(status.ConditionAvailable))
2316+
})
2317+
22712318
It("should report paused condition while Progressing", func(ctx context.Context) {
22722319
// pool1 has custom policy -> MC created, needs MCO rollout
22732320
// pool2 is paused -> skipped

0 commit comments

Comments
 (0)