Skip to content

Commit a4e79f6

Browse files
Add GPUClusterConfig controller with singleton status handling
Signed-off-by: Karthik Vetrivel <kvetrivel@nvidia.com>
1 parent 9f08bec commit a4e79f6

9 files changed

Lines changed: 560 additions & 0 deletions

File tree

api/nvidia/v1alpha1/gpuclusterconfig_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,11 @@ const (
2727
GPUClusterConfigCRDName = "GPUClusterConfig"
2828
)
2929

30+
const (
31+
// Ignored marks a duplicate GPUClusterConfig that the singleton controller does not reconcile.
32+
Ignored State = "ignored"
33+
)
34+
3035
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
3136

3237
// GPUClusterConfigSpec defines the desired state of GPUClusterConfig, the DRA-based

bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,37 @@ spec:
320320
path: state
321321
x-descriptors:
322322
- 'urn:alm:descriptor:text'
323+
- name: gpuclusterconfigs.nvidia.com
324+
kind: GPUClusterConfig
325+
version: v1alpha1
326+
group: nvidia.com
327+
displayName: GPUClusterConfig
328+
description: GPUClusterConfig allows you to configure the DRA-based GPU software stack
329+
resources:
330+
- kind: ServiceAccount
331+
name: ''
332+
version: v1
333+
- kind: DaemonSet
334+
name: ''
335+
version: apps/v1
336+
- kind: Deployment
337+
name: ''
338+
version: apps/v1
339+
- kind: ConfigMap
340+
name: ''
341+
version: v1
342+
- kind: Pod
343+
name: ''
344+
version: v1
345+
- kind: status
346+
name: ''
347+
version: v1
348+
statusDescriptors:
349+
- description: The current state of the GPUClusterConfig.
350+
displayName: State
351+
path: state
352+
x-descriptors:
353+
- 'urn:alm:descriptor:text'
323354
- name: clusterpolicies.nvidia.com
324355
kind: ClusterPolicy
325356
version: v1
@@ -635,6 +666,9 @@ spec:
635666
- clusterpolicies
636667
- clusterpolicies/finalizers
637668
- clusterpolicies/status
669+
- gpuclusterconfigs
670+
- gpuclusterconfigs/finalizers
671+
- gpuclusterconfigs/status
638672
- nvidiadrivers
639673
- nvidiadrivers/finalizers
640674
- nvidiadrivers/status

cmd/gpu-operator/main.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,16 @@ func main() {
210210
setupLog.Error(err, "unable to create controller", "controller", "NVIDIADriver")
211211
os.Exit(1)
212212
}
213+
214+
if err = (&controllers.GPUClusterConfigReconciler{
215+
Namespace: operatorNamespace,
216+
Client: mgr.GetClient(),
217+
Scheme: mgr.GetScheme(),
218+
ClusterInfo: clusterInfo,
219+
}).SetupWithManager(ctx, mgr); err != nil {
220+
setupLog.Error(err, "unable to create controller", "controller", "GPUClusterConfig")
221+
os.Exit(1)
222+
}
213223
// +kubebuilder:scaffold:builder
214224
if err := mgr.AddHealthzCheck("health", healthz.Ping); err != nil {
215225
setupLog.Error(err, "unable to set up health check")

config/rbac/role.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ rules:
132132
- nvidia.com
133133
resources:
134134
- '*'
135+
- gpuclusterconfigs
135136
- nvidiadrivers
136137
verbs:
137138
- create
@@ -144,12 +145,14 @@ rules:
144145
- apiGroups:
145146
- nvidia.com
146147
resources:
148+
- gpuclusterconfigs/finalizers
147149
- nvidiadrivers/finalizers
148150
verbs:
149151
- update
150152
- apiGroups:
151153
- nvidia.com
152154
resources:
155+
- gpuclusterconfigs/status
153156
- nvidiadrivers/status
154157
verbs:
155158
- get
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
apierrors "k8s.io/apimachinery/pkg/api/errors"
25+
"k8s.io/apimachinery/pkg/runtime"
26+
"k8s.io/apimachinery/pkg/types"
27+
"k8s.io/client-go/util/workqueue"
28+
ctrl "sigs.k8s.io/controller-runtime"
29+
"sigs.k8s.io/controller-runtime/pkg/client"
30+
"sigs.k8s.io/controller-runtime/pkg/controller"
31+
"sigs.k8s.io/controller-runtime/pkg/handler"
32+
"sigs.k8s.io/controller-runtime/pkg/log"
33+
"sigs.k8s.io/controller-runtime/pkg/predicate"
34+
"sigs.k8s.io/controller-runtime/pkg/reconcile"
35+
"sigs.k8s.io/controller-runtime/pkg/source"
36+
37+
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
38+
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
39+
"github.com/NVIDIA/gpu-operator/internal/conditions"
40+
"github.com/NVIDIA/gpu-operator/internal/consts"
41+
"github.com/NVIDIA/gpu-operator/internal/state"
42+
)
43+
44+
// GPUClusterConfigReconciler reconciles a GPUClusterConfig object
45+
type GPUClusterConfigReconciler struct {
46+
client.Client
47+
Scheme *runtime.Scheme
48+
ClusterInfo clusterinfo.Interface
49+
Namespace string
50+
51+
stateManager state.Manager
52+
conditionUpdater conditions.Updater
53+
54+
// singleton is the GPUClusterConfig that owns reconciliation; the first instance to
55+
// reconcile claims it (first-wins), mirroring ClusterPolicy.
56+
singleton *nvidiav1alpha1.GPUClusterConfig
57+
}
58+
59+
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs,verbs=get;list;watch;create;update;patch;delete
60+
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/status,verbs=get;update;patch
61+
//+kubebuilder:rbac:groups=nvidia.com,resources=gpuclusterconfigs/finalizers,verbs=update
62+
63+
func (r *GPUClusterConfigReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
64+
logger := log.FromContext(ctx)
65+
logger.V(consts.LogLevelInfo).Info("Reconciling GPUClusterConfig")
66+
67+
instance := &nvidiav1alpha1.GPUClusterConfig{}
68+
if err := r.Get(ctx, req.NamespacedName, instance); err != nil {
69+
if apierrors.IsNotFound(err) {
70+
// Deleted; owned objects are garbage-collected, so there is nothing to clean up.
71+
return reconcile.Result{}, nil
72+
}
73+
wrappedErr := fmt.Errorf("error getting GPUClusterConfig object: %w", err)
74+
logger.Error(err, "error getting GPUClusterConfig object")
75+
instance.Status.State = nvidiav1alpha1.NotReady
76+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, wrappedErr.Error()); condErr != nil {
77+
logger.Error(condErr, "failed to set condition")
78+
}
79+
return reconcile.Result{}, wrappedErr
80+
}
81+
82+
// Singleton, first-wins (mirroring ClusterPolicy): the first instance to reconcile
83+
// claims ownership; any other instance is marked Ignored and skipped. The owner is
84+
// held in memory, so the choice resets on operator restart.
85+
if r.singleton != nil && r.singleton.Name != instance.Name {
86+
logger.V(consts.LogLevelWarning).Info("Multiple GPUClusterConfig instances found, ignoring this one",
87+
"name", instance.Name, "owner", r.singleton.Name)
88+
if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.Ignored); err != nil {
89+
return reconcile.Result{}, err
90+
}
91+
return reconcile.Result{}, nil
92+
}
93+
r.singleton = instance
94+
95+
infoCatalog := state.NewInfoCatalog()
96+
infoCatalog.Add(state.InfoTypeClusterInfo, r.ClusterInfo)
97+
98+
managerStatus := r.stateManager.SyncState(ctx, instance, infoCatalog)
99+
100+
if err := r.updateCrStatus(ctx, instance, nvidiav1alpha1.State(managerStatus.Status)); err != nil {
101+
return ctrl.Result{}, err
102+
}
103+
104+
if managerStatus.Status != state.SyncStateReady {
105+
logger.Info("GPUClusterConfig instance is not ready")
106+
var errorInfo error
107+
for _, result := range managerStatus.StatesStatus {
108+
if result.Status != state.SyncStateReady && result.ErrInfo != nil {
109+
errorInfo = result.ErrInfo
110+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, fmt.Sprintf("Error syncing state %s: %v", result.StateName, errorInfo.Error())); condErr != nil {
111+
logger.Error(condErr, "failed to set condition")
112+
}
113+
break
114+
}
115+
}
116+
// if no errors are reported from any state, then we are waiting on operand pods
117+
if errorInfo == nil {
118+
if condErr := r.conditionUpdater.SetConditionsError(ctx, instance, conditions.OperandNotReady, "Waiting for operand pods to be ready"); condErr != nil {
119+
logger.Error(condErr, "failed to set condition")
120+
}
121+
}
122+
return reconcile.Result{RequeueAfter: time.Second * 5}, nil
123+
}
124+
125+
if condErr := r.conditionUpdater.SetConditionsReady(ctx, instance, conditions.Reconciled, "All resources have been successfully reconciled"); condErr != nil {
126+
logger.Error(condErr, "failed to set condition")
127+
return ctrl.Result{}, condErr
128+
}
129+
return reconcile.Result{}, nil
130+
}
131+
132+
// updateCrStatus writes desired to the CR's status, skipping the write when it is already current.
133+
func (r *GPUClusterConfigReconciler) updateCrStatus(ctx context.Context, cr *nvidiav1alpha1.GPUClusterConfig, desired nvidiav1alpha1.State) error {
134+
reqLogger := log.FromContext(ctx)
135+
136+
// Refetch to avoid a resourceVersion conflict.
137+
instance := &nvidiav1alpha1.GPUClusterConfig{}
138+
if err := r.Get(ctx, types.NamespacedName{Name: cr.Name}, instance); err != nil {
139+
reqLogger.Error(err, "Failed to get GPUClusterConfig instance for status update")
140+
return err
141+
}
142+
143+
if instance.Status.State == desired && instance.Status.Namespace == r.Namespace {
144+
return nil
145+
}
146+
instance.Status.State = desired
147+
instance.Status.Namespace = r.Namespace
148+
149+
reqLogger.V(consts.LogLevelInfo).Info("Updating CR Status", "Status", instance.Status)
150+
if err := r.Status().Update(ctx, instance); err != nil {
151+
reqLogger.Error(err, "Failed to update CR status")
152+
return err
153+
}
154+
cr.Status.State = instance.Status.State
155+
cr.Status.Namespace = instance.Status.Namespace
156+
return nil
157+
}
158+
159+
// enqueueAllGPUClusterConfigs enqueues every instance so each is reconciled when any
160+
// instance or owned resource changes.
161+
func (r *GPUClusterConfigReconciler) enqueueAllGPUClusterConfigs(ctx context.Context) []reconcile.Request {
162+
logger := log.FromContext(ctx)
163+
list := &nvidiav1alpha1.GPUClusterConfigList{}
164+
165+
if err := r.List(ctx, list); err != nil {
166+
logger.Error(err, "Unable to list GPUClusterConfig resources")
167+
return []reconcile.Request{}
168+
}
169+
170+
reconcileRequests := make([]reconcile.Request, 0, len(list.Items))
171+
for _, config := range list.Items {
172+
reconcileRequests = append(reconcileRequests,
173+
reconcile.Request{
174+
NamespacedName: types.NamespacedName{
175+
Name: config.GetName(),
176+
},
177+
})
178+
}
179+
180+
return reconcileRequests
181+
}
182+
183+
func (r *GPUClusterConfigReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error {
184+
// No operands are rendered yet (empty state set).
185+
stateManager, err := state.NewManager(
186+
nvidiav1alpha1.GPUClusterConfigCRDName,
187+
r.Namespace,
188+
mgr.GetClient(),
189+
mgr.GetScheme())
190+
if err != nil {
191+
return fmt.Errorf("error creating state manager: %v", err)
192+
}
193+
r.stateManager = stateManager
194+
195+
r.conditionUpdater = conditions.NewGPUClusterConfigUpdater(mgr.GetClient())
196+
197+
c, err := controller.New("gpu-cluster-config-controller", mgr, controller.Options{
198+
Reconciler: r,
199+
MaxConcurrentReconciles: 1,
200+
RateLimiter: workqueue.NewTypedItemExponentialFailureRateLimiter[reconcile.Request](minDelayCR, maxDelayCR),
201+
})
202+
if err != nil {
203+
return err
204+
}
205+
206+
gpuClusterConfigMapFn := func(ctx context.Context, _ *nvidiav1alpha1.GPUClusterConfig) []reconcile.Request {
207+
return r.enqueueAllGPUClusterConfigs(ctx)
208+
}
209+
210+
err = c.Watch(source.Kind(
211+
mgr.GetCache(),
212+
&nvidiav1alpha1.GPUClusterConfig{},
213+
handler.TypedEnqueueRequestsFromMapFunc(gpuClusterConfigMapFn),
214+
predicate.TypedGenerationChangedPredicate[*nvidiav1alpha1.GPUClusterConfig]{},
215+
),
216+
)
217+
if err != nil {
218+
return err
219+
}
220+
221+
// Watch the secondary resources each state manager owns.
222+
watchSources := stateManager.GetWatchSources(mgr)
223+
for _, watchSource := range watchSources {
224+
err = c.Watch(
225+
watchSource,
226+
)
227+
if err != nil {
228+
return fmt.Errorf("error setting up Watch for source type %v: %w", watchSource, err)
229+
}
230+
}
231+
232+
return nil
233+
}

0 commit comments

Comments
 (0)