Skip to content

Commit fe736f1

Browse files
authored
Merge pull request #452 from klueska/additional-namespaces
Add a MultiNamespaceDaemonsetManager for ComputeDomains
2 parents 12f1905 + 90396de commit fe736f1

18 files changed

Lines changed: 315 additions & 108 deletions

File tree

cmd/compute-domain-controller/cleanup.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ func (m *CleanupManager[T]) Start(ctx context.Context) error {
7474
}
7575

7676
func (m *CleanupManager[T]) Stop() error {
77-
m.cancelContext()
77+
if m.cancelContext != nil {
78+
m.cancelContext()
79+
}
7880
m.waitGroup.Wait()
7981
return nil
8082
}
@@ -146,6 +148,7 @@ func (m *CleanupManager[T]) periodicCleanup(ctx context.Context) {
146148
ticker := time.NewTicker(cleanupInterval)
147149
defer ticker.Stop()
148150

151+
m.cleanup(ctx)
149152
for {
150153
select {
151154
case <-ctx.Done():

cmd/compute-domain-controller/computedomain.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ type ComputeDomainManager struct {
6262
factory nvinformers.SharedInformerFactory
6363
informer cache.SharedIndexInformer
6464

65-
daemonSetManager *DaemonSetManager
65+
daemonSetManager *MultiNamespaceDaemonSetManager
6666
resourceClaimTemplateManager *WorkloadResourceClaimTemplateManager
6767
nodeManager *NodeManager
6868
}
@@ -77,7 +77,8 @@ func NewComputeDomainManager(config *ManagerConfig) *ComputeDomainManager {
7777
factory: factory,
7878
informer: informer,
7979
}
80-
m.daemonSetManager = NewDaemonSetManager(config, m.Get)
80+
81+
m.daemonSetManager = NewMultiNamespaceDaemonSetManager(config, m.Get)
8182
m.resourceClaimTemplateManager = NewWorkloadResourceClaimTemplateManager(config, m.Get)
8283
m.nodeManager = NewNodeManager(config, m.Get)
8384

@@ -151,7 +152,9 @@ func (m *ComputeDomainManager) Stop() error {
151152
if err := m.nodeManager.Stop(); err != nil {
152153
return fmt.Errorf("error stopping Node manager: %w", err)
153154
}
154-
m.cancelContext()
155+
if m.cancelContext != nil {
156+
m.cancelContext()
157+
}
155158
m.waitGroup.Wait()
156159
return nil
157160
}
@@ -274,7 +277,7 @@ func (m *ComputeDomainManager) onAddOrUpdate(ctx context.Context, obj any) error
274277
// Do not wait for the next periodic label cleanup to happen.
275278
m.nodeManager.RemoveStaleComputeDomainLabelsAsync(ctx)
276279

277-
if _, err := m.daemonSetManager.Create(ctx, m.config.driverNamespace, cd); err != nil {
280+
if _, err := m.daemonSetManager.Create(ctx, cd); err != nil {
278281
return fmt.Errorf("error creating DaemonSet: %w", err)
279282
}
280283

cmd/compute-domain-controller/controller.go

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@ type ManagerConfig struct {
4343

4444
// workQueue manages the asynchronous processing of tasks
4545
workQueue *workqueue.WorkQueue
46+
47+
// additionalNamespaces is a list of additional namespaces
48+
// where the driver can manage resources
49+
additionalNamespaces []string
4650
}
4751

4852
// Controller manages the lifecycle of the DRA driver and its components.
@@ -63,11 +67,12 @@ func (c *Controller) Run(ctx context.Context) error {
6367
workQueue := workqueue.New(workqueue.DefaultControllerRateLimiter())
6468

6569
managerConfig := &ManagerConfig{
66-
driverName: c.config.driverName,
67-
driverNamespace: c.config.flags.namespace,
68-
imageName: c.config.flags.imageName,
69-
clientsets: c.config.clientsets,
70-
workQueue: workQueue,
70+
driverName: c.config.driverName,
71+
driverNamespace: c.config.flags.namespace,
72+
additionalNamespaces: c.config.flags.additionalNamespaces.Value(),
73+
imageName: c.config.flags.imageName,
74+
clientsets: c.config.clientsets,
75+
workQueue: workQueue,
7176
}
7277

7378
cdManager := NewComputeDomainManager(managerConfig)

cmd/compute-domain-controller/daemonset.go

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ func NewDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomai
8585
informers.WithTweakListOptions(func(opts *metav1.ListOptions) {
8686
opts.LabelSelector = metav1.FormatLabelSelector(labelSelector)
8787
}),
88+
informers.WithNamespace(config.driverNamespace),
8889
)
8990

9091
informer := factory.Apps().V1().DaemonSets().Informer()
@@ -162,12 +163,14 @@ func (m *DaemonSetManager) Stop() error {
162163
if err := m.resourceClaimTemplateManager.Stop(); err != nil {
163164
return fmt.Errorf("error stopping ResourceClaimTemplate manager: %w", err)
164165
}
165-
m.cancelContext()
166+
if m.cancelContext != nil {
167+
m.cancelContext()
168+
}
166169
m.waitGroup.Wait()
167170
return nil
168171
}
169172

170-
func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*appsv1.DaemonSet, error) {
173+
func (m *DaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain) (*appsv1.DaemonSet, error) {
171174
ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.mutationCache, string(cd.UID))
172175
if err != nil {
173176
return nil, fmt.Errorf("error retrieving DaemonSet: %w", err)
@@ -179,7 +182,7 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva
179182
return ds[0], nil
180183
}
181184

182-
rct, err := m.resourceClaimTemplateManager.Create(ctx, namespace, cd)
185+
rct, err := m.resourceClaimTemplateManager.Create(ctx, cd)
183186
if err != nil {
184187
return nil, fmt.Errorf("error creating ResourceClaimTemplate: %w", err)
185188
}
@@ -229,6 +232,20 @@ func (m *DaemonSetManager) Create(ctx context.Context, namespace string, cd *nva
229232
return d, nil
230233
}
231234

235+
func (m *DaemonSetManager) Get(ctx context.Context, cdUID string) (*appsv1.DaemonSet, error) {
236+
ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.mutationCache, cdUID)
237+
if err != nil {
238+
return nil, fmt.Errorf("error retrieving DaemonSet: %w", err)
239+
}
240+
if len(ds) > 1 {
241+
return nil, fmt.Errorf("more than one DaemonSet found with same ComputeDomain UID")
242+
}
243+
if len(ds) == 0 {
244+
return nil, nil
245+
}
246+
return ds[0], nil
247+
}
248+
232249
func (m *DaemonSetManager) Delete(ctx context.Context, cdUID string) error {
233250
ds, err := getByComputeDomainUID[*appsv1.DaemonSet](ctx, m.mutationCache, cdUID)
234251
if err != nil {
@@ -335,11 +352,6 @@ func (m *DaemonSetManager) onAddOrUpdate(ctx context.Context, obj any) error {
335352
return fmt.Errorf("failed to cast to DaemonSet")
336353
}
337354

338-
// Only process events from the driver namespace
339-
if d.Namespace != m.config.driverNamespace {
340-
return nil
341-
}
342-
343355
klog.Infof("Processing added or updated DaemonSet: %s/%s", d.Namespace, d.Name)
344356

345357
cd, err := m.getComputeDomain(d.Labels[computeDomainLabelKey])

cmd/compute-domain-controller/main.go

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ type Flags struct {
5858
httpEndpoint string
5959
metricsPath string
6060
profilePath string
61+
62+
additionalNamespaces cli.StringSlice
6163
}
6264

6365
type Config struct {
@@ -123,6 +125,12 @@ func newApp() *cli.App {
123125
Destination: &flags.profilePath,
124126
EnvVars: []string{"PPROF_PATH"},
125127
},
128+
&cli.StringSliceFlag{
129+
Name: "additional-namespaces",
130+
Usage: "Additional namespaces where the driver can manage resources.",
131+
Destination: &flags.additionalNamespaces,
132+
EnvVars: []string{"ADDITIONAL_NAMESPACES"},
133+
},
126134
}
127135

128136
cliFlags = append(cliFlags, flags.kubeClientConfig.Flags()...)
@@ -173,14 +181,18 @@ func newApp() *cli.App {
173181
errChan <- controller.Run(ctx)
174182
}()
175183

176-
<-sigs
177-
cancel()
178-
179-
if err := <-errChan; err != nil {
180-
return fmt.Errorf("run controller: %w", err)
184+
for {
185+
select {
186+
case <-sigs:
187+
cancel()
188+
case err := <-errChan:
189+
cancel()
190+
if err != nil {
191+
return fmt.Errorf("run controller: %w", err)
192+
}
193+
return nil
194+
}
181195
}
182-
183-
return nil
184196
},
185197
Version: info.GetVersionString(),
186198
}
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/*
2+
* Copyright (c) 2025 NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
appsv1 "k8s.io/api/apps/v1"
24+
25+
nvapi "github.com/NVIDIA/k8s-dra-driver-gpu/api/nvidia.com/resource/v1beta1"
26+
)
27+
28+
// MultiNamespaceDaemonSetManager manages DaemonSets across multiple namespaces.
29+
type MultiNamespaceDaemonSetManager struct {
30+
config *ManagerConfig
31+
managers map[string]*DaemonSetManager
32+
}
33+
34+
// NewMultiNamespaceDaemonSetManager creates a new multi-namespace DaemonSet manager
35+
// It creates individual DaemonSet managers for the driver namespace and all additional namespaces.
36+
func NewMultiNamespaceDaemonSetManager(config *ManagerConfig, getComputeDomain GetComputeDomainFunc) *MultiNamespaceDaemonSetManager {
37+
m := &MultiNamespaceDaemonSetManager{
38+
config: config,
39+
managers: make(map[string]*DaemonSetManager),
40+
}
41+
42+
// Use a map to deduplicate namespaces (driver namespace + additional namespaces)
43+
uniqueNamespaces := make(map[string]struct{})
44+
uniqueNamespaces[config.driverNamespace] = struct{}{}
45+
for _, ns := range config.additionalNamespaces {
46+
uniqueNamespaces[ns] = struct{}{}
47+
}
48+
49+
// Create managers for unique namespaces
50+
for ns := range uniqueNamespaces {
51+
configNew := *config
52+
configNew.driverNamespace = ns
53+
configNew.additionalNamespaces = nil
54+
m.managers[ns] = NewDaemonSetManager(&configNew, getComputeDomain)
55+
}
56+
57+
return m
58+
}
59+
60+
// Start starts all DaemonSet managers for all namespaces.
61+
func (m *MultiNamespaceDaemonSetManager) Start(ctx context.Context) error {
62+
for ns, manager := range m.managers {
63+
if err := manager.Start(ctx); err != nil {
64+
return fmt.Errorf("failed to start DaemonSet manager for namespace %s: %w", ns, err)
65+
}
66+
}
67+
return nil
68+
}
69+
70+
// Stop stops all DaemonSet managers for all namespaces.
71+
func (m *MultiNamespaceDaemonSetManager) Stop() error {
72+
for ns, manager := range m.managers {
73+
if err := manager.Stop(); err != nil {
74+
return fmt.Errorf("failed to stop DaemonSet manager for namespace %s: %w", ns, err)
75+
}
76+
}
77+
return nil
78+
}
79+
80+
// Create creates a DaemonSet in the provided namespace.
81+
func (m *MultiNamespaceDaemonSetManager) Create(ctx context.Context, cd *nvapi.ComputeDomain) (*appsv1.DaemonSet, error) {
82+
for ns, manager := range m.managers {
83+
ds, err := manager.Get(ctx, string(cd.UID))
84+
if err != nil {
85+
return nil, fmt.Errorf("failed to get DaemonSet in namespace %s: %w", ns, err)
86+
}
87+
if ds != nil {
88+
return ds, nil
89+
}
90+
}
91+
manager, exists := m.managers[m.config.driverNamespace]
92+
if !exists {
93+
return nil, fmt.Errorf("no DaemonSet manager found for namespace %s", m.config.driverNamespace)
94+
}
95+
return manager.Create(ctx, cd)
96+
}
97+
98+
// Delete deletes DaemonSets across all namespaces for the given ComputeDomain UID.
99+
func (m *MultiNamespaceDaemonSetManager) Delete(ctx context.Context, cdUID string) error {
100+
for ns, manager := range m.managers {
101+
if err := manager.Delete(ctx, cdUID); err != nil {
102+
return fmt.Errorf("failed to delete DaemonSet in namespace %s: %w", ns, err)
103+
}
104+
}
105+
return nil
106+
}
107+
108+
// RemoveFinalizer removes finalizers from DaemonSets across all namespaces.
109+
func (m *MultiNamespaceDaemonSetManager) RemoveFinalizer(ctx context.Context, cdUID string) error {
110+
for ns, manager := range m.managers {
111+
if err := manager.RemoveFinalizer(ctx, cdUID); err != nil {
112+
return fmt.Errorf("failed to remove finalizer from DaemonSet in namespace %s: %w", ns, err)
113+
}
114+
}
115+
return nil
116+
}
117+
118+
// AssertRemoved asserts that DaemonSets are removed across all namespaces.
119+
func (m *MultiNamespaceDaemonSetManager) AssertRemoved(ctx context.Context, cdUID string) error {
120+
for ns, manager := range m.managers {
121+
if err := manager.AssertRemoved(ctx, cdUID); err != nil {
122+
return fmt.Errorf("failed to assert DaemonSet removal in namespace %s: %w", ns, err)
123+
}
124+
}
125+
return nil
126+
}

cmd/compute-domain-controller/node.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,9 @@ func (m *NodeManager) Stop() error {
101101
if err := m.labelCleanupManager.Stop(); err != nil {
102102
return fmt.Errorf("NodeManager: error stopping labelCleanupManager: %w", err)
103103
}
104-
m.cancelContext()
104+
if m.cancelContext != nil {
105+
m.cancelContext()
106+
}
105107
m.waitGroup.Wait()
106108
return nil
107109
}

cmd/compute-domain-controller/resourceclaimtemplate.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,9 @@ func (m *BaseResourceClaimTemplateManager) Start(ctx context.Context) (rerr erro
150150
}
151151

152152
func (m *BaseResourceClaimTemplateManager) Stop() error {
153-
m.cancelContext()
153+
if m.cancelContext != nil {
154+
m.cancelContext()
155+
}
154156
m.waitGroup.Wait()
155157
return nil
156158
}
@@ -300,7 +302,7 @@ func NewDaemonSetResourceClaimTemplateManager(config *ManagerConfig, getComputeD
300302
return m
301303
}
302304

303-
func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, namespace string, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) {
305+
func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, cd *nvapi.ComputeDomain) (*resourceapi.ResourceClaimTemplate, error) {
304306
rcts, err := getByComputeDomainUID[*resourceapi.ResourceClaimTemplate](ctx, m.mutationCache, string(cd.UID))
305307
if err != nil {
306308
return nil, fmt.Errorf("error retrieving ResourceClaimTemplate: %w", err)
@@ -316,7 +318,7 @@ func (m *DaemonSetResourceClaimTemplateManager) Create(ctx context.Context, name
316318
daemonConfig.DomainID = string(cd.UID)
317319

318320
templateData := ResourceClaimTemplateTemplateData{
319-
Namespace: namespace,
321+
Namespace: m.config.driverNamespace,
320322
GenerateName: fmt.Sprintf("%s-daemon-claim-template-", cd.Name),
321323
Finalizer: computeDomainFinalizer,
322324
ComputeDomainLabelKey: computeDomainLabelKey,

cmd/compute-domain-daemon/computedomain.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,9 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
114114

115115
// Stop stops the compute domain manager.
116116
func (m *ComputeDomainManager) Stop() error {
117-
m.cancelContext()
117+
if m.cancelContext != nil {
118+
m.cancelContext()
119+
}
118120
m.waitGroup.Wait()
119121
return nil
120122
}

cmd/compute-domain-kubelet-plugin/computedomain.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,9 @@ func (m *ComputeDomainManager) Start(ctx context.Context) (rerr error) {
121121
}
122122

123123
func (m *ComputeDomainManager) Stop() error {
124-
m.cancelContext()
124+
if m.cancelContext != nil {
125+
m.cancelContext()
126+
}
125127
m.waitGroup.Wait()
126128
return nil
127129
}

0 commit comments

Comments
 (0)