Skip to content

Commit 4f51daa

Browse files
committed
Add DRA DaemonSet reconciler
Implement a DRAReconciler that manages DRA driver DaemonSets from Module spec.dra, mirroring the DevicePluginReconciler pattern. The reconciler creates DaemonSets with mandatory kubelet-plugins and kubelet-plugins-registry host-path volumes, DRA-specific container names, and a DaemonSetRole label to distinguish DRA DaemonSets. Status patching reports targeted nodes and available pods in status.dra, and clearing status.dra when spec.dra is removed. Also fixes DevicePluginReconciler to exclude DRA-labeled DaemonSets from its device-plugin list, preventing cross-reconciler interference. Additionally addresses review feedback: - Rename handleModuleDeletion to deleteDRADaemonSets and deleteDevicePluginDaemonSets for clarity - Add DevicePluginRoleLabelValue label to device plugin DaemonSets; the exclusion-based query is kept for backward compatibility with a TODO(3.0) to switch to direct role-label querying - Add cleanup logic to DevicePluginReconciler to delete existing DaemonSets and clear status when spec.devicePlugin is removed
1 parent 888c952 commit 4f51daa

7 files changed

Lines changed: 1424 additions & 32 deletions

internal/constants/constants.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ const (
2525
PrivateSignDataKey = "key"
2626

2727
ModuleLoaderRoleLabelValue = "module-loader"
28+
DevicePluginRoleLabelValue = "device-plugin"
29+
DRARoleLabelValue = "dra"
2830

2931
OperatorNamespaceEnvVar = "OPERATOR_NAMESPACE"
3032
)

internal/controllers/device_plugin_reconciler.go

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,21 @@ func (r *DevicePluginReconciler) Reconcile(ctx context.Context, mod *kmmv1beta1.
100100
if err = r.reconHelperAPI.removeDevicePluginTargetLabels(ctx, mod); err != nil {
101101
return ctrl.Result{}, fmt.Errorf("could not remove device-plugin-target labels on deletion: %v", err)
102102
}
103-
err = r.reconHelperAPI.handleModuleDeletion(ctx, existingDevicePluginDS)
103+
err = r.reconHelperAPI.deleteDevicePluginDaemonSets(ctx, existingDevicePluginDS)
104104
return ctrl.Result{}, err
105105
}
106106

107107
r.reconHelperAPI.setKMMOMetrics(ctx)
108108

109+
if mod.Spec.DevicePlugin == nil {
110+
if len(existingDevicePluginDS) > 0 {
111+
if err = r.reconHelperAPI.deleteDevicePluginDaemonSets(ctx, existingDevicePluginDS); err != nil {
112+
return ctrl.Result{}, err
113+
}
114+
}
115+
return ctrl.Result{}, r.reconHelperAPI.clearDevicePluginStatus(ctx, mod)
116+
}
117+
109118
if err = r.reconHelperAPI.handleDevicePluginTargetLabels(ctx, mod); err != nil {
110119
return res, fmt.Errorf("could not reconcile device-plugin-target labels: %v", err)
111120
}
@@ -131,16 +140,17 @@ func (r *DevicePluginReconciler) Reconcile(ctx context.Context, mod *kmmv1beta1.
131140
return res, nil
132141
}
133142

134-
//go:generate mockgen -source=device_plugin_reconciler.go -package=controllers -destination=mock_device_plugin_reconciler.go devicePluginReconcilerHelperAPI
143+
//go:generate mockgen -source=device_plugin_reconciler.go -package=controllers -destination=mock_device_plugin_reconciler.go devicePluginReconcilerHelperAPI,daemonSetCreator
135144

136145
type devicePluginReconcilerHelperAPI interface {
137146
setKMMOMetrics(ctx context.Context)
138147
handleDevicePlugin(ctx context.Context, mod *kmmv1beta1.Module, existingDevicePluginDS []appsv1.DaemonSet) error
139148
handleDevicePluginTargetLabels(ctx context.Context, mod *kmmv1beta1.Module) error
140149
removeDevicePluginTargetLabels(ctx context.Context, mod *kmmv1beta1.Module) error
141150
garbageCollect(ctx context.Context, mod *kmmv1beta1.Module, existingDS []appsv1.DaemonSet) error
142-
handleModuleDeletion(ctx context.Context, existingDevicePluginDS []appsv1.DaemonSet) error
151+
deleteDevicePluginDaemonSets(ctx context.Context, existingDevicePluginDS []appsv1.DaemonSet) error
143152
moduleUpdateDevicePluginStatus(ctx context.Context, mod *kmmv1beta1.Module, existingDevicePluginDS []appsv1.DaemonSet) error
153+
clearDevicePluginStatus(ctx context.Context, mod *kmmv1beta1.Module) error
144154
getModuleDevicePluginDaemonSets(ctx context.Context, name, namespace string) ([]appsv1.DaemonSet, error)
145155
}
146156

@@ -176,9 +186,11 @@ func (dprh *devicePluginReconcilerHelper) getModuleDevicePluginDaemonSets(ctx co
176186
}
177187

178188
devicePluginsList := make([]appsv1.DaemonSet, 0, len(dsList.Items))
179-
// remove the older version module loader daemonsets
189+
// TODO(3.0): switch to querying by DevicePluginRoleLabelValue directly once all
190+
// clusters have been upgraded and existing DaemonSets have the role label.
180191
for _, ds := range dsList.Items {
181-
if ds.GetLabels()[constants.DaemonSetRole] != constants.ModuleLoaderRoleLabelValue {
192+
role := ds.GetLabels()[constants.DaemonSetRole]
193+
if role != constants.ModuleLoaderRoleLabelValue && role != constants.DRARoleLabelValue {
182194
devicePluginsList = append(devicePluginsList, ds)
183195
}
184196
}
@@ -286,7 +298,7 @@ func (dprh *devicePluginReconcilerHelper) garbageCollect(ctx context.Context,
286298
return nil
287299
}
288300

289-
func (dprh *devicePluginReconcilerHelper) handleModuleDeletion(ctx context.Context, existingDevicePluginDS []appsv1.DaemonSet) error {
301+
func (dprh *devicePluginReconcilerHelper) deleteDevicePluginDaemonSets(ctx context.Context, existingDevicePluginDS []appsv1.DaemonSet) error {
290302
// delete all the Device Plugin Daemonset, in order to allow worker pods to delete kernel modules
291303
for _, ds := range existingDevicePluginDS {
292304
err := dprh.client.Delete(ctx, &ds)
@@ -369,7 +381,18 @@ func (dprh *devicePluginReconcilerHelper) moduleUpdateDevicePluginStatus(ctx con
369381
return dprh.client.Status().Patch(ctx, mod, client.MergeFrom(unmodifiedMod))
370382
}
371383

372-
//go:generate mockgen -source=device_plugin_reconciler.go -package=controllers -destination=mock_device_plugin_reconciler.go daemonSetCreator
384+
func (dprh *devicePluginReconcilerHelper) clearDevicePluginStatus(ctx context.Context, mod *kmmv1beta1.Module) error {
385+
emptyStatus := kmmv1beta1.DaemonSetStatus{}
386+
if mod.Status.DevicePlugin == emptyStatus {
387+
return nil
388+
}
389+
390+
unmodifiedMod := mod.DeepCopy()
391+
392+
mod.Status.DevicePlugin = kmmv1beta1.DaemonSetStatus{}
393+
394+
return dprh.client.Status().Patch(ctx, mod, client.MergeFrom(unmodifiedMod))
395+
}
373396

374397
type daemonSetCreator interface {
375398
setDevicePluginAsDesired(ctx context.Context, ds *appsv1.DaemonSet, mod *kmmv1beta1.Module) error
@@ -467,7 +490,10 @@ func generatePodContainerSpec(containerSpec *kmmv1beta1.DevicePluginContainerSpe
467490
}
468491

469492
func generateDevicePluginLabelsAndSelector(mod *kmmv1beta1.Module) (map[string]string, map[string]string) {
470-
labels := map[string]string{constants.ModuleNameLabel: mod.Name}
493+
labels := map[string]string{
494+
constants.ModuleNameLabel: mod.Name,
495+
constants.DaemonSetRole: constants.DevicePluginRoleLabelValue,
496+
}
471497
nodeSelector := map[string]string{
472498
utils.GetKernelModuleReadyNodeLabel(mod.Namespace, mod.Name): "",
473499
utils.GetDevicePluginTargetNodeLabel(mod.Namespace, mod.Name): "",

internal/controllers/device_plugin_reconciler_test.go

Lines changed: 116 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ var _ = Describe("DevicePluginReconciler_Reconcile", func() {
4141

4242
mod = &kmmv1beta1.Module{
4343
ObjectMeta: metav1.ObjectMeta{Namespace: namespace, Name: moduleName},
44+
Spec: kmmv1beta1.ModuleSpec{
45+
DevicePlugin: &kmmv1beta1.DevicePluginSpec{},
46+
},
4447
}
4548

4649
dpr = &DevicePluginReconciler{
@@ -115,7 +118,7 @@ var _ = Describe("DevicePluginReconciler_Reconcile", func() {
115118
gomock.InOrder(
116119
mockReconHelper.EXPECT().getModuleDevicePluginDaemonSets(ctx, mod.Name, mod.Namespace).Return(devicePluginDS, nil),
117120
mockReconHelper.EXPECT().removeDevicePluginTargetLabels(ctx, mod).Return(nil),
118-
mockReconHelper.EXPECT().handleModuleDeletion(ctx, devicePluginDS).Return(nil),
121+
mockReconHelper.EXPECT().deleteDevicePluginDaemonSets(ctx, devicePluginDS).Return(nil),
119122
)
120123

121124
res, err := dpr.Reconcile(ctx, mod)
@@ -132,18 +135,64 @@ var _ = Describe("DevicePluginReconciler_Reconcile", func() {
132135
Expect(res).To(Equal(reconcile.Result{}))
133136
Expect(err).To(HaveOccurred())
134137

135-
By("error flow - handleModuleDeletion fails")
138+
By("error flow - deleteDevicePluginDaemonSets fails")
136139
gomock.InOrder(
137140
mockReconHelper.EXPECT().getModuleDevicePluginDaemonSets(ctx, mod.Name, mod.Namespace).Return(devicePluginDS, nil),
138141
mockReconHelper.EXPECT().removeDevicePluginTargetLabels(ctx, mod).Return(nil),
139-
mockReconHelper.EXPECT().handleModuleDeletion(ctx, devicePluginDS).Return(fmt.Errorf("some error")),
142+
mockReconHelper.EXPECT().deleteDevicePluginDaemonSets(ctx, devicePluginDS).Return(fmt.Errorf("some error")),
140143
)
141144

142145
res, err = dpr.Reconcile(ctx, mod)
143146
Expect(res).To(Equal(reconcile.Result{}))
144147
Expect(err).To(HaveOccurred())
145148
})
146149

150+
It("no-op when spec.devicePlugin is nil and no existing DaemonSets", func() {
151+
mod.Spec.DevicePlugin = nil
152+
gomock.InOrder(
153+
mockReconHelper.EXPECT().getModuleDevicePluginDaemonSets(ctx, mod.Name, mod.Namespace).Return(nil, nil),
154+
mockReconHelper.EXPECT().setKMMOMetrics(ctx),
155+
mockReconHelper.EXPECT().clearDevicePluginStatus(ctx, mod).Return(nil),
156+
)
157+
158+
res, err := dpr.Reconcile(ctx, mod)
159+
160+
Expect(res).To(Equal(reconcile.Result{}))
161+
Expect(err).NotTo(HaveOccurred())
162+
})
163+
164+
It("cleanup when spec.devicePlugin is nil but existing DaemonSets present", func() {
165+
mod.Spec.DevicePlugin = nil
166+
devicePluginDS := []appsv1.DaemonSet{{}}
167+
168+
gomock.InOrder(
169+
mockReconHelper.EXPECT().getModuleDevicePluginDaemonSets(ctx, mod.Name, mod.Namespace).Return(devicePluginDS, nil),
170+
mockReconHelper.EXPECT().setKMMOMetrics(ctx),
171+
mockReconHelper.EXPECT().deleteDevicePluginDaemonSets(ctx, devicePluginDS).Return(nil),
172+
mockReconHelper.EXPECT().clearDevicePluginStatus(ctx, mod).Return(nil),
173+
)
174+
175+
res, err := dpr.Reconcile(ctx, mod)
176+
177+
Expect(res).To(Equal(reconcile.Result{}))
178+
Expect(err).NotTo(HaveOccurred())
179+
})
180+
181+
It("cleanup when spec.devicePlugin is nil and clearDevicePluginStatus fails", func() {
182+
mod.Spec.DevicePlugin = nil
183+
184+
gomock.InOrder(
185+
mockReconHelper.EXPECT().getModuleDevicePluginDaemonSets(ctx, mod.Name, mod.Namespace).Return(nil, nil),
186+
mockReconHelper.EXPECT().setKMMOMetrics(ctx),
187+
mockReconHelper.EXPECT().clearDevicePluginStatus(ctx, mod).Return(fmt.Errorf("some error")),
188+
)
189+
190+
res, err := dpr.Reconcile(ctx, mod)
191+
192+
Expect(res).To(Equal(reconcile.Result{}))
193+
Expect(err).To(HaveOccurred())
194+
})
195+
147196
})
148197

149198
var _ = Describe("DevicePluginReconciler_handleDevicePlugin", func() {
@@ -337,7 +386,7 @@ var _ = Describe("DevicePluginReconciler_garbageCollect", func() {
337386
})
338387
})
339388

340-
var _ = Describe("DevicePluginReconciler_handleModuleDeletion", func() {
389+
var _ = Describe("DevicePluginReconciler_deleteDevicePluginDaemonSets", func() {
341390
var (
342391
ctrl *gomock.Controller
343392
clnt *client.MockClient
@@ -358,14 +407,14 @@ var _ = Describe("DevicePluginReconciler_handleModuleDeletion", func() {
358407
It("good flow", func() {
359408
clnt.EXPECT().Delete(ctx, &existingDevicePluginDS[0]).Return(nil)
360409

361-
err := dprh.handleModuleDeletion(ctx, existingDevicePluginDS)
410+
err := dprh.deleteDevicePluginDaemonSets(ctx, existingDevicePluginDS)
362411
Expect(err).NotTo(HaveOccurred())
363412
})
364413

365414
It("error flow", func() {
366415
clnt.EXPECT().Delete(ctx, &existingDevicePluginDS[0]).Return(fmt.Errorf("some error"))
367416

368-
err := dprh.handleModuleDeletion(ctx, existingDevicePluginDS)
417+
err := dprh.deleteDevicePluginDaemonSets(ctx, existingDevicePluginDS)
369418
Expect(err).To(HaveOccurred())
370419
})
371420
})
@@ -556,6 +605,53 @@ var _ = Describe("DevicePluginReconciler_moduleUpdateDevicePluginStatus", func()
556605
)
557606
})
558607

608+
var _ = Describe("DevicePluginReconciler_clearDevicePluginStatus", func() {
609+
var (
610+
ctrl *gomock.Controller
611+
clnt *client.MockClient
612+
statusWriter *client.MockStatusWriter
613+
dprh devicePluginReconcilerHelper
614+
)
615+
616+
BeforeEach(func() {
617+
ctrl = gomock.NewController(GinkgoT())
618+
clnt = client.NewMockClient(ctrl)
619+
statusWriter = client.NewMockStatusWriter(ctrl)
620+
dprh = devicePluginReconcilerHelper{
621+
client: clnt,
622+
}
623+
})
624+
625+
ctx := context.Background()
626+
627+
It("should be a no-op when status.devicePlugin is already empty", func() {
628+
mod := kmmv1beta1.Module{}
629+
err := dprh.clearDevicePluginStatus(ctx, &mod)
630+
Expect(err).NotTo(HaveOccurred())
631+
})
632+
633+
It("should clear status.devicePlugin when it has values", func() {
634+
mod := kmmv1beta1.Module{
635+
Status: kmmv1beta1.ModuleStatus{
636+
DevicePlugin: kmmv1beta1.DaemonSetStatus{
637+
NodesMatchingSelectorNumber: 3,
638+
DesiredNumber: 3,
639+
AvailableNumber: 2,
640+
},
641+
},
642+
}
643+
644+
expectedMod := mod.DeepCopy()
645+
expectedMod.Status.DevicePlugin = kmmv1beta1.DaemonSetStatus{}
646+
647+
clnt.EXPECT().Status().Return(statusWriter)
648+
statusWriter.EXPECT().Patch(ctx, expectedMod, gomock.Any())
649+
650+
err := dprh.clearDevicePluginStatus(ctx, &mod)
651+
Expect(err).NotTo(HaveOccurred())
652+
})
653+
})
654+
559655
var _ = Describe("DevicePluginReconciler_setDevicePluginAsDesired", func() {
560656
const (
561657
devicePluginImage = "device-plugin-image"
@@ -747,7 +843,10 @@ var _ = Describe("DevicePluginReconciler_setDevicePluginAsDesired", func() {
747843
err := dsc.setDevicePluginAsDesired(context.Background(), &ds, &mod)
748844
Expect(err).NotTo(HaveOccurred())
749845

750-
podLabels := map[string]string{constants.ModuleNameLabel: moduleName}
846+
podLabels := map[string]string{
847+
constants.ModuleNameLabel: moduleName,
848+
constants.DaemonSetRole: constants.DevicePluginRoleLabelValue,
849+
}
751850

752851
expectedInitContainer := []v1.Container{
753852
{
@@ -927,7 +1026,7 @@ var _ = Describe("DevicePluginReconciler_getModuleDevicePluginDaemonSets", func(
9271026
Expect(dsList).To(BeNil())
9281027
})
9291028

930-
It("good flow, return only device plugin DSs, either v2 or v1", func() {
1029+
It("good flow, return only device plugin DSs, excluding module-loader and DRA roles", func() {
9311030
ds1 := appsv1.DaemonSet{
9321031
ObjectMeta: metav1.ObjectMeta{
9331032
Labels: map[string]string{
@@ -951,9 +1050,17 @@ var _ = Describe("DevicePluginReconciler_getModuleDevicePluginDaemonSets", func(
9511050
},
9521051
},
9531052
}
1053+
ds4 := appsv1.DaemonSet{
1054+
ObjectMeta: metav1.ObjectMeta{
1055+
Labels: map[string]string{
1056+
constants.ModuleNameLabel: "some name",
1057+
constants.DaemonSetRole: constants.DRARoleLabelValue,
1058+
},
1059+
},
1060+
}
9541061
clnt.EXPECT().List(ctx, gomock.Any(), gomock.Any()).DoAndReturn(
9551062
func(_ interface{}, list *appsv1.DaemonSetList, _ ...interface{}) error {
956-
list.Items = []appsv1.DaemonSet{ds1, ds2, ds3}
1063+
list.Items = []appsv1.DaemonSet{ds1, ds2, ds3, ds4}
9571064
return nil
9581065
},
9591066
)

0 commit comments

Comments
 (0)