Skip to content

Commit 8302e3f

Browse files
TomerNewmankubernetes-prow[bot]
authored andcommitted
Add DRA support to ordered-upgrade state machine
Extend the NodeLabelModuleVersionReconciler to handle DRA modules through the same ordered-upgrade label action table used by device plugin modules. The reconciler now detects whether a module uses DRA (via spec.dra) and routes label resolution to the version-dra label instead of version-device-plugin, reusing the existing state machine without modification.
1 parent 29c905a commit 8302e3f

14 files changed

Lines changed: 481 additions & 257 deletions

internal/constants/constants.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ const (
1111
DaemonSetRole = "kmm.node.kubernetes.io/role"
1212
NamespaceLabelKey = "kmm.node.k8s.io/contains-modules"
1313

14-
WorkerPodVersionLabelPrefix = "beta.kmm.node.kubernetes.io/version-worker-pod"
15-
DevicePluginVersionLabelPrefix = "beta.kmm.node.kubernetes.io/version-device-plugin"
16-
ModuleVersionLabelPrefix = "kmm.node.kubernetes.io/version-module"
14+
WorkerPodVersionLabelPrefix = "beta.kmm.node.kubernetes.io/version-worker-pod"
15+
SchedulePluginVersionLabelPrefix = "beta.kmm.node.kubernetes.io/version-schedule-plugin"
16+
ModuleVersionLabelPrefix = "kmm.node.kubernetes.io/version-module"
1717

1818
GCDelayFinalizer = "kmm.node.kubernetes.io/gc-delay"
1919
ModuleFinalizer = "kmm.node.kubernetes.io/module-finalizer"

internal/controllers/device_plugin_reconciler.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -500,7 +500,7 @@ func generateDevicePluginLabelsAndSelector(mod *kmmv1beta1.Module) (map[string]s
500500
}
501501

502502
if mod.Spec.ModuleLoader != nil && mod.Spec.ModuleLoader.Container.Version != "" {
503-
versionLabel := utils.GetDevicePluginVersionLabelName(mod.Namespace, mod.Name)
503+
versionLabel := utils.GetSchedulePluginVersionLabelName(mod.Namespace, mod.Name)
504504
labels[versionLabel] = mod.Spec.ModuleLoader.Container.Version
505505
nodeSelector[versionLabel] = mod.Spec.ModuleLoader.Container.Version
506506
} else if mod.Spec.ModuleLoader == nil {
@@ -536,7 +536,7 @@ func getExistingDSFromVersion(existingDS []appsv1.DaemonSet,
536536
version = moduleLoader.Container.Version
537537
}
538538

539-
versionLabel := utils.GetDevicePluginVersionLabelName(moduleNamespace, moduleName)
539+
versionLabel := utils.GetSchedulePluginVersionLabelName(moduleNamespace, moduleName)
540540
for _, ds := range existingDS {
541541
dsModuleVersion := ds.GetLabels()[versionLabel]
542542
if dsModuleVersion == version {
@@ -549,7 +549,7 @@ func getExistingDSFromVersion(existingDS []appsv1.DaemonSet,
549549
func isOlderVersionUnusedDevicePluginDaemonset(ds *appsv1.DaemonSet, moduleVersion string) bool {
550550
moduleName := ds.Labels[constants.ModuleNameLabel]
551551
moduleNamespace := ds.Namespace
552-
versionLabel := utils.GetDevicePluginVersionLabelName(moduleNamespace, moduleName)
552+
versionLabel := utils.GetSchedulePluginVersionLabelName(moduleNamespace, moduleName)
553553
return ds.Labels[versionLabel] != moduleVersion && ds.Status.DesiredNumberScheduled == 0
554554
}
555555

internal/controllers/device_plugin_reconciler_test.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -316,16 +316,16 @@ var _ = Describe("DevicePluginReconciler_garbageCollect", func() {
316316
},
317317
},
318318
}
319-
devicePluginVersionLabel := utils.GetDevicePluginVersionLabelName(mod.Namespace, mod.Name)
319+
schedulePluginVersionLabel := utils.GetSchedulePluginVersionLabelName(mod.Namespace, mod.Name)
320320

321321
DescribeTable("device-plugin GC", func(devicePluginFormerLabel bool, devicePluginFormerDesired int) {
322322
devicePluginDS := appsv1.DaemonSet{
323323
ObjectMeta: metav1.ObjectMeta{
324324
Name: "devicePlugin",
325325
Namespace: "namespace",
326326
Labels: map[string]string{
327-
devicePluginVersionLabel: currentModuleVersion,
328-
constants.ModuleNameLabel: mod.Name,
327+
schedulePluginVersionLabel: currentModuleVersion,
328+
constants.ModuleNameLabel: mod.Name,
329329
},
330330
},
331331
}
@@ -335,7 +335,7 @@ var _ = Describe("DevicePluginReconciler_garbageCollect", func() {
335335
if devicePluginFormerLabel {
336336
devicePluginFormerVersionDS = devicePluginDS.DeepCopy()
337337
devicePluginFormerVersionDS.SetName("devicePluginFormer")
338-
devicePluginFormerVersionDS.Labels[devicePluginVersionLabel] = "former label"
338+
devicePluginFormerVersionDS.Labels[schedulePluginVersionLabel] = "former label"
339339
devicePluginFormerVersionDS.Status.DesiredNumberScheduled = int32(devicePluginFormerDesired)
340340
existingDS = append(existingDS, *devicePluginFormerVersionDS)
341341
}
@@ -357,7 +357,7 @@ var _ = Describe("DevicePluginReconciler_garbageCollect", func() {
357357
ObjectMeta: metav1.ObjectMeta{
358358
Name: "devicePlugin",
359359
Namespace: "namespace",
360-
Labels: map[string]string{constants.ModuleNameLabel: mod.Name, devicePluginVersionLabel: "formerVersion"},
360+
Labels: map[string]string{constants.ModuleNameLabel: mod.Name, schedulePluginVersionLabel: "formerVersion"},
361361
},
362362
}
363363
clnt.EXPECT().Delete(context.Background(), &deleteDS).Return(fmt.Errorf("some error"))
@@ -375,7 +375,7 @@ var _ = Describe("DevicePluginReconciler_garbageCollect", func() {
375375
ObjectMeta: metav1.ObjectMeta{
376376
Name: "devicePlugin",
377377
Namespace: "namespace",
378-
Labels: map[string]string{constants.ModuleNameLabel: mod.Name, devicePluginVersionLabel: "formerVersion"},
378+
Labels: map[string]string{constants.ModuleNameLabel: mod.Name, schedulePluginVersionLabel: "formerVersion"},
379379
},
380380
}
381381

@@ -738,7 +738,7 @@ var _ = Describe("DevicePluginReconciler_setDevicePluginAsDesired", func() {
738738
err := dsc.setDevicePluginAsDesired(context.Background(), &ds, &mod)
739739

740740
Expect(err).NotTo(HaveOccurred())
741-
versionLabel := utils.GetDevicePluginVersionLabelName(namespace, moduleName)
741+
versionLabel := utils.GetSchedulePluginVersionLabelName(namespace, moduleName)
742742
Expect(ds.GetLabels()).Should(HaveKeyWithValue(versionLabel, "some version"))
743743
})
744744

@@ -968,7 +968,7 @@ var _ = Describe("DevicePluginReconciler_getExistingDSFromVersion", func() {
968968
)
969969

970970
devicePluginLabels := map[string]string{
971-
utils.GetDevicePluginVersionLabelName(moduleNamespace, moduleName): moduleVersion,
971+
utils.GetSchedulePluginVersionLabelName(moduleNamespace, moduleName): moduleVersion,
972972
}
973973

974974
ds := appsv1.DaemonSet{

internal/controllers/dra_reconciler.go

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,11 @@ func (r *DRAReconciler) Reconcile(ctx context.Context, mod *kmmv1beta1.Module) (
113113
return res, fmt.Errorf("could not handle DRA: %v", err)
114114
}
115115

116+
err = r.reconHelperAPI.garbageCollectDRADaemonSets(ctx, mod, existingDRADS)
117+
if err != nil {
118+
return res, fmt.Errorf("failed to run DRA garbage collection: %v", err)
119+
}
120+
116121
err = r.reconHelperAPI.handleDeviceClasses(ctx, mod, existingDCs)
117122
if err != nil {
118123
return res, fmt.Errorf("could not handle DeviceClasses: %v", err)
@@ -133,6 +138,7 @@ func (r *DRAReconciler) Reconcile(ctx context.Context, mod *kmmv1beta1.Module) (
133138
type draReconcilerHelperAPI interface {
134139
getModuleDRADaemonSets(ctx context.Context, name, namespace string) ([]appsv1.DaemonSet, error)
135140
handleDRA(ctx context.Context, mod *kmmv1beta1.Module, existingDRADS []appsv1.DaemonSet) error
141+
garbageCollectDRADaemonSets(ctx context.Context, mod *kmmv1beta1.Module, existingDS []appsv1.DaemonSet) error
136142
deleteDRAResources(ctx context.Context, moduleName, moduleNamespace string) error
137143
moduleUpdateDRAStatus(ctx context.Context, mod *kmmv1beta1.Module, existingDRADS []appsv1.DaemonSet) error
138144
clearDRAStatus(ctx context.Context, mod *kmmv1beta1.Module) error
@@ -181,11 +187,9 @@ func (drh *draReconcilerHelper) handleDRA(ctx context.Context, mod *kmmv1beta1.M
181187

182188
logger := log.FromContext(ctx)
183189

184-
var ds *appsv1.DaemonSet
185-
if len(existingDRADS) > 0 {
186-
ds = &existingDRADS[0]
187-
} else {
188-
logger.Info("creating new DRA DaemonSet")
190+
ds, version := getExistingDRADSFromVersion(existingDRADS, mod.Namespace, mod.Name, mod.Spec.ModuleLoader)
191+
if ds == nil {
192+
logger.Info("creating new DRA DaemonSet", "version", version)
189193
ds = &appsv1.DaemonSet{
190194
ObjectMeta: metav1.ObjectMeta{Namespace: mod.Namespace, GenerateName: mod.Name + "-dra-"},
191195
}
@@ -202,6 +206,51 @@ func (drh *draReconcilerHelper) handleDRA(ctx context.Context, mod *kmmv1beta1.M
202206
return err
203207
}
204208

209+
func (drh *draReconcilerHelper) garbageCollectDRADaemonSets(ctx context.Context, mod *kmmv1beta1.Module, existingDS []appsv1.DaemonSet) error {
210+
if mod.Spec.ModuleLoader == nil {
211+
return nil
212+
}
213+
214+
logger := log.FromContext(ctx)
215+
deleted := make([]string, 0)
216+
for _, ds := range existingDS {
217+
if isOlderVersionUnusedDRADaemonSet(&ds, mod.Namespace, mod.Spec.ModuleLoader.Container.Version) {
218+
deleted = append(deleted, ds.Name)
219+
if err := drh.client.Delete(ctx, &ds); err != nil {
220+
return fmt.Errorf("could not delete DRA DaemonSet %s: %v", ds.Name, err)
221+
}
222+
}
223+
}
224+
225+
logger.Info("garbage-collected DRA DaemonSets", "names", deleted)
226+
return nil
227+
}
228+
229+
func getExistingDRADSFromVersion(existingDS []appsv1.DaemonSet,
230+
moduleNamespace string,
231+
moduleName string,
232+
moduleLoader *kmmv1beta1.ModuleLoaderSpec) (*appsv1.DaemonSet, string) {
233+
version := ""
234+
if moduleLoader != nil {
235+
version = moduleLoader.Container.Version
236+
}
237+
238+
versionLabel := utils.GetSchedulePluginVersionLabelName(moduleNamespace, moduleName)
239+
for _, ds := range existingDS {
240+
dsModuleVersion := ds.GetLabels()[versionLabel]
241+
if dsModuleVersion == version {
242+
return &ds, version
243+
}
244+
}
245+
return nil, version
246+
}
247+
248+
func isOlderVersionUnusedDRADaemonSet(ds *appsv1.DaemonSet, moduleNamespace, moduleVersion string) bool {
249+
moduleName := ds.Labels[constants.ModuleNameLabel]
250+
versionLabel := utils.GetSchedulePluginVersionLabelName(moduleNamespace, moduleName)
251+
return ds.Labels[versionLabel] != moduleVersion && ds.Status.DesiredNumberScheduled == 0
252+
}
253+
205254
// deleteDRAResources deletes all DRA-owned DaemonSets and DeviceClasses using label-based bulk deletion.
206255
func (drh *draReconcilerHelper) deleteDRAResources(ctx context.Context, moduleName, moduleNamespace string) error {
207256
var errs []error
@@ -403,6 +452,12 @@ func (dsci *draDaemonSetCreatorImpl) setDRAAsDesired(
403452
utils.GetKernelModuleReadyNodeLabel(mod.Namespace, mod.Name): "",
404453
}
405454

455+
if mod.Spec.ModuleLoader != nil && mod.Spec.ModuleLoader.Container.Version != "" {
456+
versionLabel := utils.GetSchedulePluginVersionLabelName(mod.Namespace, mod.Name)
457+
standardLabels[versionLabel] = mod.Spec.ModuleLoader.Container.Version
458+
nodeSelector[versionLabel] = mod.Spec.ModuleLoader.Container.Version
459+
}
460+
406461
ds.SetLabels(
407462
overrideLabels(ds.GetLabels(), standardLabels),
408463
)

0 commit comments

Comments
 (0)