Skip to content

Commit 3be2ba5

Browse files
committed
move build device status to gpu profile
Signed-off-by: Swati Gupta <swatig@nvidia.com> add model in test assertion Signed-off-by: Swati Gupta <swatig@nvidia.com>
1 parent affa378 commit 3be2ba5

9 files changed

Lines changed: 260 additions & 76 deletions

File tree

cmd/dra-example-kubeletplugin/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ type Config struct {
6767

6868
var validProfiles = map[string]func(flags Flags) profiles.Profile{
6969
gpu.ProfileName: func(flags Flags) profiles.Profile {
70-
return gpu.NewProfile(flags.nodeName, flags.numDevices, flags.gpuPartitions)
70+
return gpu.NewProfile(flags.nodeName, flags.numDevices, flags.gpuPartitions, flags.gpuDeviceStatus)
7171
},
7272
}
7373

cmd/dra-example-kubeletplugin/state.go

Lines changed: 40 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ package main
1818

1919
import (
2020
"context"
21-
encode "encoding/json"
2221
"fmt"
2322
"path/filepath"
2423
"slices"
@@ -161,7 +160,7 @@ func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceCl
161160
if err != nil {
162161
return nil, fmt.Errorf("unable to sync from checkpoint: %v", err)
163162
}
164-
restoredDevices, err := s.restoreClaimFromCheckpoint(ctx, checkpoint, claim)
163+
restoredDevices, err := s.restoreClaimFromCheckpoint(checkpoint, claim)
165164
if err != nil {
166165
return nil, fmt.Errorf("unable to restore from checkpoint: %v", err)
167166
}
@@ -217,7 +216,42 @@ func (s *DeviceState) Unprepare(claimUID types.UID) error {
217216
// prepareDevices performs one-time setup for the devices allocated to a
218217
// ResourceClaim before being consumed by a Pod.
219218
func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
220-
return s.computeDeviceConfig(ctx, claim)
219+
preparedDevices, err := s.computeDeviceConfig(claim)
220+
if err != nil {
221+
return nil, err
222+
}
223+
224+
// Publish per-device status (e.g. uuid, model, driverVersion) into
225+
// ResourceClaim.status.devices[].data when the profile implements
226+
// [profiles.DeviceStatusBuilder]. This is a side-effect on the API server
227+
// and therefore lives in prepareDevices (rather than computeDeviceConfig,
228+
// which must be deterministic and side-effect free).
229+
builder, ok := s.configHandler.(profiles.DeviceStatusBuilder)
230+
if !ok {
231+
return preparedDevices, nil
232+
}
233+
234+
var deviceStatuses []resourceapi.AllocatedDeviceStatus
235+
for _, result := range claim.Status.Allocation.Devices.Results {
236+
if result.Driver != s.driverName {
237+
continue
238+
}
239+
if status := builder.BuildDeviceStatus(s.allocatable, &result); status != nil {
240+
deviceStatuses = append(deviceStatuses, *status)
241+
}
242+
}
243+
if len(deviceStatuses) > 0 {
244+
klog.FromContext(ctx).Info("Publishing device status to ResourceClaim",
245+
"namespace", claim.Namespace, "name", claim.Name, "devices", len(deviceStatuses))
246+
if err := s.updateDeviceStatus(ctx, claim.Namespace, claim.Name, deviceStatuses...); err != nil {
247+
// A failure to publish status is non-fatal: the device is still
248+
// prepared and the claim status will simply be missing the data.
249+
klog.FromContext(ctx).Error(err, "Failed to update device status on ResourceClaim",
250+
"namespace", claim.Namespace, "name", claim.Name)
251+
}
252+
}
253+
254+
return preparedDevices, nil
221255
}
222256

223257
// unprepareDevices undoes any side-effects produced by
@@ -232,7 +266,7 @@ func (s *DeviceState) unprepareDevices(claimUID types.UID, checkpoint *checkpoin
232266
// should be deterministic and produce no side-effects. Non-deterministic data or
233267
// side-effects should be produced by [DeviceState.prepareDevices] directly and
234268
// recorded in the checkpoint by [DeviceState.addClaimToCheckpoint].
235-
func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
269+
func (s *DeviceState) computeDeviceConfig(claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
236270
if claim.Status.Allocation == nil {
237271
return nil, fmt.Errorf("claim not yet allocated")
238272
}
@@ -254,9 +288,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
254288
// the list with len(Requests) == 0 for the lookup below.
255289
configs = slices.Insert(configs, 0, &OpaqueDeviceConfig{})
256290

257-
// build device status
258-
var devicesStatus []resourceapi.AllocatedDeviceStatus
259-
260291
// Look through the configs and figure out which one will be applied to
261292
// each device allocation result based on their order of precedence.
262293
configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult)
@@ -269,11 +300,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
269300
return nil, fmt.Errorf("requested device is not allocatable: %v", result.Device)
270301
}
271302

272-
if s.gpuDeviceStatus {
273-
deviceStatus := s.buildDeviceStatus(result)
274-
devicesStatus = append(devicesStatus, deviceStatus)
275-
}
276-
277303
for _, c := range slices.Backward(configs) {
278304
if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) {
279305
configResultsMap[c.Config] = append(configResultsMap[c.Config], &result)
@@ -287,13 +313,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
287313
// of device allocation results.
288314
perDeviceCDIContainerEdits := make(profiles.PerDeviceCDIContainerEdits)
289315
for config, results := range configResultsMap {
290-
if s.gpuDeviceStatus {
291-
klog.Infof("Adding device attribute to claim %s/%s", claim.Namespace, claim.Name)
292-
if err := s.updateDeviceStatus(ctx, claim.Namespace, claim.Name, devicesStatus...); err != nil {
293-
klog.Warningf("Failed to update device attributes for claim %s/%s: %v", claim.Namespace, claim.Name, err)
294-
}
295-
}
296-
297316
// Apply the config to the list of results associated with it.
298317
containerEdits, err := s.configHandler.ApplyConfig(config, results)
299318
if err != nil {
@@ -344,12 +363,12 @@ func (*DeviceState) removeClaimFromCheckpoint(checkpoint *checkpointapi.Checkpoi
344363

345364
// restoreClaimFromCheckpoint returns the device definitions for devices already prepared
346365
// for the given claim. If the claim has not yet been prepared, it returns nil.
347-
func (s *DeviceState) restoreClaimFromCheckpoint(ctx context.Context, checkpoint *checkpointapi.Checkpoint, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
366+
func (s *DeviceState) restoreClaimFromCheckpoint(checkpoint *checkpointapi.Checkpoint, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
348367
if slices.ContainsFunc(checkpoint.PreparedClaims, func(c checkpointapi.PreparedClaim) bool { return c.UID == claim.UID }) {
349368
// If [DeviceState.addClaimToCheckpoint] associated any other data with
350369
// the claim in the checkpoint, then that should be added to the
351370
// returned [PreparedDevices] here.
352-
return s.computeDeviceConfig(ctx, claim)
371+
return s.computeDeviceConfig(claim)
353372
}
354373
return nil, nil
355374
}
@@ -453,43 +472,6 @@ func GetOpaqueDeviceConfigs(
453472
return resultConfigs, nil
454473
}
455474

456-
func (s *DeviceState) buildDeviceStatus(res resourceapi.DeviceRequestAllocationResult) resourceapi.AllocatedDeviceStatus {
457-
dn := res.Device
458-
deviceInfo := make(map[string]resourceapi.DeviceAttribute)
459-
460-
if d, ok := s.allocatable[dn]; ok {
461-
if uuid, ok := d.Attributes["uuid"]; ok {
462-
deviceInfo["uuid"] = uuid
463-
}
464-
if model, ok := d.Attributes["model"]; ok {
465-
deviceInfo["model"] = model
466-
}
467-
if driverVersion, ok := d.Attributes["driverVersion"]; ok {
468-
deviceInfo["driverVersion"] = driverVersion
469-
}
470-
}
471-
472-
jsonBytes, err := encode.Marshal(deviceInfo)
473-
if err != nil {
474-
klog.Errorf("Failed to marshal device data: %v", err)
475-
jsonBytes = []byte("{}")
476-
}
477-
478-
return resourceapi.AllocatedDeviceStatus{
479-
Device: dn,
480-
Driver: res.Driver,
481-
Pool: res.Pool,
482-
// Data records per-allocation metadata used for monitoring and debugging:
483-
// - Pod→GPU mapping: makes it easier to see which GPU a given pod is using,
484-
// which is not readily available elsewhere.
485-
// - Device attributes (e.g. UUID, model, driverVersion): remain available
486-
// even if the device is later removed from a ResourceSlice (for example,
487-
// because it becomes unhealthy), so past allocations can still be
488-
// correlated with later health or scheduling issues.
489-
Data: &runtime.RawExtension{Raw: jsonBytes},
490-
}
491-
}
492-
493475
func (s *DeviceState) updateDeviceStatus(ctx context.Context, ns, name string, devices ...resourceapi.AllocatedDeviceStatus) error {
494476
// Converting wrapper to use latest API types,
495477
// converts to/from server-supported version.

deployments/helm/dra-example-driver/templates/clusterrole.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@ rules:
1111
- apiGroups: ["resource.k8s.io"]
1212
resources: ["resourceclaims/status"]
1313
verbs: ["update"]
14+
# Kubernetes 1.36+ enforces granular authorization for ResourceClaim status
15+
# writes via the DRAResourceClaimGranularStatusAuthorization feature gate. As a
16+
# node-local driver, we need the "associated-node" verbs on
17+
# "resourceclaims/driver" for our own driver name. This rule is inert on older
18+
# clusters that have not yet enabled the feature gate.
19+
# See https://github.com/kubernetes/kubernetes/issues/138149
20+
- apiGroups: ["resource.k8s.io"]
21+
resources: ["resourceclaims/driver"]
22+
verbs: ["associated-node:update", "associated-node:patch"]
23+
resourceNames: ["{{ include "dra-example-driver.driverName" . }}"]
1424
- apiGroups: [""]
1525
resources: ["nodes"]
1626
verbs: ["get"]

internal/profiles/gpu/gpu.go

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package gpu
1818

1919
import (
20+
"encoding/json"
2021
"fmt"
2122
"maps"
2223
"math/rand"
@@ -27,6 +28,7 @@ import (
2728
"k8s.io/apimachinery/pkg/api/resource"
2829
"k8s.io/apimachinery/pkg/runtime"
2930
"k8s.io/dynamic-resource-allocation/resourceslice"
31+
"k8s.io/klog/v2"
3032
"k8s.io/utils/ptr"
3133
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"
3234
cdispec "tags.cncf.io/container-device-interface/specs-go"
@@ -38,16 +40,18 @@ import (
3840
const ProfileName = "gpu"
3941

4042
type Profile struct {
41-
nodeName string
42-
numGPUs int
43-
partitionsPerGPU int
43+
nodeName string
44+
numGPUs int
45+
partitionsPerGPU int
46+
enableDeviceStatus bool
4447
}
4548

46-
func NewProfile(nodeName string, numGPUs int, partitionsPerGPU int) Profile {
49+
func NewProfile(nodeName string, numGPUs int, partitionsPerGPU int, enableDeviceStatus bool) Profile {
4750
return Profile{
48-
nodeName: nodeName,
49-
numGPUs: numGPUs,
50-
partitionsPerGPU: partitionsPerGPU,
51+
nodeName: nodeName,
52+
numGPUs: numGPUs,
53+
partitionsPerGPU: partitionsPerGPU,
54+
enableDeviceStatus: enableDeviceStatus,
5155
}
5256
}
5357

@@ -290,3 +294,42 @@ func applyGpuConfig(config *configapi.GpuConfig, results []*resourceapi.DeviceRe
290294

291295
return perDeviceEdits, nil
292296
}
297+
298+
// BuildDeviceStatus implements [profiles.DeviceStatusBuilder]. It returns an
299+
// [resourceapi.AllocatedDeviceStatus] populated with a subset of the device's
300+
// attributes (uuid, model, driverVersion) to publish into
301+
// ResourceClaim.status.devices[].data.
302+
func (p Profile) BuildDeviceStatus(allocatable map[string]resourceapi.Device, result *resourceapi.DeviceRequestAllocationResult) *resourceapi.AllocatedDeviceStatus {
303+
if !p.enableDeviceStatus {
304+
return nil
305+
}
306+
307+
deviceInfo := make(map[string]resourceapi.DeviceAttribute)
308+
if d, ok := allocatable[result.Device]; ok {
309+
for _, name := range []resourceapi.QualifiedName{"uuid", "model", "driverVersion"} {
310+
if v, ok := d.Attributes[name]; ok {
311+
deviceInfo[string(name)] = v
312+
}
313+
}
314+
}
315+
316+
jsonBytes, err := json.Marshal(deviceInfo)
317+
if err != nil {
318+
klog.Errorf("Failed to marshal device data for %s: %v", result.Device, err)
319+
jsonBytes = []byte("{}")
320+
}
321+
322+
// Data records per-allocation metadata used for monitoring and debugging:
323+
// - Pod->GPU mapping: makes it easier to see which GPU a given pod is
324+
// using, which is not readily available elsewhere.
325+
// - Device attributes (e.g. UUID, model, driverVersion): remain available
326+
// even if the device is later removed from a ResourceSlice (for
327+
// example, because it becomes unhealthy), so past allocations can
328+
// still be correlated with later health or scheduling issues.
329+
return &resourceapi.AllocatedDeviceStatus{
330+
Device: result.Device,
331+
Driver: result.Driver,
332+
Pool: result.Pool,
333+
Data: &runtime.RawExtension{Raw: jsonBytes},
334+
}
335+
}

0 commit comments

Comments
 (0)