Skip to content

Commit ff498ea

Browse files
committed
add device data in resourceclaim
Signed-off-by: Swati Gupta <swatig@nvidia.com> address review comment: fix pointer ref Signed-off-by: Swati Gupta <swatig@nvidia.com>
1 parent 1c55f17 commit ff498ea

3 files changed

Lines changed: 76 additions & 5 deletions

File tree

cmd/dra-example-kubeletplugin/driver.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceap
114114
return result, nil
115115
}
116116

117-
func (d *driver) prepareResourceClaim(_ context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
118-
preparedPBs, err := d.state.Prepare(claim)
117+
func (d *driver) prepareResourceClaim(ctx context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
118+
preparedPBs, err := d.state.Prepare(ctx, claim)
119119
if err != nil {
120120
return kubeletplugin.PrepareResult{
121121
Err: fmt.Errorf("error preparing devices for claim %v: %w", claim.UID, err),

cmd/dra-example-kubeletplugin/state.go

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,17 @@
1717
package main
1818

1919
import (
20+
"context"
21+
"encoding/json"
2022
"fmt"
2123
"slices"
2224
"sync"
2325

2426
resourceapi "k8s.io/api/resource/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2528
"k8s.io/apimachinery/pkg/runtime"
29+
resourceapply "k8s.io/client-go/applyconfigurations/resource/v1"
30+
"k8s.io/klog/v2"
2631
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2732
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
2833

@@ -61,6 +66,7 @@ type DeviceState struct {
6166
cdi *CDIHandler
6267
allocatable AllocatableDevices
6368
checkpointManager checkpointmanager.CheckpointManager
69+
config *Config
6470
}
6571

6672
func NewDeviceState(config *Config) (*DeviceState, error) {
@@ -88,6 +94,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
8894
cdi: cdi,
8995
allocatable: allocatable,
9096
checkpointManager: checkpointManager,
97+
config: config,
9198
}
9299

93100
checkpoints, err := state.checkpointManager.ListCheckpoints()
@@ -109,7 +116,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
109116
return state, nil
110117
}
111118

112-
func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) {
119+
func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) {
113120
s.Lock()
114121
defer s.Unlock()
115122

@@ -125,7 +132,7 @@ func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Devi
125132
return preparedClaims[claimUID].GetDevices(), nil
126133
}
127134

128-
preparedDevices, err := s.prepareDevices(claim)
135+
preparedDevices, err := s.prepareDevices(ctx, claim)
129136
if err != nil {
130137
return nil, fmt.Errorf("prepare failed: %v", err)
131138
}
@@ -173,7 +180,7 @@ func (s *DeviceState) Unprepare(claimUID string) error {
173180
return nil
174181
}
175182

176-
func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
183+
func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
177184
if claim.Status.Allocation == nil {
178185
return nil, fmt.Errorf("claim not yet allocated")
179186
}
@@ -196,13 +203,20 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
196203
Config: configapi.DefaultGpuConfig(),
197204
})
198205

206+
// build device status
207+
var devicesStatus []*resourceapply.AllocatedDeviceStatusApplyConfiguration
208+
199209
// Look through the configs and figure out which one will be applied to
200210
// each device allocation result based on their order of precedence.
201211
configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult)
202212
for _, result := range claim.Status.Allocation.Devices.Results {
203213
if _, exists := s.allocatable[result.Device]; !exists {
204214
return nil, fmt.Errorf("requested GPU is not allocatable: %v", result.Device)
205215
}
216+
217+
deviceStatus := s.buildDeviceStatus(result)
218+
devicesStatus = append(devicesStatus, deviceStatus)
219+
206220
for _, c := range slices.Backward(configs) {
207221
if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) {
208222
configResultsMap[c.Config] = append(configResultsMap[c.Config], &result)
@@ -211,6 +225,11 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
211225
}
212226
}
213227

228+
klog.Infof("Adding device attribute to claim %s/%s", claim.Namespace, claim.Name)
229+
if err := s.applyDeviceStatus(ctx, claim.Namespace, claim.Name, devicesStatus...); err != nil {
230+
klog.Warningf("Failed to update device attributes for claim %s/%s: %v", claim.Namespace, claim.Name, err)
231+
}
232+
214233
// Normalize, validate, and apply all configs associated with devices that
215234
// need to be prepared. Track container edits generated from applying the
216235
// config to the set of device allocation results.
@@ -380,3 +399,52 @@ func GetOpaqueDeviceConfigs(
380399

381400
return resultConfigs, nil
382401
}
402+
403+
func (s *DeviceState) buildDeviceStatus(res resourceapi.DeviceRequestAllocationResult) *resourceapply.AllocatedDeviceStatusApplyConfiguration {
404+
dn := res.Device
405+
deviceInfo := make(map[string]resourceapi.DeviceAttribute)
406+
407+
if d, ok := s.allocatable[dn]; ok {
408+
if uuid, ok := d.Attributes["uuid"]; ok {
409+
deviceInfo["uuid"] = uuid
410+
}
411+
if model, ok := d.Attributes["model"]; ok {
412+
deviceInfo["model"] = model
413+
}
414+
if driverVersion, ok := d.Attributes["driverVersion"]; ok {
415+
deviceInfo["driverVersion"] = driverVersion
416+
}
417+
}
418+
419+
jsonBytes, err := json.Marshal(deviceInfo)
420+
if err != nil {
421+
klog.Errorf("Failed to marshal device data: %v", err)
422+
jsonBytes = []byte("{}")
423+
}
424+
data := runtime.RawExtension{
425+
Raw: jsonBytes,
426+
}
427+
428+
return resourceapply.AllocatedDeviceStatus().
429+
WithDevice(dn).
430+
WithDriver(res.Driver).
431+
WithPool(res.Pool).
432+
// WithData records per-allocation metadata used for monitoring and debugging:
433+
// - Pod→GPU mapping: makes it easier to see which GPU a given pod is using,
434+
// which is not readily available elsewhere.
435+
// - Device attributes (e.g. UUID, model, driverVersion): remain available
436+
// even if the device is later removed from a ResourceSlice (for example,
437+
// because it becomes unhealthy), so past allocations can still be
438+
// correlated with later health or scheduling issues.
439+
WithData(data)
440+
}
441+
442+
func (s *DeviceState) applyDeviceStatus(ctx context.Context, ns, name string, devices ...*resourceapply.AllocatedDeviceStatusApplyConfiguration) error {
443+
claim := resourceapply.ResourceClaim(name, ns).
444+
WithStatus(resourceapply.ResourceClaimStatus().WithDevices(devices...))
445+
446+
opts := metav1.ApplyOptions{FieldManager: consts.DriverName, Force: true}
447+
448+
_, err := s.config.coreclient.ResourceV1().ResourceClaims(ns).ApplyStatus(ctx, claim, opts)
449+
return err
450+
}

deployments/helm/dra-example-driver/templates/clusterrole.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ rules:
88
- apiGroups: ["resource.k8s.io"]
99
resources: ["resourceclaims"]
1010
verbs: ["get"]
11+
- apiGroups: ["resource.k8s.io"]
12+
resources: ["resourceclaims/status"]
13+
verbs: ["patch", "update"]
1114
- apiGroups: [""]
1215
resources: ["nodes"]
1316
verbs: ["get"]

0 commit comments

Comments
 (0)