Skip to content

Commit 082a0f3

Browse files
committed
add device data in resourceclaim
Signed-off-by: Swati Gupta <swatig@nvidia.com>
1 parent 1c55f17 commit 082a0f3

3 files changed

Lines changed: 81 additions & 5 deletions

File tree

cmd/dra-example-kubeletplugin/driver.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,8 @@ func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceap
114114
return result, nil
115115
}
116116

117-
func (d *driver) prepareResourceClaim(_ context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
118-
preparedPBs, err := d.state.Prepare(claim)
117+
func (d *driver) prepareResourceClaim(ctx context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
118+
preparedPBs, err := d.state.Prepare(ctx, claim)
119119
if err != nil {
120120
return kubeletplugin.PrepareResult{
121121
Err: fmt.Errorf("error preparing devices for claim %v: %w", claim.UID, err),

cmd/dra-example-kubeletplugin/state.go

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,18 @@
1717
package main
1818

1919
import (
20+
"context"
21+
"encoding/json"
2022
"fmt"
2123
"slices"
2224
"sync"
2325

2426
resourceapi "k8s.io/api/resource/v1"
27+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2528
"k8s.io/apimachinery/pkg/runtime"
29+
metav1apply "k8s.io/client-go/applyconfigurations/meta/v1"
30+
resourceapply "k8s.io/client-go/applyconfigurations/resource/v1"
31+
"k8s.io/klog/v2"
2632
drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2733
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
2834

@@ -61,6 +67,7 @@ type DeviceState struct {
6167
cdi *CDIHandler
6268
allocatable AllocatableDevices
6369
checkpointManager checkpointmanager.CheckpointManager
70+
config *Config
6471
}
6572

6673
func NewDeviceState(config *Config) (*DeviceState, error) {
@@ -88,6 +95,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
8895
cdi: cdi,
8996
allocatable: allocatable,
9097
checkpointManager: checkpointManager,
98+
config: config,
9199
}
92100

93101
checkpoints, err := state.checkpointManager.ListCheckpoints()
@@ -109,7 +117,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
109117
return state, nil
110118
}
111119

112-
func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) {
120+
func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) {
113121
s.Lock()
114122
defer s.Unlock()
115123

@@ -125,7 +133,7 @@ func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Devi
125133
return preparedClaims[claimUID].GetDevices(), nil
126134
}
127135

128-
preparedDevices, err := s.prepareDevices(claim)
136+
preparedDevices, err := s.prepareDevices(ctx, claim)
129137
if err != nil {
130138
return nil, fmt.Errorf("prepare failed: %v", err)
131139
}
@@ -173,7 +181,7 @@ func (s *DeviceState) Unprepare(claimUID string) error {
173181
return nil
174182
}
175183

176-
func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
184+
func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.ResourceClaim) (PreparedDevices, error) {
177185
if claim.Status.Allocation == nil {
178186
return nil, fmt.Errorf("claim not yet allocated")
179187
}
@@ -196,13 +204,20 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
196204
Config: configapi.DefaultGpuConfig(),
197205
})
198206

207+
// build device status
208+
var devicesStatus []*resourceapply.AllocatedDeviceStatusApplyConfiguration
209+
199210
// Look through the configs and figure out which one will be applied to
200211
// each device allocation result based on their order of precedence.
201212
configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult)
202213
for _, result := range claim.Status.Allocation.Devices.Results {
203214
if _, exists := s.allocatable[result.Device]; !exists {
204215
return nil, fmt.Errorf("requested GPU is not allocatable: %v", result.Device)
205216
}
217+
218+
deviceStatus := s.buildDeviceStatus(&result)
219+
devicesStatus = append(devicesStatus, deviceStatus)
220+
206221
for _, c := range slices.Backward(configs) {
207222
if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) {
208223
configResultsMap[c.Config] = append(configResultsMap[c.Config], &result)
@@ -211,6 +226,11 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
211226
}
212227
}
213228

229+
klog.Infof("Adding device attribute to claim %s/%s", claim.Namespace, claim.Name)
230+
if err := s.applyDeviceStatus(ctx, claim.Namespace, claim.Name, devicesStatus...); err != nil {
231+
klog.Warningf("Failed to update device attributes for claim %s/%s: %v", claim.Namespace, claim.Name, err)
232+
}
233+
214234
// Normalize, validate, and apply all configs associated with devices that
215235
// need to be prepared. Track container edits generated from applying the
216236
// config to the set of device allocation results.
@@ -380,3 +400,56 @@ func GetOpaqueDeviceConfigs(
380400

381401
return resultConfigs, nil
382402
}
403+
404+
func (s *DeviceState) buildDeviceStatus(res *resourceapi.DeviceRequestAllocationResult) *resourceapply.AllocatedDeviceStatusApplyConfiguration {
405+
dn := res.Device
406+
deviceInfo := make(map[string]interface{})
407+
408+
if d, ok := s.allocatable[dn]; ok {
409+
if d.Attributes != nil {
410+
attributes := d.Attributes
411+
412+
if uuid, ok := attributes["uuid"]; ok {
413+
deviceInfo["uuid"] = uuid
414+
}
415+
if model, ok := attributes["model"]; ok {
416+
deviceInfo["model"] = model
417+
}
418+
if driverVersion, ok := attributes["driverVersion"]; ok {
419+
deviceInfo["driverVersion"] = driverVersion
420+
}
421+
}
422+
}
423+
424+
jsonBytes, err := json.Marshal(deviceInfo)
425+
if err != nil {
426+
klog.Errorf("Failed to marshal device data: %v", err)
427+
jsonBytes = []byte("{}")
428+
}
429+
data := runtime.RawExtension{
430+
Raw: jsonBytes,
431+
}
432+
cond := metav1apply.Condition().
433+
WithType("Ready").
434+
WithStatus(metav1.ConditionTrue).
435+
WithReason("GPUDeviceReady").
436+
WithMessage("GPUDeviceAllocated").
437+
WithLastTransitionTime(metav1.Now())
438+
439+
return resourceapply.AllocatedDeviceStatus().
440+
WithDevice(dn).
441+
WithDriver(res.Driver).
442+
WithPool(res.Pool).
443+
WithConditions(cond).
444+
WithData(data)
445+
}
446+
447+
func (s *DeviceState) applyDeviceStatus(ctx context.Context, ns, name string, devices ...*resourceapply.AllocatedDeviceStatusApplyConfiguration) error {
448+
claim := resourceapply.ResourceClaim(name, ns).
449+
WithStatus(resourceapply.ResourceClaimStatus().WithDevices(devices...))
450+
451+
opts := metav1.ApplyOptions{FieldManager: consts.DriverName, Force: true}
452+
453+
_, err := s.config.coreclient.ResourceV1().ResourceClaims(ns).ApplyStatus(ctx, claim, opts)
454+
return err
455+
}

deployments/helm/dra-example-driver/templates/clusterrole.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ rules:
88
- apiGroups: ["resource.k8s.io"]
99
resources: ["resourceclaims"]
1010
verbs: ["get"]
11+
- apiGroups: ["resource.k8s.io"]
12+
resources: ["resourceclaims/status"]
13+
verbs: ["patch", "update"]
1114
- apiGroups: [""]
1215
resources: ["nodes"]
1316
verbs: ["get"]

0 commit comments

Comments
 (0)