@@ -18,7 +18,6 @@ package main
1818
1919import (
2020 "context"
21- encode "encoding/json"
2221 "fmt"
2322 "path/filepath"
2423 "slices"
@@ -161,7 +160,7 @@ func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceCl
161160 if err != nil {
162161 return nil , fmt .Errorf ("unable to sync from checkpoint: %v" , err )
163162 }
164- restoredDevices , err := s .restoreClaimFromCheckpoint (ctx , checkpoint , claim )
163+ restoredDevices , err := s .restoreClaimFromCheckpoint (checkpoint , claim )
165164 if err != nil {
166165 return nil , fmt .Errorf ("unable to restore from checkpoint: %v" , err )
167166 }
@@ -217,7 +216,42 @@ func (s *DeviceState) Unprepare(claimUID types.UID) error {
217216// prepareDevices performs one-time setup for the devices allocated to a
218217// ResourceClaim before being consumed by a Pod.
219218func (s * DeviceState ) prepareDevices (ctx context.Context , claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
220- return s .computeDeviceConfig (ctx , claim )
219+ preparedDevices , err := s .computeDeviceConfig (claim )
220+ if err != nil {
221+ return nil , err
222+ }
223+
224+ // Publish per-device status (e.g. uuid, model, driverVersion) into
225+ // ResourceClaim.status.devices[].data when the profile implements
226+ // [profiles.DeviceStatusBuilder]. This is a side-effect on the API server
227+ // and therefore lives in prepareDevices (rather than computeDeviceConfig,
228+ // which must be deterministic and side-effect free).
229+ builder , ok := s .configHandler .(profiles.DeviceStatusBuilder )
230+ if ! ok {
231+ return preparedDevices , nil
232+ }
233+
234+ var deviceStatuses []resourceapi.AllocatedDeviceStatus
235+ for _ , result := range claim .Status .Allocation .Devices .Results {
236+ if result .Driver != s .driverName {
237+ continue
238+ }
239+ if status := builder .BuildDeviceStatus (s .allocatable , & result ); status != nil {
240+ deviceStatuses = append (deviceStatuses , * status )
241+ }
242+ }
243+ if len (deviceStatuses ) > 0 {
244+ klog .FromContext (ctx ).Info ("Publishing device status to ResourceClaim" ,
245+ "namespace" , claim .Namespace , "name" , claim .Name , "devices" , len (deviceStatuses ))
246+ if err := s .updateDeviceStatus (ctx , claim .Namespace , claim .Name , deviceStatuses ... ); err != nil {
247+ // A failure to publish status is non-fatal: the device is still
248+ // prepared and the claim status will simply be missing the data.
249+ klog .FromContext (ctx ).Error (err , "Failed to update device status on ResourceClaim" ,
250+ "namespace" , claim .Namespace , "name" , claim .Name )
251+ }
252+ }
253+
254+ return preparedDevices , nil
221255}
222256
223257// unprepareDevices undoes any side-effects produced by
@@ -232,7 +266,7 @@ func (s *DeviceState) unprepareDevices(claimUID types.UID, checkpoint *checkpoin
232266// should be deterministic and produce no side-effects. Non-deterministic data or
233267// side-effects should be produced by [DeviceState.prepareDevices] directly and
234268// recorded in the checkpoint by [DeviceState.addClaimToCheckpoint].
235- func (s * DeviceState ) computeDeviceConfig (ctx context. Context , claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
269+ func (s * DeviceState ) computeDeviceConfig (claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
236270 if claim .Status .Allocation == nil {
237271 return nil , fmt .Errorf ("claim not yet allocated" )
238272 }
@@ -254,9 +288,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
254288 // the list with len(Requests) == 0 for the lookup below.
255289 configs = slices .Insert (configs , 0 , & OpaqueDeviceConfig {})
256290
257- // build device status
258- var devicesStatus []resourceapi.AllocatedDeviceStatus
259-
260291 // Look through the configs and figure out which one will be applied to
261292 // each device allocation result based on their order of precedence.
262293 configResultsMap := make (map [runtime.Object ][]* resourceapi.DeviceRequestAllocationResult )
@@ -269,11 +300,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
269300 return nil , fmt .Errorf ("requested device is not allocatable: %v" , result .Device )
270301 }
271302
272- if s .gpuDeviceStatus {
273- deviceStatus := s .buildDeviceStatus (result )
274- devicesStatus = append (devicesStatus , deviceStatus )
275- }
276-
277303 for _ , c := range slices .Backward (configs ) {
278304 if len (c .Requests ) == 0 || slices .Contains (c .Requests , result .Request ) {
279305 configResultsMap [c .Config ] = append (configResultsMap [c .Config ], & result )
@@ -287,13 +313,6 @@ func (s *DeviceState) computeDeviceConfig(ctx context.Context, claim *resourceap
287313 // of device allocation results.
288314 perDeviceCDIContainerEdits := make (profiles.PerDeviceCDIContainerEdits )
289315 for config , results := range configResultsMap {
290- if s .gpuDeviceStatus {
291- klog .Infof ("Adding device attribute to claim %s/%s" , claim .Namespace , claim .Name )
292- if err := s .updateDeviceStatus (ctx , claim .Namespace , claim .Name , devicesStatus ... ); err != nil {
293- klog .Warningf ("Failed to update device attributes for claim %s/%s: %v" , claim .Namespace , claim .Name , err )
294- }
295- }
296-
297316 // Apply the config to the list of results associated with it.
298317 containerEdits , err := s .configHandler .ApplyConfig (config , results )
299318 if err != nil {
@@ -344,12 +363,12 @@ func (*DeviceState) removeClaimFromCheckpoint(checkpoint *checkpointapi.Checkpoi
344363
345364// restoreClaimFromCheckpoint returns the device definitions for devices already prepared
346365// for the given claim. If the claim has not yet been prepared, it returns nil.
347- func (s * DeviceState ) restoreClaimFromCheckpoint (ctx context. Context , checkpoint * checkpointapi.Checkpoint , claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
366+ func (s * DeviceState ) restoreClaimFromCheckpoint (checkpoint * checkpointapi.Checkpoint , claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
348367 if slices .ContainsFunc (checkpoint .PreparedClaims , func (c checkpointapi.PreparedClaim ) bool { return c .UID == claim .UID }) {
349368 // If [DeviceState.addClaimToCheckpoint] associated any other data with
350369 // the claim in the checkpoint, then that should be added to the
351370 // returned [PreparedDevices] here.
352- return s .computeDeviceConfig (ctx , claim )
371+ return s .computeDeviceConfig (claim )
353372 }
354373 return nil , nil
355374}
@@ -453,43 +472,6 @@ func GetOpaqueDeviceConfigs(
453472 return resultConfigs , nil
454473}
455474
456- func (s * DeviceState ) buildDeviceStatus (res resourceapi.DeviceRequestAllocationResult ) resourceapi.AllocatedDeviceStatus {
457- dn := res .Device
458- deviceInfo := make (map [string ]resourceapi.DeviceAttribute )
459-
460- if d , ok := s .allocatable [dn ]; ok {
461- if uuid , ok := d .Attributes ["uuid" ]; ok {
462- deviceInfo ["uuid" ] = uuid
463- }
464- if model , ok := d .Attributes ["model" ]; ok {
465- deviceInfo ["model" ] = model
466- }
467- if driverVersion , ok := d .Attributes ["driverVersion" ]; ok {
468- deviceInfo ["driverVersion" ] = driverVersion
469- }
470- }
471-
472- jsonBytes , err := encode .Marshal (deviceInfo )
473- if err != nil {
474- klog .Errorf ("Failed to marshal device data: %v" , err )
475- jsonBytes = []byte ("{}" )
476- }
477-
478- return resourceapi.AllocatedDeviceStatus {
479- Device : dn ,
480- Driver : res .Driver ,
481- Pool : res .Pool ,
482- // Data records per-allocation metadata used for monitoring and debugging:
483- // - Pod→GPU mapping: makes it easier to see which GPU a given pod is using,
484- // which is not readily available elsewhere.
485- // - Device attributes (e.g. UUID, model, driverVersion): remain available
486- // even if the device is later removed from a ResourceSlice (for example,
487- // because it becomes unhealthy), so past allocations can still be
488- // correlated with later health or scheduling issues.
489- Data : & runtime.RawExtension {Raw : jsonBytes },
490- }
491- }
492-
493475func (s * DeviceState ) updateDeviceStatus (ctx context.Context , ns , name string , devices ... resourceapi.AllocatedDeviceStatus ) error {
494476 // Converting wrapper to use latest API types,
495477 // converts to/from server-supported version.
0 commit comments