1717package main
1818
1919import (
20+ "context"
21+ "encoding/json"
2022 "fmt"
2123 "slices"
2224 "sync"
2325
2426 resourceapi "k8s.io/api/resource/v1"
27+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2528 "k8s.io/apimachinery/pkg/runtime"
29+ resourceapply "k8s.io/client-go/applyconfigurations/resource/v1"
30+ "k8s.io/klog/v2"
2631 drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1"
2732 "k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
2833
@@ -61,6 +66,7 @@ type DeviceState struct {
6166 cdi * CDIHandler
6267 allocatable AllocatableDevices
6368 checkpointManager checkpointmanager.CheckpointManager
69+ config * Config
6470}
6571
6672func NewDeviceState (config * Config ) (* DeviceState , error ) {
@@ -88,6 +94,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
8894 cdi : cdi ,
8995 allocatable : allocatable ,
9096 checkpointManager : checkpointManager ,
97+ config : config ,
9198 }
9299
93100 checkpoints , err := state .checkpointManager .ListCheckpoints ()
@@ -109,7 +116,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) {
109116 return state , nil
110117}
111118
112- func (s * DeviceState ) Prepare (claim * resourceapi.ResourceClaim ) ([]* drapbv1.Device , error ) {
119+ func (s * DeviceState ) Prepare (ctx context. Context , claim * resourceapi.ResourceClaim ) ([]* drapbv1.Device , error ) {
113120 s .Lock ()
114121 defer s .Unlock ()
115122
@@ -125,7 +132,7 @@ func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Devi
125132 return preparedClaims [claimUID ].GetDevices (), nil
126133 }
127134
128- preparedDevices , err := s .prepareDevices (claim )
135+ preparedDevices , err := s .prepareDevices (ctx , claim )
129136 if err != nil {
130137 return nil , fmt .Errorf ("prepare failed: %v" , err )
131138 }
@@ -173,7 +180,7 @@ func (s *DeviceState) Unprepare(claimUID string) error {
173180 return nil
174181}
175182
176- func (s * DeviceState ) prepareDevices (claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
183+ func (s * DeviceState ) prepareDevices (ctx context. Context , claim * resourceapi.ResourceClaim ) (PreparedDevices , error ) {
177184 if claim .Status .Allocation == nil {
178185 return nil , fmt .Errorf ("claim not yet allocated" )
179186 }
@@ -196,13 +203,20 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
196203 Config : configapi .DefaultGpuConfig (),
197204 })
198205
206+ // build device status
207+ var devicesStatus []* resourceapply.AllocatedDeviceStatusApplyConfiguration
208+
199209 // Look through the configs and figure out which one will be applied to
200210 // each device allocation result based on their order of precedence.
201211 configResultsMap := make (map [runtime.Object ][]* resourceapi.DeviceRequestAllocationResult )
202212 for _ , result := range claim .Status .Allocation .Devices .Results {
203213 if _ , exists := s .allocatable [result .Device ]; ! exists {
204214 return nil , fmt .Errorf ("requested GPU is not allocatable: %v" , result .Device )
205215 }
216+
217+ deviceStatus := s .buildDeviceStatus (result )
218+ devicesStatus = append (devicesStatus , deviceStatus )
219+
206220 for _ , c := range slices .Backward (configs ) {
207221 if len (c .Requests ) == 0 || slices .Contains (c .Requests , result .Request ) {
208222 configResultsMap [c .Config ] = append (configResultsMap [c .Config ], & result )
@@ -211,6 +225,11 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (Prepared
211225 }
212226 }
213227
228+ klog .Infof ("Adding device attribute to claim %s/%s" , claim .Namespace , claim .Name )
229+ if err := s .applyDeviceStatus (ctx , claim .Namespace , claim .Name , devicesStatus ... ); err != nil {
230+ klog .Warningf ("Failed to update device attributes for claim %s/%s: %v" , claim .Namespace , claim .Name , err )
231+ }
232+
214233 // Normalize, validate, and apply all configs associated with devices that
215234 // need to be prepared. Track container edits generated from applying the
216235 // config to the set of device allocation results.
@@ -380,3 +399,52 @@ func GetOpaqueDeviceConfigs(
380399
381400 return resultConfigs , nil
382401}
402+
403+ func (s * DeviceState ) buildDeviceStatus (res resourceapi.DeviceRequestAllocationResult ) * resourceapply.AllocatedDeviceStatusApplyConfiguration {
404+ dn := res .Device
405+ deviceInfo := make (map [string ]resourceapi.DeviceAttribute )
406+
407+ if d , ok := s .allocatable [dn ]; ok {
408+ if uuid , ok := d .Attributes ["uuid" ]; ok {
409+ deviceInfo ["uuid" ] = uuid
410+ }
411+ if model , ok := d .Attributes ["model" ]; ok {
412+ deviceInfo ["model" ] = model
413+ }
414+ if driverVersion , ok := d .Attributes ["driverVersion" ]; ok {
415+ deviceInfo ["driverVersion" ] = driverVersion
416+ }
417+ }
418+
419+ jsonBytes , err := json .Marshal (deviceInfo )
420+ if err != nil {
421+ klog .Errorf ("Failed to marshal device data: %v" , err )
422+ jsonBytes = []byte ("{}" )
423+ }
424+ data := runtime.RawExtension {
425+ Raw : jsonBytes ,
426+ }
427+
428+ return resourceapply .AllocatedDeviceStatus ().
429+ WithDevice (dn ).
430+ WithDriver (res .Driver ).
431+ WithPool (res .Pool ).
432+ // WithData records per-allocation metadata used for monitoring and debugging:
433+ // - Pod→GPU mapping: makes it easier to see which GPU a given pod is using,
434+ // which is not readily available elsewhere.
435+ // - Device attributes (e.g. UUID, model, driverVersion): remain available
436+ // even if the device is later removed from a ResourceSlice (for example,
437+ // because it becomes unhealthy), so past allocations can still be
438+ // correlated with later health or scheduling issues.
439+ WithData (data )
440+ }
441+
442+ func (s * DeviceState ) applyDeviceStatus (ctx context.Context , ns , name string , devices ... * resourceapply.AllocatedDeviceStatusApplyConfiguration ) error {
443+ claim := resourceapply .ResourceClaim (name , ns ).
444+ WithStatus (resourceapply .ResourceClaimStatus ().WithDevices (devices ... ))
445+
446+ opts := metav1.ApplyOptions {FieldManager : consts .DriverName , Force : true }
447+
448+ _ , err := s .config .coreclient .ResourceV1 ().ResourceClaims (ns ).ApplyStatus (ctx , claim , opts )
449+ return err
450+ }
0 commit comments