-
Notifications
You must be signed in to change notification settings - Fork 110
add device data in resourceclaim #128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4d87b8e
ef575ab
439dbd4
93d5b13
a4b4bc8
9f5e69d
f0b910d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,14 +17,21 @@ | |
| package main | ||
|
|
||
| import ( | ||
| "context" | ||
| encode "encoding/json" | ||
| "fmt" | ||
| "slices" | ||
| "sync" | ||
|
|
||
| resourceapi "k8s.io/api/resource/v1" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/runtime" | ||
| "k8s.io/apimachinery/pkg/runtime/serializer/json" | ||
| "k8s.io/client-go/util/retry" | ||
| draclient "k8s.io/dynamic-resource-allocation/client" | ||
| "k8s.io/dynamic-resource-allocation/resourceslice" | ||
|
|
||
| "k8s.io/klog/v2" | ||
| drapbv1 "k8s.io/kubelet/pkg/apis/dra/v1beta1" | ||
| "k8s.io/kubernetes/pkg/kubelet/checkpointmanager" | ||
|
|
||
|
|
@@ -48,6 +55,8 @@ type DeviceState struct { | |
| checkpointManager checkpointmanager.CheckpointManager | ||
| configDecoder runtime.Decoder | ||
| configHandler profiles.ConfigHandler | ||
| config *Config | ||
| gpuDeviceStatus bool | ||
| } | ||
|
|
||
| func NewDeviceState(config *Config) (*DeviceState, error) { | ||
|
|
@@ -103,6 +112,8 @@ func NewDeviceState(config *Config) (*DeviceState, error) { | |
| checkpointManager: checkpointManager, | ||
| configDecoder: decoder, | ||
| configHandler: configHandler, | ||
| config: config, | ||
| gpuDeviceStatus: config.flags.gpuDeviceStatus, | ||
| } | ||
|
|
||
| checkpoints, err := state.checkpointManager.ListCheckpoints() | ||
|
|
@@ -124,7 +135,7 @@ func NewDeviceState(config *Config) (*DeviceState, error) { | |
| return state, nil | ||
| } | ||
|
|
||
| func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) { | ||
| func (s *DeviceState) Prepare(ctx context.Context, claim *resourceapi.ResourceClaim) ([]*drapbv1.Device, error) { | ||
| s.Lock() | ||
| defer s.Unlock() | ||
|
|
||
|
|
@@ -139,7 +150,8 @@ func (s *DeviceState) Prepare(claim *resourceapi.ResourceClaim) ([]*drapbv1.Devi | |
| if preparedClaims[claimUID] != nil { | ||
| return preparedClaims[claimUID].GetDevices(), nil | ||
| } | ||
| preparedDevices, err := s.prepareDevices(claim) | ||
|
|
||
| preparedDevices, err := s.prepareDevices(ctx, claim) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("prepare failed: %v", err) | ||
| } | ||
|
|
@@ -190,7 +202,7 @@ func (s *DeviceState) Unprepare(claimUID string) error { | |
| return nil | ||
| } | ||
|
|
||
| func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (profiles.PreparedDevices, error) { | ||
| func (s *DeviceState) prepareDevices(ctx context.Context, claim *resourceapi.ResourceClaim) (profiles.PreparedDevices, error) { | ||
| if claim.Status.Allocation == nil { | ||
| return nil, fmt.Errorf("claim not yet allocated") | ||
| } | ||
|
|
@@ -212,6 +224,9 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (profiles | |
| // the list with len(Requests) == 0 for the lookup below. | ||
| configs = slices.Insert(configs, 0, &OpaqueDeviceConfig{}) | ||
|
|
||
| // build device status | ||
| var devicesStatus []resourceapi.AllocatedDeviceStatus | ||
|
|
||
| // Look through the configs and figure out which one will be applied to | ||
| // each device allocation result based on their order of precedence. | ||
| configResultsMap := make(map[runtime.Object][]*resourceapi.DeviceRequestAllocationResult) | ||
|
|
@@ -223,6 +238,12 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (profiles | |
| if _, exists := s.allocatable[result.Device]; !exists { | ||
| return nil, fmt.Errorf("requested device is not allocatable: %v", result.Device) | ||
| } | ||
|
|
||
| if s.gpuDeviceStatus { | ||
| deviceStatus := s.buildDeviceStatus(result) | ||
| devicesStatus = append(devicesStatus, deviceStatus) | ||
| } | ||
|
|
||
| for _, c := range slices.Backward(configs) { | ||
| if len(c.Requests) == 0 || slices.Contains(c.Requests, result.Request) { | ||
| configResultsMap[c.Config] = append(configResultsMap[c.Config], &result) | ||
|
|
@@ -236,6 +257,13 @@ func (s *DeviceState) prepareDevices(claim *resourceapi.ResourceClaim) (profiles | |
| // of device allocation results. | ||
| perDeviceCDIContainerEdits := make(profiles.PerDeviceCDIContainerEdits) | ||
| for config, results := range configResultsMap { | ||
| if s.gpuDeviceStatus { | ||
| klog.Infof("Adding device attribute to claim %s/%s", claim.Namespace, claim.Name) | ||
| if err := s.updateDeviceStatus(ctx, claim.Namespace, claim.Name, devicesStatus...); err != nil { | ||
| klog.Warningf("Failed to update device attributes for claim %s/%s: %v", claim.Namespace, claim.Name, err) | ||
| } | ||
| } | ||
|
|
||
| // Apply the config to the list of results associated with it. | ||
| containerEdits, err := s.configHandler.ApplyConfig(config, results) | ||
| if err != nil { | ||
|
|
@@ -351,3 +379,61 @@ func GetOpaqueDeviceConfigs( | |
|
|
||
| return resultConfigs, nil | ||
| } | ||
|
|
||
| func (s *DeviceState) buildDeviceStatus(res resourceapi.DeviceRequestAllocationResult) resourceapi.AllocatedDeviceStatus { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's move this logic into the Ideally we shouldn't have to implement this for every kind of device.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay. looking to it. need sometime to understand the refractor. will be back at it. |
||
| dn := res.Device | ||
| deviceInfo := make(map[string]resourceapi.DeviceAttribute) | ||
|
|
||
| if d, ok := s.allocatable[dn]; ok { | ||
| if uuid, ok := d.Attributes["uuid"]; ok { | ||
| deviceInfo["uuid"] = uuid | ||
| } | ||
| if model, ok := d.Attributes["model"]; ok { | ||
| deviceInfo["model"] = model | ||
| } | ||
| if driverVersion, ok := d.Attributes["driverVersion"]; ok { | ||
| deviceInfo["driverVersion"] = driverVersion | ||
| } | ||
| } | ||
|
|
||
| jsonBytes, err := encode.Marshal(deviceInfo) | ||
| if err != nil { | ||
| klog.Errorf("Failed to marshal device data: %v", err) | ||
| jsonBytes = []byte("{}") | ||
| } | ||
|
|
||
| return resourceapi.AllocatedDeviceStatus{ | ||
| Device: dn, | ||
| Driver: res.Driver, | ||
| Pool: res.Pool, | ||
| // Data records per-allocation metadata used for monitoring and debugging: | ||
| // - Pod→GPU mapping: makes it easier to see which GPU a given pod is using, | ||
| // which is not readily available elsewhere. | ||
| // - Device attributes (e.g. UUID, model, driverVersion): remain available | ||
| // even if the device is later removed from a ResourceSlice (for example, | ||
| // because it becomes unhealthy), so past allocations can still be | ||
| // correlated with later health or scheduling issues. | ||
| Data: &runtime.RawExtension{Raw: jsonBytes}, | ||
| } | ||
| } | ||
|
|
||
| func (s *DeviceState) updateDeviceStatus(ctx context.Context, ns, name string, devices ...resourceapi.AllocatedDeviceStatus) error { | ||
| // Converting wrapper to use latest API types, | ||
| // converts to/from server-supported version. | ||
| c := draclient.New(s.config.coreclient) | ||
| rc := c.ResourceClaims(ns) | ||
|
|
||
| return retry.RetryOnConflict(retry.DefaultRetry, func() error { | ||
| claim, err := rc.Get(ctx, name, metav1.GetOptions{}) | ||
| if err != nil { | ||
| return err | ||
| } | ||
|
|
||
| // copy the object and update only status.devices | ||
| claim = claim.DeepCopy() | ||
| claim.Status.Devices = devices | ||
|
|
||
| _, err = rc.UpdateStatus(ctx, claim, metav1.UpdateOptions{}) | ||
| return err | ||
| }) | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's keep
NewDeviceStatewhere we pull everything out of theConfigthat we need vs. storing the wholeConfigon theDeviceState.Configcontains user-facing config like command line flags, so let's not plumb that too far down.