diff --git a/server/internal/data/pod.go b/server/internal/data/pod.go index 492e26a..e17857e 100644 --- a/server/internal/data/pod.go +++ b/server/internal/data/pod.go @@ -132,7 +132,13 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container { } } + initContainerOffset := len(pod.Spec.InitContainers) for i, ctr := range pod.Spec.Containers { + deviceIdx := initContainerOffset + i + var containerDevices biz.ContainerDevices + if deviceIdx < len(bizContainerDevices) { + containerDevices = bizContainerDevices[deviceIdx] + } c := &biz.Container{ Name: ctr.Name, UUID: ctrIdMaps[ctr.Name], @@ -145,10 +151,10 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container { NodeUID: r.GetNodeUUID(pod), Namespace: pod.Namespace, CreateTime: r.GetCreateTime(pod), - ContainerDevices: bizContainerDevices[i], + ContainerDevices: containerDevices, } - if len(bizContainerDevices[i]) > 0 { - c.Priority = bizContainerDevices[i][0].Priority + if len(containerDevices) > 0 { + c.Priority = containerDevices[0].Priority } containers = append(containers, c) } diff --git a/server/internal/provider/util/util.go b/server/internal/provider/util/util.go index f4f2846..f32e3bd 100644 --- a/server/internal/provider/util/util.go +++ b/server/internal/provider/util/util.go @@ -281,23 +281,32 @@ func DecodeMetaxContainerDevices(str string) (ContainerDevices, error) { return contdev, nil } -func GetContainerPriorities(pod *corev1.Pod) []string { - var priorities []string - +func getContainerPriority(ctr corev1.Container) string { nvidiaPriority := corev1.ResourceName(NVIDIAPriority) - for _, ctr := range pod.Spec.Containers { - priority := "" - if limitPriority, ok := ctr.Resources.Limits[nvidiaPriority]; ok { - priority = limitPriority.String() - } else if requestPriority, ok := ctr.Resources.Requests[nvidiaPriority]; ok { - priority = requestPriority.String() - } - priorities = append(priorities, priority) + if limitPriority, ok := ctr.Resources.Limits[nvidiaPriority]; ok { + return limitPriority.String() + } + if requestPriority, ok := ctr.Resources.Requests[nvidiaPriority]; ok { + return requestPriority.String() } + return "" +} +func GetContainerPriorities(pod *corev1.Pod) []string { + priorities := make([]string, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers)) + for _, ctr := range pod.Spec.InitContainers { + priorities = append(priorities, getContainerPriority(ctr)) + } + for _, ctr := range pod.Spec.Containers { + priorities = append(priorities, getContainerPriority(ctr)) + } return priorities } +func podContainerCount(pod *corev1.Pod) int { + return len(pod.Spec.InitContainers) + len(pod.Spec.Containers) +} + func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { checklist := SupportDevices @@ -339,7 +348,7 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { pd[devType] = append(pd[devType], cd) case NvidiaGPUDevice: for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) { - if i >= len(pod.Spec.Containers) { + if i >= podContainerCount(pod) { break } if s == "" { @@ -350,14 +359,11 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { if err != nil { return PodDevices{}, nil } - if len(cd) == 0 { - continue - } pd[devType] = append(pd[devType], cd) } case HygonGPUDevice: for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) { - if i >= len(pod.Spec.Containers) { + if i >= podContainerCount(pod) { break } if s == "" { @@ -368,14 +374,11 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { if err != nil { return PodDevices{}, nil } - if len(cd) == 0 { - continue - } pd[devType] = append(pd[devType], cd) } case MetaxGPUDevice, MetaxSGPUDevice: for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) { - if i >= len(pod.Spec.Containers) { + if i >= podContainerCount(pod) { break } if s == "" { @@ -386,9 +389,6 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) { if err != nil { return PodDevices{}, nil } - if len(cd) == 0 { - continue - } pd[devType] = append(pd[devType], cd) } } @@ -416,4 +416,4 @@ func MapNewDeviceInfoToDeviceInfo(newDeviceInfo *NewDeviceInfo) *DeviceInfo { Mode: newDeviceInfo.Mode, Health: newDeviceInfo.Health, } -} \ No newline at end of file +} diff --git a/server/internal/provider/util/util_test.go b/server/internal/provider/util/util_test.go index fa5e826..4d97e3d 100644 --- a/server/internal/provider/util/util_test.go +++ b/server/internal/provider/util/util_test.go @@ -16,8 +16,12 @@ limitations under the License. package util import ( - "github.com/go-kratos/kratos/v2/log" "testing" + + "github.com/go-kratos/kratos/v2/log" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var inRequestDevices map[string]string @@ -345,3 +349,55 @@ func Test_DecodePodDevices(t *testing.T) { }) } } + +func TestDecodePodDevicesWithInitContainers(t *testing.T) { + SupportDevices["NVIDIA"] = "hami.io/vgpu-devices-allocated" + logger := log.NewHelper(log.DefaultLogger) + + pod := &corev1.Pod{ + Spec: corev1.PodSpec{ + InitContainers: []corev1.Container{ + {Name: "init"}, + }, + Containers: []corev1.Container{ + { + Name: "main", + Resources: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceName(NVIDIAPriority): resource.MustParse("1"), + }, + }, + }, + }, + }, + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + SupportDevices["NVIDIA"]: ";GPU-962d9630-a4ef-dc16-a50d-b2effb90239d,NVIDIA,6144,30:;", + }, + }, + } + + got, err := DecodePodDevices(pod, logger) + if err != nil { + t.Fatalf("DecodePodDevices() error = %v", err) + } + nvidiaDevices, ok := got["NVIDIA"] + if !ok { + t.Fatal("expected NVIDIA devices") + } + if len(nvidiaDevices) != 2 { + t.Fatalf("expected 2 container slots, got %d", len(nvidiaDevices)) + } + if len(nvidiaDevices[0]) != 0 { + t.Fatalf("expected empty init container devices, got %+v", nvidiaDevices[0]) + } + if len(nvidiaDevices[1]) != 1 { + t.Fatalf("expected 1 device on main container, got %+v", nvidiaDevices[1]) + } + if nvidiaDevices[1][0].UUID != "GPU-962d9630-a4ef-dc16-a50d-b2effb90239d" { + t.Fatalf("unexpected device UUID: %s", nvidiaDevices[1][0].UUID) + } + if nvidiaDevices[1][0].Priority != "1" { + t.Fatalf("unexpected priority: %s", nvidiaDevices[1][0].Priority) + } +}