Skip to content

Commit 7c3dc77

Browse files
FouoFcursoragent
andcommitted
fix: decode pod device annotations with init containers
HAMi encodes vgpu-devices-allocated entries for init containers and regular containers in order; align decoding and container mapping with that index layout so GPU usage shows correctly in the WebUI. Co-authored-by: Cursor <cursoragent@cursor.com> Signed-off-by: Jifei Wang <jifei.wang@dynamia.ai>
1 parent 8f42445 commit 7c3dc77

3 files changed

Lines changed: 90 additions & 28 deletions

File tree

server/internal/data/pod.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,13 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container {
132132
}
133133
}
134134

135+
initContainerOffset := len(pod.Spec.InitContainers)
135136
for i, ctr := range pod.Spec.Containers {
137+
deviceIdx := initContainerOffset + i
138+
var containerDevices biz.ContainerDevices
139+
if deviceIdx < len(bizContainerDevices) {
140+
containerDevices = bizContainerDevices[deviceIdx]
141+
}
136142
c := &biz.Container{
137143
Name: ctr.Name,
138144
UUID: ctrIdMaps[ctr.Name],
@@ -145,10 +151,10 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container {
145151
NodeUID: r.GetNodeUUID(pod),
146152
Namespace: pod.Namespace,
147153
CreateTime: r.GetCreateTime(pod),
148-
ContainerDevices: bizContainerDevices[i],
154+
ContainerDevices: containerDevices,
149155
}
150-
if len(bizContainerDevices[i]) > 0 {
151-
c.Priority = bizContainerDevices[i][0].Priority
156+
if len(containerDevices) > 0 {
157+
c.Priority = containerDevices[0].Priority
152158
}
153159
containers = append(containers, c)
154160
}

server/internal/provider/util/util.go

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -281,23 +281,32 @@ func DecodeMetaxContainerDevices(str string) (ContainerDevices, error) {
281281
return contdev, nil
282282
}
283283

284-
func GetContainerPriorities(pod *corev1.Pod) []string {
285-
var priorities []string
286-
284+
func getContainerPriority(ctr corev1.Container) string {
287285
nvidiaPriority := corev1.ResourceName(NVIDIAPriority)
288-
for _, ctr := range pod.Spec.Containers {
289-
priority := ""
290-
if limitPriority, ok := ctr.Resources.Limits[nvidiaPriority]; ok {
291-
priority = limitPriority.String()
292-
} else if requestPriority, ok := ctr.Resources.Requests[nvidiaPriority]; ok {
293-
priority = requestPriority.String()
294-
}
295-
priorities = append(priorities, priority)
286+
if limitPriority, ok := ctr.Resources.Limits[nvidiaPriority]; ok {
287+
return limitPriority.String()
288+
}
289+
if requestPriority, ok := ctr.Resources.Requests[nvidiaPriority]; ok {
290+
return requestPriority.String()
296291
}
292+
return ""
293+
}
297294

295+
func GetContainerPriorities(pod *corev1.Pod) []string {
296+
priorities := make([]string, 0, len(pod.Spec.InitContainers)+len(pod.Spec.Containers))
297+
for _, ctr := range pod.Spec.InitContainers {
298+
priorities = append(priorities, getContainerPriority(ctr))
299+
}
300+
for _, ctr := range pod.Spec.Containers {
301+
priorities = append(priorities, getContainerPriority(ctr))
302+
}
298303
return priorities
299304
}
300305

306+
func podContainerCount(pod *corev1.Pod) int {
307+
return len(pod.Spec.InitContainers) + len(pod.Spec.Containers)
308+
}
309+
301310
func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
302311
checklist := SupportDevices
303312

@@ -339,7 +348,7 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
339348
pd[devType] = append(pd[devType], cd)
340349
case NvidiaGPUDevice:
341350
for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) {
342-
if i >= len(pod.Spec.Containers) {
351+
if i >= podContainerCount(pod) {
343352
break
344353
}
345354
if s == "" {
@@ -350,14 +359,11 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
350359
if err != nil {
351360
return PodDevices{}, nil
352361
}
353-
if len(cd) == 0 {
354-
continue
355-
}
356362
pd[devType] = append(pd[devType], cd)
357363
}
358364
case HygonGPUDevice:
359365
for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) {
360-
if i >= len(pod.Spec.Containers) {
366+
if i >= podContainerCount(pod) {
361367
break
362368
}
363369
if s == "" {
@@ -368,14 +374,11 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
368374
if err != nil {
369375
return PodDevices{}, nil
370376
}
371-
if len(cd) == 0 {
372-
continue
373-
}
374377
pd[devType] = append(pd[devType], cd)
375378
}
376379
case MetaxGPUDevice, MetaxSGPUDevice:
377380
for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) {
378-
if i >= len(pod.Spec.Containers) {
381+
if i >= podContainerCount(pod) {
379382
break
380383
}
381384
if s == "" {
@@ -386,9 +389,6 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
386389
if err != nil {
387390
return PodDevices{}, nil
388391
}
389-
if len(cd) == 0 {
390-
continue
391-
}
392392
pd[devType] = append(pd[devType], cd)
393393
}
394394
}
@@ -416,4 +416,4 @@ func MapNewDeviceInfoToDeviceInfo(newDeviceInfo *NewDeviceInfo) *DeviceInfo {
416416
Mode: newDeviceInfo.Mode,
417417
Health: newDeviceInfo.Health,
418418
}
419-
}
419+
}

server/internal/provider/util/util_test.go

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,12 @@ limitations under the License.
1616
package util
1717

1818
import (
19-
"github.com/go-kratos/kratos/v2/log"
2019
"testing"
20+
21+
"github.com/go-kratos/kratos/v2/log"
22+
corev1 "k8s.io/api/core/v1"
23+
"k8s.io/apimachinery/pkg/api/resource"
24+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2125
)
2226

2327
var inRequestDevices map[string]string
@@ -345,3 +349,55 @@ func Test_DecodePodDevices(t *testing.T) {
345349
})
346350
}
347351
}
352+
353+
func TestDecodePodDevicesWithInitContainers(t *testing.T) {
354+
SupportDevices["NVIDIA"] = "hami.io/vgpu-devices-allocated"
355+
logger := log.NewHelper(log.DefaultLogger)
356+
357+
pod := &corev1.Pod{
358+
Spec: corev1.PodSpec{
359+
InitContainers: []corev1.Container{
360+
{Name: "init"},
361+
},
362+
Containers: []corev1.Container{
363+
{
364+
Name: "main",
365+
Resources: corev1.ResourceRequirements{
366+
Limits: corev1.ResourceList{
367+
corev1.ResourceName(NVIDIAPriority): resource.MustParse("1"),
368+
},
369+
},
370+
},
371+
},
372+
},
373+
ObjectMeta: metav1.ObjectMeta{
374+
Annotations: map[string]string{
375+
SupportDevices["NVIDIA"]: ";GPU-962d9630-a4ef-dc16-a50d-b2effb90239d,NVIDIA,6144,30:;",
376+
},
377+
},
378+
}
379+
380+
got, err := DecodePodDevices(pod, logger)
381+
if err != nil {
382+
t.Fatalf("DecodePodDevices() error = %v", err)
383+
}
384+
nvidiaDevices, ok := got["NVIDIA"]
385+
if !ok {
386+
t.Fatal("expected NVIDIA devices")
387+
}
388+
if len(nvidiaDevices) != 2 {
389+
t.Fatalf("expected 2 container slots, got %d", len(nvidiaDevices))
390+
}
391+
if len(nvidiaDevices[0]) != 0 {
392+
t.Fatalf("expected empty init container devices, got %+v", nvidiaDevices[0])
393+
}
394+
if len(nvidiaDevices[1]) != 1 {
395+
t.Fatalf("expected 1 device on main container, got %+v", nvidiaDevices[1])
396+
}
397+
if nvidiaDevices[1][0].UUID != "GPU-962d9630-a4ef-dc16-a50d-b2effb90239d" {
398+
t.Fatalf("unexpected device UUID: %s", nvidiaDevices[1][0].UUID)
399+
}
400+
if nvidiaDevices[1][0].Priority != "1" {
401+
t.Fatalf("unexpected priority: %s", nvidiaDevices[1][0].Priority)
402+
}
403+
}

0 commit comments

Comments
 (0)