Skip to content

Commit 9593759

Browse files
committed
fix: resolve index out of range panic in fetchContainerInfo for Ascend devices
- Pre-allocate bizContainerDevices based on actual container count to prevent out-of-bounds access when annotation device count differs from pod container count - Merge devices from all device types per container instead of overwriting, which previously caused device data loss for multi-device-type pods - Add container index boundary check in DecodePodDevices Ascend branch to align with NVIDIA/Hygon/Metax device handling Fixes #94 Signed-off-by: handong <cygnushan@yunify.com>
1 parent f60dbed commit 9593759

3 files changed

Lines changed: 189 additions & 5 deletions

File tree

server/internal/data/pod.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,23 @@ func (r *podRepo) fetchContainerInfo(pod *corev1.Pod) []*biz.Container {
112112
if err != nil {
113113
return containers
114114
}
115-
bizContainerDevices := []biz.ContainerDevices{}
115+
// Merge devices from all device types per container index.
116+
// pdevices: map[deviceType]PodSingleDevice([]ContainerDevices)
117+
numContainers := len(pod.Spec.Containers)
118+
bizContainerDevices := make([]biz.ContainerDevices, numContainers)
116119
for _, pds := range pdevices {
117-
copier.Copy(&bizContainerDevices, pds)
120+
var bizPds biz.PodSingleDevice
121+
if err := copier.Copy(&bizPds, pds); err != nil {
122+
r.log.Warnf("failed to copy pod device info: %v", err)
123+
continue
124+
}
125+
for i, cd := range bizPds {
126+
if i < numContainers {
127+
bizContainerDevices[i] = append(bizContainerDevices[i], cd...)
128+
}
129+
}
118130
}
119-
if len(bizContainerDevices) < 1 {
131+
if len(pdevices) == 0 {
120132
return containers
121133
}
122134

server/internal/provider/util/util.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,14 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
317317
pd[devType] = make(PodSingleDevice, 0)
318318
switch devType {
319319
case AscendGPUDevice, Ascend310PGPUDevice:
320-
for _, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) {
320+
for i, s := range strings.Split(str, OnePodMultiContainerSplitSymbol) {
321+
if i >= len(pod.Spec.Containers) {
322+
break
323+
}
324+
if s == "" {
325+
pd[devType] = append(pd[devType], ContainerDevices{})
326+
continue
327+
}
321328
cd, err := DecodeNpuContainerDevices(s)
322329
if err != nil {
323330
return PodDevices{}, nil

server/internal/provider/util/util_test.go

Lines changed: 166 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@ limitations under the License.
1616
package util
1717

1818
import (
19-
"github.com/go-kratos/kratos/v2/log"
19+
"os"
20+
"reflect"
2021
"testing"
22+
23+
"github.com/go-kratos/kratos/v2/log"
24+
corev1 "k8s.io/api/core/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2126
)
2227

2328
var inRequestDevices map[string]string
@@ -345,3 +350,163 @@ func Test_DecodePodDevices(t *testing.T) {
345350
})
346351
}
347352
}
353+
354+
func Test_DecodePodDevices_Ascend(t *testing.T) {
355+
// Ensure Ascend device keys are registered in SupportDevices.
356+
SupportDevices["Ascend"] = "hami.io/Ascend910B-devices-allocated"
357+
SupportDevices["Ascend310P"] = "hami.io/Ascend310P-devices-allocated"
358+
t.Cleanup(func() {
359+
delete(SupportDevices, "Ascend")
360+
delete(SupportDevices, "Ascend310P")
361+
})
362+
363+
logger := log.NewHelper(log.NewStdLogger(os.Stdout))
364+
365+
tests := []struct {
366+
name string
367+
pod *corev1.Pod
368+
want PodDevices
369+
wantErr bool
370+
}{
371+
{
372+
name: "Ascend310P single container single device",
373+
pod: &corev1.Pod{
374+
ObjectMeta: metav1.ObjectMeta{
375+
Annotations: map[string]string{
376+
"hami.io/Ascend310P-devices-allocated": "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019,Ascend310P,6144,100:",
377+
},
378+
},
379+
Spec: corev1.PodSpec{
380+
Containers: []corev1.Container{{Name: "main"}},
381+
},
382+
},
383+
want: PodDevices{
384+
"Ascend310P": {
385+
{
386+
{Idx: 0, UUID: "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
387+
},
388+
},
389+
},
390+
wantErr: false,
391+
},
392+
{
393+
name: "Ascend310P two containers",
394+
pod: &corev1.Pod{
395+
ObjectMeta: metav1.ObjectMeta{
396+
Annotations: map[string]string{
397+
"hami.io/Ascend310P-devices-allocated": "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019,Ascend310P,6144,100:;D7E96E64-214123F1-E8E618E4-AED8030A-E3003039,Ascend310P,6144,100:",
398+
},
399+
},
400+
Spec: corev1.PodSpec{
401+
Containers: []corev1.Container{
402+
{Name: "ctr1"},
403+
{Name: "ctr2"},
404+
},
405+
},
406+
},
407+
want: PodDevices{
408+
"Ascend310P": {
409+
{
410+
{Idx: 0, UUID: "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
411+
},
412+
{
413+
{Idx: 0, UUID: "D7E96E64-214123F1-E8E618E4-AED8030A-E3003039", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
414+
},
415+
},
416+
},
417+
wantErr: false,
418+
},
419+
{
420+
name: "Ascend310P annotation segments exceed container count - should truncate",
421+
pod: &corev1.Pod{
422+
ObjectMeta: metav1.ObjectMeta{
423+
Annotations: map[string]string{
424+
// Two device segments but only one container in spec.
425+
"hami.io/Ascend310P-devices-allocated": "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019,Ascend310P,6144,100:;D7E96E64-214123F1-E8E618E4-AED8030A-E3003039,Ascend310P,6144,100:",
426+
},
427+
},
428+
Spec: corev1.PodSpec{
429+
Containers: []corev1.Container{{Name: "main"}},
430+
},
431+
},
432+
want: PodDevices{
433+
"Ascend310P": {
434+
{
435+
{Idx: 0, UUID: "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
436+
},
437+
},
438+
},
439+
wantErr: false,
440+
},
441+
{
442+
name: "Ascend310P empty segment in middle should produce empty ContainerDevices placeholder",
443+
pod: &corev1.Pod{
444+
ObjectMeta: metav1.ObjectMeta{
445+
Annotations: map[string]string{
446+
"hami.io/Ascend310P-devices-allocated": "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019,Ascend310P,6144,100:;;D7E96E64-214123F1-E8E618E4-AED8030A-E3003039,Ascend310P,6144,100:",
447+
},
448+
},
449+
Spec: corev1.PodSpec{
450+
Containers: []corev1.Container{
451+
{Name: "ctr1"},
452+
{Name: "ctr2"},
453+
{Name: "ctr3"},
454+
},
455+
},
456+
},
457+
want: PodDevices{
458+
"Ascend310P": {
459+
{
460+
{Idx: 0, UUID: "E0766E64-20C0AB59-CC9AB1A4-3778030A-83003019", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
461+
},
462+
{}, // empty segment placeholder
463+
{
464+
{Idx: 0, UUID: "D7E96E64-214123F1-E8E618E4-AED8030A-E3003039", Type: "Ascend310P", Usedmem: 6144, Usedcores: 25},
465+
},
466+
},
467+
},
468+
wantErr: false,
469+
},
470+
{
471+
name: "Ascend310P malformed device should produce empty device set",
472+
pod: &corev1.Pod{
473+
ObjectMeta: metav1.ObjectMeta{
474+
Annotations: map[string]string{
475+
"hami.io/Ascend310P-devices-allocated": "bad-format-string",
476+
},
477+
},
478+
Spec: corev1.PodSpec{
479+
Containers: []corev1.Container{{Name: "main"}},
480+
},
481+
},
482+
want: PodDevices{"Ascend310P": {}},
483+
wantErr: false,
484+
},
485+
{
486+
name: "empty annotations should return empty PodDevices",
487+
pod: &corev1.Pod{
488+
ObjectMeta: metav1.ObjectMeta{
489+
Annotations: map[string]string{},
490+
},
491+
Spec: corev1.PodSpec{
492+
Containers: []corev1.Container{{Name: "main"}},
493+
},
494+
},
495+
want: PodDevices{},
496+
wantErr: false,
497+
},
498+
}
499+
500+
for _, tt := range tests {
501+
t.Run(tt.name, func(t *testing.T) {
502+
got, err := DecodePodDevices(tt.pod, logger)
503+
if (err != nil) != tt.wantErr {
504+
t.Errorf("DecodePodDevices() error = %v, wantErr %v", err, tt.wantErr)
505+
return
506+
}
507+
if !reflect.DeepEqual(got, tt.want) {
508+
t.Errorf("DecodePodDevices() = %+v, want %+v", got, tt.want)
509+
}
510+
})
511+
}
512+
}

0 commit comments

Comments
 (0)