Skip to content

Commit ac0d588

Browse files
committed
Add CUDA_MPS_ACTIVE_THREAD_PERCENTAGE information in MPS situation.
Signed-off-by: 630445639 <630445639@qq.com>
1 parent eb657d1 commit ac0d588

2 files changed

Lines changed: 15 additions & 8 deletions

File tree

pkg/gpu/nvidia/allocate.go

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,21 @@ func init() {
2121
kubeInit()
2222
}
2323

24-
func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
24+
func (m *NvidiaDevicePlugin) buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
2525
responses := pluginapi.AllocateResponse{}
2626
for _, req := range reqs.ContainerRequests {
2727
response := pluginapi.ContainerAllocateResponse{
2828
Envs: map[string]string{
29-
envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU),
29+
envNVGPU: fmt.Sprintf("no-gpu-has-%dGiB-to-run", podReqGPU),
3030
EnvResourceIndex: fmt.Sprintf("-1"),
3131
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
3232
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
3333
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
3434
},
3535
}
36+
if m.mps {
37+
response.Envs[EnvPercentage] = fmt.Sprintf("%d", 100*uint(len(req.DevicesIDs))/getGPUMemory()) //
38+
}
3639
responses.ContainerResponses = append(responses.ContainerResponses, &response)
3740
}
3841
return &responses
@@ -62,7 +65,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
6265
pods, err := getCandidatePods()
6366
if err != nil {
6467
log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
65-
return buildErrResponse(reqs, podReqGPU), nil
68+
return m.buildErrResponse(reqs, podReqGPU), nil
6669
}
6770

6871
if log.V(4) {
@@ -106,7 +109,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
106109
}
107110

108111
if id < 0 {
109-
return buildErrResponse(reqs, podReqGPU), nil
112+
return m.buildErrResponse(reqs, podReqGPU), nil
110113
}
111114

112115
// 1. Create container requests
@@ -121,6 +124,9 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
121124
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
122125
},
123126
}
127+
if m.mps {
128+
response.Envs[EnvPercentage] = fmt.Sprintf("%d", 100*reqGPU/getGPUMemory())
129+
}
124130
responses.ContainerResponses = append(responses.ContainerResponses, &response)
125131
}
126132

@@ -134,25 +140,25 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
134140
pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{})
135141
if err != nil {
136142
log.Warningf("Failed due to %v", err)
137-
return buildErrResponse(reqs, podReqGPU), nil
143+
return m.buildErrResponse(reqs, podReqGPU), nil
138144
}
139145
newPod = updatePodAnnotations(pod)
140146
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
141147
if err != nil {
142148
log.Warningf("Failed due to %v", err)
143-
return buildErrResponse(reqs, podReqGPU), nil
149+
return m.buildErrResponse(reqs, podReqGPU), nil
144150
}
145151
} else {
146152
log.Warningf("Failed due to %v", err)
147-
return buildErrResponse(reqs, podReqGPU), nil
153+
return m.buildErrResponse(reqs, podReqGPU), nil
148154
}
149155
}
150156

151157
} else {
152158
log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
153159
podReqGPU)
154160
// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
155-
return buildErrResponse(reqs, podReqGPU), nil
161+
return m.buildErrResponse(reqs, podReqGPU), nil
156162
}
157163

158164
log.Infof("new allocated GPUs info %v", &responses)

pkg/gpu/nvidia/const.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ const (
2828
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
2929
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
3030
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
31+
EnvPercentage = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
3132
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
3233

3334
GiBPrefix = MemoryUnit("GiB")

0 commit comments

Comments
 (0)