Skip to content

Commit 21585bf

Browse files
committed
Add env and volume information in MPS situation.
Signed-off-by: 630445639 <630445639@qq.com>
1 parent eb657d1 commit 21585bf

6 files changed

Lines changed: 44 additions & 23 deletions

File tree

cmd/nvidia/main.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ var (
1111
mps = flag.Bool("mps", false, "Enable or Disable MPS")
1212
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
1313
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
14+
mpspipe = flag.String("mps-pipe", "/tmp/nvidia-mps", " pipes and UNIX domain sockets")
1415
)
1516

1617
func main() {
1718
flag.Parse()
1819
log.V(1).Infoln("Start gpushare device plugin")
19-
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit))
20+
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, *mpspipe, translatememoryUnits(*memoryUnit))
2021
err := ngm.Run()
2122
if err != nil {
2223
log.Fatalf("Failed due to %v", err)

device-plugin-ds.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ spec:
2525
- gpushare-device-plugin-v2
2626
- -logtostderr
2727
- --v=5
28+
# - --mps-pipe=/root/nvidia-mps // mps-client and mps-server communicate through the directory.You can modify it.
29+
# - --mps=true //if you want to use mps
2830
- --memory-unit=GiB
2931
resources:
3032
limits:

pkg/gpu/nvidia/allocate.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,22 @@ func init() {
2121
kubeInit()
2222
}
2323

24-
func buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
24+
func (m *NvidiaDevicePlugin) buildErrResponse(reqs *pluginapi.AllocateRequest, podReqGPU uint) *pluginapi.AllocateResponse {
2525
responses := pluginapi.AllocateResponse{}
2626
for _, req := range reqs.ContainerRequests {
2727
response := pluginapi.ContainerAllocateResponse{
2828
Envs: map[string]string{
29-
envNVGPU: fmt.Sprintf("no-gpu-has-%dMiB-to-run", podReqGPU),
29+
envNVGPU: fmt.Sprintf("no-gpu-has-%dGiB-to-run", podReqGPU),
3030
EnvResourceIndex: fmt.Sprintf("-1"),
3131
EnvResourceByPod: fmt.Sprintf("%d", podReqGPU),
3232
EnvResourceByContainer: fmt.Sprintf("%d", uint(len(req.DevicesIDs))),
3333
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
3434
},
3535
}
36+
if m.mps {
37+
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*uint(len(req.DevicesIDs))/getGPUMemory())
38+
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
39+
}
3640
responses.ContainerResponses = append(responses.ContainerResponses, &response)
3741
}
3842
return &responses
@@ -62,7 +66,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
6266
pods, err := getCandidatePods()
6367
if err != nil {
6468
log.Infof("invalid allocation requst: Failed to find candidate pods due to %v", err)
65-
return buildErrResponse(reqs, podReqGPU), nil
69+
return m.buildErrResponse(reqs, podReqGPU), nil
6670
}
6771

6872
if log.V(4) {
@@ -106,7 +110,7 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
106110
}
107111

108112
if id < 0 {
109-
return buildErrResponse(reqs, podReqGPU), nil
113+
return m.buildErrResponse(reqs, podReqGPU), nil
110114
}
111115

112116
// 1. Create container requests
@@ -121,6 +125,15 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
121125
EnvResourceByDev: fmt.Sprintf("%d", getGPUMemory()),
122126
},
123127
}
128+
if m.mps {
129+
response.Envs[EnvMPSActiveThreadPercentage] = fmt.Sprintf("%d", 100*reqGPU/getGPUMemory())
130+
response.Envs[EnvMPSPipeDirectory] = fmt.Sprintf(m.mpspipe)
131+
mount := pluginapi.Mount{
132+
ContainerPath: m.mpspipe,
133+
HostPath: m.mpspipe,
134+
}
135+
response.Mounts = append(response.Mounts, &mount)
136+
}
124137
responses.ContainerResponses = append(responses.ContainerResponses, &response)
125138
}
126139

@@ -134,25 +147,25 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context,
134147
pod, err := clientset.CoreV1().Pods(assumePod.Namespace).Get(assumePod.Name, metav1.GetOptions{})
135148
if err != nil {
136149
log.Warningf("Failed due to %v", err)
137-
return buildErrResponse(reqs, podReqGPU), nil
150+
return m.buildErrResponse(reqs, podReqGPU), nil
138151
}
139152
newPod = updatePodAnnotations(pod)
140153
_, err = clientset.CoreV1().Pods(newPod.Namespace).Update(newPod)
141154
if err != nil {
142155
log.Warningf("Failed due to %v", err)
143-
return buildErrResponse(reqs, podReqGPU), nil
156+
return m.buildErrResponse(reqs, podReqGPU), nil
144157
}
145158
} else {
146159
log.Warningf("Failed due to %v", err)
147-
return buildErrResponse(reqs, podReqGPU), nil
160+
return m.buildErrResponse(reqs, podReqGPU), nil
148161
}
149162
}
150163

151164
} else {
152165
log.Warningf("invalid allocation requst: request GPU memory %d can't be satisfied.",
153166
podReqGPU)
154167
// return &responses, fmt.Errorf("invalid allocation requst: request GPU memory %d can't be satisfied", reqGPU)
155-
return buildErrResponse(reqs, podReqGPU), nil
168+
return m.buildErrResponse(reqs, podReqGPU), nil
156169
}
157170

158171
log.Infof("new allocated GPUs info %v", &responses)

pkg/gpu/nvidia/const.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,16 @@ const (
2121
containerLogPathLabelKey = "io.kubernetes.container.logpath"
2222
sandboxIDLabelKey = "io.kubernetes.sandbox.id"
2323

24-
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
25-
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
26-
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
27-
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
28-
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
29-
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
30-
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
31-
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
24+
envNVGPU = "NVIDIA_VISIBLE_DEVICES"
25+
EnvResourceIndex = "ALIYUN_COM_GPU_MEM_IDX"
26+
EnvResourceByPod = "ALIYUN_COM_GPU_MEM_POD"
27+
EnvResourceByContainer = "ALIYUN_COM_GPU_MEM_CONTAINER"
28+
EnvResourceByDev = "ALIYUN_COM_GPU_MEM_DEV"
29+
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
30+
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
31+
EnvMPSPipeDirectory = "CUDA_MPS_PIPE_DIRECTORY"
32+
EnvMPSActiveThreadPercentage = "CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"
33+
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"
3234

3335
GiBPrefix = MemoryUnit("GiB")
3436
MiBPrefix = MemoryUnit("MiB")

pkg/gpu/nvidia/gpumanager.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,15 @@ import (
1414
type sharedGPUManager struct {
1515
enableMPS bool
1616
healthCheck bool
17+
mpspipe string
1718
}
1819

19-
func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager {
20+
func NewSharedGPUManager(enableMPS, healthCheck bool, mpspipe string, bp MemoryUnit) *sharedGPUManager {
2021
metric = bp
2122
return &sharedGPUManager{
2223
enableMPS: enableMPS,
2324
healthCheck: healthCheck,
25+
mpspipe: mpspipe,
2426
}
2527
}
2628

@@ -61,7 +63,7 @@ L:
6163
devicePlugin.Stop()
6264
}
6365

64-
devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck)
66+
devicePlugin = NewNvidiaDevicePlugin(ngm.enableMPS, ngm.healthCheck, ngm.mpspipe)
6567
if err := devicePlugin.Serve(); err != nil {
6668
log.Warningf("Failed to start device plugin due to %v", err)
6769
} else {

pkg/gpu/nvidia/server.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ type NvidiaDevicePlugin struct {
2222
devIndxMap map[uint]string
2323
socket string
2424
mps bool
25+
mpspipe string
2526
healthCheck bool
2627

2728
stop chan struct{}
@@ -32,7 +33,7 @@ type NvidiaDevicePlugin struct {
3233
}
3334

3435
// NewNvidiaDevicePlugin returns an initialized NvidiaDevicePlugin
35-
func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
36+
func NewNvidiaDevicePlugin(mps, healthCheck bool, mpspipe string) *NvidiaDevicePlugin {
3637
devs, devNameMap := getDevices()
3738
devList := []string{}
3839

@@ -54,10 +55,10 @@ func NewNvidiaDevicePlugin(mps, healthCheck bool) *NvidiaDevicePlugin {
5455
devNameMap: devNameMap,
5556
socket: serverSock,
5657
mps: mps,
58+
mpspipe: mpspipe,
5759
healthCheck: healthCheck,
58-
59-
stop: make(chan struct{}),
60-
health: make(chan *pluginapi.Device),
60+
stop: make(chan struct{}),
61+
health: make(chan *pluginapi.Device),
6162
}
6263
}
6364

0 commit comments

Comments
 (0)