Skip to content

Commit 232d4ae

Browse files
committed
feat: add cdi yaml
Signed-off-by: james <open4pd@4paradigm.com>
1 parent 0342ed3 commit 232d4ae

3 files changed

Lines changed: 338 additions & 2 deletions

File tree

README.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,14 @@ You can set the sharing mode and customize your installation by adjusting the [c
135135

136136
Once you have enabled this option on *all* the GPU nodes you wish to use,
137137
you can then enable GPU support in your cluster by deploying the following Daemonset:
138+
#### Normal Mode
138139

139140
```
140-
$ kubectl create -f volcano-vgpu-device-plugin.yml
141+
$ kubectl apply -f volcano-vgpu-device-plugin.yml
142+
```
143+
#### CDI Mode
144+
```
145+
$ kubectl apply -f volcano-vgpu-device-plugin-cdi.yml
141146
```
142147
143148
### Verify environment is ready

volcano-vgpu-device-plugin-cdi.yml

Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
---
15+
apiVersion: v1
16+
kind: ConfigMap
17+
metadata:
18+
name: volcano-vgpu-device-config
19+
namespace: kube-system
20+
labels:
21+
app.kubernetes.io/component: volcano-vgpu-device-plugin
22+
data:
23+
device-config.yaml: |-
24+
nvidia:
25+
resourceCountName: volcano.sh/vgpu-number
26+
resourceMemoryName: volcano.sh/vgpu-memory
27+
resourceMemoryPercentageName: volcano.sh/vgpu-memory-percentage
28+
resourceCoreName: volcano.sh/vgpu-cores
29+
overwriteEnv: false
30+
defaultMemory: 0
31+
defaultCores: 0
32+
defaultGPUNum: 1
33+
deviceSplitCount: 10
34+
deviceMemoryScaling: 1
35+
deviceCoreScaling: 1
36+
gpuMemoryFactor: 1
37+
knownMigGeometries:
38+
- models: [ "A30" ]
39+
allowedGeometries:
40+
- group: group1
41+
geometries:
42+
- name: 1g.6gb
43+
memory: 6144
44+
count: 4
45+
- group: group2
46+
geometries:
47+
- name: 2g.12gb
48+
memory: 12288
49+
count: 2
50+
- group: group3
51+
geometries:
52+
- name: 4g.24gb
53+
memory: 24576
54+
count: 1
55+
- models: [ "A100-SXM4-40GB", "A100-40GB-PCIe", "A100-PCIE-40GB", "A100-SXM4-40GB" ]
56+
allowedGeometries:
57+
- group: "group1"
58+
geometries:
59+
- name: 1g.5gb
60+
memory: 5120
61+
count: 7
62+
- group: "group2"
63+
geometries:
64+
- name: 2g.10gb
65+
memory: 10240
66+
count: 3
67+
- name: 1g.5gb
68+
memory: 5120
69+
count: 1
70+
- group: "group3"
71+
geometries:
72+
- name: 3g.20gb
73+
memory: 20480
74+
count: 2
75+
- group: "group4"
76+
geometries:
77+
- name: 7g.40gb
78+
memory: 40960
79+
count: 1
80+
- models: [ "A100-SXM4-80GB", "A100-80GB-PCIe", "A100-PCIE-80GB"]
81+
allowedGeometries:
82+
- group: "group1"
83+
geometries:
84+
- name: 1g.10gb
85+
memory: 10240
86+
count: 7
87+
- group: "group2"
88+
geometries:
89+
- name: 2g.20gb
90+
memory: 20480
91+
count: 3
92+
- name: 1g.10gb
93+
memory: 10240
94+
count: 1
95+
- group: "group3"
96+
geometries:
97+
- name: 3g.40gb
98+
memory: 40960
99+
count: 2
100+
- group: "group4"
101+
geometries:
102+
- name: 7g.79gb
103+
memory: 80896
104+
count: 1
105+
---
106+
apiVersion: v1
107+
kind: ConfigMap
108+
metadata:
109+
name: volcano-vgpu-node-config
110+
namespace: kube-system
111+
labels:
112+
app.kubernetes.io/component: volcano-vgpu-node-plugin
113+
data:
114+
config.json: |
115+
{
116+
"nodeconfig": [
117+
{
118+
"name": "ucloud-wlcb-gpu-073",
119+
"operatingmode": "hami-core",
120+
"devicememoryscaling": 1,
121+
"devicesplitcount": 10,
122+
"migstrategy":"none",
123+
"filterdevices": {
124+
"uuid": [],
125+
"index": []
126+
}
127+
}
128+
]
129+
}
130+
---
131+
apiVersion: v1
132+
kind: ServiceAccount
133+
metadata:
134+
name: volcano-device-plugin
135+
namespace: kube-system
136+
---
137+
kind: ClusterRole
138+
apiVersion: rbac.authorization.k8s.io/v1
139+
metadata:
140+
name: volcano-device-plugin
141+
rules:
142+
- apiGroups: [""]
143+
resources: ["nodes"]
144+
verbs: ["get", "list", "watch", "update", "patch"]
145+
- apiGroups: [""]
146+
resources: ["nodes/status"]
147+
verbs: ["patch"]
148+
- apiGroups: [""]
149+
resources: ["pods"]
150+
verbs: ["get", "list", "update", "patch", "watch"]
151+
- apiGroups: [""]
152+
resources: ["configmaps"]
153+
verbs: ["get", "list", "watch", "create", "update"]
154+
---
155+
kind: ClusterRoleBinding
156+
apiVersion: rbac.authorization.k8s.io/v1
157+
metadata:
158+
name: volcano-device-plugin
159+
subjects:
160+
- kind: ServiceAccount
161+
name: volcano-device-plugin
162+
namespace: kube-system
163+
roleRef:
164+
kind: ClusterRole
165+
name: volcano-device-plugin
166+
apiGroup: rbac.authorization.k8s.io
167+
---
168+
apiVersion: apps/v1
169+
kind: DaemonSet
170+
metadata:
171+
name: volcano-device-plugin
172+
namespace: kube-system
173+
spec:
174+
selector:
175+
matchLabels:
176+
name: volcano-device-plugin
177+
updateStrategy:
178+
type: RollingUpdate
179+
template:
180+
metadata:
181+
# This annotation is deprecated. Kept here for backward compatibility
182+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
183+
annotations:
184+
scheduler.alpha.kubernetes.io/critical-pod: ""
185+
labels:
186+
name: volcano-device-plugin
187+
spec:
188+
runtimeClassName: nvidia
189+
tolerations:
190+
# This toleration is deprecated. Kept here for backward compatibility
191+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
192+
- key: CriticalAddonsOnly
193+
operator: Exists
194+
- key: volcano.sh/gpu-memory
195+
operator: Exists
196+
effect: NoSchedule
197+
# Mark this pod as a critical add-on; when enabled, the critical add-on
198+
# scheduler reserves resources for critical add-on pods so that they can
199+
# be rescheduled after a failure.
200+
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
201+
priorityClassName: "system-node-critical"
202+
serviceAccount: volcano-device-plugin
203+
containers:
204+
- image: harbor-contest.4pd.io/denglong/volcano-vgpu-device-plugin:v1.12.0
205+
imagePullPolicy: Always
206+
args:
207+
- --device-split-count=10
208+
lifecycle:
209+
postStart:
210+
exec:
211+
command: ["/bin/sh", "-c", "cp -f /k8s-vgpu/lib/nvidia/* /usr/local/vgpu/"]
212+
name: volcano-device-plugin
213+
env:
214+
- name: NODE_NAME
215+
valueFrom:
216+
fieldRef:
217+
fieldPath: spec.nodeName
218+
- name: HOOK_PATH
219+
value: "/usr/local/vgpu"
220+
- name: NVIDIA_VISIBLE_DEVICES
221+
value: "all"
222+
- name: NVIDIA_MIG_MONITOR_DEVICES
223+
value: "all"
224+
- name: NVIDIA_DRIVER_CAPABILITIES
225+
value: "utility"
226+
- name: DEVICE_LIST_STRATEGY
227+
value: "envvar"
228+
#value: "cdi-annotations"
229+
- name: NVIDIA_DRIVER_ROOT
230+
value: /
231+
- name: NVIDIA_CDI_HOOK_PATH
232+
value: /usr/bin/nvidia-ctk
233+
- name: GDRCOPY_ENABLED
234+
value: "false"
235+
- name: GDS_ENABLED
236+
value: "false"
237+
- name: MOFED_ENABLED
238+
value: "false"
239+
240+
securityContext:
241+
allowPrivilegeEscalation: true
242+
privileged: true
243+
capabilities:
244+
drop: ["ALL"]
245+
add: ["SYS_ADMIN"]
246+
volumeMounts:
247+
- name: deviceconfig
248+
mountPath: /config
249+
- name: device-plugin
250+
mountPath: /var/lib/kubelet/device-plugins
251+
- name: lib
252+
mountPath: /usr/local/vgpu
253+
- name: hosttmp
254+
mountPath: /tmp
255+
- name: driver-root
256+
mountPath: /driver-root
257+
readOnly: true
258+
- image: harbor-contest.4pd.io/denglong/volcano-vgpu-device-plugin:v1.11.0
259+
name: monitor
260+
command:
261+
- /bin/bash
262+
- -c
263+
- volcano-vgpu-monitor
264+
env:
265+
- name: NVIDIA_VISIBLE_DEVICES
266+
value: "all"
267+
- name: NVIDIA_MIG_MONITOR_DEVICES
268+
value: "all"
269+
- name: HOOK_PATH
270+
value: "/tmp/vgpu"
271+
- name: NODE_NAME
272+
valueFrom:
273+
fieldRef:
274+
fieldPath: spec.nodeName
275+
securityContext:
276+
privileged: true
277+
allowPrivilegeEscalation: true
278+
capabilities:
279+
drop: ["ALL"]
280+
add: ["SYS_ADMIN"]
281+
volumeMounts:
282+
- name: dockers
283+
mountPath: /run/docker
284+
- name: containerds
285+
mountPath: /run/containerd
286+
- name: sysinfo
287+
mountPath: /sysinfo
288+
- name: hostvar
289+
mountPath: /hostvar
290+
- name: hosttmp
291+
mountPath: /tmp
292+
volumes:
293+
- name: deviceconfig
294+
configMap:
295+
name: volcano-vgpu-node-config
296+
- hostPath:
297+
path: /var/lib/kubelet/device-plugins
298+
type: Directory
299+
name: device-plugin
300+
- hostPath:
301+
path: /usr/local/vgpu
302+
type: DirectoryOrCreate
303+
name: lib
304+
- name: hosttmp
305+
hostPath:
306+
path: /tmp
307+
type: DirectoryOrCreate
308+
- name: dockers
309+
hostPath:
310+
path: /run/docker
311+
type: DirectoryOrCreate
312+
- name: containerds
313+
hostPath:
314+
path: /run/containerd
315+
type: DirectoryOrCreate
316+
- name: usrbin
317+
hostPath:
318+
path: /usr/bin
319+
type: Directory
320+
- name: sysinfo
321+
hostPath:
322+
path: /sys
323+
type: Directory
324+
- name: hostvar
325+
hostPath:
326+
path: /var
327+
type: Directory
328+
- name: driver-root
329+
hostPath:
330+
path: /
331+
type: Directory

volcano-vgpu-device-plugin.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ spec:
200200
priorityClassName: "system-node-critical"
201201
serviceAccount: volcano-device-plugin
202202
containers:
203-
- image: docker.io/projecthami/volcano-vgpu-device-plugin:v1.11.0
203+
- image: docker.io/projecthami/volcano-vgpu-device-plugin:v1.12.0
204204
args: ["--device-split-count=10"]
205205
lifecycle:
206206
postStart:

0 commit comments

Comments
 (0)