Skip to content

Commit 436b483

Browse files
butler54claude
andcommitted
feat: add bare-metal GPU clusterGroup with NVIDIA confidential container support
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent fbce1aa commit 436b483

8 files changed

Lines changed: 327 additions & 0 deletions

File tree

charts/all/nvidia-gpu/Chart.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: v2
2+
description: NVIDIA GPU operator configuration for confidential containers (ClusterPolicy, IOMMU, VFIO).
3+
keywords:
4+
- pattern
5+
- nvidia
6+
- gpu
7+
- confidential-computing
8+
name: nvidia-gpu
9+
version: 0.0.1
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
apiVersion: nvidia.com/v1
2+
kind: ClusterPolicy
3+
metadata:
4+
name: gpu-cluster-policy
5+
spec:
6+
ccManager:
7+
defaultMode: {{ .Values.ccManager.defaultMode | quote }}
8+
enabled: {{ .Values.ccManager.enabled }}
9+
cdi:
10+
enabled: true
11+
default: false
12+
driver:
13+
enabled: false
14+
devicePlugin:
15+
enabled: false
16+
gfd:
17+
enabled: true
18+
kataManager:
19+
enabled: false
20+
sandboxDevicePlugin:
21+
enabled: true
22+
env:
23+
- name: P_GPU_ALIAS
24+
value: {{ .Values.sandboxDevicePlugin.pgpuAlias }}
25+
- name: NVSWITCH_ALIAS
26+
value: {{ .Values.sandboxDevicePlugin.nvswitchAlias }}
27+
sandboxWorkloads:
28+
defaultWorkload: vm-passthrough
29+
enabled: true
30+
mode: kata
31+
vfioManager:
32+
enabled: true
33+
env:
34+
- name: BIND_NVSWITCHES
35+
value: {{ .Values.vfioManager.bindNvSwitches | quote }}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
{{- range list "master" "worker" }}
2+
---
3+
apiVersion: machineconfiguration.openshift.io/v1
4+
kind: MachineConfig
5+
metadata:
6+
labels:
7+
machineconfiguration.openshift.io/role: {{ . }}
8+
name: 100-iommu-kernel-args-{{ . }}
9+
spec:
10+
config:
11+
ignition:
12+
version: 3.2.0
13+
kernelArguments:
14+
- intel_iommu=on
15+
- amd_iommu=on
16+
{{- end }}

charts/all/nvidia-gpu/values.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
ccManager:
2+
enabled: true
3+
defaultMode: "on"
4+
5+
sandboxDevicePlugin:
6+
pgpuAlias: pgpu
7+
nvswitchAlias: nvswitch
8+
9+
vfioManager:
10+
bindNvSwitches: "true"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: v2
2+
description: Sample GPU workload for confidential containers (NVIDIA GPU verification in a CoCo pod).
3+
keywords:
4+
- pattern
5+
- nvidia
6+
- gpu
7+
- confidential-computing
8+
name: gpu-workload
9+
version: 0.0.1
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: gpu-cc-verify
5+
labels:
6+
app: gpu-cc-verify
7+
annotations:
8+
io.katacontainers.config.hypervisor.default_memory: {{ .Values.gpuMemory | quote }}
9+
spec:
10+
runtimeClassName: {{ .Values.runtimeClassName }}
11+
restartPolicy: OnFailure
12+
containers:
13+
- name: cuda-vectoradd
14+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0
15+
resources:
16+
limits:
17+
nvidia.com/pgpu: 1
18+
securityContext:
19+
privileged: false
20+
allowPrivilegeEscalation: false
21+
capabilities:
22+
drop:
23+
- ALL
24+
seccompProfile:
25+
type: RuntimeDefault
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
runtimeClassName: "kata-cc-nvidia-gpu"
2+
3+
gpuMemory: "32768"

values-baremetal-gpu.yaml

Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# Bare metal deployment for confidential containers with NVIDIA GPU support (Tech Preview).
2+
# Supports Intel TDX and AMD SEV-SNP via auto-detection (NFD).
3+
# Adds NVIDIA GPU Operator for GPU passthrough to confidential VMs.
4+
# Set main.clusterGroupName: baremetal-gpu in values-global.yaml to use.
5+
6+
clusterGroup:
7+
name: baremetal-gpu
8+
isHubCluster: true
9+
namespaces:
10+
- open-cluster-management
11+
- vault
12+
- golang-external-secrets
13+
- openshift-sandboxed-containers-operator
14+
- trustee-operator-system
15+
- cert-manager-operator
16+
- cert-manager
17+
- hello-openshift
18+
- kbs-access
19+
- openshift-cnv
20+
- openshift-storage
21+
- openshift-nfd
22+
- baremetal
23+
- intel-dcap
24+
- nvidia-gpu-operator
25+
- gpu-workload
26+
27+
subscriptions:
28+
acm:
29+
name: advanced-cluster-management
30+
namespace: open-cluster-management
31+
sandbox:
32+
name: sandboxed-containers-operator
33+
namespace: openshift-sandboxed-containers-operator
34+
source: redhat-operators
35+
channel: stable
36+
installPlanApproval: Manual
37+
csv: sandboxed-containers-operator.v1.12.0
38+
trustee:
39+
name: trustee-operator
40+
namespace: trustee-operator-system
41+
source: redhat-operators
42+
channel: stable
43+
installPlanApproval: Manual
44+
csv: trustee-operator.v1.1.0
45+
cert-manager:
46+
name: openshift-cert-manager-operator
47+
namespace: cert-manager-operator
48+
channel: stable-v1
49+
lvm-operator:
50+
name: lvms-operator
51+
namespace: openshift-storage
52+
source: redhat-operators
53+
channel: stable-4.20
54+
installPlanApproval: Automatic
55+
cnv:
56+
name: kubevirt-hyperconverged
57+
namespace: openshift-cnv
58+
source: redhat-operators
59+
channel: stable
60+
installPlanApproval: Automatic
61+
nfd:
62+
name: nfd
63+
namespace: openshift-nfd
64+
channel: stable
65+
gpu-operator:
66+
name: gpu-operator-certified
67+
namespace: nvidia-gpu-operator
68+
source: certified-operators
69+
channel: v24.9
70+
installPlanApproval: Automatic
71+
72+
projects:
73+
- hub
74+
- vault
75+
- trustee
76+
- golang-external-secrets
77+
- sandbox
78+
- workloads
79+
- default
80+
81+
sharedValueFiles:
82+
- '/overrides/values-{{ $.Values.global.clusterPlatform }}.yaml'
83+
- '/overrides/values-storage-{{ $.Values.global.storageProvider }}.yaml'
84+
85+
applications:
86+
acm:
87+
name: acm
88+
namespace: open-cluster-management
89+
project: hub
90+
chart: acm
91+
chartVersion: 0.1.*
92+
93+
vault:
94+
name: vault
95+
namespace: vault
96+
project: vault
97+
chart: hashicorp-vault
98+
chartVersion: 0.1.*
99+
100+
secrets-operator:
101+
name: golang-external-secrets
102+
namespace: golang-external-secrets
103+
project: golang-external-secrets
104+
chart: golang-external-secrets
105+
chartVersion: 0.1.*
106+
107+
trustee:
108+
name: trustee
109+
namespace: trustee-operator-system
110+
project: trustee
111+
chart: trustee
112+
chartVersion: 0.2.*
113+
overrides:
114+
- name: global.coco.secured
115+
value: "true"
116+
- name: global.coco.bypassAttestation
117+
value: "true"
118+
- name: kbs.https.enabled
119+
value: "false"
120+
- name: kbs.secretResources[0].name
121+
value: kbsres1
122+
- name: kbs.secretResources[0].key
123+
value: secret/data/hub/kbsres1
124+
- name: kbs.secretResources[1].name
125+
value: passphrase
126+
- name: kbs.secretResources[1].key
127+
value: secret/data/hub/passphrase
128+
129+
storage:
130+
name: storage
131+
namespace: openshift-storage
132+
project: hub
133+
path: charts/hub/storage
134+
135+
baremetal:
136+
name: baremetal
137+
namespace: baremetal
138+
project: hub
139+
path: charts/all/baremetal
140+
141+
sandbox:
142+
name: sandbox
143+
namespace: openshift-sandboxed-containers-operator
144+
project: sandbox
145+
chart: sandboxed-containers
146+
chartVersion: 0.2.*
147+
overrides:
148+
- name: global.secretStore.backend
149+
value: vault
150+
- name: secretStore.name
151+
value: vault-backend
152+
- name: secretStore.kind
153+
value: ClusterSecretStore
154+
- name: enablePeerPods
155+
value: "false"
156+
157+
intel-dcap:
158+
name: intel-dcap
159+
namespace: intel-dcap
160+
project: hub
161+
path: charts/all/intel-dcap
162+
overrides:
163+
- name: secretStore.name
164+
value: vault-backend
165+
- name: secretStore.kind
166+
value: ClusterSecretStore
167+
168+
nvidia-gpu:
169+
name: nvidia-gpu
170+
namespace: nvidia-gpu-operator
171+
project: hub
172+
path: charts/all/nvidia-gpu
173+
174+
sandbox-policies:
175+
name: sandbox-policies
176+
namespace: openshift-sandboxed-containers-operator
177+
chart: sandboxed-policies
178+
chartVersion: 0.1.*
179+
180+
kbs-access:
181+
name: kbs-access
182+
namespace: kbs-access
183+
project: workloads
184+
path: charts/coco-supported/kbs-access
185+
overrides:
186+
- name: runtimeClassName
187+
value: "kata-cc"
188+
189+
hello-openshift:
190+
name: hello-openshift
191+
namespace: hello-openshift
192+
project: workloads
193+
path: charts/coco-supported/hello-openshift
194+
overrides:
195+
- name: runtimeClassName
196+
value: "kata-cc"
197+
198+
gpu-workload:
199+
name: gpu-workload
200+
namespace: gpu-workload
201+
project: workloads
202+
path: charts/coco-supported/gpu-workload
203+
overrides:
204+
- name: runtimeClassName
205+
value: "kata-cc-nvidia-gpu"
206+
207+
imperative:
208+
image: ghcr.io/butler54/imperative-container:latest
209+
serviceAccountCreate: true
210+
adminServiceAccountCreate: true
211+
serviceAccountName: imperative-admin-sa
212+
jobs:
213+
- name: install-deps
214+
playbook: ansible/install-deps.yaml
215+
verbosity: -vvv
216+
timeout: 3600
217+
- name: init-data-gzipper
218+
playbook: ansible/init-data-gzipper.yaml
219+
verbosity: -vvv
220+
timeout: 3600

0 commit comments

Comments
 (0)