Skip to content

Commit bbcb622

Browse files
committed
feat: segregated gpu bm from bm for ease of testing
Signed-off-by: Chris Butler <chris.butler@redhat.com>
1 parent b54884e commit bbcb622

9 files changed

Lines changed: 307 additions & 41 deletions

File tree

ansible/reconcile-kataconfig-gpu.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
hosts: localhost
44
connection: local
55
become: false
6-
gather_facts: false
6+
gather_facts: true
77
tasks:
88
- name: Check for nodes with NVIDIA GPU labels
99
kubernetes.core.k8s_info:

charts/hub/storage/templates/hostpathprovisioner.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{{- if eq .Values.global.storageProvider "hpp" }}
1+
{{- if eq .Values.global.baremetalStorageProvider "hpp" }}
22
apiVersion: hostpathprovisioner.kubevirt.io/v1beta1
33
kind: HostPathProvisioner
44
metadata:

charts/hub/storage/templates/hpp-storageclass.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{{- if eq .Values.global.storageProvider "hpp" }}
1+
{{- if eq .Values.global.baremetalStorageProvider "hpp" }}
22
apiVersion: storage.k8s.io/v1
33
kind: StorageClass
44
metadata:

charts/hub/storage/templates/lvmcluster.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{{- if eq .Values.global.storageProvider "lvm" }}
1+
{{- if eq .Values.global.baremetalStorageProvider "lvm" }}
22
apiVersion: lvm.topolvm.io/v1alpha1
33
kind: LVMCluster
44
metadata:

charts/hub/storage/values.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
global:
2-
storageProvider: hpp
2+
baremetalStorageProvider: hpp
33

44
lvmCluster:
55
name: "lvmcluster"

scripts/gen-secrets.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@ if [ ! -f "${PCCS_USER_TOKEN_FILE}" ]; then
4646
echo "Creating PCCS user token"
4747
echo "usertoken" > "${PCCS_USER_TOKEN_FILE}"
4848
fi
49-
echo -n "usertoken" | sha512sum | tr -d '[:space:]-' > "${COCO_SECRETS_DIR}/pccs_user_token_hash"
49+
tr -d '\n' < "${PCCS_USER_TOKEN_FILE}" | sha512sum | tr -d '[:space:]-' > "${COCO_SECRETS_DIR}/pccs_user_token_hash"
5050

5151
if [ ! -f "${PCCS_ADMIN_TOKEN_FILE}" ]; then
5252
echo "Creating PCCS admin token"
5353
echo "admintoken" > "${PCCS_ADMIN_TOKEN_FILE}"
5454
fi
55-
echo -n "admintoken" | sha512sum | tr -d '[:space:]-' > "${COCO_SECRETS_DIR}/pccs_admin_token_hash"
55+
tr -d '\n' < "${PCCS_ADMIN_TOKEN_FILE}" | sha512sum | tr -d '[:space:]-' > "${COCO_SECRETS_DIR}/pccs_admin_token_hash"
5656

5757
## Copy a sample values file if this stuff doesn't exist
5858

values-baremetal-gpu.yaml

Lines changed: 297 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,297 @@
1+
# Bare metal deployment for confidential containers WITH NVIDIA GPU support.
2+
# Supports Intel TDX and AMD SEV-SNP via auto-detection (NFD).
3+
# Includes NVIDIA H100 confidential GPU components (GPU Operator, IOMMU, CC Manager).
4+
# Set main.clusterGroupName: baremetal in values-global.yaml to use.
5+
6+
clusterGroup:
7+
name: baremetal
8+
isHubCluster: true
9+
namespaces:
10+
- open-cluster-management
11+
- vault
12+
- golang-external-secrets
13+
- openshift-sandboxed-containers-operator
14+
- trustee-operator-system
15+
- cert-manager-operator
16+
- cert-manager
17+
- hello-openshift
18+
- kbs-access
19+
- openshift-cnv
20+
- openshift-storage
21+
- openshift-nfd
22+
- baremetal
23+
- intel-dcap
24+
- nvidia-gpu-operator
25+
- gpu-workload
26+
- kyverno
27+
28+
subscriptions:
29+
acm:
30+
name: advanced-cluster-management
31+
namespace: open-cluster-management
32+
sandbox:
33+
name: sandboxed-containers-operator
34+
namespace: openshift-sandboxed-containers-operator
35+
source: redhat-operators
36+
channel: stable
37+
installPlanApproval: Manual
38+
csv: sandboxed-containers-operator.v1.12.0
39+
trustee:
40+
name: trustee-operator
41+
namespace: trustee-operator-system
42+
source: redhat-operators
43+
channel: stable
44+
installPlanApproval: Manual
45+
csv: trustee-operator.v1.1.0
46+
cert-manager:
47+
name: openshift-cert-manager-operator
48+
namespace: cert-manager-operator
49+
channel: stable-v1
50+
lvm-operator:
51+
name: lvms-operator
52+
namespace: openshift-storage
53+
source: redhat-operators
54+
channel: stable-4.20
55+
installPlanApproval: Automatic
56+
cnv:
57+
name: kubevirt-hyperconverged
58+
namespace: openshift-cnv
59+
source: redhat-operators
60+
channel: stable
61+
installPlanApproval: Automatic
62+
nfd:
63+
name: nfd
64+
namespace: openshift-nfd
65+
channel: stable
66+
gpu-operator:
67+
name: gpu-operator-certified
68+
namespace: nvidia-gpu-operator
69+
source: certified-operators
70+
channel: v26.3
71+
installPlanApproval: Manual
72+
csv: gpu-operator-certified.v26.3.0
73+
intel-device-plugins:
74+
name: intel-device-plugins-operator
75+
namespace: openshift-operators
76+
source: certified-operators
77+
channel: stable
78+
projects:
79+
- hub
80+
- vault
81+
- trustee
82+
- golang-external-secrets
83+
- sandbox
84+
- workloads
85+
- default
86+
87+
# Explicitly mention the cluster-state based overrides we plan to use for this pattern.
88+
# We can use self-referential variables because the chart calls the tpl function with these variables defined
89+
sharedValueFiles:
90+
- '/overrides/values-{{ $.Values.global.clusterPlatform }}.yaml'
91+
- '/overrides/values-storage-{{ $.Values.global.baremetalStorageProvider }}.yaml'
92+
93+
applications:
94+
acm:
95+
name: acm
96+
namespace: open-cluster-management
97+
project: hub
98+
chart: acm
99+
chartVersion: 0.1.*
100+
101+
vault:
102+
name: vault
103+
namespace: vault
104+
project: vault
105+
chart: hashicorp-vault
106+
chartVersion: 0.1.*
107+
108+
secrets-operator:
109+
name: golang-external-secrets
110+
namespace: golang-external-secrets
111+
project: golang-external-secrets
112+
chart: golang-external-secrets
113+
chartVersion: 0.1.*
114+
115+
trustee:
116+
name: trustee
117+
namespace: trustee-operator-system
118+
project: trustee
119+
repoURL: https://github.com/butler54/trustee-chart.git
120+
targetRevision: feature/baremetal-attestation
121+
path: .
122+
overrides:
123+
- name: global.coco.secured
124+
value: "true"
125+
- name: kbs.admin.format
126+
value: "v1.1"
127+
- name: kbs.https.enabled
128+
value: "false"
129+
- name: kbs.secretResources[0].name
130+
value: kbsres1
131+
- name: kbs.secretResources[0].key
132+
value: secret/data/hub/kbsres1
133+
- name: kbs.tdx.enabled
134+
value: "true"
135+
- name: kbs.tdx.collateralService
136+
value: "https://pccs-service.intel-dcap.svc.cluster.local:8042/sgx/certification/v4/"
137+
- name: kbs.secretResources[1].name
138+
value: passphrase
139+
- name: kbs.secretResources[1].key
140+
value: secret/data/hub/passphrase
141+
- name: kbs.gpu.enabled
142+
value: "true"
143+
144+
storage:
145+
name: storage
146+
namespace: openshift-storage
147+
project: hub
148+
path: charts/hub/storage
149+
150+
baremetal:
151+
name: baremetal
152+
namespace: baremetal
153+
project: hub
154+
path: charts/all/baremetal
155+
156+
sandbox:
157+
name: sandbox
158+
namespace: openshift-sandboxed-containers-operator
159+
project: sandbox
160+
chart: sandboxed-containers
161+
chartVersion: 0.2.*
162+
overrides:
163+
- name: global.secretStore.backend
164+
value: vault
165+
- name: secretStore.name
166+
value: vault-backend
167+
- name: secretStore.kind
168+
value: ClusterSecretStore
169+
- name: enablePeerPods
170+
value: "false"
171+
172+
173+
intel-dcap:
174+
name: intel-dcap
175+
namespace: intel-dcap
176+
project: hub
177+
path: charts/all/intel-dcap
178+
overrides:
179+
- name: secretStore.name
180+
value: vault-backend
181+
- name: secretStore.kind
182+
value: ClusterSecretStore
183+
184+
nvidia-gpu:
185+
name: nvidia-gpu
186+
namespace: nvidia-gpu-operator
187+
project: hub
188+
path: charts/all/nvidia-gpu
189+
190+
gpu-workload:
191+
name: gpu-workload
192+
namespace: gpu-workload
193+
project: workloads
194+
path: charts/coco-supported/gpu-workload
195+
syncPolicy:
196+
automated:
197+
prune: true
198+
199+
sandbox-policies:
200+
name: sandbox-policies
201+
namespace: openshift-sandboxed-containers-operator
202+
chart: sandboxed-policies
203+
chartVersion: 0.1.*
204+
205+
kbs-access:
206+
name: kbs-access
207+
namespace: kbs-access
208+
project: workloads
209+
path: charts/coco-supported/kbs-access
210+
syncPolicy:
211+
automated:
212+
prune: true
213+
overrides:
214+
- name: defaultMemory
215+
value: "8192"
216+
217+
hello-openshift:
218+
name: hello-openshift
219+
namespace: hello-openshift
220+
project: workloads
221+
path: charts/coco-supported/hello-openshift
222+
syncPolicy:
223+
automated:
224+
prune: true
225+
226+
kyverno:
227+
name: kyverno
228+
namespace: kyverno
229+
project: hub
230+
repoURL: https://kyverno.github.io/kyverno/
231+
chart: kyverno
232+
chartVersion: 3.7.*
233+
syncPolicy:
234+
automated: {}
235+
retry:
236+
limit: 20
237+
syncOptions:
238+
- ServerSideApply=true
239+
overrides:
240+
- name: admissionController.container.securityContext
241+
value: "null"
242+
- name: admissionController.initContainer.securityContext
243+
value: "null"
244+
- name: backgroundController.securityContext
245+
value: "null"
246+
- name: cleanupController.securityContext
247+
value: "null"
248+
- name: reportsController.securityContext
249+
value: "null"
250+
- name: crds.migration.securityContext
251+
value: "null"
252+
- name: webhooksCleanup.securityContext
253+
value: "null"
254+
- name: test.securityContext
255+
value: "null"
256+
- name: crds.groups.wgpolicyk8s.policyreports
257+
value: "false"
258+
- name: crds.groups.wgpolicyk8s.clusterpolicyreports
259+
value: "false"
260+
- name: reportsController.enabled
261+
value: "false"
262+
263+
coco-kyverno-policies:
264+
name: coco-kyverno-policies
265+
namespace: openshift-sandboxed-containers-operator
266+
project: sandbox
267+
path: charts/all/coco-kyverno-policies
268+
269+
imperative:
270+
# NOTE: We *must* use lists and not hashes. As hashes lose ordering once parsed by helm
271+
# The default schedule is every 10 minutes: imperative.schedule
272+
# Total timeout of all jobs is 1h: imperative.activeDeadlineSeconds
273+
# imagePullPolicy is set to always: imperative.imagePullPolicy
274+
# For additional overrides that apply to the jobs, please refer to
275+
# https://validatedpatterns.io/imperative-actions/#additional-job-customizations
276+
image: ghcr.io/butler54/imperative-container:latest
277+
serviceAccountCreate: true
278+
adminServiceAccountCreate: true
279+
serviceAccountName: imperative-admin-sa
280+
jobs:
281+
- name: install-deps
282+
playbook: ansible/install-deps.yaml
283+
verbosity: -vvv
284+
timeout: 3600
285+
- name: init-data-gzipper
286+
playbook: ansible/init-data-gzipper.yaml
287+
verbosity: -vvv
288+
timeout: 3600
289+
- name: reconcile-kataconfig-gpu
290+
playbook: ansible/reconcile-kataconfig-gpu.yaml
291+
verbosity: -vvv
292+
timeout: 600
293+
# Required for tech preview only.
294+
# - name: detect-runtime-class
295+
# playbook: ansible/detect-runtime-class.yaml
296+
# verbosity: -vvv
297+
# timeout: 600

0 commit comments

Comments
 (0)