Skip to content

Commit 03ff51d

Browse files
butler54claude
andcommitted
fix: align ClusterPolicy with Red Hat OCP 4.21 CC GPU reference
Rewrite ClusterPolicy to match the official Red Hat OCP 4.21.9+ documentation for NVIDIA confidential GPU support. Key changes: - Remove hardcoded cc-manager v0.1.0 (was from old v25.3.x line, caused IntelRootPort crash) — let GPU Operator manage its versions - Remove hardcoded CC_CAPABLE_DEVICE_IDS and sandbox device plugin image/version — operator fills in correct defaults - Disable host-side components not needed for CC passthrough (driver, dcgm, toolkit, migManager) — driver runs inside kata VM via initrd - Add kataSandboxDevicePlugin env vars (P_GPU_ALIAS, NVSWITCH_ALIAS) - Add vfioManager BIND_NVSWITCHES env var - Add both amd_iommu=on and intel_iommu=on to IOMMU MachineConfig Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 40f41ae commit 03ff51d

3 files changed

Lines changed: 70 additions & 104 deletions

File tree

charts/all/nvidia-gpu/templates/cluster-policy.yaml

Lines changed: 69 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -6,138 +6,108 @@ metadata:
66
annotations:
77
argocd.argoproj.io/sync-wave: "110"
88
spec:
9-
operator:
10-
defaultRuntime: crio
11-
runtimeClass: nvidia
12-
use_ocp_driver_toolkit: true
13-
9+
ccManager:
10+
defaultMode: {{ .Values.ccManager.defaultMode | quote }}
11+
enabled: {{ .Values.ccManager.enabled }}
12+
cdi:
13+
default: false
14+
enabled: true
15+
nriPluginEnabled: false
1416
daemonsets:
1517
rollingUpdate:
16-
maxUnavailable: "1"
18+
maxUnavailable: '1'
1719
updateStrategy: RollingUpdate
18-
20+
dcgm:
21+
enabled: false
22+
dcgmExporter:
23+
config:
24+
name: ''
25+
enabled: false
26+
serviceMonitor:
27+
enabled: true
28+
devicePlugin:
29+
config:
30+
default: ''
31+
name: ''
32+
enabled: false
33+
mps:
34+
root: /run/nvidia/mps
1935
driver:
20-
enabled: true
21-
useNvidiaDriverCRD: false
36+
certConfig:
37+
name: ''
38+
enabled: false
39+
kernelModuleConfig:
40+
name: ''
2241
kernelModuleType: auto
2342
licensingConfig:
43+
configMapName: ''
2444
nlsEnabled: true
25-
secretName: ""
26-
certConfig:
27-
name: ""
28-
kernelModuleConfig:
29-
name: ""
3045
repoConfig:
31-
configMapName: ""
32-
virtualTopology:
33-
config: ""
46+
configMapName: ''
3447
upgradePolicy:
3548
autoUpgrade: true
36-
maxParallelUpgrades: 1
37-
maxUnavailable: 25%
3849
drain:
3950
deleteEmptyDir: false
4051
enable: false
4152
force: false
4253
timeoutSeconds: 300
54+
maxParallelUpgrades: 1
55+
maxUnavailable: 25%
4356
podDeletion:
4457
deleteEmptyDir: false
4558
force: false
4659
timeoutSeconds: 300
4760
waitForCompletion:
4861
timeoutSeconds: 0
49-
50-
devicePlugin:
62+
useNvidiaDriverCRD: false
63+
useOpenKernelModules: false
64+
virtualTopology:
65+
config: ''
66+
gdrcopy:
67+
enabled: false
68+
gds:
5169
enabled: false
52-
config:
53-
name: ""
54-
default: ""
55-
mps:
56-
root: /run/nvidia/mps
57-
58-
dcgm:
59-
enabled: true
60-
61-
dcgmExporter:
62-
enabled: true
63-
config:
64-
name: ""
65-
serviceMonitor:
66-
enabled: true
67-
6870
gfd:
6971
enabled: true
70-
72+
kataManager:
73+
enabled: false
74+
mig:
75+
strategy: single
76+
migManager:
77+
enabled: false
7178
nodeStatusExporter:
7279
enabled: true
73-
74-
toolkit:
75-
enabled: true
76-
installDir: /usr/local/nvidia
77-
78-
validator:
79-
plugin:
80-
env: []
81-
80+
operator:
81+
defaultRuntime: crio
82+
initContainer: {}
83+
runtimeClass: nvidia
84+
use_ocp_driver_toolkit: true
85+
kataSandboxDevicePlugin:
86+
enabled: {{ .Values.kataSandboxDevicePlugin.enabled }}
87+
env:
88+
- name: P_GPU_ALIAS
89+
value: pgpu
90+
- name: NVSWITCH_ALIAS
91+
value: nvswitch
8292
sandboxWorkloads:
93+
defaultWorkload: vm-passthrough
8394
enabled: true
8495
mode: kata
85-
defaultWorkload: vm-passthrough
86-
87-
kataManager:
96+
toolkit:
8897
enabled: false
89-
config:
90-
artifactsDir: /opt/nvidia-gpu-operator/artifacts/runtimeclasses
91-
92-
ccManager:
93-
enabled: {{ .Values.ccManager.enabled }}
94-
defaultMode: {{ .Values.ccManager.defaultMode | quote }}
95-
repository: nvcr.io/nvidia/cloud-native
96-
image: k8s-cc-manager
97-
version: v0.1.0
98-
env:
99-
- name: CC_CAPABLE_DEVICE_IDS
100-
value: {{ .Values.ccManager.deviceIDs | quote }}
101-
102-
kataSandboxDevicePlugin:
103-
enabled: {{ .Values.kataSandboxDevicePlugin.enabled }}
104-
repository: {{ .Values.kataSandboxDevicePlugin.repository }}
105-
image: {{ .Values.kataSandboxDevicePlugin.image }}
106-
version: {{ .Values.kataSandboxDevicePlugin.version | quote }}
107-
108-
sandboxDevicePlugin:
109-
enabled: true
110-
98+
installDir: /usr/local/nvidia
99+
validator:
100+
plugin:
101+
env:
102+
- name: WITH_WORKLOAD
103+
value: 'false'
111104
vfioManager:
112105
enabled: true
113-
114-
vgpuManager:
115-
enabled: false
116-
106+
env:
107+
- name: BIND_NVSWITCHES
108+
value: 'true'
117109
vgpuDeviceManager:
118-
enabled: true
119-
config:
120-
default: default
121-
122-
gdrcopy:
123110
enabled: false
124-
125-
gds:
111+
vgpuManager:
126112
enabled: false
127-
128-
mig:
129-
strategy: single
130-
131-
migManager:
132-
enabled: true
133-
config:
134-
default: all-disabled
135-
136-
cdi:
137-
default: false
138-
enabled: true
139-
nriPluginEnabled: false
140-
141-
nfd:
142-
nodefeaturerules: true
143113
{{- end }}

charts/all/nvidia-gpu/templates/iommu-mco.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ metadata:
99
name: 100-iommu-{{ . }}
1010
spec:
1111
kernelArguments:
12+
- amd_iommu=on
1213
- intel_iommu=on
13-
- iommu=pt
1414
{{- end }}
1515
{{- end }}

charts/all/nvidia-gpu/values.yaml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,9 @@ enabled: true
33
ccManager:
44
enabled: true
55
defaultMode: "on"
6-
deviceIDs: "0x2331,0x2322"
76

87
kataSandboxDevicePlugin:
98
enabled: true
10-
repository: nvcr.io/nvidia/cloud-native
11-
image: nvidia-sandbox-device-plugin
12-
version: "v0.0.2"
139

1410
iommu:
1511
enabled: true

0 commit comments

Comments
 (0)