-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsetup.sh
More file actions
273 lines (239 loc) · 7.47 KB
/
setup.sh
File metadata and controls
273 lines (239 loc) · 7.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#!/usr/bin/env bash
# shfmt -i 2 -ci -w
set -Eo pipefail
trap exit SIGINT SIGTERM
################################################################################
# AKS Cluster Setup for NVIDIA Dynamo
# Creates an AKS cluster with a system node pool and an H100 GPU node pool.
#
# NOTE: Uses AKS managed GPU (preview) — --enable-managed-gpu=true.
# AKS installs the full GPU stack: driver, device plugin, DCGM metrics exporter,
# and GPU health monitoring. No GPU Operator needed.
# Requires: aks-preview extension >= 19.0.0b29 and ManagedGPUExperiencePreview feature flag.
# See: https://learn.microsoft.com/azure/aks/aks-managed-gpu-nodes
################################################################################
# Default configuration
CLUSTER_NAME=${CLUSTER_NAME:-dicasati-dynamo}
RESOURCE_GROUP=${RESOURCE_GROUP:-dicasati-dynamo}
LOCATION=${LOCATION:-southafricanorth}
KUBERNETES_VERSION=${KUBERNETES_VERSION:-1.34.0}
SYSTEM_NODE_SIZE=${SYSTEM_NODE_SIZE:-Standard_D4ds_v5}
GPU_NODE_SIZE=${GPU_NODE_SIZE:-Standard_ND96isr_H100_v5}
KUBECONFIG=${KUBECONFIG:-${PWD}/cluster.config}
################################################################################
__usage="
-x action to be executed.
Possible verbs are:
install Create the AKS cluster and GPU node pool.
delete Delete the resource group and all resources.
show Show cluster information.
check-deps Check required dependencies.
Environment variables (with defaults):
CLUSTER_NAME=${CLUSTER_NAME}
RESOURCE_GROUP=${RESOURCE_GROUP}
LOCATION=${LOCATION}
KUBERNETES_VERSION=${KUBERNETES_VERSION}
SYSTEM_NODE_SIZE=${SYSTEM_NODE_SIZE}
GPU_NODE_SIZE=${GPU_NODE_SIZE}
KUBECONFIG=${KUBECONFIG}
"
usage() {
echo "usage: ${0##*/} [options]"
echo "${__usage/[[:space:]]/}"
exit 1
}
print_header() {
echo ""
echo "AKS Cluster Setup — NVIDIA Dynamo"
echo "=========================================="
echo ""
echo "Cluster Name: $CLUSTER_NAME"
echo "Resource Group: $RESOURCE_GROUP"
echo "Location: $LOCATION"
echo "Kubernetes Version: $KUBERNETES_VERSION"
echo "System Node Size: $SYSTEM_NODE_SIZE"
echo "GPU Node Size: $GPU_NODE_SIZE"
echo "KUBECONFIG: $KUBECONFIG"
echo ""
}
log() {
echo "[$(date +'%r')] $*"
}
check_dependencies() {
log "Checking dependencies..."
local _NEEDED="az kubectl helm"
local _DEP_FLAG=false
for i in ${_NEEDED}; do
if hash "$i" 2>/dev/null; then
log " $i: OK"
else
log " $i: NOT FOUND"
_DEP_FLAG=true
fi
done
if [[ "${_DEP_FLAG}" == "true" ]]; then
log "Dependencies missing. Please install them before proceeding."
exit 1
fi
log "All dependencies satisfied."
}
setup_preview() {
log "Installing/updating aks-preview CLI extension..."
if az extension show --name aks-preview &>/dev/null; then
az extension update --name aks-preview
else
az extension add --name aks-preview --allow-preview true
fi
log " aks-preview: OK"
log "Registering ManagedGPUExperiencePreview feature flag..."
az feature register \
--namespace Microsoft.ContainerService \
--name ManagedGPUExperiencePreview
log "Waiting for feature registration (this may take a few minutes)..."
az feature wait \
--namespace Microsoft.ContainerService \
--name ManagedGPUExperiencePreview \
--created 2>/dev/null || true
log "Refreshing resource provider..."
az provider register --namespace Microsoft.ContainerService
log "Preview setup complete."
}
create_resource_group() {
log "Creating resource group $RESOURCE_GROUP in $LOCATION..."
az group create \
--name "$RESOURCE_GROUP" \
--location "$LOCATION"
log "Resource group created."
}
create_cluster() {
log "Creating AKS cluster $CLUSTER_NAME (system node pool)..."
az aks create \
--name "$CLUSTER_NAME" \
--resource-group "$RESOURCE_GROUP" \
--location "$LOCATION" \
--kubernetes-version "$KUBERNETES_VERSION" \
--node-count 1 \
--node-vm-size "$SYSTEM_NODE_SIZE" \
--node-osdisk-size 150 \
--node-osdisk-type Ephemeral \
--max-pods 250 \
--network-plugin azure \
--network-plugin-mode overlay \
--pod-cidr 10.244.0.0/16 \
--service-cidr 10.0.0.0/16 \
--dns-service-ip 10.0.0.10 \
--load-balancer-sku standard \
--generate-ssh-keys \
--enable-ai-toolchain-operator \
--enable-oidc-issuer \
--enable-workload-identity
# az aks create exits 0 even on Python-level errors; verify the cluster exists.
if ! az aks show --name "$CLUSTER_NAME" --resource-group "$RESOURCE_GROUP" --output none 2>/dev/null; then
log "ERROR: Cluster creation failed. Run 'brew upgrade azure-cli' and retry."
exit 1
fi
log "Cluster created."
}
add_gpu_nodepool() {
log "Adding H100 GPU node pool (h100pool) with AKS managed GPU stack..."
# --enable-managed-gpu=true installs: driver + device plugin + DCGM metrics
# exporter + GPU health monitoring. No GPU Operator required.
# NOTE: cluster autoscaler is not supported with managed GPU preview.
# Scale manually with: az aks nodepool scale
az aks nodepool add \
--cluster-name "$CLUSTER_NAME" \
--resource-group "$RESOURCE_GROUP" \
--name h100pool \
--node-vm-size "$GPU_NODE_SIZE" \
--node-count 1 \
--node-osdisk-size 1024 \
--node-osdisk-type Ephemeral \
--max-pods 30 \
--mode User \
--kubernetes-version "$KUBERNETES_VERSION" \
--node-taints sku=gpu:NoSchedule \
--enable-managed-gpu=true
log "GPU node pool added."
}
get_credentials() {
log "Retrieving cluster credentials..."
az aks get-credentials \
--name "$CLUSTER_NAME" \
--resource-group "$RESOURCE_GROUP" \
--file "$KUBECONFIG"
log "KUBECONFIG written to: $KUBECONFIG"
}
do_install() {
check_dependencies
setup_preview
create_resource_group
create_cluster
add_gpu_nodepool
get_credentials
log ""
log "Cluster is ready."
log "Run '$0 -x show' to view cluster details."
log "Next: run './install-dynamo.sh -x install' to deploy NVIDIA Dynamo."
}
do_delete() {
log "Deleting resource group $RESOURCE_GROUP and all resources..."
if az group show --name "$RESOURCE_GROUP" >/dev/null 2>&1; then
az group delete --name "$RESOURCE_GROUP" --yes --no-wait
log "Deletion initiated (running in background)."
else
log "Resource group $RESOURCE_GROUP not found — nothing to delete."
fi
}
do_show() {
log "Cluster information for $CLUSTER_NAME..."
if az aks show --name "$CLUSTER_NAME" --resource-group "$RESOURCE_GROUP" >/dev/null 2>&1; then
az aks show \
--name "$CLUSTER_NAME" \
--resource-group "$RESOURCE_GROUP" \
--output table
echo ""
log "Node pools:"
az aks nodepool list \
--cluster-name "$CLUSTER_NAME" \
--resource-group "$RESOURCE_GROUP" \
--output table
else
log "Cluster $CLUSTER_NAME not found in $RESOURCE_GROUP."
exit 1
fi
}
exec_case() {
local _opt=$1
case ${_opt} in
install) do_install ;;
delete) do_delete ;;
show) do_show ;;
check-deps) check_dependencies ;;
*) usage ;;
esac
unset _opt
}
################################################################################
# Entry point
main() {
while getopts "x:" opt; do
case $opt in
x)
exec_flag=true
EXEC_OPT="${OPTARG}"
;;
*) usage ;;
esac
done
shift $((OPTIND - 1))
if [ $OPTIND = 1 ]; then
print_header
usage
exit 0
fi
if [[ "${exec_flag}" == "true" ]]; then
exec_case "${EXEC_OPT}"
fi
}
main "$@"
exit 0