-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdeploy.sh
More file actions
executable file
·467 lines (443 loc) · 26.1 KB
/
Copy pathdeploy.sh
File metadata and controls
executable file
·467 lines (443 loc) · 26.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
#!/usr/bin/env bash
# One-command bring-up for the LLM gateway benchmark rig.
#
# ./deploy.sh
#
# Everything is configured by environment variables (all optional — the script
# detects, confirms, and creates what is missing). The only thing you may want to
# set is which boxes to deploy. Common overrides:
#
# REGION=us-east-1 AWS region
# KEY_NAME=my-key EC2 key pair name (created if it does not exist)
# VPC_ID=vpc-xxxx target VPC (default VPC is used, or one is created)
# SUBNET_ID=subnet-xxxx public subnet in that VPC (auto-picked if omitted)
# ADMIN_CIDR=1.2.3.4/32 who may reach SSH (:22) — defaults to 0.0.0.0/0 (open) if omitted
# DEPLOY_NEXUS=true ... per-box toggles (see PLAN output)
# GATEWAY_TYPE / AUX_TYPE instance types (default c6i.4xlarge)
# ASSUME_YES=1 skip the confirmation prompt
# PROVISION=1 also run the Ansible install after the boxes are up
#
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# ---------- operator config (deploy.env) ----------
# Source deploy.env if present (gitignored; copy from deploy.env.example). Exported
# (set -a) so the values reach BOTH this script's CFN parameters AND Ansible's env
# lookups (e.g. BENCH_SSH_PUBLIC_KEYS) when run with PROVISION=1. Anything already
# set in the calling shell still wins (deploy.env only fills the unset ones via the
# ${VAR:-default} reads below). See deploy.env.example for the full list + docs.
if [ -f "$ROOT/deploy.env" ]; then set -a; . "$ROOT/deploy.env"; set +a; fi
# ---------- config (env-overridable) ----------
REGION="${REGION:-us-east-1}"
STACK="${STACK:-gw-bench}"
NET_STACK="${NET_STACK:-${STACK}-net}"
KEY_NAME="${KEY_NAME:-llm-bench-key}"
KEY_FILE_EXPLICIT="${KEY_FILE:+1}" # set iff the operator pinned KEY_FILE (deploy.env/env)
KEY_FILE="${KEY_FILE:-$HOME/.ssh/${KEY_NAME}.pem}"
VPC_ID="${VPC_ID:-}"
SUBNET_ID="${SUBNET_ID:-}"
SECURITY_GROUP_ID="${SECURITY_GROUP_ID:-}" # blank = the stack creates one with the exact rules the rig needs
ADMIN_CIDR="${ADMIN_CIDR:-}"
ACCESS_CIDR="${ACCESS_CIDR:-0.0.0.0/0}" # gateway/UI/mock/metrics ports (public so others can reproduce)
GATEWAY_TYPE="${GATEWAY_TYPE:-c6i.4xlarge}"
AUX_TYPE="${AUX_TYPE:-c6i.4xlarge}"
CONTROL_TYPE="${CONTROL_TYPE:-c6i.xlarge}" # control box: 4 vCPU / 8 GiB (orchestration only)
CONTROL_VOLUME_GIB="${CONTROL_VOLUME_GIB:-200}" # control box root gp3 (plain gp3, no provisioned IOPS)
VOLUME_GIB="${VOLUME_GIB:-120}"
VOLUME_IOPS="${VOLUME_IOPS:-12000}" # gp3 IOPS (>= 4x throughput MB/s)
VOLUME_THROUGHPUT="${VOLUME_THROUGHPUT:-1000}" # gp3 MB/s; real cap is instance EBS bw (~593 sustained on c6i.4xlarge)
DEPLOY_MOCK="${DEPLOY_MOCK:-true}"
DEPLOY_NEXUS="${DEPLOY_NEXUS:-true}"
DEPLOY_BIFROST="${DEPLOY_BIFROST:-true}"
DEPLOY_LOADTEST="${DEPLOY_LOADTEST:-true}"
DEPLOY_LITELLM="${DEPLOY_LITELLM:-false}"
DEPLOY_KONG="${DEPLOY_KONG:-false}"
DEPLOY_PORTKEY="${DEPLOY_PORTKEY:-false}"
DEPLOY_TENSORZERO="${DEPLOY_TENSORZERO:-false}"
DEPLOY_CONTROL="${DEPLOY_CONTROL:-true}" # in-VPC control box (set DEPLOY_CONTROL=false in deploy.env to skip it)
ASSUME_YES="${ASSUME_YES:-0}"
PROVISION="${PROVISION:-0}"
c() { printf "\033[%sm%s\033[0m" "$1" "$2"; }
ok() { echo "$(c 32 "✓") $*"; }
inf(){ echo "$(c 36 "•") $*"; }
warn(){ echo "$(c 33 "!") $*" >&2; }
die(){ echo "$(c 31 "✗") $*" >&2; exit 1; }
ask(){ # ask "question" -> returns 0 on yes
[ "$ASSUME_YES" = "1" ] && return 0
local a; read -r -p "$(c 33 "?") $1 [y/N] " a </dev/tty || true
[[ "$a" =~ ^[Yy]$ ]]
}
# pick_one <title> <default> : reads "value|label" lines on stdin, prints a numbered
# menu (to stderr) and echoes the chosen value (to stdout). Enter = the default. With
# ASSUME_YES=1 (non-interactive) it takes the default (or first option), no prompt.
pick_one() {
local title="$1" def="${2:-}"
local -a vals=() labels=(); local v l
while IFS='|' read -r v l; do
[ -n "$v" ] || continue
case " ${vals[*]:-} " in *" $v "*) continue ;; esac # dedup by value (':-' so an empty array is safe under set -u / bash 3.2)
vals+=("$v"); labels+=("${l:-$v}")
done
[ "${#vals[@]}" -gt 0 ] || { echo "$def"; return 0; }
if [ "$ASSUME_YES" = "1" ]; then echo "${def:-${vals[0]}}"; return 0; fi
{ echo "$(c 1 "$title")"
local i mark
for i in "${!vals[@]}"; do
mark=""; { [ -n "$def" ] && [ "${vals[$i]}" = "$def" ]; } && mark=" $(c 32 '(current)')"
printf " %2d) %s%s\n" "$((i+1))" "${labels[$i]}" "$mark"
done
} >&2
local ans
read -r -p "$(c 33 "?") choose 1-${#vals[@]}${def:+ (Enter=current)}: " ans </dev/tty || true
[ -n "$ans" ] || { echo "${def:-${vals[0]}}"; return 0; }
{ [[ "$ans" =~ ^[0-9]+$ ]] && [ "$ans" -ge 1 ] && [ "$ans" -le "${#vals[@]}" ]; } \
|| die "invalid selection '$ans' (want 1-${#vals[@]})"
echo "${vals[$((ans-1))]}"
}
# prompt_free_name <prompt> <default> <taken_fn> : echo a name the operator picks that is
# NOT already taken. <taken_fn name> must return 0 when the name is taken. Loops until
# free (prompts + warnings go to stderr; only the chosen name is echoed to stdout).
prompt_free_name() {
local prompt="$1" name="$2" taken="$3" ans
if [ "$ASSUME_YES" = "1" ]; then echo "$name"; return 0; fi
while :; do
read -r -p "$(c 33 "?") $prompt [$name]: " ans </dev/tty || true
name="${ans:-$name}"
if "$taken" "$name"; then warn "'$name' is already taken — pick another."; name="${name}2"; continue; fi
echo "$name"; return 0
done
}
stack_exists() { aws_ cloudformation describe-stacks --stack-name "$1" >/dev/null 2>&1; }
aws_() { aws --region "$REGION" "$@"; }
# ---------- 0. tooling + profile + region + credentials ----------
command -v aws >/dev/null || die "aws CLI not found. Install it and run 'aws configure'."
# Profile: if not pre-set (deploy.env/shell), list the configured profiles and let the
# operator choose which ACCOUNT to deploy into. Exported so every aws call + Ansible
# hit the same account.
if [ -z "${AWS_PROFILE:-}" ]; then
profs="$(aws configure list-profiles 2>/dev/null || true)"
if [ -n "$profs" ]; then
AWS_PROFILE="$(printf '%s\n' "$profs" | sed 's/$/|/' | pick_one "AWS profile (which account to deploy into):" "default")"
export AWS_PROFILE
fi
fi
# Region: confirm/choose (default = current $REGION). Pre-set REGION stays the default.
REGION="$(printf '%s\n' \
'us-east-1|us-east-1 (N. Virginia)' \
'us-east-2|us-east-2 (Ohio)' \
'us-west-2|us-west-2 (Oregon)' \
'eu-west-1|eu-west-1 (Ireland)' \
'eu-central-1|eu-central-1 (Frankfurt)' \
'ap-southeast-1|ap-southeast-1 (Singapore)' \
"${REGION}|${REGION} (current)" \
| pick_one "AWS region:" "$REGION")"
# Credentials: verify they work AND show the account, so the target account is
# unmistakable before anything is created — then confirm it's the intended one.
ACCT=$(aws_ sts get-caller-identity --query Account --output text 2>/dev/null) \
|| die "AWS credentials not working for profile '${AWS_PROFILE:-default}' — run 'aws configure${AWS_PROFILE:+ --profile $AWS_PROFILE}'."
ok "AWS account $ACCT (profile ${AWS_PROFILE:-default}, region $REGION)"
ask "deploy into account $ACCT?" || die "aborted — set AWS_PROFILE to the right account (see deploy.env)."
# ---------- 0b. vCPU quota preflight ----------
# The matrix is several c6i.4xlarge boxes (16 vCPU each). Compare the vCPUs this run
# needs against the account's On-Demand Standard vCPU quota (L-1216C4B3) and warn early
# — a quota miss otherwise fails mid-deploy with VcpuLimitExceeded. (Quota is total
# running vCPUs; other running instances reduce the real headroom.)
check_vcpu_quota() {
local gw=0 aux=0 v
for v in "$DEPLOY_NEXUS" "$DEPLOY_BIFROST" "$DEPLOY_LITELLM" "$DEPLOY_KONG" "$DEPLOY_PORTKEY" "$DEPLOY_TENSORZERO"; do
[ "$v" = "true" ] && gw=$((gw+1)); done
for v in "$DEPLOY_MOCK" "$DEPLOY_LOADTEST"; do [ "$v" = "true" ] && aux=$((aux+1)); done
local ctl=0; [ "$DEPLOY_CONTROL" = "true" ] && ctl=1
local gv av cv; gv="$(aws_ ec2 describe-instance-types --instance-types "$GATEWAY_TYPE" --query 'InstanceTypes[0].VCpuInfo.DefaultVCpus' --output text 2>/dev/null || echo 0)"
av="$(aws_ ec2 describe-instance-types --instance-types "$AUX_TYPE" --query 'InstanceTypes[0].VCpuInfo.DefaultVCpus' --output text 2>/dev/null || echo 0)"
cv="$(aws_ ec2 describe-instance-types --instance-types "$CONTROL_TYPE" --query 'InstanceTypes[0].VCpuInfo.DefaultVCpus' --output text 2>/dev/null || echo 0)"
case "$gv$av$cv" in *[!0-9]*|"") warn "couldn't read instance-type vCPUs; skipping quota check"; return 0 ;; esac
local need=$(( gw*gv + aux*av + ctl*cv ))
# L-1216C47A = "Running On-Demand Standard (A,C,D,H,I,M,R,T,Z) instances", in vCPUs.
# get-service-quota returns the APPLIED value; fall back to the AWS default if the
# account never customised it.
local quota; quota="$(aws_ service-quotas get-service-quota --service-code ec2 --quota-code L-1216C47A --query 'Quota.Value' --output text 2>/dev/null || true)"
[ -n "$quota" ] || quota="$(aws_ service-quotas get-aws-default-service-quota --service-code ec2 --quota-code L-1216C47A --query 'Quota.Value' --output text 2>/dev/null || true)"
if [ -z "$quota" ]; then warn "couldn't read the On-Demand vCPU quota; skipping quota check"; return 0; fi
quota="${quota%.*}"
inf "vCPU need: $need ($gw gateway×$gv + $aux aux×$av + $ctl control×$cv) | account On-Demand Standard quota: $quota vCPU"
if [ "$need" -gt "$quota" ]; then
warn "this matrix needs $need vCPU but the account quota is $quota."
ask "continue anyway (deploy may fail with VcpuLimitExceeded)?" \
|| die "reduce the matrix (DEPLOY_* in deploy.env) or request an EC2 vCPU quota increase."
else
ok "vCPU quota OK ($need ≤ $quota)"
fi
}
check_vcpu_quota
# ---------- 1. SSH key pair ----------
# Two halves must line up: an EC2 key pair (in AWS, so instances get the public key)
# AND the matching local private .pem (Ansible/SSH log in with it). If KEY_NAME is
# pre-set and exists, use it; otherwise LIST the account's key pairs (annotated with
# whether a local ~/.ssh/<name>.pem exists) and let the operator pick one or create a
# new pair (AWS generates the .pem, saved to ~/.ssh). Then resolve the local key:
# the conventional path, else a pick from ~/.ssh/*.pem, else a typed path.
key_exists() { aws_ ec2 describe-key-pairs --key-names "$1" >/dev/null 2>&1; }
if key_exists "$KEY_NAME"; then
ok "key pair '$KEY_NAME' exists in account $ACCT"
else
inf "key pair '$KEY_NAME' not found in account $ACCT"
kp_menu() { # "name|label" per AWS key pair, annotated with local .pem presence
aws_ ec2 describe-key-pairs --query 'KeyPairs[].KeyName' --output text 2>/dev/null \
| tr '\t' '\n' | sed '/^$/d' | while read -r kp; do
if [ -f "$HOME/.ssh/$kp.pem" ]; then printf '%s|%s (local ~/.ssh/%s.pem ✓)\n' "$kp" "$kp" "$kp"
else printf '%s|%s (no local .pem — you will be asked for the path)\n' "$kp" "$kp"; fi
done
}
sel="$( { kp_menu; printf '__create__|+ create a new key pair\n'; } | pick_one "EC2 key pair — pick an existing one or create new:" "__create__")"
if [ "$sel" = "__create__" ]; then
KEY_NAME="$(prompt_free_name "new key pair name" "$KEY_NAME" key_exists)"
KEY_FILE="$HOME/.ssh/${KEY_NAME}.pem"
[ -e "$KEY_FILE" ] && die "local key file $KEY_FILE already exists — move it aside or pick another name."
mkdir -p "$(dirname "$KEY_FILE")"
aws_ ec2 create-key-pair --key-name "$KEY_NAME" --query KeyMaterial --output text > "$KEY_FILE"
chmod 600 "$KEY_FILE"
ok "created key pair '$KEY_NAME' → $KEY_FILE"
else
KEY_NAME="$sel"; ok "using existing key pair '$KEY_NAME'"
# KEY_FILE was derived from the INITIAL KEY_NAME; re-point it at the selected pair
# (unless the operator pinned KEY_FILE) so a stale ~/.ssh/<old-name>.pem isn't reused
# — that mismatch silently launches the stack with one key but SSHes with another.
[ -n "$KEY_FILE_EXPLICIT" ] || KEY_FILE="$HOME/.ssh/${KEY_NAME}.pem"
fi
fi
# Resolve the local private key for KEY_NAME (the AWS name and the local filename can
# differ). Prefer a pre-set/existing KEY_FILE, then ~/.ssh/<name>.pem, then let the
# operator pick from ~/.ssh/*.pem or type a path.
if [ ! -f "$KEY_FILE" ] && [ -f "$HOME/.ssh/${KEY_NAME}.pem" ]; then KEY_FILE="$HOME/.ssh/${KEY_NAME}.pem"; fi
if [ ! -f "$KEY_FILE" ]; then
[ "$ASSUME_YES" = "1" ] && die "no local private key for '$KEY_NAME' — set KEY_FILE in deploy.env."
pems="$(ls -1 "$HOME"/.ssh/*.pem 2>/dev/null || true)"
if [ -n "$pems" ]; then
inf "no ~/.ssh/${KEY_NAME}.pem — pick the local private key for '$KEY_NAME':"
KEY_FILE="$(printf '%s\n' "$pems" | sed 's/$/|/' | pick_one "local private key (.pem):" "")"
fi
while [ ! -f "$KEY_FILE" ]; do
read -r -p "$(c 33 "?") path to the .pem private key for '$KEY_NAME' (Enter to abort): " kf </dev/tty || true
[ -n "$kf" ] || die "no private key for '$KEY_NAME' — set KEY_FILE in deploy.env, or create a new key pair."
KEY_FILE="${kf/#\~/$HOME}"
done
fi
ok "private key: $KEY_FILE"
# Cross-check the local .pem actually matches the AWS key pair, so a wrong key can't
# silently launch the stack and then fail Ansible/SSH (Permission denied) later. AWS-
# created RSA key pairs use a SHA1-of-PKCS8-DER fingerprint (40 hex / colon-grouped);
# that's what we can verify here. If the AWS fingerprint isn't that form (ed25519 or an
# imported key), we skip rather than risk a false abort.
key_fp_aws="$(aws_ ec2 describe-key-pairs --key-names "$KEY_NAME" \
--query 'KeyPairs[0].KeyFingerprint' --output text 2>/dev/null | tr -d ':')"
if [ "${#key_fp_aws}" = 40 ]; then
key_fp_local="$(openssl pkcs8 -in "$KEY_FILE" -nocrypt -topk8 -outform DER 2>/dev/null \
| openssl sha1 2>/dev/null | sed 's/.*[ =]//')"
if [ -n "$key_fp_local" ] && [ "$key_fp_local" != "$key_fp_aws" ]; then
warn "local key $KEY_FILE does NOT match AWS key pair '$KEY_NAME' (fingerprint mismatch)"
warn " → Ansible/SSH would fail with 'Permission denied'. Point KEY_FILE at the .pem for '$KEY_NAME'."
ask "deploy anyway?" || die "aborted — set KEY_FILE to the .pem matching key pair '$KEY_NAME'."
fi
fi
# ---------- 2. VPC + subnet ----------
verify_subnet_in_vpc() { # <subnet> <vpc>
local v; v=$(aws_ ec2 describe-subnets --subnet-ids "$1" \
--query "Subnets[0].VpcId" --output text 2>/dev/null) || return 1
[ "$v" = "$2" ]
}
# VPC: validate a pre-set one; else LIST the account's VPCs and let the operator pick
# (default = the account's default VPC). No VPCs at all → offer to create a dedicated one.
if [ -n "$VPC_ID" ]; then
aws_ ec2 describe-vpcs --vpc-ids "$VPC_ID" >/dev/null 2>&1 || die "VPC '$VPC_ID' not found in $REGION."
else
vpcs="$(aws_ ec2 describe-vpcs \
--query 'Vpcs[].[VpcId,CidrBlock,IsDefault,Tags[?Key==`Name`]|[0].Value]' \
--output text 2>/dev/null || true)"
def_vpc="$(printf '%s\n' "$vpcs" | awk -F'\t' '$3=="True"{print $1; exit}')"
# Menu = create-new (a dedicated VPC+subnet via the net-stack, CFN-managed) + every
# existing VPC. Default to the account's default VPC if there is one, else create-new.
sel="$( { printf '%s|+ create a dedicated VPC + public subnet (net-stack, CFN-managed)\n' "__create__"
[ -n "$vpcs" ] && printf '%s\n' "$vpcs" | awk -F'\t' \
'{n=($4!="None"&&$4!="")?" "$4:""; d=($3=="True")?" [default]":""; print $1"|"$1" "$2 n d}'; } \
| pick_one "VPC:" "${def_vpc:-__create__}")"
if [ "$sel" = "__create__" ]; then
NET_STACK="$(prompt_free_name "name for the new network stack" "$NET_STACK" stack_exists)"
inf "creating a dedicated VPC + public subnet via stack '$NET_STACK' (CFN-managed)…"
aws cloudformation deploy --region "$REGION" --stack-name "$NET_STACK" \
--template-file "$ROOT/cloudformation/network-stack.yaml" \
--parameter-overrides "ProjectName=$NET_STACK"
VPC_ID=$(aws_ cloudformation describe-stacks --stack-name "$NET_STACK" \
--query "Stacks[0].Outputs[?OutputKey=='VpcId'].OutputValue" --output text)
SUBNET_ID=$(aws_ cloudformation describe-stacks --stack-name "$NET_STACK" \
--query "Stacks[0].Outputs[?OutputKey=='SubnetId'].OutputValue" --output text)
ok "created VPC $VPC_ID / subnet $SUBNET_ID (net-stack)"
else
VPC_ID="$sel"
fi
fi
[ -n "$VPC_ID" ] || die "no VPC selected."
# Subnet: validate a pre-set one; else LIST the VPC's subnets and let the operator pick
# (default = a public one). The rig needs a subnet that routes 0.0.0.0/0 to an IGW.
if [ -n "$SUBNET_ID" ]; then
verify_subnet_in_vpc "$SUBNET_ID" "$VPC_ID" || die "subnet '$SUBNET_ID' is not in VPC '$VPC_ID'."
else
subs="$(aws_ ec2 describe-subnets --filters "Name=vpc-id,Values=$VPC_ID" \
--query 'Subnets[].[SubnetId,AvailabilityZone,CidrBlock,MapPublicIpOnLaunch]' \
--output text 2>/dev/null || true)"
def_sub="$(printf '%s\n' "$subs" | awk -F'\t' '$4=="True"{print $1; exit}')"
sel="$( { [ -n "$subs" ] && printf '%s\n' "$subs" | awk -F'\t' '{p=($4=="True")?"public":"private"; print $1"|"$1" "$2" "$3" ("p")"}'
printf '__create__|+ create a new public subnet\n'; } | pick_one "Subnet (needs a route to an IGW):" "$def_sub")"
if [ "$sel" = "__create__" ]; then
# A public subnet needs the VPC's Internet Gateway + a route table that sends
# 0.0.0.0/0 to it. NOTE: a subnet created here is NOT part of the CloudFormation
# stack, so down.sh will NOT remove it — you clean it up yourself.
igw="$(aws_ ec2 describe-internet-gateways --filters "Name=attachment.vpc-id,Values=$VPC_ID" \
--query 'InternetGateways[0].InternetGatewayId' --output text 2>/dev/null || echo None)"
{ [ -n "$igw" ] && [ "$igw" != "None" ]; } || die "VPC $VPC_ID has no Internet Gateway — pick another VPC, or let the stack build a dedicated net (set VPC_ID= blank with no default VPC)."
az="$(aws_ ec2 describe-availability-zones --query 'AvailabilityZones[0].ZoneName' --output text)"
vcidr="$(aws_ ec2 describe-vpcs --vpc-ids "$VPC_ID" --query 'Vpcs[0].CidrBlock' --output text)"
warn "the new subnet + its route table will NOT be tracked by the stack (manual cleanup on teardown)."
# Loop the CIDR until create-subnet succeeds (overlap / out-of-range → re-prompt).
SUBNET_ID=""
while [ -z "$SUBNET_ID" ]; do
read -r -p "$(c 33 "?") CIDR for the new subnet (free inside $vcidr): " ncidr </dev/tty || true
[ -n "$ncidr" ] || die "no CIDR given for the new subnet."
SUBNET_ID="$(aws_ ec2 create-subnet --vpc-id "$VPC_ID" --cidr-block "$ncidr" --availability-zone "$az" \
--query 'Subnet.SubnetId' --output text 2>/dev/null || true)"
[ -n "$SUBNET_ID" ] || warn " couldn't create subnet $ncidr (overlap / not inside $vcidr / invalid) — try another."
done
aws_ ec2 modify-subnet-attribute --subnet-id "$SUBNET_ID" --map-public-ip-on-launch >/dev/null
rtb="$(aws_ ec2 create-route-table --vpc-id "$VPC_ID" --query 'RouteTable.RouteTableId' --output text)"
aws_ ec2 create-route --route-table-id "$rtb" --destination-cidr-block 0.0.0.0/0 --gateway-id "$igw" >/dev/null
aws_ ec2 associate-route-table --route-table-id "$rtb" --subnet-id "$SUBNET_ID" >/dev/null
aws_ ec2 create-tags --resources "$SUBNET_ID" "$rtb" --tags "Key=Name,Value=${STACK}-net-manual" >/dev/null 2>&1 || true
ok "created public subnet $SUBNET_ID ($ncidr, $az) + route 0.0.0.0/0 → $igw [not stack-managed]"
else
SUBNET_ID="$sel"
fi
fi
{ [ -n "$SUBNET_ID" ] && [ "$SUBNET_ID" != "None" ]; } || die "no subnet selected."
ok "VPC $VPC_ID / subnet $SUBNET_ID"
VPC_CIDR=$(aws_ ec2 describe-vpcs --vpc-ids "$VPC_ID" \
--query "Vpcs[0].CidrBlock" --output text)
# ---------- 2c. security group ----------
# The rig needs these TCP ports open (the exact set the stack's SG opens):
SG_REQUIRED_TCP="22 80 443 3050 8080 4000 8000 8001 8787 3000 3062 9090"
sg_rules_human="SSH:22, http:80, https:443, nexus:3050, bifrost:8080, litellm:4000, kong:8000-8001, portkey:8787, tensorzero:3000, mock:3062, metrics:9090, + intra-SG (all)"
# echo the required TCP ports an existing SG does NOT cover (for 0.0.0.0/0 or any CIDR)
sg_missing_ports() {
local perms; perms="$(aws_ ec2 describe-security-groups --group-ids "$1" \
--query 'SecurityGroups[0].IpPermissions' --output json 2>/dev/null || echo '[]')"
printf '%s' "$perms" | SG_REQUIRED_TCP="$SG_REQUIRED_TCP" python3 -c '
import json,os,sys
perms=json.load(sys.stdin)
def covered(p):
for r in perms:
proto=r.get("IpProtocol")
if proto not in ("tcp","-1"): continue
if proto=="-1": return True
fp,tp=r.get("FromPort"),r.get("ToPort")
if fp is not None and tp is not None and fp<=p<=tp: return True
return False
miss=[str(p) for p in (int(x) for x in os.environ["SG_REQUIRED_TCP"].split()) if not covered(p)]
print(" ".join(miss))'
}
if [ -z "$SECURITY_GROUP_ID" ]; then
inf "Security group — the stack can CREATE one with exactly the ports the rig needs:"
inf " $sg_rules_human"
sgs="$(aws_ ec2 describe-security-groups --filters "Name=vpc-id,Values=$VPC_ID" \
--query 'SecurityGroups[].[GroupId,GroupName]' --output text 2>/dev/null || true)"
sel="$( { printf '%s|+ create a new SG (recommended — exactly the rules above)\n' "__create__"
[ -n "$sgs" ] && printf '%s\n' "$sgs" | awk -F'\t' '{print $1"|"$1" ("$2")"}'; } \
| pick_one "Security group:" "__create__")"
[ "$sel" = "__create__" ] && SECURITY_GROUP_ID="" || SECURITY_GROUP_ID="$sel"
fi
if [ -n "$SECURITY_GROUP_ID" ]; then
aws_ ec2 describe-security-groups --group-ids "$SECURITY_GROUP_ID" >/dev/null 2>&1 \
|| die "security group '$SECURITY_GROUP_ID' not found in $REGION."
miss="$(sg_missing_ports "$SECURITY_GROUP_ID")"
if [ -n "$miss" ]; then
warn "SG $SECURITY_GROUP_ID is MISSING ingress for TCP port(s):$miss"
warn " the rig needs — $sg_rules_human"
warn " those gateways/ports will be UNREACHABLE until you add the rules."
ask "use $SECURITY_GROUP_ID anyway?" || die "pick another SG, or create a new one (recommended)."
else
ok "SG $SECURITY_GROUP_ID already covers every required port"
fi
else
ok "security group: the stack will create '${STACK}-sg' ($sg_rules_human)"
fi
# ---------- 3. admin CIDR (who can reach SSH :22) ----------
# This rig deliberately opens SSH to the WORLD by default: the boxes are throwaway, operators
# connect from changing IPs, and locking :22 to a single detected IP kept silently breaking SSH
# after the IP changed / on each redeploy. Pass ADMIN_CIDR=1.2.3.4/32 to lock it to a CIDR.
ADMIN_CIDR="${ADMIN_CIDR:-0.0.0.0/0}"
[ "$ADMIN_CIDR" = "0.0.0.0/0" ] \
&& inf "SSH (:22) open to the world (ADMIN_CIDR=0.0.0.0/0) — pass ADMIN_CIDR=<your-ip>/32 to lock it" \
|| inf "SSH (:22) allowed from ADMIN_CIDR=$ADMIN_CIDR"
# ---------- 4. plan + confirm ----------
boxes=""
for kv in MOCK:$DEPLOY_MOCK NEXUS:$DEPLOY_NEXUS BIFROST:$DEPLOY_BIFROST LOADTEST:$DEPLOY_LOADTEST \
LITELLM:$DEPLOY_LITELLM KONG:$DEPLOY_KONG PORTKEY:$DEPLOY_PORTKEY TENSORZERO:$DEPLOY_TENSORZERO \
CONTROL:$DEPLOY_CONTROL; do
[ "${kv#*:}" = "true" ] && boxes="$boxes ${kv%%:*}"
done
echo
echo "$(c 1 "── deploy plan ──")"
sg_display="${SECURITY_GROUP_ID:-stack creates ${STACK}-sg}"
printf " %-14s %s\n" region "$REGION" stack "$STACK" \
account "$ACCT" "key pair" "$KEY_NAME ($KEY_FILE)" \
VPC "$VPC_ID ($VPC_CIDR)" subnet "$SUBNET_ID" \
"security group" "$sg_display" \
"SSH from" "$ADMIN_CIDR" \
"ports from" "$ACCESS_CIDR" \
"gateway type" "$GATEWAY_TYPE" "aux type" "$AUX_TYPE" \
"control type" "$([ "$DEPLOY_CONTROL" = true ] && echo "$CONTROL_TYPE (${CONTROL_VOLUME_GIB}GiB gp3)" || echo "(not deployed)")" \
"root volume" "${VOLUME_GIB}GiB gp3 / ${VOLUME_IOPS} IOPS / ${VOLUME_THROUGHPUT} MB/s" \
boxes "${boxes# }"
echo
ask "deploy this?" || die "aborted."
# ---------- 5. deploy ----------
inf "deploying stack '$STACK' (a few minutes)…"
aws cloudformation deploy --region "$REGION" --stack-name "$STACK" \
--template-file "$ROOT/cloudformation/perf-matrix-stack.yaml" \
--capabilities CAPABILITY_IAM \
--parameter-overrides \
"ProjectName=$STACK" "KeyName=$KEY_NAME" "VpcId=$VPC_ID" "SubnetId=$SUBNET_ID" \
"SecurityGroupId=$SECURITY_GROUP_ID" \
"AdminCidr=$ADMIN_CIDR" "AccessCidr=$ACCESS_CIDR" "GatewayInstanceType=$GATEWAY_TYPE" "AuxInstanceType=$AUX_TYPE" \
"ControlInstanceType=$CONTROL_TYPE" "ControlVolumeSizeGiB=$CONTROL_VOLUME_GIB" \
"VolumeSizeGiB=$VOLUME_GIB" "VolumeIops=$VOLUME_IOPS" \
"DeployMock=$DEPLOY_MOCK" "DeployNexus=$DEPLOY_NEXUS" "DeployBifrost=$DEPLOY_BIFROST" \
"DeployLoadtest=$DEPLOY_LOADTEST" "DeployLitellm=$DEPLOY_LITELLM" "DeployKong=$DEPLOY_KONG" \
"DeployPortkey=$DEPLOY_PORTKEY" "DeployTensorzero=$DEPLOY_TENSORZERO" "DeployControl=$DEPLOY_CONTROL"
ok "stack '$STACK' is up"
# gp3 root-volume throughput can't be set inline on an EC2::Instance (the Ebs schema
# has no Throughput property), so apply it online now. Non-fatal: deploy still succeeds
# if it can't (e.g. IAM lacks ec2:ModifyVolume) — it prints the manual command.
if [ "${VOLUME_THROUGHPUT:-0}" != "0" ]; then
REGION="$REGION" "$ROOT/scripts/set-ebs-throughput.sh" "$STACK" "$VOLUME_THROUGHPUT" \
|| warn "couldn't set EBS throughput now — re-run: scripts/set-ebs-throughput.sh $STACK $VOLUME_THROUGHPUT"
fi
# ---------- 6. inventory + outputs ----------
echo
echo "$(c 1 "── boxes ──")"
aws_ cloudformation describe-stacks --stack-name "$STACK" \
--query "Stacks[0].Outputs[].[OutputKey,OutputValue]" --output text | sort | \
while read -r k v; do printf " %-20s %s\n" "$k" "$v"; done
if [ -f "$KEY_FILE" ]; then
VPC_CIDR="$VPC_CIDR" "$ROOT/scripts/gen-inventory.sh" "$STACK" "$KEY_FILE" "$REGION" >/dev/null
ok "wrote ansible/inventory.ini + ansible/host_targets.env"
else
warn "private key $KEY_FILE missing — skipped inventory generation."
fi
# ---------- 7. optional provisioning ----------
if [ "$PROVISION" = "1" ] && [ -f "$KEY_FILE" ]; then
inf "provisioning with Ansible…"
( cd "$ROOT/ansible" && ansible-playbook -i inventory.ini site.yml )
ok "provisioned"
fi
echo
echo "$(c 1 "next:")"
echo " install: cd ansible && ansible-playbook -i inventory.ini site.yml (or re-run with PROVISION=1)"
echo " teardown: ./down.sh"