-
Notifications
You must be signed in to change notification settings - Fork 598
Expand file tree
/
Copy pathbootstrap_ec2
More file actions
executable file
·437 lines (391 loc) · 15.9 KB
/
bootstrap_ec2
File metadata and controls
executable file
·437 lines (391 loc) · 15.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
#!/usr/bin/env bash
# Unified bootstrap script for launching CI builds on EC2 instances.
# Supports both SSH and SSM transport modes, controlled by CI_USE_SSH.
# CI_USE_SSH=1 → SSH into instance (requires key pair)
# CI_USE_SSH=0 → SSM RunShellScript (no SSH)
# Both modes use instance profiles for AWS credentials.
NO_CD=1 source $(git rev-parse --show-toplevel)/ci3/source
source $ci3/source_refname
cmd=${1:?usage: $(basename "$0") '<command-to-run-inside-devbox>'}
arch=${ARCH:-amd64}
NO_TERMINATE=${NO_TERMINATE:-0}
use_ssh=${CI_USE_SSH:-0}
# We're always "in CI" if we're running on a remote instance.
export CI=1
# Enable build instance key features (disk logging, benchmark uploads) by default.
# Set to 0 in ci3-external.yml for external contributors who don't have SSH access.
export CI_USE_BUILD_INSTANCE_KEY=${CI_USE_BUILD_INSTANCE_KEY:-1}
# Pre-generate a log ID and print the dashboard link.
export CI_LOG_ID=$(date +%s%3N)$((100 + RANDOM % 900))
echo "CI booting..." | redis_setexz "$CI_LOG_ID" 300
log_url="http://ci.aztec-labs.com/$CI_LOG_ID"
echo -e "CI Log: ${yellow}$log_url${reset}"
if [ -n "${CI_DASHBOARD:-}" ]; then
echo "CI dashboard: http://ci.aztec-labs.com/section/$CI_DASHBOARD"
fi
# Post pending GitHub commit status so the CI log link is visible on the PR.
post_github_status pending "ci/${JOB_ID}" "$log_url" "${cmd:0:140}" 2>/dev/null || true
if [ "$arch" == "arm64" ]; then
cores=64
export AWS_SHUTDOWN_TIME=${AWS_SHUTDOWN_TIME_ARM:-${AWS_SHUTDOWN_TIME:-60}}
else
cores=128,64
if [ "${CI_FULL:-0}" -eq 1 ]; then
export AWS_SHUTDOWN_TIME=${AWS_SHUTDOWN_TIME:-75}
else
export AWS_SHUTDOWN_TIME=${AWS_SHUTDOWN_TIME:-60}
fi
fi
# Allow override.
cores=${CPUS:-$cores}
# Trap function to terminate our running instance when the script exits.
function cleanup {
if [ -d "${state_dir:-}" ]; then
aws_terminate_instance "$state_dir" >/dev/null 2>&1
fi
}
# It's important that cleanup writes to /dev/null.
# We run this from parallel, which closes stdout/stderr during its cleanup.
# This causes cleanup to die silently, probably from a SIGPIPE.
trap 'cleanup >/dev/null' SIGINT SIGTERM EXIT
# Verify that the commit exists on the remote.
current_commit=$(git rev-parse HEAD)
if [[ "$(git fetch origin --negotiate-only --negotiation-tip="$current_commit")" != *"$current_commit"* ]]; then
echo "Commit $current_commit is not pushed, exiting."
exit 1
fi
# Our instance_name acts as a uniqueness key for the instance.
# Instances are terminated if they exist with the same name.
if [[ "$REF_NAME" =~ ^gh-readonly-queue/.*(pr-[0-9]+) ]]; then
instance_name="${BASH_REMATCH[1]}_$arch"
else
instance_name=$(echo -n "$REF_NAME" | head -c 50 | tr -c 'a-zA-Z0-9-' '_')_$arch
fi
state_dir=$(mktemp -d /tmp/aws_request_instance.XXXXXX)
if semver check "$REF_NAME"; then
# Override the public key that aws will load into ~/.ssh/authorized_keys on the launched instance.
# This requires the restricted key only available in release environments.
key_name="super-build-instance"
else
key_name="build-instance"
fi
[ -n "${INSTANCE_POSTFIX:-}" ] && instance_name+="_$INSTANCE_POSTFIX"
if [ "$use_ssh" -eq 1 ]; then
echo_header "request build instance (SSH)"
else
echo_header "request build instance (SSM)"
fi
# Terminate any existing instances with the same name.
existing_instance=$(aws ec2 describe-instances \
--region us-east-2 \
--filters "Name=tag:Name,Values=$instance_name" "Name=instance-state-name,Values=running" \
--query "Reservations[].Instances[?State.Name!='terminated'].InstanceId[]" \
--output text)
if [ -n "$existing_instance" ]; then
for i in $existing_instance; do
echo "Terminating existing instance: $i"
aws ec2 --region us-east-2 terminate-instances --instance-ids "$i" >/dev/null 2>&1
done
fi
# Request new instance.
# SSH mode needs a key pair; SSM mode uses instance profile (no key pair).
if [ "$use_ssh" -eq 1 ]; then
KEY_NAME=$key_name aws_request_instance "$instance_name" "$cores" "$arch" "$state_dir"
else
KEY_NAME= aws_request_instance "$instance_name" "$cores" "$arch" "$state_dir"
fi
ip=$(cat "$state_dir/ip")
sir=""
if [[ -f "$state_dir/sir" ]]; then
sir=$(cat "$state_dir/sir")
fi
iid=$(cat "$state_dir/iid" | tr -d '\n\r' | xargs)
export EC2_INSTANCE_TYPE=$(cat "$state_dir/instance_type" 2>/dev/null || echo "unknown")
export EC2_SPOT=$(cat "$state_dir/spot" 2>/dev/null || echo "unknown")
# --- Credential encoding ---
# Encode GCP and network env files as base64 for transfer to the instance.
export GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS:-/tmp/gcp-key.json}
export NETWORK_ENV_FILE=${NETWORK_ENV_FILE:-/tmp/network.env}
if [ -f "${GOOGLE_APPLICATION_CREDENTIALS}" ]; then
export GCP_SA_KEY_B64=$(cat "${GOOGLE_APPLICATION_CREDENTIALS}" | base64 -w 0)
fi
if [ -f "${NETWORK_ENV_FILE}" ]; then
export NETWORK_ENV_FILE_B64=$(cat "${NETWORK_ENV_FILE}" | base64 -w 0)
fi
# SSH-specific: encode SSH keys for transfer to the instance.
# AWS credentials are handled by instance profiles on all paths.
if [ "$use_ssh" -eq 1 ]; then
# Only pass SSH keys if disk logging is enabled (internal CI only).
if [ "${CI_USE_BUILD_INSTANCE_KEY:-1}" -eq 1 ]; then
if [ -f ~/.ssh/build_instance_key ]; then
export BUILD_INSTANCE_KEY_B64=$(cat ~/.ssh/build_instance_key | base64 -w 0)
fi
if [ -f "$ci3/aws/build_instance_ssh_config" ]; then
export BUILD_INSTANCE_SSH_CONFIG_B64=$(cat "$ci3/aws/build_instance_ssh_config" | base64 -w 0)
fi
fi
fi
# --- Determine run command ---
# If stdout is connected to a terminal (SSH interactive), drop into a shell on failure.
# Otherwise (CI mode, always for SSM), pipe logs through timestamps and cache.
if [ "$use_ssh" -eq 1 ] && [ -t 1 ]; then
ssh_args="-t"
if [ "$NO_TERMINATE" -eq 0 ]; then
run_cmd="run || exec zsh"
else
run_cmd="run; exec zsh"
fi
else
run_cmd="PARENT_LOG_ID=\$ci_log_id run 2>&1 | ci3/add_timestamps | DUP=1 ci3/cache_log 'CI run' \$ci_log_id"
fi
# --- Container script ---
# This runs inside the devbox Docker container.
container_script=$(
cat <<EOF
set -euo pipefail
# When restarting the container, just hang around.
# Note we use the "ci-started" file to determine if we're running on a CI machine in some cases (e.g. npm cache).
while [ -f ci-started ]; do sleep 999; done
touch ci-started
sudo chown aztec-dev:aztec-dev aztec-packages
# Set up preferred commit attribution (used during releases).
git config --global user.email "tech@aztecprotocol.com"
git config --global user.name "AztecBot"
cd aztec-packages
git config --global advice.detachedHead false
git init .
if [ -n "\${GITHUB_TOKEN:-}" ]; then
git remote add origin "https://x-access-token:\$GITHUB_TOKEN@github.com/\$GITHUB_REPOSITORY.git"
else
git remote add origin "https://github.com/\$GITHUB_REPOSITORY.git"
fi
git fetch --depth 1 origin $current_commit
git checkout FETCH_HEAD
git checkout -b \$REF_NAME
source ci3/source
ci_log_id=\$CI_LOG_ID
log_ci_run \$ci_log_id
if [ -n "\$DOCKERHUB_PASSWORD" ]; then
echo \$DOCKERHUB_PASSWORD | docker login -u \$DOCKERHUB_USERNAME --password-stdin
fi
# Heartbeat.
while true; do redis_cli SETEX hb-\$ci_log_id 60 1 &>/dev/null || true; sleep 30; done &
function run {
echo "env: REF_NAME=\$REF_NAME COMMIT_HASH=\$COMMIT_HASH CURRENT_VERSION=\$CURRENT_VERSION TARGET_BRANCH=\$TARGET_BRANCH"
if semver check "\$REF_NAME"; then
echo "Performing a release because \$REF_NAME is a semver."
fi
# Background monitors. cache_log and add_timestamps both close inherited pipeline
# fds internally, so these long-running processes won't block the outer pipeline.
mpstat 2 2>&1 | cache_log cpufile &
vmstat -w -S M 2 2>&1 | add_timestamps | cache_log memfile &
bash -c "while true; do pstree -agl; echo; echo; sleep 10; done" 2>&1 | add_timestamps | cache_log processes &
set +e
set -x
$cmd
local code=\${PIPESTATUS[0]}
set +x
sudo dmesg 2>&1 | cache_log 'dmesg'
return \$code
}
export -f run
set +e
$run_cmd
code=\${PIPESTATUS[0]}
case \$code in
155) ;;
0) log_ci_run \$ci_log_id PASSED ;;
*)
log_ci_run \$ci_log_id FAILED
merge_train_failure_slack_notify \$ci_log_id
release_canary_slack_notify \$ci_log_id
;;
esac
exit \$code
EOF
)
# CI_SSM_MODE: 1 when using SSM, 0 when using SSH.
ci_ssm_mode=$((1 - use_ssh))
# --- Host script ---
# This runs on the EC2 host (either via SSH or SSM).
host_script=$(
cat <<EOF
set -euo pipefail
sudo sysctl fs.inotify.max_user_watches=1048576 &>/dev/null
sudo sysctl fs.inotify.max_user_instances=1048576 &>/dev/null
# Pin host processes to top CPU cores to keep benchmark cores clean.
# CPU layout: physical cores 0..N/2-1, hyperthreads N/2..N-1.
# OS gets top 8 physical cores + their hyperthread siblings.
# Skip dockerd so containers it spawns can use all CPUs for build/test.
total_cpus=\$(nproc)
total_physical=\$((total_cpus / 2))
os_start=\$((total_physical - 8))
os_ht_start=\$((total_cpus - 8))
os_cpu_list="\$os_start-\$((total_physical - 1)),\$os_ht_start-\$((total_cpus - 1))"
for pid in \$(ps -eo pid= 2>/dev/null); do
comm=\$(cat /proc/\$pid/comm 2>/dev/null) || continue
[ "\$comm" = "dockerd" ] && continue
sudo taskset -apc "\$os_cpu_list" \$pid &>/dev/null || true
done
echo "Host processes pinned to OS CPUs: \$os_cpu_list"
echo "HOST: fetching EC2 metadata token..."
aws_token=\$(curl -sX PUT http://169.254.169.254/latest/api/token -H 'X-aws-ec2-metadata-token-ttl-seconds: 21600')
echo "HOST: metadata token acquired."
GOOGLE_APPLICATION_CREDENTIALS=\${GOOGLE_APPLICATION_CREDENTIALS:-/tmp/gcp-key.json}
NETWORK_ENV_FILE=\${NETWORK_ENV_FILE:-/tmp/network.env}
# Decode GCP service account key if provided (base64-encoded by the caller).
echo "HOST: decoding credentials..."
if [ -n "${GCP_SA_KEY_B64:-}" ]; then
echo "${GCP_SA_KEY_B64:-}" | base64 -d > "\${GOOGLE_APPLICATION_CREDENTIALS}"
fi
# Decode network env file if provided.
if [ -n "${NETWORK_ENV_FILE_B64:-}" ]; then
echo "${NETWORK_ENV_FILE_B64:-}" | base64 -d > "\${NETWORK_ENV_FILE}"
fi
# Decode SSH keys if provided (SSH mode only, empty in SSM mode).
if [ -n "${BUILD_INSTANCE_KEY_B64:-}" ]; then
mkdir -p \$HOME/.ssh
echo "${BUILD_INSTANCE_KEY_B64:-}" | base64 -d > \$HOME/.ssh/build_instance_key
chmod 600 \$HOME/.ssh/build_instance_key
fi
if [ -n "${BUILD_INSTANCE_SSH_CONFIG_B64:-}" ]; then
mkdir -p \$HOME/.ssh
echo "${BUILD_INSTANCE_SSH_CONFIG_B64:-}" | base64 -d > \$HOME/.ssh/build_instance_ssh_config
fi
start_build() {
echo "HOST: preparing devbox (uid/gid, docker run)..."
local_uid=\$(id -u)
local_gid=\$(id -g)
docker run --privileged --rm \${docker_args:-} \
--name aztec_build \
--hostname $instance_name \
-v bootstrap_ci_local_docker:/var/lib/docker \
-v bootstrap_ci_repo:/home/aztec-dev/aztec-packages \
-v \$HOME/.ssh:/home/aztec-dev/.ssh:ro \
-v \$HOME/.bb-crs:/home/aztec-dev/.bb-crs \
-v /dev/kmsg:/dev/kmsg \
-v /tmp:/tmp \
-e GOOGLE_APPLICATION_CREDENTIALS="\${GOOGLE_APPLICATION_CREDENTIALS}" \
-e NETWORK_ENV_FILE="\${NETWORK_ENV_FILE}" \
-e CI=1 \
-e CI_SSM_MODE=$ci_ssm_mode \
-e CI_LOG_ID=${CI_LOG_ID:-} \
-e CI_LOGS_S3_LOCATION=${CI_LOGS_S3_LOCATION:-s3://aztec-ci-artifacts/logs} \
-e RUN_ID=${RUN_ID:-} \
-e JOB_ID=${JOB_ID:-} \
-e REF_NAME=${REF_NAME:-} \
-e TARGET_BRANCH=${TARGET_BRANCH:-} \
-e CI_DASHBOARD=${CI_DASHBOARD:-} \
-e PARENT_LOG_ID=${PARENT_LOG_ID:-} \
-e NO_CACHE=${NO_CACHE:-} \
-e NO_FAIL_FAST=${NO_FAIL_FAST:-} \
-e CI_USE_BUILD_INSTANCE_KEY=${CI_USE_BUILD_INSTANCE_KEY:-1} \
-e CI_REDIS='ci-redis-tiered.lzka0i.ng.0001.use2.cache.amazonaws.com' \
-e SSH_CONNECTION=' ' \
-e LOCAL_USER_ID=\$local_uid \
-e LOCAL_GROUP_ID=\$local_gid \
-e GCP_PROJECT_ID=${GCP_PROJECT_ID:-} \
-e EXTERNAL_ETHEREUM_HOSTS=${EXTERNAL_ETHEREUM_HOSTS:-} \
-e EXTERNAL_ETHEREUM_CONSENSUS_HOST=${EXTERNAL_ETHEREUM_CONSENSUS_HOST:-} \
-e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY:-} \
-e EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER=${EXTERNAL_ETHEREUM_CONSENSUS_HOST_API_KEY_HEADER:-} \
-e L1_DEPLOYMENT_PRIVATE_KEY=${L1_DEPLOYMENT_PRIVATE_KEY:-} \
-e DOCKERHUB_PASSWORD=${DOCKERHUB_PASSWORD:-} \
-e DOCKERHUB_USERNAME=${DOCKERHUB_USERNAME:-} \
-e R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-} \
-e R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-} \
-e BUILD_SYSTEM_DEBUG=${BUILD_SYSTEM_DEBUG:-} \
-e GITHUB_TOKEN=${GITHUB_TOKEN:-} \
-e GITHUB_REPOSITORY=${GITHUB_REPOSITORY:-aztecprotocol/aztec-packages} \
-e PR_NUMBER=${PR_NUMBER:-} \
-e PR_HEAD_REF=${PR_HEAD_REF:-} \
-e PR_BASE_REF=${PR_BASE_REF:-} \
-e CHONK_INPUTS_STATE_DIR=${CHONK_INPUTS_STATE_DIR:-} \
-e NETLIFY_SITE_ID=${NETLIFY_SITE_ID:-} \
-e NETLIFY_AUTH_TOKEN=${NETLIFY_AUTH_TOKEN:-} \
-e NPM_TOKEN=${NPM_TOKEN:-} \
-e CARGO_REGISTRY_TOKEN=${CARGO_REGISTRY_TOKEN:-} \
-e SLACK_BOT_TOKEN=${SLACK_BOT_TOKEN:-} \
-e AZTEC_FOUNDATION_CI_SLACK_BOT_TOKEN=${AZTEC_FOUNDATION_CI_SLACK_BOT_TOKEN:-} \
-e SOCKET_SECURITY_API_TOKEN=${SOCKET_SECURITY_API_TOKEN:-} \
-e AWS_TOKEN=\$aws_token \
-e NAMESPACE=${NAMESPACE:-} \
-e NETWORK=${NETWORK:-} \
-e GITHUB_ACTOR=${GITHUB_ACTOR:-} \
-e EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-unknown} \
-e EC2_SPOT=${EC2_SPOT:-unknown} \
-e AZTEC_TOOLCHAIN_DEFAULT_MAJOR_VERSION=${AZTEC_TOOLCHAIN_DEFAULT_MAJOR_VERSION:-} \
-e DRY_RUN=${DRY_RUN:-} \
--pids-limit=65536 \
--shm-size=2g \
--ulimit nofile=1048576:1048576 \
aztecprotocol/devbox:3.0 bash -c $(printf '%q' "$container_script")
}
# If stdout is a tty, run in foreground, otherwise run in background and handle spot termination notices.
if [ -t 1 ]; then
docker_args+=' -ti'
start_build
else
echo "HOST: starting devbox container..."
start_build &
build_pid=\$!
echo "HOST: devbox container launched (pid=\$build_pid). Monitoring for spot termination..."
# While the docker container is running, check for spot termination notices.
while kill -0 \$build_pid &>/dev/null; do
# The check for the file allows for testing spot termination logic.
if [ -f /tmp/spot_term ] || curl -fs -H 'X-aws-ec2-metadata-token: '\$aws_token http://169.254.169.254/latest/meta-data/spot/termination-time &>/dev/null; then
echo 'Spot will be terminated! Exiting early.'
docker kill aztec_build &>/dev/null || true
exit 155
fi
sleep 5
done
# Returns exit code from docker run.
wait \$build_pid
fi
EOF
)
# --- Transport: send host_script to the instance ---
if [ "$use_ssh" -eq 1 ]; then
echo_header "connect via SSH"
function run_ssh {
ssh ${ssh_args:-} -F $ci3/aws/build_instance_ssh_config ubuntu@$ip "$host_script"
}
set +e
# If in terminal run in foreground.
# If not, run in background so we can handle the signals in a timely fashion, and wait for it to finish.
if [ -t 1 ]; then
run_ssh
else
echo "Stdout is not a tty, running in background..."
run_ssh &
wait $!
fi
exit_code=$?
set -e
echo "SSH exited with code: $exit_code"
else
echo_header "invoke SSM command"
# Send host script via SSM and poll until completion.
# Run as ubuntu to match SSH behavior (SSM defaults to root).
set +e
SSM_RUN_AS=ubuntu SSM_POLL_TIMEOUT=${SSM_POLL_TIMEOUT:-$((AWS_SHUTDOWN_TIME * 60 + 600))} \
ssm_send_command "$iid" "$host_script"
exit_code=$?
set -e
fi
# If we were spot evicted, try again using on-demand.
if [ "$exit_code" -eq 155 ]; then
echo "Spot was evicted. Retrying with on-demand instance."
NO_SPOT=1 exec "$0" "$@"
else
# Print CI log URL at the end for easy access (avoid scrolling up).
echo -e "CI Log: ${yellow}$log_url${reset}"
# Post final GitHub commit status.
if [ "$exit_code" -eq 0 ]; then
post_github_status success "ci/${JOB_ID}" "$log_url" "${cmd:0:140}" 2>/dev/null || true
else
post_github_status failure "ci/${JOB_ID}" "$log_url" "${cmd:0:140}" 2>/dev/null || true
fi
exit "$exit_code"
fi