Skip to content

Commit fbfbca2

Browse files
author
AztecBot
committed
Merge branch 'v5-next' into merge-train/spartan-v5
2 parents b5830ee + 0ed7f79 commit fbfbca2

9 files changed

Lines changed: 787 additions & 3 deletions

File tree

ci3/bootstrap_ec2

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,36 @@ sudo systemctl mask --now apt-daily.timer apt-daily-upgrade.timer apt-daily.serv
259259
sudo sysctl fs.inotify.max_user_watches=1048576 &>/dev/null
260260
sudo sysctl fs.inotify.max_user_instances=1048576 &>/dev/null
261261
262+
# DNS caching. CI's massively parallel jobs resolve the same handful of hosts (S3,
263+
# Docker Hub, npm, cargo, github) thousands of times. By default every lookup — from
264+
# the devbox container and nested docker-in-docker alike — goes straight to the VPC
265+
# resolver, which AWS caps at ~1024 pps per ENI for link-local services; over that,
266+
# packets are silently dropped and surface as "could not resolve host".
267+
# Route container lookups through the host's (caching) systemd-resolved instead, by
268+
# exposing its stub on the instance's primary private IP — the one address reachable
269+
# from both the devbox container and nested dind — and pointing containers at it via
270+
# --dns on docker run below. priv_ip is cleared (and --dns omitted, leaving DNS
271+
# unchanged) if any step fails, so this can only help, never break resolution.
272+
priv_ip=\$(ip -4 route get 169.254.169.253 2>/dev/null | awk '{for(i=1;i<=NF;i++) if(\$i=="src"){print \$(i+1); exit}}' || true)
273+
if [ -n "\$priv_ip" ] && systemctl is-active --quiet systemd-resolved \
274+
&& echo "DNSStubListenerExtra=\$priv_ip" | sudo tee -a /etc/systemd/resolved.conf >/dev/null \
275+
&& sudo systemctl restart systemd-resolved; then
276+
# Only route containers to the cache once it's actually listening on priv_ip.
277+
for _ in 1 2 3 4 5; do
278+
sudo ss -lnu "sport = :53" 2>/dev/null | grep -qF "\$priv_ip:53" && { dns_ready=1; break; }
279+
sleep 0.5
280+
done
281+
if [ "\${dns_ready:-0}" = 1 ]; then
282+
echo "HOST: DNS cache active on \$priv_ip (systemd-resolved)."
283+
else
284+
echo "HOST: DNS cache failed to bind \$priv_ip; using default resolver."
285+
priv_ip=
286+
fi
287+
else
288+
echo "HOST: DNS cache not configured; using default resolver."
289+
priv_ip=
290+
fi
291+
262292
# Pin host processes to top CPU cores to keep benchmark cores clean.
263293
# CPU layout: physical cores 0..N/2-1, hyperthreads N/2..N-1.
264294
# OS gets top 8 physical cores + their hyperthread siblings.
@@ -309,7 +339,7 @@ start_build() {
309339
local_uid=\$(id -u)
310340
local_gid=\$(id -g)
311341
312-
docker run --privileged --rm \${docker_args:-} \
342+
docker run --privileged --rm \${docker_args:-} \${priv_ip:+--dns \$priv_ip} \
313343
--name aztec_build \
314344
--hostname $docker_hostname \
315345
-v bootstrap_ci_local_docker:/var/lib/docker \

spartan/metrics/grafana/alerts/contactpoints.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,30 @@ contactPoints:
3636
{{ template "aztec.slack.by_namespace" . }}
3737
disableResolveMessage: false
3838

39+
- orgId: 1
40+
name: "Slack #alerts-staging-public by namespace"
41+
receivers:
42+
- uid: alertsstagingpublicbynamespace
43+
type: slack
44+
settings:
45+
url: $SLACK_WEBHOOK_STAGING_PUBLIC_URL
46+
mentionUsers: $SLACK_ALERT_MENTION_USER_IDS
47+
text: |-
48+
{{ template "aztec.slack.by_namespace" . }}
49+
disableResolveMessage: false
50+
51+
- orgId: 1
52+
name: "Slack #alerts-staging-public by network"
53+
receivers:
54+
- uid: alertsstagingpublicbynetwork
55+
type: slack
56+
settings:
57+
url: $SLACK_WEBHOOK_STAGING_PUBLIC_URL
58+
mentionUsers: $SLACK_ALERT_MENTION_USER_IDS
59+
text: |-
60+
{{ template "aztec.slack.by_network" . }}
61+
disableResolveMessage: false
62+
3963
- orgId: 1
4064
name: "Slack #alerts-testnet by namespace"
4165
receivers:

0 commit comments

Comments
 (0)