Skip to content

Commit 3e75de1

Browse files
committed
Docker antithesis - split into node and driver containers
1 parent cd2e422 commit 3e75de1

6 files changed

Lines changed: 234 additions & 20 deletions

File tree

.github/regression.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ elif is_truthy "${CI_BYRON_CLUSTER:-}"; then
7575
export TESTNET_VARIANT="${CLUSTER_ERA:-conway}_slow"
7676
fi
7777

78-
export CARDANO_NODE_SOCKET_PATH_CI="$WORKDIR/state-cluster0/bft1.socket"
78+
CARDANO_NODE_SOCKET_PATH_CI="${CARDANO_NODE_SOCKET_PATH_CI:-$WORKDIR/state-cluster0/bft1.socket}"
79+
export CARDANO_NODE_SOCKET_PATH_CI
7980

8081
# assume we run tests on testnet when `BOOTSTRAP_DIR` is set
8182
if [ -n "${BOOTSTRAP_DIR:-}" ]; then

docker/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,9 +67,10 @@ RUN nix develop --accept-flake-config .#testenv --command \
6767
# paths are cached and the regression.sh shebang resolves offline.
6868
RUN nix develop --accept-flake-config .#base --command true
6969

70-
# Create the Antithesis test driver directory and install the entry-point.
70+
# Create the Antithesis test driver directory and install the entry-points.
7171
# singleton_driver_* files are run once per test run by Antithesis.
7272
RUN mkdir -p /opt/antithesis/test/v1/quickstart && \
7373
cp /work/docker/antithesis_run.sh \
7474
/opt/antithesis/test/v1/quickstart/singleton_driver_regression.sh && \
75-
chmod +x /opt/antithesis/test/v1/quickstart/singleton_driver_regression.sh
75+
chmod +x /opt/antithesis/test/v1/quickstart/singleton_driver_regression.sh && \
76+
chmod +x /work/docker/node_run.sh

docker/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ dependencies are baked into the image at build time:
2828
- `Dockerfile.config` — builds the Antithesis config image (`FROM scratch`)
2929
containing only `docker-compose.yaml`.
3030

31-
- `docker-compose.yaml` — single `driver` service.
31+
- `docker-compose.yaml` — two services: `node` (cardano-node cluster) and
32+
`driver` (pytest). Both share a `cluster-state` Docker volume so the
33+
driver accesses the node sockets without going over the network. An HTTP
34+
health check on port 8090 provides cross-container traffic that satisfies
35+
the Antithesis "Containers joined the Antithesis network" property.
3236

3337
## Workflow
3438

docker/antithesis_run.sh

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,27 @@
11
#!/usr/bin/env bash
2-
# Antithesis entrypoint for cardano-node-tests.
2+
# Antithesis driver container entrypoint.
33
#
44
# Runs the full test suite without any network access by:
55
# 1. Forcing nix into offline mode (all store paths were pre-built into
66
# the image by docker/Dockerfile).
77
# 2. Pointing regression.sh at the pre-built cardano binaries and Python
88
# venv so it skips all download / build steps.
9-
# 3. Emitting the Antithesis `setup_complete` lifecycle signal before
10-
# starting pytest.
9+
# 3. When NODE_HOST is set (multi-container mode): waiting for the node
10+
# container's health check on port 8090 before running tests, and
11+
# setting DEV_CLUSTER_RUNNING=1 so pytest uses the pre-running cluster
12+
# instead of starting its own.
13+
# 4. Emitting the Antithesis setup_complete lifecycle signal.
14+
# 5. Handing off to regression.sh.
15+
#
16+
# Multi-container environment variables (set in docker-compose):
17+
# NODE_HOST Hostname of the node container (default: unset).
18+
# NODE_PORT Health check port on the node container (default: 8090).
19+
# CLUSTER_STATE_DIR Mount point of the shared cluster-state volume
20+
# (default: /cluster-state).
1121
#
1222
# This file is installed at:
1323
# /opt/antithesis/test/v1/quickstart/singleton_driver_regression.sh
14-
# and is also usable directly as the docker-compose `command`.
24+
# and is also usable directly as the docker-compose command.
1525

1626
set -Eeuo pipefail
1727

@@ -29,20 +39,66 @@ echo "offline = true" >> /etc/nix/nix.conf
2939
export CARDANO_PREBUILT_DIR=/opt/cardano
3040
export _VENV_DIR=/opt/tests-venv
3141

32-
# ---------------------------------------------------------------------------
33-
# 3. Emit the Antithesis setup_complete signal.
34-
# Written as JSONL to $ANTITHESIS_OUTPUT_DIR/sdk.jsonl.
35-
# Antithesis begins fault injection / test orchestration after receiving
36-
# this message.
37-
# ---------------------------------------------------------------------------
3842
_output_dir="${ANTITHESIS_OUTPUT_DIR:-/tmp/antithesis}"
3943
mkdir -p "$_output_dir"
44+
45+
# ---------------------------------------------------------------------------
46+
# 3. Multi-container mode: wait for the node container and configure the
47+
# driver to use the pre-running cluster.
48+
#
49+
# When NODE_HOST is set the driver polls the node's HTTP health endpoint
50+
# (port 8090) until it responds "ready". This HTTP traffic is what makes
51+
# both containers visible on the Antithesis network bridge.
52+
#
53+
# DEV_CLUSTER_RUNNING=1 tells pytest to skip cluster startup/shutdown and
54+
# use the cluster already started by the node container.
55+
# CARDANO_NODE_SOCKET_PATH_CI is pre-set to the shared volume socket path
56+
# so regression.sh does not override it with its default workdir path.
57+
# ---------------------------------------------------------------------------
58+
if [ -n "${NODE_HOST:-}" ]; then
59+
_node_port="${NODE_PORT:-8090}"
60+
echo "Waiting for ${NODE_HOST}:${_node_port} to report ready..."
61+
62+
_ready=0
63+
for _i in $(seq 1 120); do
64+
_resp="$(python3 -c "
65+
import urllib.request, sys
66+
try:
67+
r = urllib.request.urlopen('http://${NODE_HOST}:${_node_port}/', timeout=5)
68+
sys.stdout.write(r.read().decode())
69+
except Exception:
70+
pass
71+
" 2>/dev/null || true)"
72+
if [ "$_resp" = "ready" ]; then
73+
_ready=1
74+
break
75+
fi
76+
echo " attempt ${_i}/120: node reports '${_resp:-no response}', retrying in 5s..."
77+
sleep 5
78+
done
79+
80+
if [ "$_ready" -ne 1 ]; then
81+
echo "ERROR: node container did not become ready within 10 minutes" >&2
82+
exit 1
83+
fi
84+
echo "Node is ready."
85+
86+
CLUSTER_STATE_DIR="${CLUSTER_STATE_DIR:-/cluster-state}"
87+
export DEV_CLUSTER_RUNNING=1
88+
export CLUSTERS_COUNT="${CLUSTERS_COUNT:-1}"
89+
# Pre-set so regression.sh does not overwrite with its default workdir path.
90+
export CARDANO_NODE_SOCKET_PATH_CI="${CLUSTER_STATE_DIR}/state-cluster0/bft1.socket"
91+
fi
92+
93+
# ---------------------------------------------------------------------------
94+
# 4. Emit the Antithesis setup_complete signal.
95+
# ---------------------------------------------------------------------------
4096
printf '{"antithesis_setup": {"status": "complete", "details": {"info": ["cardano-node-tests driver ready, node_rev=%s"]}}}\n' \
4197
"${BAKED_NODE_REV:-unknown}" >> "$_output_dir/sdk.jsonl"
4298
unset _output_dir
4399

44100
# ---------------------------------------------------------------------------
45-
# 4. Hand off to regression.sh. The shebang in that script will invoke
101+
# 5. Hand off to regression.sh. The shebang in that script will invoke
46102
# `nix develop .#base` which now resolves entirely from the local nix
47103
# store (offline = true).
48104
#

docker/docker-compose.yaml

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,18 @@
11
# Docker Compose for Antithesis test submission.
22
#
3-
# The driver image must be pre-built with all cardano binaries and the Python
4-
# venv baked in (see docker/Dockerfile). No internet access is available at
3+
# Two services share a cluster-state volume:
4+
#
5+
# node — starts the cardano-node cluster (system under test).
6+
# Serves a health check on port 8090 so the driver can detect
7+
# when the cluster is ready. The traffic between driver and node
8+
# over the antithesis-net bridge satisfies the Antithesis
9+
# "Containers joined the Antithesis network" property.
10+
#
11+
# driver — waits for the node health check, then runs the pytest test
12+
# suite against the pre-running cluster via DEV_CLUSTER_RUNNING=1.
13+
#
14+
# Both images must be pre-built with all cardano binaries and the Python venv
15+
# baked in (see docker/Dockerfile). No internet access is available at
516
# runtime inside the Antithesis environment.
617
#
718
# Push images to the Antithesis registry before submitting:
@@ -17,24 +28,59 @@ networks:
1728
antithesis-net:
1829
driver: bridge
1930

31+
volumes:
32+
cluster-state:
33+
2034
services:
35+
node:
36+
image: ghcr.io/saratomaz/cardano-node-tests-antithesis:latest
37+
build:
38+
context: ..
39+
dockerfile: docker/Dockerfile
40+
command: ["/work/docker/node_run.sh"]
41+
networks:
42+
- antithesis-net
43+
volumes:
44+
- cluster-state:/cluster-state
45+
environment:
46+
- CLUSTER_STATE_DIR=/cluster-state
47+
- TESTNET_VARIANT=${TESTNET_VARIANT:-conway_fast}
48+
healthcheck:
49+
test:
50+
- "CMD"
51+
- "python3"
52+
- "-c"
53+
- "import urllib.request; exit(0 if urllib.request.urlopen('http://localhost:8090/', timeout=5).read() == b'ready' else 1)"
54+
interval: 15s
55+
timeout: 6s
56+
retries: 60
57+
start_period: 60s
58+
2159
driver:
2260
image: ghcr.io/saratomaz/cardano-node-tests-antithesis:latest
2361
build:
2462
context: ..
2563
dockerfile: docker/Dockerfile
26-
# antithesis_run.sh sets nix offline, exports pre-built paths, emits
27-
# setup_complete, then hands off to regression.sh.
64+
# antithesis_run.sh sets nix offline, waits for the node health check,
65+
# exports DEV_CLUSTER_RUNNING=1, emits setup_complete, then hands off
66+
# to regression.sh.
2867
command: ["/work/docker/antithesis_run.sh"]
2968
networks:
3069
- antithesis-net
70+
depends_on:
71+
- node
72+
volumes:
73+
- cluster-state:/cluster-state
3174
environment:
75+
- CLUSTER_STATE_DIR=/cluster-state
76+
- NODE_HOST=node
77+
- NODE_PORT=8090
3278
# NODE_REV is baked into the image at build time; do not override here.
3379
- CARDANO_CLI_REV=${CARDANO_CLI_REV:-}
3480
- DBSYNC_REV=${DBSYNC_REV:-}
3581
- RUN_TARGET=${RUN_TARGET:-tests}
3682
- MARKEXPR=${MARKEXPR:-}
37-
- CLUSTERS_COUNT=${CLUSTERS_COUNT:-}
83+
- CLUSTERS_COUNT=${CLUSTERS_COUNT:-1}
3884
- CLUSTER_ERA=${CLUSTER_ERA:-}
3985
- PROTOCOL_VERSION=${PROTOCOL_VERSION:-}
4086
- UTXO_BACKEND=${UTXO_BACKEND:-}

docker/node_run.sh

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env bash
2+
# Antithesis node container entrypoint.
3+
#
4+
# 1. Starts the cardano-node cluster on the shared 'cluster-state' volume so
5+
# the driver container can reach the node sockets without going over the
6+
# network (Unix socket on a shared Docker volume).
7+
# 2. Serves a lightweight HTTP health check on port 8090 over the Antithesis
8+
# network bridge. Returns "ready" once the cluster socket exists.
9+
# This cross-container HTTP traffic satisfies the Antithesis
10+
# "Containers joined the Antithesis network" property.
11+
#
12+
# Environment variables:
13+
# CLUSTER_STATE_DIR Mount point of the shared cluster-state volume
14+
# (default: /cluster-state).
15+
# TESTNET_VARIANT Cluster variant passed to prepare_cluster_scripts
16+
# (default: conway_fast).
17+
18+
set -Eeuo pipefail
19+
20+
# ---------------------------------------------------------------------------
21+
# 1. Force nix offline — all store paths are pre-built into the image.
22+
# ---------------------------------------------------------------------------
23+
echo "offline = true" >> /etc/nix/nix.conf
24+
25+
# ---------------------------------------------------------------------------
26+
# 2. Point at pre-built binaries and Python venv.
27+
# ---------------------------------------------------------------------------
28+
export CARDANO_PREBUILT_DIR=/opt/cardano
29+
export _VENV_DIR=/opt/tests-venv
30+
_PATH_PREPEND="/opt/cardano/cardano-node/bin:/opt/cardano/cardano-submit-api/bin:/opt/cardano/cardano-cli/bin:/opt/cardano/bech32/bin"
31+
32+
# ---------------------------------------------------------------------------
33+
# 3. Cluster state lives on the shared volume so the driver can read sockets.
34+
# ---------------------------------------------------------------------------
35+
CLUSTER_STATE_DIR="${CLUSTER_STATE_DIR:-/cluster-state}"
36+
_INSTANCE_NUM=0
37+
_STATE_CLUSTER="${CLUSTER_STATE_DIR}/state-cluster${_INSTANCE_NUM}"
38+
_SCRIPTS_DEST="${CLUSTER_STATE_DIR}/startup_scripts"
39+
40+
# Local clusters (conway_fast, etc.) use bft1.socket.
41+
export CARDANO_NODE_SOCKET_PATH="${_STATE_CLUSTER}/bft1.socket"
42+
43+
_output_dir="${ANTITHESIS_OUTPUT_DIR:-/tmp/antithesis}"
44+
mkdir -p "$_output_dir" "${CLUSTER_STATE_DIR}"
45+
46+
# ---------------------------------------------------------------------------
47+
# 4. Health check server on port 8090 (Antithesis network bridge traffic).
48+
# Returns HTTP 200 "ready" once the cluster socket file exists,
49+
# 503 "starting" while the cluster is still coming up.
50+
# ---------------------------------------------------------------------------
51+
python3 -c "
52+
import os, socket as _s
53+
_sock_path = os.environ.get('CARDANO_NODE_SOCKET_PATH', '')
54+
server = _s.socket(_s.AF_INET, _s.SOCK_STREAM)
55+
server.setsockopt(_s.SOL_SOCKET, _s.SO_REUSEADDR, 1)
56+
server.bind(('0.0.0.0', 8090))
57+
server.listen(64)
58+
while True:
59+
conn, _ = server.accept()
60+
ready = os.path.exists(_sock_path)
61+
body = b'ready' if ready else b'starting'
62+
status = b'200 OK' if ready else b'503 Service Unavailable'
63+
conn.sendall(b'HTTP/1.1 ' + status + b'\r\nContent-Length: ' + str(len(body)).encode() + b'\r\n\r\n' + body)
64+
conn.close()
65+
" &
66+
_health_pid=$!
67+
trap 'kill "$_health_pid" 2>/dev/null || true' EXIT
68+
69+
# ---------------------------------------------------------------------------
70+
# 5. Prepare cluster startup scripts and run the cluster.
71+
# We enter the nix testenv shell to ensure jq, postgres, and other cluster
72+
# tools are available. The inner script uses the pre-built venv so no
73+
# network access is needed.
74+
# ---------------------------------------------------------------------------
75+
_testnet_variant="${TESTNET_VARIANT:-conway_fast}"
76+
77+
set +e
78+
nix develop --accept-flake-config .#testenv --command bash -c "
79+
set -euo pipefail
80+
. '${_VENV_DIR}/bin/activate'
81+
export PATH='${_PATH_PREPEND}:\${PATH}'
82+
export CARDANO_NODE_SOCKET_PATH='${CARDANO_NODE_SOCKET_PATH}'
83+
84+
# Instantiate cluster scripts for instance ${_INSTANCE_NUM} into the
85+
# shared volume. --clean removes any previous attempt.
86+
python -m cardano_node_tests.prepare_cluster_scripts \
87+
--dest-dir '${_SCRIPTS_DEST}' \
88+
--testnet-variant '${_testnet_variant}' \
89+
--instance-num ${_INSTANCE_NUM} \
90+
--clean
91+
92+
# start-cluster must run from the parent of the state-cluster directory.
93+
cd '${CLUSTER_STATE_DIR}'
94+
'${_SCRIPTS_DEST}/start-cluster'
95+
96+
printf '{\"antithesis_setup\": {\"status\": \"complete\", \"details\": {\"info\": [\"cardano-node cluster ready, socket=%s\"]}}}\n' \
97+
'${CARDANO_NODE_SOCKET_PATH}' >> '${_output_dir}/sdk.jsonl'
98+
99+
# Keep the cluster alive until the container is stopped.
100+
tail -f /dev/null
101+
"
102+
_rc=$?
103+
set -e
104+
105+
echo "node_run.sh exiting with code ${_rc}"
106+
exit 0

0 commit comments

Comments
 (0)