|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Run the Composed-1 M5 multi-table DynamoDB Jepsen workload against a |
| 3 | +# single-process two-group elastickv cluster. |
| 4 | +# |
| 5 | +# Why this script exists separately from run-jepsen-local.sh: the M5 |
| 6 | +# workload requires a multi-Raft-group cluster topology that the |
| 7 | +# existing 3-node single-group layout cannot provide. Per the design |
| 8 | +# doc (docs/design/2026_06_02_proposed_composed1_m5_jepsen_route_shuffle.md |
| 9 | +# §3.3), today's `validateShardRanges` / `buildShardGroups` only |
| 10 | +# support a "single process hosts all groups" model — separate |
| 11 | +# processes per group fail validation or race on Raft listeners. |
| 12 | +# So this script launches ONE process hosting BOTH single-member |
| 13 | +# groups, with two Raft listeners (50051, 50054) and one shared |
| 14 | +# DynamoDB endpoint (63801). |
| 15 | +# |
| 16 | +# Trade-off accepted: partition / kill nemeses can't isolate one |
| 17 | +# group from the other since they share a process. Only the |
| 18 | +# (future) route-shuffle nemesis exercises the cross-group path |
| 19 | +# meaningfully under this topology. True distributed multi-group is |
| 20 | +# M6+ work — see the parent design doc. |
| 21 | +# |
| 22 | +# Usage: |
| 23 | +# ./scripts/run-jepsen-m5-local.sh # build + start + test |
| 24 | +# ./scripts/run-jepsen-m5-local.sh --no-rebuild # skip go build |
| 25 | +# ./scripts/run-jepsen-m5-local.sh --no-cluster # reuse running cluster |
| 26 | +set -euo pipefail |
| 27 | + |
| 28 | +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| 29 | +BINARY=/tmp/elastickv4-m5-binary |
| 30 | +ROUTE_KEY_BIN=/tmp/elastickv4-m5-route-key |
| 31 | +LIST_ROUTES_BIN=/tmp/elastickv4-m5-list-routes |
| 32 | +DATA_DIR=/tmp/elastickv-m5-test-run |
| 33 | +PID_FILE=/tmp/elastickv-m5-test-run.pid |
| 34 | + |
| 35 | +# ---- topology: one process, two single-member Raft groups ---- |
| 36 | +RAFT_ADDR_G1="127.0.0.1:50051" |
| 37 | +RAFT_ADDR_G2="127.0.0.1:50054" |
| 38 | +DYNAMO_ADDR="127.0.0.1:63801" |
| 39 | +PROC_ADDR="$RAFT_ADDR_G1" # the process's primary gRPC address |
| 40 | +RAFT_GROUPS="1=$RAFT_ADDR_G1,2=$RAFT_ADDR_G2" |
| 41 | +RAFT_DYNAMO_MAP="$RAFT_ADDR_G1=$DYNAMO_ADDR,$RAFT_ADDR_G2=$DYNAMO_ADDR" |
| 42 | + |
| 43 | +NO_REBUILD=false |
| 44 | +NO_CLUSTER=false |
| 45 | +for arg in "$@"; do |
| 46 | + case "$arg" in |
| 47 | + --no-rebuild) NO_REBUILD=true ;; |
| 48 | + --no-cluster) NO_CLUSTER=true ;; |
| 49 | + esac |
| 50 | +done |
| 51 | + |
| 52 | +# ---- build (server + route-key + list-routes helpers) ---- |
| 53 | +if ! $NO_REBUILD; then |
| 54 | + # Pre-flight: cmd/elastickv-list-routes lands in PR #925. If this |
| 55 | + # branch is run before #925 merges, `go build` would emit an |
| 56 | + # opaque package-not-found error. Surface the cross-PR dependency |
| 57 | + # in a machine-readable way (claude[bot] suggestion on PR #924). |
| 58 | + if [ ! -d "$REPO_ROOT/cmd/elastickv-list-routes" ]; then |
| 59 | + echo "[error] cmd/elastickv-list-routes/ not found in this branch." >&2 |
| 60 | + echo " PR #924 depends on PR #925 (setup-hook + list-routes CLI)." >&2 |
| 61 | + echo " Merge #925 first, or check out the integrated branch." >&2 |
| 62 | + exit 1 |
| 63 | + fi |
| 64 | + echo "[build] compiling elastickv server..." |
| 65 | + cd "$REPO_ROOT" |
| 66 | + go build -o "$BINARY" . |
| 67 | + echo "[build] compiling elastickv-route-key helper..." |
| 68 | + go build -o "$ROUTE_KEY_BIN" ./cmd/elastickv-route-key |
| 69 | + echo "[build] compiling elastickv-list-routes helper..." |
| 70 | + # Used by the Jepsen workload's setup-hook verification |
| 71 | + # (verify-multi-group-routing!). Confirms the cluster booted with |
| 72 | + # >=2 distinct Raft groups before any workload op runs. |
| 73 | + go build -o "$LIST_ROUTES_BIN" ./cmd/elastickv-list-routes |
| 74 | + echo "[build] done -> $BINARY, $ROUTE_KEY_BIN, $LIST_ROUTES_BIN" |
| 75 | +fi |
| 76 | + |
| 77 | +# ---- compute --shardRanges boundary keys ---- |
| 78 | +# Multi-table workload uses tables jepsen_append_t{1..4}. Tables 1-2 |
| 79 | +# go to group 1, tables 3-4 to group 2. Boundary keys are the |
| 80 | +# byte-for-byte route-key encoding of the table names — computed via |
| 81 | +# the elastickv-route-key Go helper rather than inlined in shell so |
| 82 | +# the base64 encoding stays in sync with adapter/dynamodb.go's |
| 83 | +# encodeDynamoSegment (codex P1 #1 on PR #905 ffb9c73f). |
| 84 | +# |
| 85 | +# Guard: every helper binary must exist before continuing. Runs |
| 86 | +# unconditionally — catches both --no-rebuild (helpers expected from |
| 87 | +# a previous run) AND a fresh-build environment where a helper |
| 88 | +# somehow produced a non-executable. Failing fast with a clear |
| 89 | +# remediation message is strictly better than letting `set -e` |
| 90 | +# swallow a misleading 'No such file or directory' deeper in the |
| 91 | +# script (gemini medium + claude[bot] minor on PR #924). |
| 92 | +for bin in "$ROUTE_KEY_BIN" "$LIST_ROUTES_BIN" "$BINARY"; do |
| 93 | + if [ ! -x "$bin" ]; then |
| 94 | + echo "[error] required helper not found at $bin." >&2 |
| 95 | + echo " Re-run without --no-rebuild to compile the helpers." >&2 |
| 96 | + exit 1 |
| 97 | + fi |
| 98 | +done |
| 99 | +T1_KEY="$("$ROUTE_KEY_BIN" jepsen_append_t1)" |
| 100 | +T3_KEY="$("$ROUTE_KEY_BIN" jepsen_append_t3)" |
| 101 | +# Group 1: [T1_KEY, T3_KEY) — tables 1, 2 |
| 102 | +# Group 2: [T3_KEY, +inf) — tables 3, 4 |
| 103 | +# Keys outside [T1_KEY, +inf) fall through to the default group; this |
| 104 | +# workload only writes table-route keys so that range is unused. |
| 105 | +SHARD_RANGES="${T1_KEY}:${T3_KEY}=1,${T3_KEY}:=2" |
| 106 | +echo "[shard-ranges] $SHARD_RANGES" |
| 107 | + |
| 108 | +# ---- stop any previously managed cluster ---- |
| 109 | +stop_cluster() { |
| 110 | + if [ -f "$PID_FILE" ]; then |
| 111 | + echo "[cluster] stopping previous cluster..." |
| 112 | + while IFS= read -r pid; do |
| 113 | + kill "$pid" 2>/dev/null || true |
| 114 | + done < "$PID_FILE" |
| 115 | + rm -f "$PID_FILE" |
| 116 | + fi |
| 117 | +} |
| 118 | + |
| 119 | +# ---- start cluster: ONE process hosting both groups ---- |
| 120 | +if ! $NO_CLUSTER; then |
| 121 | + # Install the cleanup hook BEFORE starting the cluster so an |
| 122 | + # exception during launch (e.g. bind-port collision) still |
| 123 | + # tears down the half-started state. EXIT covers normal flow, |
| 124 | + # INT/TERM cover user Ctrl-C and CI cancellation. Without |
| 125 | + # this the failure path leaks background processes that hold |
| 126 | + # the Raft / Dynamo ports for the next run (gemini medium on |
| 127 | + # PR #924). |
| 128 | + trap stop_cluster EXIT INT TERM |
| 129 | + stop_cluster |
| 130 | + rm -rf "$DATA_DIR" |
| 131 | + mkdir -p "$DATA_DIR" |
| 132 | + : > "$PID_FILE" |
| 133 | + |
| 134 | + echo "[cluster] starting single-process two-group cluster..." |
| 135 | + # Notes on flag selection: |
| 136 | + # --raftBootstrap : boolean; each group is single-member so no |
| 137 | + # peer discovery is needed. --raftBootstrapMembers |
| 138 | + # is rejected by resolveBootstrapServers on any |
| 139 | + # multi-group process (main.go:735-741) and so |
| 140 | + # MUST NOT appear here (codex P2 + claude[bot] |
| 141 | + # P2 on PR #905 3ca2a7f7). |
| 142 | + # --raftGroups : declares both groups with distinct Raft |
| 143 | + # listeners. |
| 144 | + # --shardRanges : places t1-t2 in group 1 and t3-t4 in group 2. |
| 145 | + # Both flags are required for the multi-group |
| 146 | + # contract: --shardRanges alone collapses |
| 147 | + # everything to the default group 1 |
| 148 | + # (coderabbit Major on PR #905 f92a029e). |
| 149 | + # --raftDynamoMap : both Raft addresses point at the same Dynamo |
| 150 | + # endpoint since there's only one process. |
| 151 | + nohup "$BINARY" \ |
| 152 | + --address "$PROC_ADDR" \ |
| 153 | + --dynamoAddress "$DYNAMO_ADDR" \ |
| 154 | + --redisAddress "" \ |
| 155 | + --s3Address "" \ |
| 156 | + --sqsAddress "" \ |
| 157 | + --metricsAddress "" \ |
| 158 | + --pprofAddress "" \ |
| 159 | + --raftId "n1" \ |
| 160 | + --raftDataDir "${DATA_DIR}/n1" \ |
| 161 | + --raftBootstrap \ |
| 162 | + --raftGroups "$RAFT_GROUPS" \ |
| 163 | + --shardRanges "$SHARD_RANGES" \ |
| 164 | + --raftDynamoMap "$RAFT_DYNAMO_MAP" \ |
| 165 | + > "${DATA_DIR}/n1.log" 2>&1 & |
| 166 | + echo $! >> "$PID_FILE" |
| 167 | + |
| 168 | + echo "[cluster] waiting for Dynamo endpoint ($DYNAMO_ADDR)..." |
| 169 | + for i in $(seq 1 90); do |
| 170 | + # Use bash's built-in /dev/tcp probe rather than `nc` so the |
| 171 | + # script runs on minimal CI images that may not ship netcat |
| 172 | + # (gemini medium on PR #924). |
| 173 | + if (echo > /dev/tcp/127.0.0.1/63801) >/dev/null 2>&1; then |
| 174 | + echo "[cluster] up after ${i}s" |
| 175 | + break |
| 176 | + fi |
| 177 | + sleep 1 |
| 178 | + if [ "$i" -eq 90 ]; then |
| 179 | + echo "[cluster] FAILED to start - dumping log:" |
| 180 | + tail -n 100 "${DATA_DIR}/n1.log" || true |
| 181 | + exit 1 |
| 182 | + fi |
| 183 | + done |
| 184 | +fi |
| 185 | + |
| 186 | +# ---- run M5 Jepsen multi-table workload ---- |
| 187 | +cd "$REPO_ROOT/jepsen" |
| 188 | + |
| 189 | +# Resolve lein: prefer LEIN env override, then PATH (works on CI), then |
| 190 | +# the macOS Homebrew default. Failing to find lein is fatal. |
| 191 | +LEIN_BIN="${LEIN:-$(command -v lein || echo /opt/homebrew/bin/lein)}" |
| 192 | +if [ ! -x "$LEIN_BIN" ]; then |
| 193 | + echo "[jepsen] lein not found (tried \$LEIN, PATH, /opt/homebrew/bin/lein)" >&2 |
| 194 | + exit 127 |
| 195 | +fi |
| 196 | + |
| 197 | +echo "[jepsen] running DynamoDB multi-table list-append workload via $LEIN_BIN..." |
| 198 | +mkdir -p tmp-home .lein |
| 199 | +# --list-routes-bin / --grpc-host-port wire the setup-hook verification |
| 200 | +# (verify-multi-group-routing!) at the workload's first setup! call. |
| 201 | +# Without them the hook falls back to PATH lookup which fails when |
| 202 | +# run from this script's tmp build. |
| 203 | +HOME="$(pwd)/tmp-home" LEIN_HOME="$(pwd)/.lein" \ |
| 204 | + LEIN_JVM_OPTS="-Duser.home=$(pwd)/tmp-home" \ |
| 205 | + "$LEIN_BIN" run -m elastickv.dynamodb-multi-table-workload \ |
| 206 | + --local \ |
| 207 | + --time-limit 30 \ |
| 208 | + --rate 5 \ |
| 209 | + --dynamo-port 63801 \ |
| 210 | + --list-routes-bin "$LIST_ROUTES_BIN" \ |
| 211 | + --grpc-host-port "$PROC_ADDR" \ |
| 212 | + || EXIT_CODE=$? |
| 213 | + |
| 214 | +EXIT_CODE=${EXIT_CODE:-0} |
| 215 | + |
| 216 | +# ---- teardown ---- |
| 217 | +# Cluster shutdown is handled by the `trap stop_cluster EXIT INT TERM` |
| 218 | +# installed above the cluster launch. No explicit teardown call is |
| 219 | +# needed here; doing so would double-call stop_cluster on success |
| 220 | +# (harmless but noisy) and double-call on failure (which is also |
| 221 | +# harmless since stop_cluster is idempotent, but the EXIT trap path |
| 222 | +# is the canonical one — see gemini medium on PR #924). |
| 223 | + |
| 224 | +exit "$EXIT_CODE" |
0 commit comments