Skip to content

backup: snapshot_reader (Phase 0a foundation for snapshot-decode binary) #3520

backup: snapshot_reader (Phase 0a foundation for snapshot-decode binary)

backup: snapshot_reader (Phase 0a foundation for snapshot-decode binary) #3520

Workflow file for this run

on:
push:
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-jepsen-test
name: Jepsen Test
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
env:
GOCACHE: /tmp/go-build
steps:
- uses: actions/checkout@v6
with:
submodules: recursive
- uses: actions/setup-java@v5
with:
distribution: temurin
java-version: '21'
- uses: actions/setup-go@v6
with:
go-version-file: 'go.mod'
- name: Install netcat and graphviz
run: sudo apt-get update && sudo apt-get install -y netcat-openbsd graphviz
- name: Install Leiningen
run: |
curl -L https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > ~/lein
chmod +x ~/lein
~/lein version
# See jepsen-test-scheduled.yml for the rationale: Maven Central
# 429s during peak hours have been knocking the scheduled stress
# run out, and the per-push run uses the same dependency set.
- name: Cache Maven and Leiningen artifacts
uses: actions/cache@v5
with:
path: |
~/.m2/repository
~/.lein
key: ${{ runner.os }}-maven-${{ hashFiles('jepsen/project.clj') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Pre-fetch Go modules and build binary
run: |
mkdir -p "$GOCACHE" /tmp/go-tmp
GOPATH=$(go env GOPATH)
export GOCACHE GOTMPDIR=/tmp/go-tmp
go mod download
go build -o /tmp/elastickv-bin .
- name: Warm Leiningen Maven cache
working-directory: jepsen
run: |
set -uo pipefail
n=0
max=3
until ~/lein deps; do
n=$((n + 1))
if [ "$n" -ge "$max" ]; then
echo "lein deps failed after $n attempts" >&2
exit 1
fi
sleep_secs=$((n * 30))
echo "lein deps failed (attempt $n/$max), retrying in ${sleep_secs}s..." >&2
sleep "$sleep_secs"
done
- name: Run Jepsen unit tests
working-directory: jepsen
run: ~/lein test
- name: Launch etcd-backed cluster
run: |
set -euo pipefail
mkdir -p /tmp/elastickv-ci
BOOTSTRAP_MEMBERS="n1=127.0.0.1:50051,n2=127.0.0.1:50052,n3=127.0.0.1:50053"
RAFT_REDIS_MAP="127.0.0.1:50051=127.0.0.1:63791,127.0.0.1:50052=127.0.0.1:63792,127.0.0.1:50053=127.0.0.1:63793"
RAFT_S3_MAP="127.0.0.1:50051=127.0.0.1:63901,127.0.0.1:50052=127.0.0.1:63902,127.0.0.1:50053=127.0.0.1:63903"
RAFT_DYNAMO_MAP="127.0.0.1:50051=127.0.0.1:63801,127.0.0.1:50052=127.0.0.1:63802,127.0.0.1:50053=127.0.0.1:63803"
RAFT_SQS_MAP="127.0.0.1:50051=127.0.0.1:63501,127.0.0.1:50052=127.0.0.1:63502,127.0.0.1:50053=127.0.0.1:63503"
: > /tmp/elastickv-demo.pid
for node in 1 2 3; do
nohup /tmp/elastickv-bin \
--address "127.0.0.1:5005${node}" \
--redisAddress "127.0.0.1:6379${node}" \
--dynamoAddress "127.0.0.1:6380${node}" \
--s3Address "127.0.0.1:6390${node}" \
--sqsAddress "127.0.0.1:6350${node}" \
--metricsAddress "" \
--pprofAddress "" \
--raftId "n${node}" \
--raftDataDir "/tmp/elastickv-ci/n${node}" \
--raftBootstrapMembers "$BOOTSTRAP_MEMBERS" \
--raftRedisMap "$RAFT_REDIS_MAP" \
--raftS3Map "$RAFT_S3_MAP" \
--raftDynamoMap "$RAFT_DYNAMO_MAP" \
--raftSqsMap "$RAFT_SQS_MAP" \
> "/tmp/elastickv-demo-n${node}.log" 2>&1 &
echo $! >> /tmp/elastickv-demo.pid
done
echo "Waiting for redis (63791-63793), dynamo (63801-63803), s3 (63901-63903), and sqs (63501-63503) listeners..."
for i in {1..90}; do
if nc -z 127.0.0.1 63791 && nc -z 127.0.0.1 63792 && nc -z 127.0.0.1 63793 \
&& nc -z 127.0.0.1 63801 && nc -z 127.0.0.1 63802 && nc -z 127.0.0.1 63803 \
&& nc -z 127.0.0.1 63901 && nc -z 127.0.0.1 63902 && nc -z 127.0.0.1 63903 \
&& nc -z 127.0.0.1 63501 && nc -z 127.0.0.1 63502 && nc -z 127.0.0.1 63503; then
echo "Cluster is up"
exit 0
fi
sleep 1
done
echo "Demo cluster failed to start; dumping log:"
tail -n 200 /tmp/elastickv-demo-n1.log || true
tail -n 200 /tmp/elastickv-demo-n2.log || true
tail -n 200 /tmp/elastickv-demo-n3.log || true
exit 1
- name: Run Redis Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 3
run: |
timeout 120 ~/lein run -m elastickv.redis-workload --time-limit 5 --rate 5 --concurrency 5 --ports 63791,63792,63793 --host 127.0.0.1
- name: Run DynamoDB Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 3
run: |
timeout 120 ~/lein run -m elastickv.dynamodb-workload --local --time-limit 5 --rate 5 --concurrency 5 --dynamo-ports 63801,63802,63803 --host 127.0.0.1
- name: Run DynamoDB per-type Jepsen workloads against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
# Run every type even if one fails, so the log shows which
# specific attribute types pass and which fail. The step
# still fails at the end if any single type failed.
declare -A RESULT
FAILED=()
for t in string number binary bool null string-set number-set binary-set list map; do
echo "::group::value-type=${t}"
set +e
timeout 120 ~/lein run -m elastickv.dynamodb-types-workload --local \
--time-limit 5 --rate 5 --concurrency 4 \
--value-type "${t}" \
--dynamo-ports 63801,63802,63803 --host 127.0.0.1
rc=$?
set -e
if [ "$rc" -eq 0 ]; then
RESULT[$t]="pass"
else
RESULT[$t]="fail(${rc})"
FAILED+=("$t")
fi
echo "::endgroup::"
done
echo
echo "=== per-type jepsen summary ==="
for t in string number binary bool null string-set number-set binary-set list map; do
printf ' %-12s %s\n' "$t" "${RESULT[$t]}"
done
if [ ${#FAILED[@]} -ne 0 ]; then
echo "FAILED types: ${FAILED[*]}"
exit 1
fi
- name: Upload Jepsen store on per-type failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: jepsen-store-types
path: jepsen/store
retention-days: 7
- name: Run S3 Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 3
run: |
timeout 120 ~/lein run -m elastickv.s3-workload --local --time-limit 5 --rate 10 --concurrency 10 --s3-ports 63901,63902,63903 --host 127.0.0.1
- name: Run SQS HT-FIFO Jepsen workload against elastickv
working-directory: jepsen
# The HT-FIFO workload runs sends and receives across a 4-partition
# FIFO queue with content-based deduplication. The custom checker
# validates within-group ordering, no loss, and no duplicates.
# See jepsen/src/elastickv/sqs_htfifo_workload.clj.
#
# --drain-time 15: in --local mode the nemesis is a no-op, so no
# message can become invisible due to partition/kill — the 40s
# default drain (which protects against fault-induced
# visibility-timeout races) is overkill here. 15s leaves ample
# headroom under the 120s shell timeout against JVM startup and
# the 5s main phase.
timeout-minutes: 3
run: |
timeout 120 ~/lein run -m elastickv.sqs-htfifo-workload --local \
--time-limit 5 --rate 5 --concurrency 5 \
--partition-count 4 --group-count 6 \
--drain-time 15 \
--sqs-ports 63501,63502,63503 --host 127.0.0.1
- name: Stop demo cluster
if: always()
run: |
if [ -f /tmp/elastickv-demo.pid ]; then
while read -r pid; do
kill "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
done < /tmp/elastickv-demo.pid
fi