Skip to content

Jepsen Scheduled Stress Test #145

Jepsen Scheduled Stress Test

Jepsen Scheduled Stress Test #145

on:
schedule:
- cron: '0 */6 * * *'
workflow_dispatch:
inputs:
time-limit:
description: "Workload runtime seconds"
required: false
default: "300"
rate:
description: "Ops/sec per worker"
required: false
default: "5"
concurrency:
description: "Number of worker threads (must be multiple of 4 for S3)"
required: false
default: "4"
key-count:
description: "Number of distinct keys per workload"
required: false
default: "8"
max-writes-per-key:
description: "Maximum writes per key before exhaustion"
required: false
default: "150"
max-txn-length:
description: "Maximum micro-ops per transaction"
required: false
default: "4"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-jepsen-scheduled
name: Jepsen Scheduled Stress Test
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
env:
GOCACHE: /tmp/go-build
steps:
- uses: actions/checkout@v6
with:
submodules: recursive
- uses: actions/setup-java@v5
with:
distribution: temurin
java-version: '21'
- uses: actions/setup-go@v6
with:
go-version-file: 'go.mod'
- name: Install netcat and graphviz
run: sudo apt-get update && sudo apt-get install -y netcat-openbsd graphviz
- name: Install Leiningen
run: |
curl -L https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > ~/lein
chmod +x ~/lein
~/lein version
- name: Pre-fetch Go modules
run: |
mkdir -p "$GOCACHE" /tmp/go-tmp
GOPATH=$(go env GOPATH)
export GOCACHE GOTMPDIR=/tmp/go-tmp
go mod download
- name: Run Jepsen unit tests
working-directory: jepsen
run: ~/lein test
- name: Launch demo cluster
run: |
set -euo pipefail
mkdir -p "$GOCACHE" /tmp/go-tmp
export GOTMPDIR=/tmp/go-tmp
nohup go run cmd/server/demo.go > /tmp/elastickv-demo.log 2>&1 &
echo $! > /tmp/elastickv-demo.pid
echo "Waiting for redis (63791-63793), dynamo (63801-63803), and s3 (63901-63903) listeners..."
for i in {1..90}; do
if nc -z 127.0.0.1 63791 && nc -z 127.0.0.1 63792 && nc -z 127.0.0.1 63793 \
&& nc -z 127.0.0.1 63801 && nc -z 127.0.0.1 63802 && nc -z 127.0.0.1 63803 \
&& nc -z 127.0.0.1 63901 && nc -z 127.0.0.1 63902 && nc -z 127.0.0.1 63903; then
echo "Cluster is up"
exit 0
fi
sleep 1
done
echo "Demo cluster failed to start; dumping log:"
tail -n 200 /tmp/elastickv-demo.log || true
exit 1
- name: Run Redis Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.redis-workload \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--max-txn-length ${{ inputs.max-txn-length || '4' }} \
--ports 63791,63792,63793 \
--host 127.0.0.1
- name: Run DynamoDB Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.dynamodb-workload --local \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--max-txn-length ${{ inputs.max-txn-length || '4' }} \
--dynamo-ports 63801,63802,63803 \
--host 127.0.0.1
- name: Run DynamoDB per-type Jepsen workloads against elastickv
working-directory: jepsen
# The per-type sweep is a coverage check across all 10 attribute
# types, not the deep stress run — it uses its own shorter
# time-limit so the 10-type loop fits comfortably inside the job
# timeout regardless of the workflow_dispatch time-limit input.
# The per-invocation `timeout` is derived from TYPE_TL + buffer
# so bumping TYPE_TL never races against the outer timeout.
timeout-minutes: 30
env:
# Per-type sweep is a coverage check, not the deep stress run, so
# it uses its own shorter runtime and smaller history density than
# the parent dynamodb-workload step. Keeping per-key ops modest
# also keeps Knossos's linearizability analysis inside its
# time budget (dense histories cause :valid? :unknown verdicts).
TYPE_TL: "60"
TYPE_CONCURRENCY: "4"
TYPE_KEY_COUNT: "8"
TYPE_MAX_WRITES: "80"
run: |
# Run every type independently: one failure does not stop
# the sweep so the final summary shows which specific types
# passed/failed. The step still fails if any type failed.
PER_TYPE_TIMEOUT=$((TYPE_TL + 180))
declare -A RESULT
FAILED=()
for t in string number binary bool null string-set number-set binary-set list map; do
echo "::group::value-type=${t}"
set +e
timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \
--time-limit "${TYPE_TL}" \
--rate ${{ inputs.rate || '5' }} \
--concurrency "${TYPE_CONCURRENCY}" \
--key-count "${TYPE_KEY_COUNT}" \
--max-writes-per-key "${TYPE_MAX_WRITES}" \
--value-type "${t}" \
--dynamo-ports 63801,63802,63803 \
--host 127.0.0.1
rc=$?
set -e
if [ "$rc" -eq 0 ]; then
RESULT[$t]="pass"
else
RESULT[$t]="fail(${rc})"
FAILED+=("$t")
fi
echo "::endgroup::"
done
echo
echo "=== per-type jepsen summary ==="
for t in string number binary bool null string-set number-set binary-set list map; do
printf ' %-12s %s\n' "$t" "${RESULT[$t]}"
done
if [ ${#FAILED[@]} -ne 0 ]; then
echo "FAILED types: ${FAILED[*]}"
exit 1
fi
- name: Upload Jepsen store on failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: jepsen-store-types
path: jepsen/store
retention-days: 7
- name: Run S3 Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.s3-workload --local \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--threads-per-key 4 \
--s3-ports 63901,63902,63903 \
--host 127.0.0.1
- name: Dump demo cluster log on failure
if: failure()
run: tail -n 500 /tmp/elastickv-demo.log || true
- name: Stop demo cluster
if: always()
run: |
if [ -f /tmp/elastickv-demo.pid ]; then
pid=$(cat /tmp/elastickv-demo.pid)
kill "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
fi