-
Notifications
You must be signed in to change notification settings - Fork 2
261 lines (257 loc) · 10.3 KB
/
jepsen-test-scheduled.yml
File metadata and controls
261 lines (257 loc) · 10.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
on:
schedule:
- cron: '0 */6 * * *'
workflow_dispatch:
inputs:
time-limit:
description: "Workload runtime seconds"
required: false
default: "300"
rate:
description: "Ops/sec per worker"
required: false
default: "5"
concurrency:
description: "Number of worker threads (must be multiple of 4 for S3)"
required: false
default: "4"
key-count:
description: "Number of distinct keys per workload"
required: false
default: "8"
max-writes-per-key:
description: "Maximum writes per key before exhaustion"
required: false
default: "150"
max-txn-length:
description: "Maximum micro-ops per transaction"
required: false
default: "4"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-jepsen-scheduled
name: Jepsen Scheduled Stress Test
permissions:
contents: read
jobs:
test:
runs-on: ubuntu-latest
env:
GOCACHE: /tmp/go-build
steps:
- uses: actions/checkout@v6
with:
submodules: recursive
- uses: actions/setup-java@v5
with:
distribution: temurin
java-version: '21'
- uses: actions/setup-go@v6
with:
go-version-file: 'go.mod'
- name: Install netcat and graphviz
run: sudo apt-get update && sudo apt-get install -y netcat-openbsd graphviz
- name: Install Leiningen
run: |
curl -L https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > ~/lein
chmod +x ~/lein
~/lein version
# Cache Maven artifacts across runs. Maven Central (repo1.maven.org)
# serves intermittent 429s during dependency download, which
# uncached scheduled runs trip frequently — the lein project pulls
# ~100 jars at first launch. Keying on project.clj invalidates the
# cache only when dependencies actually change.
- name: Cache Maven and Leiningen artifacts
uses: actions/cache@v5
with:
path: |
~/.m2/repository
~/.lein
key: ${{ runner.os }}-maven-${{ hashFiles('jepsen/project.clj') }}
restore-keys: |
${{ runner.os }}-maven-
- name: Pre-fetch Go modules
run: |
mkdir -p "$GOCACHE" /tmp/go-tmp
GOPATH=$(go env GOPATH)
export GOCACHE GOTMPDIR=/tmp/go-tmp
go mod download
# Warm the Maven cache up front so unit tests / workloads start
# against a populated ~/.m2. `lein deps` is wrapped with a retry
# loop because Maven Central transiently returns 429 during peak
# hours, and the default lein behaviour is to fail the build on
# the first checksum miss. Three attempts with linear backoff is
# enough to ride out a typical rate-limit window.
- name: Warm Leiningen Maven cache
working-directory: jepsen
run: |
set -uo pipefail
n=0
max=3
until ~/lein deps; do
n=$((n + 1))
if [ "$n" -ge "$max" ]; then
echo "lein deps failed after $n attempts" >&2
exit 1
fi
sleep_secs=$((n * 30))
echo "lein deps failed (attempt $n/$max), retrying in ${sleep_secs}s..." >&2
sleep "$sleep_secs"
done
- name: Run Jepsen unit tests
working-directory: jepsen
run: ~/lein test
- name: Launch demo cluster
run: |
set -euo pipefail
mkdir -p "$GOCACHE" /tmp/go-tmp
export GOTMPDIR=/tmp/go-tmp
nohup go run cmd/server/demo.go > /tmp/elastickv-demo.log 2>&1 &
echo $! > /tmp/elastickv-demo.pid
echo "Waiting for redis (63791-63793), dynamo (63801-63803), and s3 (63901-63903) listeners..."
for i in {1..90}; do
if nc -z 127.0.0.1 63791 && nc -z 127.0.0.1 63792 && nc -z 127.0.0.1 63793 \
&& nc -z 127.0.0.1 63801 && nc -z 127.0.0.1 63802 && nc -z 127.0.0.1 63803 \
&& nc -z 127.0.0.1 63901 && nc -z 127.0.0.1 63902 && nc -z 127.0.0.1 63903; then
echo "Cluster is up"
exit 0
fi
sleep 1
done
echo "Demo cluster failed to start; dumping log:"
tail -n 200 /tmp/elastickv-demo.log || true
exit 1
- name: Run Redis Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.redis-workload \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--max-txn-length ${{ inputs.max-txn-length || '4' }} \
--ports 63791,63792,63793 \
--host 127.0.0.1
- name: Run DynamoDB Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.dynamodb-workload --local \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--max-txn-length ${{ inputs.max-txn-length || '4' }} \
--dynamo-ports 63801,63802,63803 \
--host 127.0.0.1
- name: Run DynamoDB per-type Jepsen workloads against elastickv
working-directory: jepsen
# The per-type sweep is a coverage check across all 10 attribute
# types, not the deep stress run — it uses its own shorter
# time-limit so the 10-type loop fits comfortably inside the job
# timeout regardless of the workflow_dispatch time-limit input.
# The per-invocation `timeout` is derived from TYPE_TL + buffer
# so bumping TYPE_TL never races against the outer timeout.
timeout-minutes: 30
env:
# Per-type sweep is a coverage check, not the deep stress run, so
# it uses its own shorter runtime and smaller history density than
# the parent dynamodb-workload step. Keeping per-key ops modest
# also keeps Knossos's linearizability analysis inside its
# time budget (dense histories cause :valid? :unknown verdicts).
TYPE_TL: "60"
TYPE_CONCURRENCY: "4"
TYPE_KEY_COUNT: "8"
TYPE_MAX_WRITES: "80"
run: |
# Run every type independently: one failure does not stop
# the sweep so the final summary shows which specific types
# passed/failed. The step still fails if any type failed.
PER_TYPE_TIMEOUT=$((TYPE_TL + 180))
declare -A RESULT
FAILED=()
for t in string number binary bool null string-set number-set binary-set list map; do
echo "::group::value-type=${t}"
set +e
timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \
--time-limit "${TYPE_TL}" \
--rate ${{ inputs.rate || '5' }} \
--concurrency "${TYPE_CONCURRENCY}" \
--key-count "${TYPE_KEY_COUNT}" \
--max-writes-per-key "${TYPE_MAX_WRITES}" \
--value-type "${t}" \
--dynamo-ports 63801,63802,63803 \
--host 127.0.0.1
rc=$?
set -e
if [ "$rc" -eq 0 ]; then
RESULT[$t]="pass"
else
RESULT[$t]="fail(${rc})"
FAILED+=("$t")
fi
echo "::endgroup::"
done
echo
echo "=== per-type jepsen summary ==="
for t in string number binary bool null string-set number-set binary-set list map; do
printf ' %-12s %s\n' "$t" "${RESULT[$t]}"
done
if [ ${#FAILED[@]} -ne 0 ]; then
echo "FAILED types: ${FAILED[*]}"
exit 1
fi
- name: Upload Jepsen store on failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: jepsen-store-types
path: jepsen/store
retention-days: 7
- name: Run S3 Jepsen workload against elastickv
working-directory: jepsen
timeout-minutes: 10
run: |
timeout 480 ~/lein run -m elastickv.s3-workload --local \
--time-limit ${{ inputs.time-limit || '150' }} \
--rate ${{ inputs.rate || '10' }} \
--concurrency ${{ inputs.concurrency || '8' }} \
--key-count ${{ inputs.key-count || '16' }} \
--max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
--threads-per-key 4 \
--s3-ports 63901,63902,63903 \
--host 127.0.0.1
- name: Dump demo cluster log on failure
if: failure()
# The previous `tail -n 500` truncated a 3-minute workload's
# log down to startup-only lines, making it impossible to
# correlate a Jepsen anomaly with the server-side state
# (start_ts, commit_ts, raft term, write conflicts, lock-
# resolver events). Print head + tail inline so the GH UI
# still shows the most recent activity at-a-glance, then
# upload the full log as an artifact for offline analysis.
run: |
echo "=== first 200 lines (startup) ==="
head -n 200 /tmp/elastickv-demo.log || true
echo "=== last 1000 lines (most recent activity) ==="
tail -n 1000 /tmp/elastickv-demo.log || true
echo "=== full log line count ==="
wc -l /tmp/elastickv-demo.log || true
- name: Upload demo cluster log on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: elastickv-demo-log
path: /tmp/elastickv-demo.log
retention-days: 14
if-no-files-found: warn
- name: Stop demo cluster
if: always()
run: |
if [ -f /tmp/elastickv-demo.pid ]; then
pid=$(cat /tmp/elastickv-demo.pid)
kill "$pid" 2>/dev/null || true
wait "$pid" 2>/dev/null || true
fi