elastickv/.github/workflows/jepsen-test-scheduled.yml at main · bootjp/elastickv · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
on:
  schedule:
    - cron: '0 */6 * * *'
  workflow_dispatch:
    inputs:
      time-limit:
        description: "Workload runtime seconds"
        required: false
        default: "300"
      rate:
        description: "Ops/sec per worker"
        required: false
        default: "5"
      concurrency:
        description: "Number of worker threads (must be multiple of 4 for S3)"
        required: false
        default: "4"
      key-count:
        description: "Number of distinct keys per workload"
        required: false
        default: "8"
      max-writes-per-key:
        description: "Maximum writes per key before exhaustion"
        required: false
        default: "150"
      max-txn-length:
        description: "Maximum micro-ops per transaction"
        required: false
        default: "4"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-jepsen-scheduled

name: Jepsen Scheduled Stress Test
permissions:
  contents: read
jobs:
  test:
    runs-on: ubuntu-latest
    env:
      GOCACHE: /tmp/go-build
    steps:
      - uses: actions/checkout@v6
        with:
          submodules: recursive
      - uses: actions/setup-java@v5
        with:
          distribution: temurin
          java-version: '21'
      - uses: actions/setup-go@v6
        with:
          go-version-file: 'go.mod'
      - name: Install netcat and graphviz
        run: sudo apt-get update && sudo apt-get install -y netcat-openbsd graphviz
      - name: Install Leiningen
        run: |
          curl -L https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein > ~/lein
          chmod +x ~/lein
          ~/lein version
      # Cache Maven artifacts across runs. Maven Central (repo1.maven.org)
      # serves intermittent 429s during dependency download, which
      # uncached scheduled runs trip frequently — the lein project pulls
      # ~100 jars at first launch. Keying on project.clj invalidates the
      # cache only when dependencies actually change.
      - name: Cache Maven and Leiningen artifacts
        uses: actions/cache@v5
        with:
          path: |
            ~/.m2/repository
            ~/.lein
          key: ${{ runner.os }}-maven-${{ hashFiles('jepsen/project.clj') }}
          restore-keys: |
            ${{ runner.os }}-maven-
      - name: Pre-fetch Go modules
        run: |
          mkdir -p "$GOCACHE" /tmp/go-tmp
          GOPATH=$(go env GOPATH)
          export GOCACHE GOTMPDIR=/tmp/go-tmp
          go mod download
      # Warm the Maven cache up front so unit tests / workloads start
      # against a populated ~/.m2. `lein deps` is wrapped with a retry
      # loop because Maven Central transiently returns 429 during peak
      # hours, and the default lein behaviour is to fail the build on
      # the first checksum miss. Three attempts with linear backoff is
      # enough to ride out a typical rate-limit window.
      - name: Warm Leiningen Maven cache
        working-directory: jepsen
        run: |
          set -uo pipefail
          n=0
          max=3
          until ~/lein deps; do
            n=$((n + 1))
            if [ "$n" -ge "$max" ]; then
              echo "lein deps failed after $n attempts" >&2
              exit 1
            fi
            sleep_secs=$((n * 30))
            echo "lein deps failed (attempt $n/$max), retrying in ${sleep_secs}s..." >&2
            sleep "$sleep_secs"
          done
      - name: Run Jepsen unit tests
        working-directory: jepsen
        run: ~/lein test
      - name: Launch demo cluster
        run: |
          set -euo pipefail
          mkdir -p "$GOCACHE" /tmp/go-tmp
          export GOTMPDIR=/tmp/go-tmp
          nohup go run cmd/server/demo.go > /tmp/elastickv-demo.log 2>&1 &
          echo $! > /tmp/elastickv-demo.pid

          echo "Waiting for redis (63791-63793), dynamo (63801-63803), and s3 (63901-63903) listeners..."
          for i in {1..90}; do
            if nc -z 127.0.0.1 63791 && nc -z 127.0.0.1 63792 && nc -z 127.0.0.1 63793 \
              && nc -z 127.0.0.1 63801 && nc -z 127.0.0.1 63802 && nc -z 127.0.0.1 63803 \
              && nc -z 127.0.0.1 63901 && nc -z 127.0.0.1 63902 && nc -z 127.0.0.1 63903; then
              echo "Cluster is up"
              exit 0
            fi
            sleep 1
          done

          echo "Demo cluster failed to start; dumping log:"
          tail -n 200 /tmp/elastickv-demo.log || true
          exit 1
      - name: Run Redis Jepsen workload against elastickv
        working-directory: jepsen
        timeout-minutes: 10
        run: |
          timeout 480 ~/lein run -m elastickv.redis-workload \
            --time-limit ${{ inputs.time-limit || '150' }} \
            --rate ${{ inputs.rate || '10' }} \
            --concurrency ${{ inputs.concurrency || '8' }} \
            --key-count ${{ inputs.key-count || '16' }} \
            --max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
            --max-txn-length ${{ inputs.max-txn-length || '4' }} \
            --ports 63791,63792,63793 \
            --host 127.0.0.1
      - name: Run DynamoDB Jepsen workload against elastickv
        working-directory: jepsen
        timeout-minutes: 10
        run: |
          timeout 480 ~/lein run -m elastickv.dynamodb-workload --local \
            --time-limit ${{ inputs.time-limit || '150' }} \
            --rate ${{ inputs.rate || '10' }} \
            --concurrency ${{ inputs.concurrency || '8' }} \
            --key-count ${{ inputs.key-count || '16' }} \
            --max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
            --max-txn-length ${{ inputs.max-txn-length || '4' }} \
            --dynamo-ports 63801,63802,63803 \
            --host 127.0.0.1
      - name: Run DynamoDB per-type Jepsen workloads against elastickv
        working-directory: jepsen
        # The per-type sweep is a coverage check across all 10 attribute
        # types, not the deep stress run — it uses its own shorter
        # time-limit so the 10-type loop fits comfortably inside the job
        # timeout regardless of the workflow_dispatch time-limit input.
        # The per-invocation `timeout` is derived from TYPE_TL + buffer
        # so bumping TYPE_TL never races against the outer timeout.
        timeout-minutes: 30
        env:
          # Per-type sweep is a coverage check, not the deep stress run, so
          # it uses its own shorter runtime and smaller history density than
          # the parent dynamodb-workload step.  Keeping per-key ops modest
          # also keeps Knossos's linearizability analysis inside its
          # time budget (dense histories cause :valid? :unknown verdicts).
          TYPE_TL: "60"
          TYPE_CONCURRENCY: "4"
          TYPE_KEY_COUNT: "8"
          TYPE_MAX_WRITES: "80"
        run: |
          # Run every type independently: one failure does not stop
          # the sweep so the final summary shows which specific types
          # passed/failed.  The step still fails if any type failed.
          PER_TYPE_TIMEOUT=$((TYPE_TL + 180))
          declare -A RESULT
          FAILED=()
          for t in string number binary bool null string-set number-set binary-set list map; do
            echo "::group::value-type=${t}"
            set +e
            timeout "${PER_TYPE_TIMEOUT}" ~/lein run -m elastickv.dynamodb-types-workload --local \
              --time-limit "${TYPE_TL}" \
              --rate ${{ inputs.rate || '5' }} \
              --concurrency "${TYPE_CONCURRENCY}" \
              --key-count "${TYPE_KEY_COUNT}" \
              --max-writes-per-key "${TYPE_MAX_WRITES}" \
              --value-type "${t}" \
              --dynamo-ports 63801,63802,63803 \
              --host 127.0.0.1
            rc=$?
            set -e
            if [ "$rc" -eq 0 ]; then
              RESULT[$t]="pass"
            else
              RESULT[$t]="fail(${rc})"
              FAILED+=("$t")
            fi
            echo "::endgroup::"
          done
          echo
          echo "=== per-type jepsen summary ==="
          for t in string number binary bool null string-set number-set binary-set list map; do
            printf '  %-12s %s\n' "$t" "${RESULT[$t]}"
          done
          if [ ${#FAILED[@]} -ne 0 ]; then
            echo "FAILED types: ${FAILED[*]}"
            exit 1
          fi
      - name: Upload Jepsen store on failure
        if: failure()
        uses: actions/upload-artifact@v7
        with:
          name: jepsen-store-types
          path: jepsen/store
          retention-days: 7
      - name: Run S3 Jepsen workload against elastickv
        working-directory: jepsen
        timeout-minutes: 10
        run: |
          timeout 480 ~/lein run -m elastickv.s3-workload --local \
            --time-limit ${{ inputs.time-limit || '150' }} \
            --rate ${{ inputs.rate || '10' }} \
            --concurrency ${{ inputs.concurrency || '8' }} \
            --key-count ${{ inputs.key-count || '16' }} \
            --max-writes-per-key ${{ inputs.max-writes-per-key || '250' }} \
            --threads-per-key 4 \
            --s3-ports 63901,63902,63903 \
            --host 127.0.0.1
      - name: Dump demo cluster log on failure
        if: failure()
        # The previous `tail -n 500` truncated a 3-minute workload's
        # log down to startup-only lines, making it impossible to
        # correlate a Jepsen anomaly with the server-side state
        # (start_ts, commit_ts, raft term, write conflicts, lock-
        # resolver events). Print head + tail inline so the GH UI
        # still shows the most recent activity at-a-glance, then
        # upload the full log as an artifact for offline analysis.
        run: |
          echo "=== first 200 lines (startup) ==="
          head -n 200 /tmp/elastickv-demo.log || true
          echo "=== last 1000 lines (most recent activity) ==="
          tail -n 1000 /tmp/elastickv-demo.log || true
          echo "=== full log line count ==="
          wc -l /tmp/elastickv-demo.log || true
      - name: Upload demo cluster log on failure
        if: failure()
        uses: actions/upload-artifact@v4
        with:
          name: elastickv-demo-log
          path: /tmp/elastickv-demo.log
          retention-days: 14
          if-no-files-found: warn
      - name: Stop demo cluster
        if: always()
        run: |
          if [ -f /tmp/elastickv-demo.pid ]; then
            pid=$(cat /tmp/elastickv-demo.pid)
            kill "$pid" 2>/dev/null || true
            wait "$pid" 2>/dev/null || true
          fi