texera/.github/workflows/benchmarks.yml at main · apache/texera · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# Texera benchmarks — bench-agnostic umbrella workflow.
#
# This file is the single CI entry for ALL Texera performance benchmarks
# (currently Arrow Flight E2E; JMH and others land here as well). The
# workflow knows nothing about specific benches — bin/run-benchmarks.sh
# is the opaque entry point that owns which benches run and where their
# outputs land under bench-results/. Adding a new bench is:
#   1. Append the run command to bin/run-benchmarks.sh.
#   2. Add a `Publish <chart-name>` step block below pointing at the
#      bench's JSON output file with the right `tool:` setting.
# This workflow file otherwise stays unchanged.
#
# Triggering — mirrors amber-integration's label gate (NOT file paths):
#   - PR: runs only when one of the labels mapped to the amber-integration
#     stack in required-checks.yml's LABEL_STACKS is present on the PR.
#     Labels are applied by the .github/labeler.yml workflow on opened /
#     synchronize, so we wait for that workflow to complete before
#     deciding (same pattern required-checks.yml uses). A PR run benches
#     the PR head, then re-runs the IDENTICAL trimmed grid against the
#     base-branch (main) commit it targets, in the SAME runner (see the
#     "Benchmark main baseline in the same runner" step). The delta between
#     those two cancels cross-runner hardware variance (the dominant source
#     of CI bench noise), so the PR comment's main-vs-branch comparison is
#     apples-to-apples rather than PR-here vs a stored baseline captured on
#     some other runner. PRs never publish to gh-pages.
#   - push to main: always runs (same trimmed grid as PR for quick post-
#     merge signal) but does NOT publish to gh-pages; it only emits the
#     job summary plus uploaded artifact. Publishing on every merge spammed
#     the repo's Pulse / all-branches commit count with bot commits, so
#     only the scheduled (daily) run persists the baseline now.
#   - schedule (daily): runs the full 27-config sweep and is the sole
#     writer that publishes to gh-pages (the authoritative long-term
#     baseline).
#   - workflow_dispatch: manual full-grid run (no publish; bring-your-own
#     trigger for ad-hoc exploration).
#
# Two modes via BENCH_MODE env (read by the bench Scala main):
#   pr   — 3 configs × 20 batches, ~5 min   (PR + push-to-main)
#   full — 27 configs × 200 batches, ~40 min   (schedule + dispatch)
#
# Non-blocking: this workflow is NOT included in required-checks.yml's
# `required-checks` aggregator, so its result doesn't gate merges even
# when it fails. Adding it to branch protection later is a deliberate
# .asf.yaml change.
#
# Permissions:
#   contents: write — needed by benchmark-action's auto-push to gh-pages.
#   PR runs (which GitHub auto-downgrades to read-only on forks) gate
#   auto-push off via the event check, so the missing write is never
#   exercised.

name: Benchmarks

on:
  push:
    branches: [main]
  pull_request:
    types: [opened, reopened, synchronize, labeled, unlabeled]
  schedule:
    # Daily full-grid baseline refresh, 12:00 UTC (05:00 PDT). PR and
    # post-merge runs use a trimmed 3-config grid to stay around 5 min; the
    # scheduled run covers the full 27-config sweep that the gh-pages
    # dashboard tracks long-term. Daily (rather than weekly) keeps the
    # baseline fresh and accumulates enough data points to average out CI
    # noise; the extra bot commits on gh-pages are intentionally tolerated.
    # 12:00 UTC lands in the early-morning PDT lull when GitHub runners are
    # less contended (and thus less noisy) than during late-night dev hours.
    # Bump to several times a day by adding more cron entries if denser
    # sampling is wanted.
    - cron: "0 12 * * *"
  workflow_dispatch:

permissions:
  contents: write

concurrency:
  group: benchmarks-${{ github.ref }}
  # On main: never cancel an in-flight baseline run; on PRs: supersede.
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

jobs:
  precheck:
    # Decide whether to run based on PR labels (push / dispatch always
    # run). Lifted from required-checks.yml's precheck so the trigger
    # surface matches amber-integration exactly.
    name: Precheck
    runs-on: ubuntu-latest
    outputs:
      run_bench: ${{ steps.decide.outputs.run_bench }}
    steps:
      - name: Wait for Pull Request Labeler
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v8
        with:
          script: |
            const ref = context.payload.pull_request.head.sha;
            const maxAttempts = 30;
            for (let i = 0; i < maxAttempts; i++) {
              const { data } = await github.rest.checks.listForRef({
                owner: context.repo.owner,
                repo: context.repo.repo,
                ref,
                check_name: "labeler",
              });
              const check = data.check_runs[0];
              if (check && check.status === "completed") {
                core.info(`labeler ${check.conclusion}`);
                return;
              }
              core.info(`labeler not ready (attempt ${i + 1}/${maxAttempts})`);
              await new Promise((r) => setTimeout(r, 10000));
            }
            core.warning("labeler did not complete within 5 minutes; proceeding with current labels.");

      - name: Decide whether to run bench
        id: decide
        uses: actions/github-script@v8
        with:
          script: |
            const eventName = context.eventName;
            if (eventName !== "pull_request") {
              // push to main / workflow_dispatch always run.
              core.info(`event=${eventName} — running unconditionally`);
              core.setOutput("run_bench", "true");
              return;
            }
            // Re-fetch labels: the labeler may have just added some.
            const { data: pr } = await github.rest.pulls.get({
              owner: context.repo.owner,
              repo: context.repo.repo,
              pull_number: context.payload.pull_request.number,
            });
            const labels = pr.labels.map((l) => l.name);
            core.info(`PR labels: ${labels.join(", ") || "(none)"}`);
            // Mirrors LABEL_STACKS in required-checks.yml: every label
            // whose stack list contains "amber-integration" triggers this
            // bench. Keep in sync if LABEL_STACKS there changes.
            const TRIGGER_LABELS = new Set([
              "pyamber",
              "engine",
              "amber-integration",
              "common",
              "ddl-change",
              "ci",
            ]);
            const matched = labels.filter((l) => TRIGGER_LABELS.has(l));
            const shouldRun = matched.length > 0;
            core.info(
              shouldRun
                ? `Triggering on labels: ${matched.join(", ")}`
                : "No trigger label present; skipping bench."
            );
            core.setOutput("run_bench", shouldRun ? "true" : "false");

  bench:
    name: Bench
    needs: precheck
    if: ${{ needs.precheck.outputs.run_bench == 'true' }}
    runs-on: ubuntu-latest
    env:
      JAVA_OPTS: -Xms2048M -Xmx2048M -Xss6M -XX:ReservedCodeCacheSize=256M -Dfile.encoding=UTF-8
      JVM_OPTS: -Xms2048M -Xmx2048M -Xss6M -XX:ReservedCodeCacheSize=256M -Dfile.encoding=UTF-8
      # `pr` mode = 3-config trimmed sweep (~5 min) for PR + post-merge.
      # `full` mode = 27-config sweep (~40 min) for schedule + manual.
      # Read by the bench Scala main (see GridSpec switch); workflow only
      # decides which mode to pass.
      BENCH_MODE: ${{ (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && 'full' || 'pr' }}
    services:
      # The bench itself doesn't touch the DB, but sbt's transitive compile
      # chain reaches `common/auth` which imports JOOQ-generated classes
      # from `org.apache.texera.dao.jooq.generated.*`. JOOQ codegen at
      # sbt compile time requires a live Postgres to introspect against;
      # without it the auth module's `User` / `UserRoleEnum` symbols fail
      # to resolve and the whole bench compile aborts. Mirrors the same
      # service block from amber-integration in build.yml.
      postgres:
        image: postgres
        env:
          POSTGRES_PASSWORD: postgres
        ports:
          - 5432:5432
        options: >-
          --health-cmd="pg_isready -U postgres"
          --health-interval=10s
          --health-timeout=5s
          --health-retries=5
    steps:
      - name: Checkout
        uses: actions/checkout@v5
        with:
          fetch-depth: 0
      - name: Setup JDK
        uses: actions/setup-java@v5
        with:
          distribution: "temurin"
          java-version: 17
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          python-version: "3.12"
      - name: Install Python dependencies
        # Mirrors amber-integration's installer in build.yml so the bench
        # subprocess imports resolve identically (pytorch CPU index +
        # betterproto plugin via dev-requirements).
        run: |
          python -m pip install uv
          if [ -f amber/requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/requirements.txt; fi
          if [ -f amber/operator-requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/operator-requirements.txt; fi
          if [ -f amber/dev-requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/dev-requirements.txt; fi
      - name: Install protoc
        run: |
          PROTOC_VERSION=$(cat bin/protoc-version.txt)
          curl -fsSL -o /tmp/protoc.zip "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-x86_64.zip"
          sudo unzip -o /tmp/protoc.zip -d /usr/local
          sudo chmod +x /usr/local/bin/protoc
          sudo chmod -R a+rX /usr/local/include/google
      - name: Create Database for JOOQ codegen
        # Minimal subset of amber-integration's "Create Databases" step —
        # JOOQ only introspects against texera_db, not iceberg/lakefs/
        # lakekeeper schemas which the bench never touches.
        run: psql -h localhost -U postgres -f sql/texera_ddl.sql
        env:
          PGPASSWORD: postgres
      - name: Generate Python proto bindings
        run: bash bin/python-proto-gen.sh
      - name: Setup sbt launcher
        uses: sbt/setup-sbt@508b753e53cb6095967669e0911487d2b9bc9f41 # v1.1.22
      - uses: coursier/cache-action@90c37294538be80a558fd665531fcdc2b467b475 # v8.1.0
        with:
          extraSbtFiles: '["*.sbt", "project/**.{scala,sbt}", "project/build.properties" ]'

      - name: Run benchmarks
        # Single opaque entry point — this workflow doesn't know which
        # benches exist. Adding a JMH suite later = appending one line
        # inside bin/run-benchmarks.sh and adding a publish step below.
        run: bash bin/run-benchmarks.sh

      - name: Benchmark main baseline in the same runner
        # PR only: re-run the IDENTICAL trimmed grid against the base-branch
        # (main) commit this PR targets, in THIS runner, right after the PR
        # run above. Comparing two runs from the same machine cancels the
        # cross-runner hardware variance that otherwise dominates CI bench
        # deltas, so benchmarks-pr-comment.yml can show a trustworthy
        # main-vs-branch comparison instead of PR-here vs a stored baseline
        # captured on some other runner.
        #
        # The output convention is preserved: the PR's own outputs stay in
        # bench-results/ untouched; we only ADD main's CSV as
        # arrow-flight-e2e-main.csv (plus the base SHA in a sidecar file).
        # The PR-mode grid is deterministic (see GridSpec in
        # ArrowFlightActorBench.scala), so main's rows key 1:1 against the
        # PR's rows for the comparison.
        #
        # Fail-soft by construction: no `set -e`, and a trap restores the
        # PR's results plus the original checkout no matter where the main
        # re-run dies (broken main, compile error, etc). On failure we emit
        # no main CSV, and the comment workflow falls back to the stored
        # gh-pages baseline. We also skip entirely if the PR run produced no
        # CSV (e.g. the bench itself failed upstream).
        if: ${{ github.event_name == 'pull_request' && !cancelled() }}
        env:
          BASE_SHA: ${{ github.event.pull_request.base.sha }}
        run: |
          set -uo pipefail
          if [ ! -f bench-results/arrow-flight-e2e.csv ]; then
            echo "::warning::no PR bench CSV; skipping same-runner main baseline."
            exit 0
          fi
          ORIG_REF=$(git rev-parse HEAD)
          # Park the PR's outputs; main's re-run writes a fresh bench-results/.
          mv bench-results bench-results-pr
          restore() {
            rm -rf bench-results
            mv bench-results-pr bench-results 2>/dev/null || true
            git checkout --force "$ORIG_REF" 2>/dev/null || true
          }
          trap restore EXIT
          if ! git checkout --force "$BASE_SHA"; then
            echo "::warning::could not check out base SHA $BASE_SHA; skipping main baseline."
            exit 0
          fi
          # Re-sync Python deps to main's requirements: the deps installed
          # earlier are the PR's, and the bench subprocess imports must match
          # the main code we're about to compile and run. Without this the
          # "main" baseline would run main's Scala/Python sources against the
          # PR's pinned Python packages, which is not a clean main measurement.
          # (sbt recompiles main's Scala automatically when run-benchmarks.sh
          # invokes it below; only the pip deps need an explicit re-sync.)
          if [ -f amber/requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/requirements.txt || { echo "::warning::main requirements install failed; skipping main baseline."; exit 0; }; fi
          if [ -f amber/operator-requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/operator-requirements.txt || { echo "::warning::main operator-requirements install failed; skipping main baseline."; exit 0; }; fi
          if [ -f amber/dev-requirements.txt ]; then uv pip install --system --index-strategy unsafe-best-match -r amber/dev-requirements.txt || { echo "::warning::main dev-requirements install failed; skipping main baseline."; exit 0; }; fi
          # Regenerate proto bindings against main's protos, then re-bench.
          bash bin/python-proto-gen.sh || { echo "::warning::main proto-gen failed; skipping main baseline."; exit 0; }
          if bash bin/run-benchmarks.sh && [ -f bench-results/arrow-flight-e2e.csv ]; then
            cp bench-results/arrow-flight-e2e.csv bench-results-pr/arrow-flight-e2e-main.csv
            printf '%s' "$BASE_SHA" > bench-results-pr/arrow-flight-e2e-main.commit.txt
            echo "captured same-runner main baseline at $BASE_SHA"
          else
            echo "::warning::main baseline re-run failed; PR comment falls back to the gh-pages baseline."
          fi
          # trap restores the PR outputs (now incl. main CSV) plus original ref.

      - name: Stash PR number for downstream comment workflow
        # PR fork workflows can't comment (GitHub forces read-only token);
        # benchmarks-pr-comment.yml runs separately via workflow_run with
        # proper write access, and needs the PR number to find the target.
        # github.event.workflow_run.pull_requests is empty for fork PRs,
        # so we ferry the number via artifact.
        if: ${{ github.event_name == 'pull_request' && !cancelled() }}
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: echo "$PR_NUMBER" > bench-results/pr-number.txt

      - name: Render bench summary
        # Render the bench CSV into a markdown table on the workflow run
        # page. Visible without further clicks — and doesn't need any
        # extra permissions (writes to $GITHUB_STEP_SUMMARY only).
        if: ${{ !cancelled() }}
        run: |
          {
            echo "## Bench results (\`$BENCH_MODE\` mode)"
            echo
            if [ -f bench-results/arrow-flight-e2e.csv ]; then
              echo '```csv'
              cat bench-results/arrow-flight-e2e.csv
              echo '```'
            else
              echo "_(no bench-results/arrow-flight-e2e.csv produced)_"
            fi
          } >> "$GITHUB_STEP_SUMMARY"

      - name: Upload bench artifacts
        if: ${{ !cancelled() }}
        uses: actions/upload-artifact@v4
        with:
          name: bench-results-${{ github.run_id }}
          path: bench-results/
          retention-days: 14

      # Publish to the gh-pages dashboard. auto-push + save-data-file are
      # gated on `schedule` ONLY: the daily full-grid run is the single
      # authoritative baseline writer. PR *and* push-to-main runs only emit
      # the job summary and the uploaded artifact, never touching the
      # tracked baseline. This is deliberate: each gh-pages write is a bot
      # commit (one per chart, so two per run), and persisting on every
      # merge to main flooded the repo's Pulse / all-branches commit count
      # with `github-action-benchmark` commits. The post-merge run still
      # gives quick signal via the rendered summary + artifact; only the
      # daily sweep persists. Adding a new benchmark = adding one publish
      # block below matching the JSON filename convention in
      # bin/run-benchmarks.sh.
      #
      # `skip-fetch-gh-pages: true` intentionally keeps baseline comparison
      # OFF for now. When flipped to false, the action will fetch the stored
      # gh-pages baseline, compare each run against main, post an alert
      # comment when a result regresses past `alert-threshold`, and (with
      # comment-on-alert / fail-on-alert) can block merge. We're deferring
      # that until the baseline has accumulated enough daily data points to
      # be trustworthy; turning it on is a deliberate follow-up to evaluate
      # later. auto-push on the daily schedule still appends to the branch.
      #
      # `continue-on-error: true` keeps any other gh-pages-side surprise
      # (permission glitches, transient git failures) from failing the
      # bench job overall — the bench data itself is already in the
      # uploaded artifact above.
      - name: Publish throughput
        if: ${{ !cancelled() }}
        continue-on-error: true
        uses: benchmark-action/github-action-benchmark@52576c92bccf6ac60c8223ec7eb2565637cae9ba # v1.22.1
        with:
          name: Arrow Flight E2E Throughput
          tool: customBiggerIsBetter
          output-file-path: bench-results/arrow-flight-e2e-throughput.json
          github-token: ${{ secrets.GITHUB_TOKEN }}
          auto-push: ${{ github.event_name == 'schedule' }}
          save-data-file: ${{ github.event_name == 'schedule' }}
          skip-fetch-gh-pages: true
          gh-pages-branch: gh-pages
          benchmark-data-dir-path: dev/bench
          alert-threshold: "150%"
          # comment-on-alert needs pull-requests:write; skip and let
          # results show up via summary-always instead.
          comment-on-alert: false
          summary-always: true
      - name: Publish latency
        if: ${{ !cancelled() }}
        continue-on-error: true
        uses: benchmark-action/github-action-benchmark@52576c92bccf6ac60c8223ec7eb2565637cae9ba # v1.22.1
        with:
          name: Arrow Flight E2E Latency
          tool: customSmallerIsBetter
          output-file-path: bench-results/arrow-flight-e2e-latency.json
          github-token: ${{ secrets.GITHUB_TOKEN }}
          auto-push: ${{ github.event_name == 'schedule' }}
          save-data-file: ${{ github.event_name == 'schedule' }}
          skip-fetch-gh-pages: true
          gh-pages-branch: gh-pages
          benchmark-data-dir-path: dev/bench
          alert-threshold: "150%"
          comment-on-alert: false
          summary-always: true