Several fixes and improvements for CI #324
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPULlama3 Build & Run | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| types: [opened, synchronize, reopened] | |
| env: | |
| JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 | |
| TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm | |
| LLAMA_ROOT: ${{ github.workspace }} | |
| GRAAL_JARS: /opt/graalJars | |
| MODELS_DIR: /opt/models | |
| # History file committed back to the repo on push to main | |
| PERF_HISTORY_FILE: docs/perf-history.jsonl | |
| jobs: | |
| code-quality: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: self-hosted | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Check code formatting (Spotless) | |
| run: | | |
| cd ${{ github.workspace }} | |
| # ./mvnw -T12C -Pspotless spotless:check | |
| build-and-run: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: [self-hosted] | |
| needs: code-quality | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Setup TornadoVM | |
| uses: ./.github/actions/setup-tornadovm | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| - name: Build GPULlama3.java | |
| run: | | |
| cd ${{ github.workspace }} | |
| echo "Using TORNADOVM_HOME=$TORNADOVM_HOME" | |
| tornado --version | |
| ./mvnw clean package -DskipTests | |
| # ── Llama-3.2-1B: standard + prefill-decode variants, all backends ────────── | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: prefill-decode | |
| flags: --with-prefill-decode | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: batch-prefill-decode | |
| flags: --with-prefill-decode --batch-prefill-size 32 | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json | |
| # ── PTX-only: CUDA-graph variants ──────────────────────────────────────── | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ptx | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: prefill-decode-cuda-graphs | |
| flags: --with-prefill-decode --cuda-graphs | |
| metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ptx | |
| model_file: Llama-3.2-1B-Instruct-F16.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: F16 | |
| configuration: batch-prefill-decode-cuda-graphs | |
| flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs | |
| metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json | |
| # ── Additional models — standard inference, all backends ───────────────── | |
| - name: FP16 - Run Qwen3-4B-f16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Qwen3-4B-f16.gguf | |
| model: Qwen3-4B | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json | |
| - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Mistral-7B-Instruct-v0.3.fp16.gguf | |
| model: Mistral-7B-Instruct-v0.3 | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json | |
| - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: qwen2.5-1.5b-instruct-fp16.gguf | |
| model: Qwen2.5-1.5B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json | |
| - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Phi-3-mini-4k-instruct-fp16.gguf | |
| model: Phi-3-mini-4k-instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json | |
| - name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-3.2-2b-instruct-f16.gguf | |
| model: Granite-3.2-2B-Instruct | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json | |
| - name: FP16 - Run Granite-4.0-1b-F16.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-4.0-1b-F16.gguf | |
| model: Granite-4.0-1B | |
| quantization: F16 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json | |
| - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Llama-3.2-1B-Instruct-Q8_0.gguf | |
| model: Llama-3.2-1B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json | |
| - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Qwen3-0.6B-Q8_0.gguf | |
| model: Qwen3-0.6B | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json | |
| - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Phi-3-mini-4k-instruct-Q8_0.gguf | |
| model: Phi-3-mini-4k-instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json | |
| - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: qwen2.5-1.5b-instruct-q8_0.gguf | |
| model: Qwen2.5-1.5B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json | |
| - name: Q8 - Run Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| model: Mistral-7B-Instruct-v0.3 | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json | |
| - name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-3.2-2b-instruct-Q8_0.gguf | |
| model: Granite-3.2-2B-Instruct | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json | |
| - name: Q8 - Run Granite-4.0-1b-Q8_0.gguf | |
| uses: ./.github/actions/run-inference | |
| with: | |
| backend: ${{ matrix.backend.name }} | |
| model_file: granite-4.0-1b-Q8_0.gguf | |
| model: Granite-4.0-1B | |
| quantization: Q8_0 | |
| configuration: standard | |
| metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json | |
| # ── Upload metrics for the publish job ──────────────────────────────────── | |
| - name: Upload metrics artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: metrics-${{ matrix.backend.name }}-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json | |
| if-no-files-found: warn | |
| # ── Separate job: collect all matrix metrics and update history ─────────────── | |
| publish-performance-history: | |
| # Guard: only commit history on real pushes to main, not on PRs or forks. | |
| # Prevents duplicate entries from PR runs and avoids push-permission errors on forks. | |
| if: >- | |
| github.repository == 'beehive-lab/GPULlama3.java' && | |
| github.event_name == 'push' && | |
| github.ref == 'refs/heads/main' | |
| runs-on: [self-hosted] | |
| needs: build-and-run | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Download metrics artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: metrics-*-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-artifacts | |
| merge-multiple: true | |
| - name: Append to performance history | |
| run: | | |
| python3 scripts/process_metrics.py \ | |
| --metrics-dir "${{ runner.temp }}/metrics-artifacts" \ | |
| --commit "${{ github.sha }}" \ | |
| --branch "${{ github.ref_name }}" \ | |
| --run-id "${{ github.run_id }}" \ | |
| --run-number "${{ github.run_number }}" \ | |
| --run-attempt "${{ github.run_attempt }}" \ | |
| --workflow "${{ github.workflow }}" \ | |
| --history "$PERF_HISTORY_FILE" | |
| - name: Commit performance history | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add "$PERF_HISTORY_FILE" | |
| git diff --cached --quiet && echo "No history changes to commit" && exit 0 | |
| git commit -m "perf: record run #${{ github.run_number }} @ ${GITHUB_SHA::8}" | |
| for attempt in 1 2 3; do | |
| git pull --rebase origin main && git push && break || { | |
| [ $attempt -lt 3 ] && { echo "Attempt $attempt failed, retrying in $((attempt * 5))s..."; sleep $((attempt * 5)); } \ | |
| || { echo "::error::Failed to push after 3 attempts"; exit 1; } | |
| } | |
| done |