Skip to content

Several fixes and improvements for CI #324

Several fixes and improvements for CI

Several fixes and improvements for CI #324

Workflow file for this run

name: GPULlama3 Build & Run
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]
env:
JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3
TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm
LLAMA_ROOT: ${{ github.workspace }}
GRAAL_JARS: /opt/graalJars
MODELS_DIR: /opt/models
# History file committed back to the repo on push to main
PERF_HISTORY_FILE: docs/perf-history.jsonl
jobs:
code-quality:
if: github.repository == 'beehive-lab/GPULlama3.java'
runs-on: self-hosted
timeout-minutes: 30
steps:
- name: Checkout GPULlama3
uses: actions/checkout@v4
- name: Check code formatting (Spotless)
run: |
cd ${{ github.workspace }}
# ./mvnw -T12C -Pspotless spotless:check
build-and-run:
if: github.repository == 'beehive-lab/GPULlama3.java'
runs-on: [self-hosted]
needs: code-quality
timeout-minutes: 30
strategy:
fail-fast: true
matrix:
backend:
- name: opencl
- name: ptx
steps:
- name: Checkout GPULlama3
uses: actions/checkout@v4
- name: Setup TornadoVM
uses: ./.github/actions/setup-tornadovm
with:
backend: ${{ matrix.backend.name }}
- name: Build GPULlama3.java
run: |
cd ${{ github.workspace }}
echo "Using TORNADOVM_HOME=$TORNADOVM_HOME"
tornado --version
./mvnw clean package -DskipTests
# ── Llama-3.2-1B: standard + prefill-decode variants, all backends ──────────
- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Llama-3.2-1B-Instruct-F16.gguf
model: Llama-3.2-1B-Instruct
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json
- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Llama-3.2-1B-Instruct-F16.gguf
model: Llama-3.2-1B-Instruct
quantization: F16
configuration: prefill-decode
flags: --with-prefill-decode
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json
- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Llama-3.2-1B-Instruct-F16.gguf
model: Llama-3.2-1B-Instruct
quantization: F16
configuration: batch-prefill-decode
flags: --with-prefill-decode --batch-prefill-size 32
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json
# ── PTX-only: CUDA-graph variants ────────────────────────────────────────
- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
if: matrix.backend.name == 'ptx'
uses: ./.github/actions/run-inference
with:
backend: ptx
model_file: Llama-3.2-1B-Instruct-F16.gguf
model: Llama-3.2-1B-Instruct
quantization: F16
configuration: prefill-decode-cuda-graphs
flags: --with-prefill-decode --cuda-graphs
metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json
- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs
if: matrix.backend.name == 'ptx'
uses: ./.github/actions/run-inference
with:
backend: ptx
model_file: Llama-3.2-1B-Instruct-F16.gguf
model: Llama-3.2-1B-Instruct
quantization: F16
configuration: batch-prefill-decode-cuda-graphs
flags: --with-prefill-decode --batch-prefill-size 32 --cuda-graphs
metrics_file: ${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json
# ── Additional models — standard inference, all backends ─────────────────
- name: FP16 - Run Qwen3-4B-f16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Qwen3-4B-f16.gguf
model: Qwen3-4B
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json
- name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Mistral-7B-Instruct-v0.3.fp16.gguf
model: Mistral-7B-Instruct-v0.3
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json
- name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: qwen2.5-1.5b-instruct-fp16.gguf
model: Qwen2.5-1.5B-Instruct
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json
- name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Phi-3-mini-4k-instruct-fp16.gguf
model: Phi-3-mini-4k-instruct
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json
- name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: granite-3.2-2b-instruct-f16.gguf
model: Granite-3.2-2B-Instruct
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json
- name: FP16 - Run Granite-4.0-1b-F16.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: granite-4.0-1b-F16.gguf
model: Granite-4.0-1B
quantization: F16
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json
- name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Llama-3.2-1B-Instruct-Q8_0.gguf
model: Llama-3.2-1B-Instruct
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json
- name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Qwen3-0.6B-Q8_0.gguf
model: Qwen3-0.6B
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json
- name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Phi-3-mini-4k-instruct-Q8_0.gguf
model: Phi-3-mini-4k-instruct
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json
- name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: qwen2.5-1.5b-instruct-q8_0.gguf
model: Qwen2.5-1.5B-Instruct
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json
- name: Q8 - Run Mistral-7B-Instruct-v0.3.Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: Mistral-7B-Instruct-v0.3.Q8_0.gguf
model: Mistral-7B-Instruct-v0.3
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json
- name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: granite-3.2-2b-instruct-Q8_0.gguf
model: Granite-3.2-2B-Instruct
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json
- name: Q8 - Run Granite-4.0-1b-Q8_0.gguf
uses: ./.github/actions/run-inference
with:
backend: ${{ matrix.backend.name }}
model_file: granite-4.0-1b-Q8_0.gguf
model: Granite-4.0-1B
quantization: Q8_0
configuration: standard
metrics_file: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json
# ── Upload metrics for the publish job ────────────────────────────────────
- name: Upload metrics artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: metrics-${{ matrix.backend.name }}-${{ github.run_id }}
path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json
if-no-files-found: warn
# ── Separate job: collect all matrix metrics and update history ───────────────
publish-performance-history:
# Guard: only commit history on real pushes to main, not on PRs or forks.
# Prevents duplicate entries from PR runs and avoids push-permission errors on forks.
if: >-
github.repository == 'beehive-lab/GPULlama3.java' &&
github.event_name == 'push' &&
github.ref == 'refs/heads/main'
runs-on: [self-hosted]
needs: build-and-run
timeout-minutes: 15
steps:
- name: Checkout GPULlama3
uses: actions/checkout@v4
- name: Download metrics artifacts
uses: actions/download-artifact@v4
with:
pattern: metrics-*-${{ github.run_id }}
path: ${{ runner.temp }}/metrics-artifacts
merge-multiple: true
- name: Append to performance history
run: |
python3 scripts/process_metrics.py \
--metrics-dir "${{ runner.temp }}/metrics-artifacts" \
--commit "${{ github.sha }}" \
--branch "${{ github.ref_name }}" \
--run-id "${{ github.run_id }}" \
--run-number "${{ github.run_number }}" \
--run-attempt "${{ github.run_attempt }}" \
--workflow "${{ github.workflow }}" \
--history "$PERF_HISTORY_FILE"
- name: Commit performance history
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git add "$PERF_HISTORY_FILE"
git diff --cached --quiet && echo "No history changes to commit" && exit 0
git commit -m "perf: record run #${{ github.run_number }} @ ${GITHUB_SHA::8}"
for attempt in 1 2 3; do
git pull --rebase origin main && git push && break || {
[ $attempt -lt 3 ] && { echo "Attempt $attempt failed, retrying in $((attempt * 5))s..."; sleep $((attempt * 5)); } \
|| { echo "::error::Failed to push after 3 attempts"; exit 1; }
}
done