Merge pull request #114 from beehive-lab/ci/metrics-history #322
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPULlama3 Build & Run | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| types: [opened, synchronize, reopened] | |
| env: | |
| JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 | |
| TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm | |
| LLAMA_ROOT: ${{ github.workspace }} | |
| GRAAL_JARS: /opt/graalJars | |
| MODELS_DIR: /opt/models | |
| # History file committed back to the repo on push to main | |
| PERF_HISTORY_FILE: docs/perf-history.jsonl | |
| jobs: | |
| code-quality: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: self-hosted | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Check code formatting (Spotless) | |
| run: | | |
| cd ${{ github.workspace }} | |
| # ./mvnw -T12C -Pspotless spotless:check | |
| build-and-run: | |
| if: github.repository == 'beehive-lab/GPULlama3.java' | |
| runs-on: [self-hosted] | |
| needs: code-quality | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Clone TornadoVM master | |
| run: | | |
| git clone --depth 1 --branch master \ | |
| https://github.com/beehive-lab/TornadoVM.git \ | |
| $TORNADO_ROOT | |
| - name: Set up Python venv for TornadoVM | |
| run: | | |
| python3 -m venv $TORNADO_ROOT/venv | |
| source $TORNADO_ROOT/venv/bin/activate | |
| python --version | |
| - name: Build TornadoVM | |
| run: | | |
| cd $TORNADO_ROOT | |
| mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ | |
| source venv/bin/activate | |
| echo "=== Building TornadoVM ===" | |
| make BACKEND=${{ matrix.backend.name }} | |
| echo "=== Searching for TornadoVM SDK directory ===" | |
| SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1) | |
| if [ -z "$SDK_DIR" ]; then | |
| echo "::error::Could not locate TornadoVM SDK directory!" | |
| find dist -maxdepth 5 -type d | |
| exit 1 | |
| fi | |
| FULL_SDK="${PWD}/${SDK_DIR}" | |
| echo "Detected TornadoVM SDK: $FULL_SDK" | |
| # Export for current shell session | |
| export TORNADOVM_HOME="$FULL_SDK" | |
| export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| # Save for subsequent steps | |
| echo "TORNADOVM_HOME=$FULL_SDK" >> $GITHUB_ENV | |
| echo "PATH=$PATH" >> $GITHUB_ENV | |
| echo "=== Checking tornado CLI ===" | |
| which tornado || { echo "::error::tornado not in PATH"; exit 1; } | |
| tornado --devices | |
| - name: Build GPULlama3.java | |
| run: | | |
| cd ${{ github.workspace }} | |
| echo "Using TORNADOVM_HOME=$TORNADOVM_HOME" | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| tornado --version | |
| ./mvnw clean package -DskipTests | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-F16.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" \ | |
| --with-prefill-decode | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-F16.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=F16 \ | |
| configuration=prefill-decode \ | |
| "flags=--with-prefill-decode" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" \ | |
| --with-prefill-decode --batch-prefill-size 32 | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-F16.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=F16 \ | |
| configuration=batch-prefill-decode \ | |
| "flags=--with-prefill-decode --batch-prefill-size 32" \ | |
| prompt="Say hello" | |
| # ── PTX-only: CUDA-graph variants ──────────────────────────────────────── | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --ptx \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" \ | |
| --with-prefill-decode \ | |
| --cuda-graphs | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.meta.json" \ | |
| backend=ptx \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-F16.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=F16 \ | |
| configuration=prefill-decode-cuda-graphs \ | |
| "flags=--with-prefill-decode --cuda-graphs" \ | |
| prompt="Say hello" | |
| - name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs | |
| if: matrix.backend.name == 'ptx' | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --ptx \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" \ | |
| --with-prefill-decode --batch-prefill-size 32 \ | |
| --cuda-graphs | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.meta.json" \ | |
| backend=ptx \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-F16.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=F16 \ | |
| configuration=batch-prefill-decode-cuda-graphs \ | |
| "flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \ | |
| prompt="Say hello" | |
| # ── Additional models — standard inference, all backends ───────────────── | |
| - name: FP16 - Run Qwen3-4B-f16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Qwen3-4B-f16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Qwen3-4B-f16.gguf \ | |
| model=Qwen3-4B \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Mistral-7B-Instruct-v0.3.fp16.gguf \ | |
| model=Mistral-7B-Instruct-v0.3 \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=qwen2.5-1.5b-instruct-fp16.gguf \ | |
| model=Qwen2.5-1.5B-Instruct \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Phi-3-mini-4k-instruct-fp16.gguf \ | |
| model=Phi-3-mini-4k-instruct \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/granite-3.2-2b-instruct-f16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=granite-3.2-2b-instruct-f16.gguf \ | |
| model=Granite-3.2-2B-Instruct \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: FP16 - Run Granite-4.0-1b-F16.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/granite-4.0-1b-F16.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=granite-4.0-1b-F16.gguf \ | |
| model=Granite-4.0-1B \ | |
| quantization=F16 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \ | |
| model=Llama-3.2-1B-Instruct \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Qwen3-0.6B-Q8_0.gguf \ | |
| model=Qwen3-0.6B \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Phi-3-mini-4k-instruct-Q8_0.gguf \ | |
| model=Phi-3-mini-4k-instruct \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=qwen2.5-1.5b-instruct-q8_0.gguf \ | |
| model=Qwen2.5-1.5B-Instruct \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=Mistral-7B-Instruct-v0.3.Q8_0.gguf \ | |
| model=Mistral-7B-Instruct-v0.3 \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/granite-3.2-2b-instruct-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=granite-3.2-2b-instruct-Q8_0.gguf \ | |
| model=Granite-3.2-2B-Instruct \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| - name: Q8 - Run Granite-4.0-1b-Q8_0.gguf | |
| env: | |
| JAVA_TOOL_OPTIONS: >- | |
| -Dllama.metrics.format=json | |
| -Dllama.metrics.output=file | |
| -Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/granite-4.0-1b-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| python3 scripts/write_metrics_sidecar.py \ | |
| --out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.meta.json" \ | |
| backend="${{ matrix.backend.name }}" \ | |
| task=llama-inference \ | |
| model_file=granite-4.0-1b-Q8_0.gguf \ | |
| model=Granite-4.0-1B \ | |
| quantization=Q8_0 \ | |
| configuration=standard \ | |
| flags="" \ | |
| prompt="Say hello" | |
| # ── Upload metrics for the publish job ──────────────────────────────────── | |
| - name: Upload metrics artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: metrics-${{ matrix.backend.name }}-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json | |
| if-no-files-found: warn | |
| # ── Separate job: collect all matrix metrics and update history ─────────────── | |
| publish-performance-history: | |
| # Guard: only commit history on real pushes to main, not on PRs or forks. | |
| # Prevents duplicate entries from PR runs and avoids push-permission errors on forks. | |
| if: >- | |
| github.repository == 'beehive-lab/GPULlama3.java' && | |
| github.event_name == 'push' && | |
| github.ref == 'refs/heads/main' | |
| runs-on: [self-hosted] | |
| needs: build-and-run | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Download metrics artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: metrics-*-${{ github.run_id }} | |
| path: ${{ runner.temp }}/metrics-artifacts | |
| merge-multiple: true | |
| - name: Append to performance history | |
| run: | | |
| python3 scripts/process_metrics.py \ | |
| --metrics-dir "${{ runner.temp }}/metrics-artifacts" \ | |
| --commit "${{ github.sha }}" \ | |
| --branch "${{ github.ref_name }}" \ | |
| --run-id "${{ github.run_id }}" \ | |
| --run-number "${{ github.run_number }}" \ | |
| --run-attempt "${{ github.run_attempt }}" \ | |
| --workflow "${{ github.workflow }}" \ | |
| --history "$PERF_HISTORY_FILE" | |
| - name: Commit performance history | |
| run: | | |
| git config user.name "github-actions[bot]" | |
| git config user.email "github-actions[bot]@users.noreply.github.com" | |
| git add "$PERF_HISTORY_FILE" | |
| git diff --cached --quiet && echo "No history changes to commit" && exit 0 | |
| git commit -m "perf: record run #${{ github.run_number }} @ ${GITHUB_SHA::8}" | |
| for attempt in 1 2 3; do | |
| git pull --rebase origin main && git push && break || { | |
| [ $attempt -lt 3 ] && { echo "Attempt $attempt failed, retrying in $((attempt * 5))s..."; sleep $((attempt * 5)); } \ | |
| || { echo "::error::Failed to push after 3 attempts"; exit 1; } | |
| } | |
| done |