Merge pull request #114 from beehive-lab/ci/metrics-history #322

Workflow file for this run

.github/workflows/build-and-run.yml at 2d442a0

	name: GPULlama3 Build & Run

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	types: [opened, synchronize, reopened]

	env:
	JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3
	TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm
	LLAMA_ROOT: ${{ github.workspace }}
	GRAAL_JARS: /opt/graalJars
	MODELS_DIR: /opt/models
	# History file committed back to the repo on push to main
	PERF_HISTORY_FILE: docs/perf-history.jsonl

	jobs:
	code-quality:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: self-hosted
	timeout-minutes: 30

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4

	- name: Check code formatting (Spotless)
	run: \|
	cd ${{ github.workspace }}
	# ./mvnw -T12C -Pspotless spotless:check

	build-and-run:
	if: github.repository == 'beehive-lab/GPULlama3.java'
	runs-on: [self-hosted]
	needs: code-quality
	timeout-minutes: 30

	strategy:
	fail-fast: true
	matrix:
	backend:
	- name: opencl
	- name: ptx

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4

	- name: Clone TornadoVM master
	run: \|
	git clone --depth 1 --branch master \
	https://github.com/beehive-lab/TornadoVM.git \
	$TORNADO_ROOT

	- name: Set up Python venv for TornadoVM
	run: \|
	python3 -m venv $TORNADO_ROOT/venv
	source $TORNADO_ROOT/venv/bin/activate
	python --version

	- name: Build TornadoVM
	run: \|
	cd $TORNADO_ROOT
	mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/
	source venv/bin/activate
	echo "=== Building TornadoVM ==="

	make BACKEND=${{ matrix.backend.name }}

	echo "=== Searching for TornadoVM SDK directory ==="
	SDK_DIR=$(find dist -type d -maxdepth 3 -path "/tornadovm--${{ matrix.backend.name }}" \| head -n 1)
	if [ -z "$SDK_DIR" ]; then
	echo "::error::Could not locate TornadoVM SDK directory!"
	find dist -maxdepth 5 -type d
	exit 1
	fi
	FULL_SDK="${PWD}/${SDK_DIR}"
	echo "Detected TornadoVM SDK: $FULL_SDK"

	# Export for current shell session
	export TORNADOVM_HOME="$FULL_SDK"
	export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH"

	# Save for subsequent steps
	echo "TORNADOVM_HOME=$FULL_SDK" >> $GITHUB_ENV
	echo "PATH=$PATH" >> $GITHUB_ENV

	echo "=== Checking tornado CLI ==="
	which tornado \|\| { echo "::error::tornado not in PATH"; exit 1; }
	tornado --devices

	- name: Build GPULlama3.java
	run: \|
	cd ${{ github.workspace }}
	echo "Using TORNADOVM_HOME=$TORNADOVM_HOME"
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	tornado --version
	./mvnw clean package -DskipTests

	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Standard
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-F16.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
	--prompt "Say hello" \
	--with-prefill-decode
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-prefill-decode.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-F16.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=F16 \
	configuration=prefill-decode \
	"flags=--with-prefill-decode" \
	prompt="Say hello"

	- name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
	--prompt "Say hello" \
	--with-prefill-decode --batch-prefill-size 32
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-f16-batch-prefill-decode.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-F16.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=F16 \
	configuration=batch-prefill-decode \
	"flags=--with-prefill-decode --batch-prefill-size 32" \
	prompt="Say hello"

	# ── PTX-only: CUDA-graph variants ────────────────────────────────────────
	- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Prefill-Decode-CUDA-Graphs
	if: matrix.backend.name == 'ptx'
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --ptx \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
	--prompt "Say hello" \
	--with-prefill-decode \
	--cuda-graphs
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-prefill-decode-cuda-graphs.meta.json" \
	backend=ptx \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-F16.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=F16 \
	configuration=prefill-decode-cuda-graphs \
	"flags=--with-prefill-decode --cuda-graphs" \
	prompt="Say hello"

	- name: PTX - FP16 - Run Llama-3.2-1B-Instruct-F16.gguf - Batch-Prefill-Decode-CUDA-Graphs
	if: matrix.backend.name == 'ptx'
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --ptx \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \
	--prompt "Say hello" \
	--with-prefill-decode --batch-prefill-size 32 \
	--cuda-graphs
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-ptx-llama-1b-f16-batch-prefill-decode-cuda-graphs.meta.json" \
	backend=ptx \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-F16.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=F16 \
	configuration=batch-prefill-decode-cuda-graphs \
	"flags=--with-prefill-decode --batch-prefill-size 32 --cuda-graphs" \
	prompt="Say hello"

	# ── Additional models — standard inference, all backends ─────────────────
	- name: FP16 - Run Qwen3-4B-f16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Qwen3-4B-f16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-4b-f16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Qwen3-4B-f16.gguf \
	model=Qwen3-4B \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-fp16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Mistral-7B-Instruct-v0.3.fp16.gguf \
	model=Mistral-7B-Instruct-v0.3 \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-fp16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=qwen2.5-1.5b-instruct-fp16.gguf \
	model=Qwen2.5-1.5B-Instruct \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-fp16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Phi-3-mini-4k-instruct-fp16.gguf \
	model=Phi-3-mini-4k-instruct \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Granite-3.2-2b-instruct-f16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/granite-3.2-2b-instruct-f16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-f16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=granite-3.2-2b-instruct-f16.gguf \
	model=Granite-3.2-2B-Instruct \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: FP16 - Run Granite-4.0-1b-F16.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/granite-4.0-1b-F16.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-f16-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=granite-4.0-1b-F16.gguf \
	model=Granite-4.0-1B \
	quantization=F16 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-llama-1b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Llama-3.2-1B-Instruct-Q8_0.gguf \
	model=Llama-3.2-1B-Instruct \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Qwen3-0.6B-Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen3-0-6b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Qwen3-0.6B-Q8_0.gguf \
	model=Qwen3-0.6B \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-phi3-mini-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Phi-3-mini-4k-instruct-Q8_0.gguf \
	model=Phi-3-mini-4k-instruct \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-qwen2-5-1-5b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=qwen2.5-1.5b-instruct-q8_0.gguf \
	model=Qwen2.5-1.5B-Instruct \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-mistral-7b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=Mistral-7B-Instruct-v0.3.Q8_0.gguf \
	model=Mistral-7B-Instruct-v0.3 \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Granite-3.2-2b-instruct-Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/granite-3.2-2b-instruct-Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-3-2-2b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=granite-3.2-2b-instruct-Q8_0.gguf \
	model=Granite-3.2-2B-Instruct \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	- name: Q8 - Run Granite-4.0-1b-Q8_0.gguf
	env:
	JAVA_TOOL_OPTIONS: >-
	-Dllama.metrics.format=json
	-Dllama.metrics.output=file
	-Dllama.metrics.file=${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.json
	run: \|
	cd ${{ github.workspace }}
	export PATH="$TORNADOVM_HOME/bin:$JAVA_HOME/bin:$PATH"
	./llama-tornado --gpu --${{ matrix.backend.name }} \
	--model $MODELS_DIR/granite-4.0-1b-Q8_0.gguf \
	--prompt "Say hello"
	python3 scripts/write_metrics_sidecar.py \
	--out "${{ runner.temp }}/metrics-${{ matrix.backend.name }}-granite-4-0-1b-q8-standard.meta.json" \
	backend="${{ matrix.backend.name }}" \
	task=llama-inference \
	model_file=granite-4.0-1b-Q8_0.gguf \
	model=Granite-4.0-1B \
	quantization=Q8_0 \
	configuration=standard \
	flags="" \
	prompt="Say hello"

	# ── Upload metrics for the publish job ────────────────────────────────────
	- name: Upload metrics artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: metrics-${{ matrix.backend.name }}-${{ github.run_id }}
	path: ${{ runner.temp }}/metrics-${{ matrix.backend.name }}-*.json
	if-no-files-found: warn

	# ── Separate job: collect all matrix metrics and update history ───────────────
	publish-performance-history:
	# Guard: only commit history on real pushes to main, not on PRs or forks.
	# Prevents duplicate entries from PR runs and avoids push-permission errors on forks.
	if: >-
	github.repository == 'beehive-lab/GPULlama3.java' &&
	github.event_name == 'push' &&
	github.ref == 'refs/heads/main'
	runs-on: [self-hosted]
	needs: build-and-run
	timeout-minutes: 15

	steps:
	- name: Checkout GPULlama3
	uses: actions/checkout@v4

	- name: Download metrics artifacts
	uses: actions/download-artifact@v4
	with:
	pattern: metrics-*-${{ github.run_id }}
	path: ${{ runner.temp }}/metrics-artifacts
	merge-multiple: true

	- name: Append to performance history
	run: \|
	python3 scripts/process_metrics.py \
	--metrics-dir "${{ runner.temp }}/metrics-artifacts" \
	--commit "${{ github.sha }}" \
	--branch "${{ github.ref_name }}" \
	--run-id "${{ github.run_id }}" \
	--run-number "${{ github.run_number }}" \
	--run-attempt "${{ github.run_attempt }}" \
	--workflow "${{ github.workflow }}" \
	--history "$PERF_HISTORY_FILE"

	- name: Commit performance history
	run: \|
	git config user.name "github-actions[bot]"
	git config user.email "github-actions[bot]@users.noreply.github.com"
	git add "$PERF_HISTORY_FILE"
	git diff --cached --quiet && echo "No history changes to commit" && exit 0
	git commit -m "perf: record run #${{ github.run_number }} @ ${GITHUB_SHA::8}"
	for attempt in 1 2 3; do
	git pull --rebase origin main && git push && break \|\| {
	[ $attempt -lt 3 ] && { echo "Attempt $attempt failed, retrying in $((attempt * 5))s..."; sleep $((attempt * 5)); } \
	\|\| { echo "::error::Failed to push after 3 attempts"; exit 1; }
	}
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge pull request #114 from beehive-lab/ci/metrics-history #322

Workflow file

Merge pull request #114 from beehive-lab/ci/metrics-history #322

Uh oh!

Workflow file for this run