diff --git a/.github/build.sh b/.github/build.sh
index 23fef1d2..dec29e86 100755
--- a/.github/build.sh
+++ b/.github/build.sh
@@ -21,13 +21,19 @@ fi
 # while macOS installs it via brew in the workflow. Best-effort and inert-safe: any failure
 # leaves sccache absent, so the build just proceeds uncached. The static musl binary runs in
 # any x86_64 Linux container (the cross-compile host is always x86_64).
+#
+# SCCACHE_DL_VERSION is overridable per-job, so a container that crashes one sccache build can
+# try another without editing this script (the in-container panic that stalled phase 2 was on
+# v0.8.2; v0.16.0 is the latest release and the default). A wrong/unavailable version just fails
+# the `curl -f` and falls back to an uncached build, so bumping it can never red a build.
+SCCACHE_DL_VERSION="${SCCACHE_DL_VERSION:-0.16.0}"
 if [ "${USE_CACHE:-true}" = "true" ] && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \
    && ! command -v sccache >/dev/null 2>&1 \
    && [ "$(uname -s)" = "Linux" ] && [ "$(uname -m)" = "x86_64" ]; then
-  SCCACHE_REL="sccache-v0.8.2-x86_64-unknown-linux-musl"
+  SCCACHE_REL="sccache-v${SCCACHE_DL_VERSION}-x86_64-unknown-linux-musl"
   echo "build.sh: fetching ${SCCACHE_REL} (no sccache on PATH)..."
   if curl -fsSL --proto =https --proto-redir =https \
-        "https://github.com/mozilla/sccache/releases/download/v0.8.2/${SCCACHE_REL}.tar.gz" \
+        "https://github.com/mozilla/sccache/releases/download/v${SCCACHE_DL_VERSION}/${SCCACHE_REL}.tar.gz" \
         -o /tmp/sccache.tgz && tar -xzf /tmp/sccache.tgz -C /tmp; then
     export PATH="/tmp/${SCCACHE_REL}:$PATH"
     echo "build.sh: sccache -> $(command -v sccache || echo 'still missing')"
@@ -36,14 +42,55 @@ if [ "${USE_CACHE:-true}" = "true" ] && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE
   fi
 fi
 
+# Health-check before trusting sccache as the compiler launcher. Because sccache *is* the
+# launcher (cmake runs `sccache <compiler> ...` for every TU), a present-but-crashing sccache
+# fails every compile and reds the whole build — exactly the in-container panic that stalled
+# phase 2 (the static-musl binary panicked while wrapping the cross-compiler, failing ggml.c.o).
+# The probe runs the real compiler through sccache on a trivial TU; only if that succeeds is the
+# launcher enabled. On any failure it logs the captured output (the Rust panic backtrace, plus
+# the detached server's SCCACHE_ERROR_LOG when a job sets one) and the build runs WITHOUT the
+# cache — a clean, uncached -O3 build that still goes green. This closes the gap the old
+# absent-only guard left: it handled sccache *missing*, not sccache *crashing*.
+sccache_can_wrap_compiler() {
+  probe_cc="${CC:-}"
+  if [ -z "$probe_cc" ]; then
+    for c in cc gcc clang; do
+      if command -v "$c" >/dev/null 2>&1; then probe_cc="$c"; break; fi
+    done
+  fi
+  if [ -z "$probe_cc" ]; then
+    echo "build.sh: sccache probe: no C compiler on PATH to probe; building uncached"
+    return 1
+  fi
+  probe_dir="$(mktemp -d 2>/dev/null || echo "/tmp/sccache-probe.$$")"
+  mkdir -p "$probe_dir" || return 1
+  printf 'int main(void){return 0;}\n' > "$probe_dir/probe.c"
+  probe_out="$(sccache "$probe_cc" -c "$probe_dir/probe.c" -o "$probe_dir/probe.o" 2>&1)"
+  probe_rc=$?
+  rm -rf "$probe_dir"
+  if [ "$probe_rc" -ne 0 ]; then
+    echo "build.sh: sccache probe FAILED (rc=${probe_rc}) wrapping '${probe_cc}' — building WITHOUT cache."
+    [ -n "$probe_out" ] && printf '%s\n' "$probe_out" | sed 's/^/build.sh:   sccache-probe| /'
+    if [ -n "${SCCACHE_ERROR_LOG:-}" ] && [ -f "${SCCACHE_ERROR_LOG}" ]; then
+      echo "build.sh:   --- detached server log (${SCCACHE_ERROR_LOG}) ---"
+      sed 's/^/build.sh:   sccache-srv| /' "${SCCACHE_ERROR_LOG}" 2>/dev/null || true
+    fi
+    return 1
+  fi
+  echo "build.sh: sccache probe OK (wrapped '${probe_cc}')"
+  return 0
+}
+
 # Optional shared compiler cache: sccache fronting Depot Cache (WebDAV). Enabled only when
-# USE_CACHE is true AND sccache + a cache token are present, so it stays inert before the
-# DEPOT_TOKEN secret is configured and on fork PRs (secrets hidden) — those just compile
-# normally. sccache is content-addressed, so a cache hit is bit-identical to a fresh -O3
-# compile (release-safe), and it degrades to direct compilation if the cache is unreachable.
+# USE_CACHE is true AND sccache + a cache token are present AND the probe confirms sccache can
+# wrap the compiler — so it stays inert before the DEPOT_TOKEN secret is configured, on fork PRs
+# (secrets hidden), and when sccache would crash; all of those just compile normally. sccache is
+# content-addressed, so a cache hit is bit-identical to a fresh -O3 compile (release-safe), and
+# it degrades to direct compilation if the cache is unreachable.
 LAUNCH=""
 if [ "${USE_CACHE:-true}" = "true" ] && command -v sccache >/dev/null 2>&1 \
-   && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ]; then
+   && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \
+   && sccache_can_wrap_compiler; then
   LAUNCH="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache"
   echo "build.sh: sccache ON (endpoint=${SCCACHE_WEBDAV_ENDPOINT:-default}), building with -j${JOBS}"
 else
@@ -53,6 +100,8 @@ fi
 cmake -Bbuild $LAUNCH $@ || exit 1
 cmake --build build --config Release -j"${JOBS}" || exit 1
 
-if command -v sccache >/dev/null 2>&1; then
+# Only query stats when sccache was actually used as the launcher; if the probe rejected a
+# crashing sccache, re-invoking it here would just repeat the crash output (harmless but noisy).
+if [ -n "$LAUNCH" ] && command -v sccache >/dev/null 2>&1; then
   sccache --show-stats || true
 fi
diff --git a/.github/build_cuda_linux.sh b/.github/build_cuda_linux.sh
index d9acbbf2..bf9bc560 100755
--- a/.github/build_cuda_linux.sh
+++ b/.github/build_cuda_linux.sh
@@ -15,4 +15,26 @@ sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute
 
 sudo dnf install -y cuda-toolkit-13-2
 
-exec .github/build.sh $@ -DGGML_CUDA=1 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc
+# CUDA target architectures — build-speed knob.
+#
+# Default (CUDA_FAST_BUILD unset): we do NOT pass CMAKE_CUDA_ARCHITECTURES, so ggml/llama.cpp
+# compiles its full default arch set. That is exactly what release artifacts must ship (every
+# supported GPU generation) and is the slow part of this ~70 min job: nvcc recompiles each .cu
+# kernel once per architecture. sccache caches the gcc C/C++ TUs but NOT the nvcc .cu kernels
+# (sccache's nvcc support is limited/experimental), so the per-arch nvcc passes dominate even
+# with the cache on — which is why this knob exists as the real CUDA build-time lever.
+#
+# Dev fast build (CUDA_FAST_BUILD=1): compile for a SINGLE architecture instead of the full
+# set, removing most of the nvcc time. Defaults to `native` (the build machine's own GPU —
+# needs a GPU present at configure time); override with CUDA_ARCH, e.g. CUDA_ARCH=90. This is
+# a MANUAL local-dev knob only: CI and release never set it, because an artifact built this
+# way runs on a single GPU generation. (Direct-cmake equivalent: -DCMAKE_CUDA_ARCHITECTURES=native.)
+CUDA_ARCH_ARGS=""
+case "${CUDA_FAST_BUILD:-}" in
+  1 | true | TRUE | yes | on)
+    CUDA_ARCH_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH:-native}"
+    echo "build_cuda_linux.sh: CUDA_FAST_BUILD set -> ${CUDA_ARCH_ARGS} (DEV ONLY — not release-distributable)"
+    ;;
+esac
+
+exec .github/build.sh $@ -DGGML_CUDA=1 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc $CUDA_ARCH_ARGS
diff --git a/.github/build_opencl_android.sh b/.github/build_opencl_android.sh
index 33053f4a..efa3789c 100755
--- a/.github/build_opencl_android.sh
+++ b/.github/build_opencl_android.sh
@@ -42,11 +42,11 @@ if [ ! -f "$LOADER_BUILD/libOpenCL.so" ]; then
     cmake --build "$LOADER_BUILD" --config Release -j"$(nproc)"
 fi
 
-mkdir -p build
-# Match .github/build.sh: pass $@ unquoted so the CI's single-string
+# Delegate the jllama cmake configure + build to build.sh so it inherits the
+# sccache probe, Depot cache launcher, and --show-stats output automatically —
+# same as build_cuda_linux.sh. Pass $@ unquoted so the CI's single-string
 # argument is word-split into individual -D flags for cmake.
-cmake -Bbuild \
+exec .github/build.sh \
     -DOpenCL_INCLUDE_DIR="$HEADERS_DIR" \
     -DOpenCL_LIBRARY="$LOADER_BUILD/libOpenCL.so" \
-    $@ || exit 1
-cmake --build build --config Release -j"$(nproc)" || exit 1
+    $@
diff --git a/.github/dockcross/dockcross-android-arm b/.github/dockcross/dockcross-android-arm
index eb90d8a5..70e1466e 100755
--- a/.github/dockcross/dockcross-android-arm
+++ b/.github/dockcross/dockcross-android-arm
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm:20260312-9b3357c
+DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm:20260515-5fd14ac
 
 #------------------------------------------------------------------------------
 # Helpers
diff --git a/.github/dockcross/dockcross-android-arm64 b/.github/dockcross/dockcross-android-arm64
index 7cc130dd..6ba9ecdb 100755
--- a/.github/dockcross/dockcross-android-arm64
+++ b/.github/dockcross/dockcross-android-arm64
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm64:20260312-9b3357c
+DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm64:20260515-5fd14ac
 
 #------------------------------------------------------------------------------
 # Helpers
diff --git a/.github/dockcross/dockcross-linux-arm64-lts b/.github/dockcross/dockcross-linux-arm64-lts
index 0658411f..49c467c0 100755
--- a/.github/dockcross/dockcross-linux-arm64-lts
+++ b/.github/dockcross/dockcross-linux-arm64-lts
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-DEFAULT_DOCKCROSS_IMAGE=dockcross/linux-arm64-lts:20260313-9b3357c
+DEFAULT_DOCKCROSS_IMAGE=dockcross/linux-arm64-lts:20260515-5fd14ac
 
 #------------------------------------------------------------------------------
 # Helpers
diff --git a/.github/dockcross/dockcross-manylinux2014-x64 b/.github/dockcross/dockcross-manylinux2014-x64
index 75a37ffe..a3aea0f7 100755
--- a/.github/dockcross/dockcross-manylinux2014-x64
+++ b/.github/dockcross/dockcross-manylinux2014-x64
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux2014-x64:20260312-9b3357c
+DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux2014-x64:20260515-5fd14ac
 
 #------------------------------------------------------------------------------
 # Helpers
diff --git a/.github/dockcross/dockcross-manylinux_2_28-x64 b/.github/dockcross/dockcross-manylinux_2_28-x64
index 15d4937e..39f4f9db 100755
--- a/.github/dockcross/dockcross-manylinux_2_28-x64
+++ b/.github/dockcross/dockcross-manylinux_2_28-x64
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux_2_28-x64:20260312-9b3357c
+DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux_2_28-x64:20260515-5fd14ac
 
 #------------------------------------------------------------------------------
 # Helpers
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 18f566ab..18e15ca3 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -171,6 +171,30 @@ jobs:
     name: Cross-Compile manylinux_2_28 x86_64 (CUDA)
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
+    # Phase 2 dockcross cache rollout — job 2, enabled after manylinux2014 (job 1) verified green
+    # in CI with sccache v0.16.0 caching to Depot. build_cuda_linux.sh execs build.sh, so the same
+    # probe guards this job: only the gcc C/C++ TUs cache (the nvcc .cu kernels are not wrapped),
+    # still a large partial win on this ~70 min build. Diagnostics are on for its first run on the
+    # manylinux_2_28 image; drop them (and their -e passthroughs) once it is confirmed green with a
+    # cache hit, then enable the next job. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false.
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+      SCCACHE_LOG: debug
+      SCCACHE_ERROR_LOG: /tmp/sccache_server.log
+      RUST_BACKTRACE: full
+      # CUDA arch policy: FAST single-arch build for validation runs (PR / push / non-publish
+      # dispatch) to cut nvcc time; FULL arch set only when actually publishing to Central
+      # (publish_to_central=true) so the distributed jar runs on every GPU generation. The
+      # publish-snapshot/publish-release jobs require publish_to_central, so any artifact that
+      # reaches Central is always built with the full set. CI has no GPU, so the fast path pins a
+      # fixed CUDA_ARCH ('native' would fail at configure). '0' => full (release-safe), '1' => fast.
+      CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}
+      # Newest CUDA 13.2 architecture: sm_120 (consumer Blackwell / RTX 50xx). Only used on the
+      # fast validation path; bump as newer GPU generations ship. Releases ignore it (full set).
+      CUDA_ARCH: '120'
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE -e CUDA_FAST_BUILD -e CUDA_ARCH"
     steps:
       - uses: actions/checkout@v6
       - name: Download shared WebUI assets
@@ -200,6 +224,17 @@ jobs:
     name: Cross-Compile manylinux2014 x86_64
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
+    # Phase 2 dockcross cache rollout — job 1, VERIFIED green in CI (PR #245): sccache v0.16.0
+    # probe passed in-container (devtoolset-10 gcc), cache ON over Depot WebDAV (cold run: 275
+    # objects stored). Steady-state env below — the first-run diagnostics (SCCACHE_LOG /
+    # SCCACHE_ERROR_LOG / RUST_BACKTRACE) were dropped now that it is proven. Inert without
+    # DEPOT_TOKEN (fork PRs) or with use_cache=false; a crashing sccache still falls back to a
+    # green uncached build via the build.sh probe.
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"
     steps:
       - uses: actions/checkout@v6
       - name: Download shared WebUI assets
@@ -229,6 +264,14 @@ jobs:
     name: Cross-Compile Linux aarch64 (LTS)
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
+    # Phase 2 dockcross cache rollout — job 3. Same steady-state env as manylinux2014 (job 1);
+    # the build.sh probe makes it safe to enable without a separate verification run. Inert
+    # without DEPOT_TOKEN (fork PRs) or use_cache=false.
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"
     steps:
       - uses: actions/checkout@v6
       - name: Download shared WebUI assets
@@ -258,6 +301,14 @@ jobs:
     name: Cross-Compile Android aarch64
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
+    # Phase 2 dockcross cache rollout — job 4. Same steady-state env as manylinux2014 (job 1);
+    # the build.sh probe makes it safe to enable without a separate verification run. Inert
+    # without DEPOT_TOKEN (fork PRs) or use_cache=false.
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"
     steps:
       - uses: actions/checkout@v6
       - name: Download shared WebUI assets
@@ -287,6 +338,15 @@ jobs:
     name: Cross-Compile Android aarch64 (OpenCL/Adreno)
     needs: [startgate, build-webui]
     runs-on: ubuntu-latest
+    # Phase 2 dockcross cache rollout — job 5. build_opencl_android.sh stages the OpenCL
+    # headers/loader, then delegates the jllama cmake build to build.sh (which owns the
+    # sccache probe + launcher). Same steady-state env as the other dockcross jobs. Inert
+    # without DEPOT_TOKEN (fork PRs) or use_cache=false.
+    env:
+      USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }}
+      SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev
+      SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}
+      DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"
     steps:
       - uses: actions/checkout@v6
       - name: Download shared WebUI assets
@@ -561,22 +621,35 @@ jobs:
         with:
           name: Linux-x86_64-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
+      # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from
+      # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler
+      # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's
+      # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache`
+      # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced
+      # there and its file cache needs Depot-hosted runners. See CLAUDE.md.
+      - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace)
+        uses: actions/cache@v5
+        with:
+          path: models/
+          # GGUF is platform-independent, so ubuntu + macOS share one entry;
+          # bump the suffix when the model set / URLs change.
+          key: gguf-models-v1
       - name: Download text generation model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+        run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Download reranking model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
+        run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+        run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+        run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: Download tool-calling model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
+        run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download nomic embedding model (issue #98 regression)
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
+        run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
+        run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
+        run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -685,20 +758,33 @@ jobs:
         with:
           name: macos-14-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
+      # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from
+      # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler
+      # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's
+      # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache`
+      # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced
+      # there and its file cache needs Depot-hosted runners. See CLAUDE.md.
+      - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace)
+        uses: actions/cache@v5
+        with:
+          path: models/
+          # GGUF is platform-independent, so ubuntu + macOS share one entry;
+          # bump the suffix when the model set / URLs change.
+          key: gguf-models-v1
       - name: Download text generation model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+        run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Download reranking model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
+        run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+        run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+        run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: Download tool-calling model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
+        run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
+        run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
+        run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -752,20 +838,33 @@ jobs:
         with:
           name: macos-15-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
+      # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from
+      # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler
+      # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's
+      # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache`
+      # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced
+      # there and its file cache needs Depot-hosted runners. See CLAUDE.md.
+      - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace)
+        uses: actions/cache@v5
+        with:
+          path: models/
+          # GGUF is platform-independent, so ubuntu + macOS share one entry;
+          # bump the suffix when the model set / URLs change.
+          key: gguf-models-v1
       - name: Download text generation model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+        run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Download reranking model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
+        run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+        run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+        run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: Download tool-calling model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
+        run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
+        run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
+        run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
@@ -819,20 +918,33 @@ jobs:
         with:
           name: macos-15-metal-libraries
           path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/
+      # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from
+      # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler
+      # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's
+      # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache`
+      # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced
+      # there and its file cache needs Depot-hosted runners. See CLAUDE.md.
+      - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace)
+        uses: actions/cache@v5
+        with:
+          path: models/
+          # GGUF is platform-independent, so ubuntu + macOS share one entry;
+          # bump the suffix when the model set / URLs change.
+          key: gguf-models-v1
       - name: Download text generation model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+        run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Download reranking model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
+        run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME}
       - name: Download draft model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
+        run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+        run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
       - name: Download tool-calling model
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
+        run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
+        run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
-        run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
+        run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME}
       - name: List files in models directory
         run: ls -l models/
       - name: Validate model files
diff --git a/CLAUDE.md b/CLAUDE.md
index 7b66afea..e654df78 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -38,6 +38,45 @@ git add .github/build_cuda_linux.sh pom.xml CLAUDE.md
 git commit -m "Upgrade CUDA from 13.2 to 13.3"
 ```
 
+### Fast local CUDA builds (`CUDA_FAST_BUILD`) — single-arch speed knob
+
+The CUDA artifact must ship kernels for **every supported GPU generation**, so the default
+build — and every CI/release build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that
+ggml/llama.cpp selects. nvcc recompiles each `.cu` kernel once per architecture, which is the
+dominant cost of the ~70 min CUDA job. **`sccache` does not help here:** it caches the gcc
+C/C++ TUs but not the nvcc `.cu` kernels (sccache's nvcc support is limited/experimental), so
+the per-arch nvcc passes remain even with the cache on. The one reliable lever to cut that time
+is to build **fewer architectures**.
+
+`build_cuda_linux.sh` therefore honors an **opt-in** env knob — default **off** (full arch set,
+release-safe):
+
+```bash
+# Full release build (default): all archs — slow, runs on every GPU generation.
+.github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64"
+
+# Fast local dev build: one arch only. Defaults to `native` (the build machine's own GPU;
+# needs a GPU present at configure time). Override with CUDA_ARCH=<cc>, e.g. CUDA_ARCH=90.
+CUDA_FAST_BUILD=1 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64"
+CUDA_FAST_BUILD=1 CUDA_ARCH=90 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64"
+# Direct-cmake equivalent: cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native
+```
+
+**Default + CI policy (release-safety is the invariant).** An artifact built with `CUDA_FAST_BUILD`
+runs on only the single GPU generation it was compiled for, so the **distributed jar must always be
+the full arch set**. The script default is **off** (full) so any *local/manual* build is
+release-safe. In CI (`publish.yml`, the `crosscompile-linux-x86_64-cuda` job) the flag is **on for
+validation runs** (PR / push / non-publish dispatch) to cut nvcc time, and **off only when actually
+publishing to Central** — it is wired as `CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}`
+(`'0'`=full, `'1'`=fast). Because the `publish-snapshot`/`publish-release` jobs require
+`publish_to_central`, **every artifact that reaches Central is built with the full arch set** while
+ordinary PR/push CI stays fast. CI has no GPU, so the fast path pins a fixed `CUDA_ARCH` (default
+`120` — the newest CUDA 13.2 arch, sm_120 / consumer Blackwell — in the job env) — `native`
+would fail at configure. Both `CUDA_FAST_BUILD` and `CUDA_ARCH` are
+forwarded into the dockcross container via `DOCKCROSS_ARGS` `-e`. To cache the nvcc kernels too you
+would add `-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` (gated behind the same probe), but sccache's nvcc
+caching is unreliable — the arch knob is the better lever and is what this repo ships.
+
 ## Android minimum API level
 
 Current Android minimum API level: **28** (Android 9.0 Pie)
@@ -197,14 +236,50 @@ stays `-O3` and is **bit-identical** to a clean build (release-safe).
 
 **Safety / transparency.** It is **inert** until `DEPOT_TOKEN` is configured and on **fork
 PRs** (secrets are hidden there) — those simply compile normally; the `Install sccache` step
-is `continue-on-error`; and `use_cache=false` forces a pristine, from-scratch build.
-
-**Rollout.** **Phase 1 (current): the 3 macOS build jobs** (slowest + OOM-prone) —
-`brew install sccache` + the env above + `BUILD_JOBS: 2`. **Phase 2 (TODO):** the dockcross
-Linux/Android/CUDA jobs (the `sccache` binary **and** `DEPOT_TOKEN` must be passed *into* the
-container), the Windows jobs (sccache supports MSVC), and the Linux-host `test-cpp` job. To
-extend a job: install `sccache`, set the two `SCCACHE_WEBDAV_*` env vars, and (for
-RAM-limited runners) `BUILD_JOBS`.
+is `continue-on-error`; and `use_cache=false` forces a pristine, from-scratch build. Crucially,
+`build.sh` runs a **probe-compile health-check** (`sccache_can_wrap_compiler`) before trusting
+sccache as the launcher: it compiles a trivial TU *through* sccache, and only sets
+`-DCMAKE_{C,CXX}_COMPILER_LAUNCHER=sccache` if that succeeds. So a sccache that is present but
+**crashes** (the in-container panic that stalled phase 2) also falls back to an uncached, green
+`-O3` build — it logs the Rust panic backtrace (and the detached server's `SCCACHE_ERROR_LOG`,
+when a job sets one) for diagnosis but never reds the build. This closes the gap the original
+absent-only guard left.
+
+**Rollout.** **Phase 1 — DONE & proven: the 3 macOS build jobs** (slowest + OOM-prone) —
+`brew install sccache` + the env above + `BUILD_JOBS: 2`. macOS build dropped **~40 min → ~6 min**
+with a warm cache. **Phase 2 — DONE: all 5 dockcross cross-compile jobs** now have the same
+steady-state env (`USE_CACHE` + `SCCACHE_WEBDAV_*` + `DOCKCROSS_ARGS`). The probe makes it safe
+to enable them all at once — any container where sccache crashes falls back to an uncached green
+build automatically. (The first attempt enabled all four at once without the probe and was
+reverted: the static-musl sccache v0.8.2 panicked in-container and redded the build. With
+v0.16.0 + the probe this is no longer a risk.) Job-by-job status:
+1. `crosscompile-linux-x86_64` (manylinux2014) — ✅ **verified green** in PR #245: sccache
+   **v0.16.0** probe passed in-container (devtoolset-10 gcc), `sccache ON` over Depot WebDAV,
+   warm cache 277/278 hits (99.64%), 1m46s build time.
+2. `crosscompile-linux-x86_64-cuda` (via `build_cuda_linux.sh`, which execs `build.sh`) —
+   🚧 **first run in progress** (diagnostics on). Only the gcc C/C++ TUs cache (134 model files
+   + ggml + httplib); the nvcc `.cu` kernels won't (limited sccache nvcc support) — still a
+   large partial win on the ~70 min full-arch job; the fast single-arch (sm_120) validation path
+   cuts nvcc time independently of sccache.
+3. `crosscompile-linux-aarch64` — ✅ **enabled** (same steady-state env; probe guards it).
+4. `crosscompile-android-aarch64` — ✅ **enabled** (same steady-state env; probe guards it).
+5. `crosscompile-android-aarch64-opencl` — ✅ **enabled**. `build_opencl_android.sh` stages the
+   OpenCL headers/loader, then delegates the jllama cmake build to `build.sh` via `exec`
+   (same pattern as `build_cuda_linux.sh`), so it inherits the probe and launcher automatically.
+
+Per-job recipe: add `env:` { `USE_CACHE`, `SCCACHE_WEBDAV_ENDPOINT`, `SCCACHE_WEBDAV_TOKEN` } and
+`DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"` — the
+dockcross wrapper only forwards host env it is explicitly told to via `-e`. The fetched sccache
+version is the `SCCACHE_DL_VERSION` knob in `build.sh` (default **0.16.0**; overridable per-job
+to try a different build against a container that crashed another). **Windows** (`build.bat` +
+MSVC) is separate and last: use `mozilla-actions/sccache-action` / sccache's MSVC support, not
+the `build.sh` musl fetch.
+
+**Cross-repo scope.** This Depot/sccache compiler cache makes sense only for java-llama.cpp —
+it is the only sibling repo with a native (C++/JNI) build. It does not apply to the pure-Maven
+siblings; why (and why the `DEPOT_TOKEN` org secret and the README "Build cache by Depot" badge
+are kept jllama-only) is explained in the cross-repo status under "Deliberate non-parity":
+[`../workspace/crossrepostatus.md`](../workspace/crossrepostatus.md).
 
 ## Upgrading/Downgrading llama.cpp Version
 
@@ -701,6 +776,12 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson"
 
 llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9682`.
 
+**GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely
+by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the
+llama.cpp pin or the bundled nlohmann/json. There is **no constraint behind the exact tag**; it
+is just the latest stable at the time it was last touched. Bump it from time to time (nothing
+auto-tracks it), pairing the bump with a green `C++ Tests` CI run.
+
 ```
 build/_deps/llama.cpp-src/tools/server/   ← server-task.h, server-common.h, etc.
 build/_deps/llama.cpp-src/include/        ← llama.h, llama-cpp.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89d80585..f9cb148d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -383,7 +383,10 @@ if(BUILD_TESTING)
     FetchContent_Declare(
         googletest
         GIT_REPOSITORY https://github.com/google/googletest.git
-        GIT_TAG        v1.15.2
+        # No constraint behind this exact tag — GoogleTest is only used by this repo's own
+        # C++ unit tests (jllama_test), not by the shipped library and not tied to llama.cpp.
+        # It is just "latest stable at the time"; bump it from time to time (see CLAUDE.md).
+        GIT_TAG        v1.17.0
     )
     # Keep GTest on the same CRT as the rest of the project.
     # OFF means GTest respects CMAKE_MSVC_RUNTIME_LIBRARY (static /MT here).
diff --git a/TODO.md b/TODO.md
index 0a9f1342..15233961 100644
--- a/TODO.md
+++ b/TODO.md
@@ -85,6 +85,56 @@ primary goal: agentic tool-calling with Qwen):
 - **Gemma 4 tool-calling validation.** Confirm the pinned llama.cpp (`b9682`) includes the Gemma 4
   tool-call parser fixes; if not, bump per the upgrade procedure.
 
+### Windows compiler cache (sccache) — deferred: needs Ninja; evaluate dual-artifact
+
+The two Windows native build jobs (`build-windows-x86_64`, `build-windows-x86`) are the **only
+remaining uncached** native builds — the 3 macOS jobs and all 5 dockcross jobs now cache via
+sccache + Depot. Windows is not yet wired up because of a hard CMake constraint, and the chosen
+path is to validate it carefully rather than flip the working build in place.
+
+**Why the obvious fix doesn't work.** Our cache mechanism is the CMake *compiler launcher*
+(`-DCMAKE_C_COMPILER_LAUNCHER=sccache`, set by `build.sh`). ggml has its own equivalent
+(`GGML_CCACHE` → `RULE_LAUNCH_COMPILE`). **Both are honored only by the Ninja and Makefile
+generators — the Visual Studio generator ignores them entirely.** Our Windows jobs use
+`-G "Visual Studio 18 2026" -A x64|Win32`, so just adding `mozilla-actions/sccache-action`
+caches nothing. (The CLAUDE.md "use sccache-action / MSVC support" note predates hitting this.)
+
+**Upstream evidence (llama.cpp `b9682`, `.github/workflows/release.yml`).** ggml-org ships its
+Windows artifacts with Ninja, not the VS generator:
+- `windows-cpu` (the main CPU artifact, our analogue) — **Ninja Multi-Config** + clang toolchain
+  (`cmake/x64-windows-llvm.cmake`) + ccache.
+- `windows-cuda` — **Ninja Multi-Config** + MSVC + ccache (proves Ninja Multi-Config + MSVC works
+  on the same llama.cpp + BoringSSL tree we build).
+- `windows-sycl` — Ninja; `windows-hip` — Unix Makefiles; legacy `windows` + `windows-openvino` —
+  Visual Studio 17 2022. All jobs cache via `ggml-org/ccache-action@v1.2.21`.
+- Important detail: it is **"Ninja Multi-Config"**, not plain Ninja — it keeps multi-config
+  semantics, so `cmake --build … --config Release` and our config-specific
+  `RUNTIME_OUTPUT_DIRECTORY_RELEASE` properties (`CMakeLists.txt:363-365`) behave exactly as they
+  do under the VS generator. The diff vs today is small: swap `-G`/`-A` for `-G "Ninja
+  Multi-Config"` + an MSVC env step (`vcvarsall` / `ilammy/msvc-dev-cmd`); `/MT` runtime and the
+  x64-vs-x86 arch gating are unchanged.
+
+**Chosen approach — do NOT switch the working build blindly.** Instead either (a) prove the Ninja
+Multi-Config build in a **separate/experimental job first**, or preferably (b) **ship two Windows
+artifacts in parallel — one Ninja-built, one MSVC(VS-generator)-built — so end users can test both**
+and we can compare them before committing to one. That means the Windows native build runs **twice**
+(once per generator) for a transition period; keep the MSVC/VS artifact as the trusted default and
+add the Ninja one alongside until it's proven equivalent. Only after the Ninja artifact is validated
+should we consider making it the sole Windows build (and retiring the second run).
+
+**Implementation notes for when this is picked up:**
+- Cache backend: prefer **sccache + Depot WebDAV** (consistent with the other 8 jobs — one token,
+  shared cross-branch) over upstream's ccache (GitHub per-branch cache, a second cache system).
+  sccache supports MSVC `cl.exe`; Release config emits no debug info, so the `/Zi`→`/Z7` PDB caveat
+  doesn't apply.
+- `build.bat` needs a Ninja path: pass `-G "Ninja Multi-Config"` + `-DCMAKE_BUILD_TYPE` is *not*
+  needed (multi-config keeps `--config Release`); add an sccache presence/probe guard mirroring
+  `build.sh` so a missing/crashing sccache falls back to a green uncached build.
+- Files to touch: `.github/workflows/publish.yml` (the two `build-windows-*` jobs — add the MSVC env
+  step, the cache action, and the second artifact), `.github/build.bat` (generator + launcher wiring).
+- Risk is bounded: a broken Ninja build shows up as a red Windows job, and publishing is gated behind
+  `publish_to_central`, so no broken artifact can reach Central/GitHub Releases.
+
 ### llama.cpp upstream feature exposure (queued, deferred by policy)
 
 These are JNI plumbing items for upstream API additions. Policy: add only after a real user request — they are mostly relevant to specific model families or specialized workflows.
diff --git a/pom.xml b/pom.xml
index 3f1f52c6..3dd10543 100644
--- a/pom.xml
+++ b/pom.xml
@@ -54,7 +54,7 @@ SPDX-License-Identifier: MIT
 		<jspecify.version>1.0.0</jspecify.version>
 		<lombok.version>1.18.46</lombok.version>
 		<errorprone.version>2.50.0</errorprone.version>
-		<nullaway.version>0.13.6</nullaway.version>
+		<nullaway.version>0.13.7</nullaway.version>
 		<checker.version>4.2.0</checker.version>
 		<jackson.version>2.22.0</jackson.version>
 		<reactor.version>3.8.6</reactor.version>
@@ -330,7 +330,7 @@ SPDX-License-Identifier: MIT
 				<plugin>
 					<groupId>org.pitest</groupId>
 					<artifactId>pitest-maven</artifactId>
-					<version>1.25.4</version>
+					<version>1.25.5</version>
 				</plugin>
 				<plugin>
 					<groupId>org.sonatype.central</groupId>
diff --git a/src/main/java/net/ladenthin/llama/server/NativeServer.java b/src/main/java/net/ladenthin/llama/server/NativeServer.java
new file mode 100644
index 00000000..024ac827
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/NativeServer.java
@@ -0,0 +1,109 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import java.util.Objects;
+import lombok.ToString;
+
+/**
+ * Scaffold for the <em>native</em> HTTP server bridge — the planned counterpart to
+ * {@link OpenAiCompatServer}.
+ *
+ * <p>{@link OpenAiCompatServer} implements the HTTP transport in Java (on the JDK's
+ * {@code com.sun.net.httpserver}) and drives the native llama.cpp server <em>core</em> over JNI. This
+ * class is instead the entry point for the upstream <em>native</em> HTTP transport that is already
+ * compiled into {@code libjllama} (llama.cpp's {@code server-http.cpp} plus its {@code cpp-httplib}
+ * backend). That native transport is the only component able to serve the embedded llama.cpp
+ * <strong>WebUI</strong> (the {@code ui.cpp}/{@code ui.h} asset table compiled in behind
+ * {@code LLAMA_UI_HAS_ASSETS}).</p>
+ *
+ * <p><strong>Status: scaffold only.</strong> The route registration that upstream performs in
+ * {@code server.cpp} (deliberately excluded from this build) is not yet wired to a JNI entry point, so
+ * {@link #start()} throws {@link UnsupportedOperationException} for now. This class only fixes the
+ * package structure and the public API shape; the native {@code startServer}/{@code stopServer}
+ * methods, their C++ implementation, the server lifecycle/threading and WebUI serving are a separate,
+ * detailed step (see {@code CLAUDE.md}, "WebUI (llama.cpp Svelte UI) embedding").</p>
+ *
+ * <p>It is {@link AutoCloseable} so that, once implemented, callers can drive it with
+ * try-with-resources exactly like {@link OpenAiCompatServer}.</p>
+ */
+@ToString
+public final class NativeServer implements AutoCloseable {
+
+    /** Message thrown by {@link #start()} until the native route-wiring lands. */
+    static final String NOT_WIRED_MESSAGE =
+            "NativeServer is a scaffold: the upstream native HTTP routes (server-http.cpp) are "
+                    + "not yet wired to JNI. Use OpenAiCompatServer for now; the native server and "
+                    + "embedded WebUI are a planned step.";
+
+    /** Immutable server configuration (bind host, port, ...) shared with {@link OpenAiCompatServer}. */
+    private final OpenAiServerConfig config;
+
+    /**
+     * Creates a native-server bridge for the given configuration.
+     *
+     * <p>Construction performs no native work and binds no socket; it only captures the configuration.
+     * Call {@link #start()} to launch the server (not implemented yet).</p>
+     *
+     * @param config the server configuration (host, port, ...); must not be {@code null}
+     */
+    public NativeServer(OpenAiServerConfig config) {
+        this.config = Objects.requireNonNull(config, "config");
+    }
+
+    /**
+     * Starts the native HTTP server and begins serving the embedded WebUI.
+     *
+     * <p><strong>Not implemented yet</strong> — this is a scaffold. The native route registration and
+     * its JNI binding are a planned step, so this method always throws until then.</p>
+     *
+     * @return this server instance (for fluent / try-with-resources use), once implemented
+     * @throws UnsupportedOperationException always, until the native routes are wired to JNI
+     */
+    // Scaffold: start() intentionally always throws for now, but must stay callable (not @DoNotCall)
+    // so the real implementation and its callers/tests keep the same signature.
+    @SuppressWarnings("DoNotCallSuggester")
+    public NativeServer start() {
+        throw new UnsupportedOperationException(NOT_WIRED_MESSAGE);
+    }
+
+    /**
+     * Reports whether the native server is currently running.
+     *
+     * @return {@code false} — the scaffold never starts a server yet
+     */
+    public boolean isRunning() {
+        return false;
+    }
+
+    /**
+     * Returns the host the server is configured to bind to.
+     *
+     * @return the configured bind host
+     */
+    public String getHost() {
+        return config.getHost();
+    }
+
+    /**
+     * Returns the port the server is configured to bind to.
+     *
+     * @return the configured port
+     */
+    public int getPort() {
+        return config.getPort();
+    }
+
+    /**
+     * Stops the native server if it is running.
+     *
+     * <p>No-op in the scaffold (nothing is ever started), so it is always safe to call, including from
+     * try-with-resources. Real lifecycle teardown is part of the planned native-server implementation.</p>
+     */
+    @Override
+    public void close() {
+        // Nothing is started yet, so there is nothing to release.
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java b/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java
new file mode 100644
index 00000000..7e74dec4
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java
@@ -0,0 +1,48 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import org.junit.jupiter.api.Test;
+
+/**
+ * Model-free smoke test for the {@link NativeServer} scaffold: it must construct without any native
+ * work, expose its configured host/port, never report itself running, throw a clear
+ * {@link UnsupportedOperationException} from {@link NativeServer#start()} until the native routes are
+ * wired, and be a safe no-op {@link AutoCloseable}. No model and no {@code libjllama} required.
+ */
+public class NativeServerSmokeTest {
+
+    private static OpenAiServerConfig config() {
+        return OpenAiServerConfig.builder().host("127.0.0.1").port(1234).build();
+    }
+
+    @Test
+    public void exposesConfiguredHostAndPortWithoutStarting() {
+        NativeServer server = new NativeServer(config());
+        assertThat(server.getHost(), is("127.0.0.1"));
+        assertThat(server.getPort(), is(1234));
+        assertThat(server.isRunning(), is(false));
+    }
+
+    @Test
+    public void startThrowsUntilNativeRoutesAreWired() {
+        NativeServer server = new NativeServer(config());
+        UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, server::start);
+        assertThat(ex.getMessage(), containsString("not yet wired"));
+        assertThat(server.isRunning(), is(false));
+    }
+
+    @Test
+    public void closeIsSafeNoOpEvenViaTryWithResources() {
+        try (NativeServer server = new NativeServer(config())) {
+            assertThat(server.isRunning(), is(false));
+        }
+    }
+}