diff --git a/.github/build.sh b/.github/build.sh index 23fef1d2..dec29e86 100755 --- a/.github/build.sh +++ b/.github/build.sh @@ -21,13 +21,19 @@ fi # while macOS installs it via brew in the workflow. Best-effort and inert-safe: any failure # leaves sccache absent, so the build just proceeds uncached. The static musl binary runs in # any x86_64 Linux container (the cross-compile host is always x86_64). +# +# SCCACHE_DL_VERSION is overridable per-job, so a container that crashes one sccache build can +# try another without editing this script (the in-container panic that stalled phase 2 was on +# v0.8.2; v0.16.0 is the latest release and the default). A wrong/unavailable version just fails +# the `curl -f` and falls back to an uncached build, so bumping it can never red a build. +SCCACHE_DL_VERSION="${SCCACHE_DL_VERSION:-0.16.0}" if [ "${USE_CACHE:-true}" = "true" ] && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \ && ! command -v sccache >/dev/null 2>&1 \ && [ "$(uname -s)" = "Linux" ] && [ "$(uname -m)" = "x86_64" ]; then - SCCACHE_REL="sccache-v0.8.2-x86_64-unknown-linux-musl" + SCCACHE_REL="sccache-v${SCCACHE_DL_VERSION}-x86_64-unknown-linux-musl" echo "build.sh: fetching ${SCCACHE_REL} (no sccache on PATH)..." if curl -fsSL --proto =https --proto-redir =https \ - "https://github.com/mozilla/sccache/releases/download/v0.8.2/${SCCACHE_REL}.tar.gz" \ + "https://github.com/mozilla/sccache/releases/download/v${SCCACHE_DL_VERSION}/${SCCACHE_REL}.tar.gz" \ -o /tmp/sccache.tgz && tar -xzf /tmp/sccache.tgz -C /tmp; then export PATH="/tmp/${SCCACHE_REL}:$PATH" echo "build.sh: sccache -> $(command -v sccache || echo 'still missing')" @@ -36,14 +42,55 @@ if [ "${USE_CACHE:-true}" = "true" ] && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE fi fi +# Health-check before trusting sccache as the compiler launcher. Because sccache *is* the +# launcher (cmake runs `sccache ...` for every TU), a present-but-crashing sccache +# fails every compile and reds the whole build — exactly the in-container panic that stalled +# phase 2 (the static-musl binary panicked while wrapping the cross-compiler, failing ggml.c.o). +# The probe runs the real compiler through sccache on a trivial TU; only if that succeeds is the +# launcher enabled. On any failure it logs the captured output (the Rust panic backtrace, plus +# the detached server's SCCACHE_ERROR_LOG when a job sets one) and the build runs WITHOUT the +# cache — a clean, uncached -O3 build that still goes green. This closes the gap the old +# absent-only guard left: it handled sccache *missing*, not sccache *crashing*. +sccache_can_wrap_compiler() { + probe_cc="${CC:-}" + if [ -z "$probe_cc" ]; then + for c in cc gcc clang; do + if command -v "$c" >/dev/null 2>&1; then probe_cc="$c"; break; fi + done + fi + if [ -z "$probe_cc" ]; then + echo "build.sh: sccache probe: no C compiler on PATH to probe; building uncached" + return 1 + fi + probe_dir="$(mktemp -d 2>/dev/null || echo "/tmp/sccache-probe.$$")" + mkdir -p "$probe_dir" || return 1 + printf 'int main(void){return 0;}\n' > "$probe_dir/probe.c" + probe_out="$(sccache "$probe_cc" -c "$probe_dir/probe.c" -o "$probe_dir/probe.o" 2>&1)" + probe_rc=$? + rm -rf "$probe_dir" + if [ "$probe_rc" -ne 0 ]; then + echo "build.sh: sccache probe FAILED (rc=${probe_rc}) wrapping '${probe_cc}' — building WITHOUT cache." + [ -n "$probe_out" ] && printf '%s\n' "$probe_out" | sed 's/^/build.sh: sccache-probe| /' + if [ -n "${SCCACHE_ERROR_LOG:-}" ] && [ -f "${SCCACHE_ERROR_LOG}" ]; then + echo "build.sh: --- detached server log (${SCCACHE_ERROR_LOG}) ---" + sed 's/^/build.sh: sccache-srv| /' "${SCCACHE_ERROR_LOG}" 2>/dev/null || true + fi + return 1 + fi + echo "build.sh: sccache probe OK (wrapped '${probe_cc}')" + return 0 +} + # Optional shared compiler cache: sccache fronting Depot Cache (WebDAV). Enabled only when -# USE_CACHE is true AND sccache + a cache token are present, so it stays inert before the -# DEPOT_TOKEN secret is configured and on fork PRs (secrets hidden) — those just compile -# normally. sccache is content-addressed, so a cache hit is bit-identical to a fresh -O3 -# compile (release-safe), and it degrades to direct compilation if the cache is unreachable. +# USE_CACHE is true AND sccache + a cache token are present AND the probe confirms sccache can +# wrap the compiler — so it stays inert before the DEPOT_TOKEN secret is configured, on fork PRs +# (secrets hidden), and when sccache would crash; all of those just compile normally. sccache is +# content-addressed, so a cache hit is bit-identical to a fresh -O3 compile (release-safe), and +# it degrades to direct compilation if the cache is unreachable. LAUNCH="" if [ "${USE_CACHE:-true}" = "true" ] && command -v sccache >/dev/null 2>&1 \ - && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ]; then + && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \ + && sccache_can_wrap_compiler; then LAUNCH="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache" echo "build.sh: sccache ON (endpoint=${SCCACHE_WEBDAV_ENDPOINT:-default}), building with -j${JOBS}" else @@ -53,6 +100,8 @@ fi cmake -Bbuild $LAUNCH $@ || exit 1 cmake --build build --config Release -j"${JOBS}" || exit 1 -if command -v sccache >/dev/null 2>&1; then +# Only query stats when sccache was actually used as the launcher; if the probe rejected a +# crashing sccache, re-invoking it here would just repeat the crash output (harmless but noisy). +if [ -n "$LAUNCH" ] && command -v sccache >/dev/null 2>&1; then sccache --show-stats || true fi diff --git a/.github/build_cuda_linux.sh b/.github/build_cuda_linux.sh index d9acbbf2..bf9bc560 100755 --- a/.github/build_cuda_linux.sh +++ b/.github/build_cuda_linux.sh @@ -15,4 +15,26 @@ sudo dnf config-manager --add-repo https://developer.download.nvidia.com/compute sudo dnf install -y cuda-toolkit-13-2 -exec .github/build.sh $@ -DGGML_CUDA=1 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc +# CUDA target architectures — build-speed knob. +# +# Default (CUDA_FAST_BUILD unset): we do NOT pass CMAKE_CUDA_ARCHITECTURES, so ggml/llama.cpp +# compiles its full default arch set. That is exactly what release artifacts must ship (every +# supported GPU generation) and is the slow part of this ~70 min job: nvcc recompiles each .cu +# kernel once per architecture. sccache caches the gcc C/C++ TUs but NOT the nvcc .cu kernels +# (sccache's nvcc support is limited/experimental), so the per-arch nvcc passes dominate even +# with the cache on — which is why this knob exists as the real CUDA build-time lever. +# +# Dev fast build (CUDA_FAST_BUILD=1): compile for a SINGLE architecture instead of the full +# set, removing most of the nvcc time. Defaults to `native` (the build machine's own GPU — +# needs a GPU present at configure time); override with CUDA_ARCH, e.g. CUDA_ARCH=90. This is +# a MANUAL local-dev knob only: CI and release never set it, because an artifact built this +# way runs on a single GPU generation. (Direct-cmake equivalent: -DCMAKE_CUDA_ARCHITECTURES=native.) +CUDA_ARCH_ARGS="" +case "${CUDA_FAST_BUILD:-}" in + 1 | true | TRUE | yes | on) + CUDA_ARCH_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH:-native}" + echo "build_cuda_linux.sh: CUDA_FAST_BUILD set -> ${CUDA_ARCH_ARGS} (DEV ONLY — not release-distributable)" + ;; +esac + +exec .github/build.sh $@ -DGGML_CUDA=1 -DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.2/bin/nvcc $CUDA_ARCH_ARGS diff --git a/.github/build_opencl_android.sh b/.github/build_opencl_android.sh index 33053f4a..efa3789c 100755 --- a/.github/build_opencl_android.sh +++ b/.github/build_opencl_android.sh @@ -42,11 +42,11 @@ if [ ! -f "$LOADER_BUILD/libOpenCL.so" ]; then cmake --build "$LOADER_BUILD" --config Release -j"$(nproc)" fi -mkdir -p build -# Match .github/build.sh: pass $@ unquoted so the CI's single-string +# Delegate the jllama cmake configure + build to build.sh so it inherits the +# sccache probe, Depot cache launcher, and --show-stats output automatically — +# same as build_cuda_linux.sh. Pass $@ unquoted so the CI's single-string # argument is word-split into individual -D flags for cmake. -cmake -Bbuild \ +exec .github/build.sh \ -DOpenCL_INCLUDE_DIR="$HEADERS_DIR" \ -DOpenCL_LIBRARY="$LOADER_BUILD/libOpenCL.so" \ - $@ || exit 1 -cmake --build build --config Release -j"$(nproc)" || exit 1 + $@ diff --git a/.github/dockcross/dockcross-android-arm b/.github/dockcross/dockcross-android-arm index eb90d8a5..70e1466e 100755 --- a/.github/dockcross/dockcross-android-arm +++ b/.github/dockcross/dockcross-android-arm @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm:20260312-9b3357c +DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm:20260515-5fd14ac #------------------------------------------------------------------------------ # Helpers diff --git a/.github/dockcross/dockcross-android-arm64 b/.github/dockcross/dockcross-android-arm64 index 7cc130dd..6ba9ecdb 100755 --- a/.github/dockcross/dockcross-android-arm64 +++ b/.github/dockcross/dockcross-android-arm64 @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm64:20260312-9b3357c +DEFAULT_DOCKCROSS_IMAGE=dockcross/android-arm64:20260515-5fd14ac #------------------------------------------------------------------------------ # Helpers diff --git a/.github/dockcross/dockcross-linux-arm64-lts b/.github/dockcross/dockcross-linux-arm64-lts index 0658411f..49c467c0 100755 --- a/.github/dockcross/dockcross-linux-arm64-lts +++ b/.github/dockcross/dockcross-linux-arm64-lts @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DEFAULT_DOCKCROSS_IMAGE=dockcross/linux-arm64-lts:20260313-9b3357c +DEFAULT_DOCKCROSS_IMAGE=dockcross/linux-arm64-lts:20260515-5fd14ac #------------------------------------------------------------------------------ # Helpers diff --git a/.github/dockcross/dockcross-manylinux2014-x64 b/.github/dockcross/dockcross-manylinux2014-x64 index 75a37ffe..a3aea0f7 100755 --- a/.github/dockcross/dockcross-manylinux2014-x64 +++ b/.github/dockcross/dockcross-manylinux2014-x64 @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux2014-x64:20260312-9b3357c +DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux2014-x64:20260515-5fd14ac #------------------------------------------------------------------------------ # Helpers diff --git a/.github/dockcross/dockcross-manylinux_2_28-x64 b/.github/dockcross/dockcross-manylinux_2_28-x64 index 15d4937e..39f4f9db 100755 --- a/.github/dockcross/dockcross-manylinux_2_28-x64 +++ b/.github/dockcross/dockcross-manylinux_2_28-x64 @@ -1,6 +1,6 @@ #!/usr/bin/env bash -DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux_2_28-x64:20260312-9b3357c +DEFAULT_DOCKCROSS_IMAGE=dockcross/manylinux_2_28-x64:20260515-5fd14ac #------------------------------------------------------------------------------ # Helpers diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 18f566ab..18e15ca3 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -171,6 +171,30 @@ jobs: name: Cross-Compile manylinux_2_28 x86_64 (CUDA) needs: [startgate, build-webui] runs-on: ubuntu-latest + # Phase 2 dockcross cache rollout — job 2, enabled after manylinux2014 (job 1) verified green + # in CI with sccache v0.16.0 caching to Depot. build_cuda_linux.sh execs build.sh, so the same + # probe guards this job: only the gcc C/C++ TUs cache (the nvcc .cu kernels are not wrapped), + # still a large partial win on this ~70 min build. Diagnostics are on for its first run on the + # manylinux_2_28 image; drop them (and their -e passthroughs) once it is confirmed green with a + # cache hit, then enable the next job. Inert without DEPOT_TOKEN (fork PRs) or use_cache=false. + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + SCCACHE_LOG: debug + SCCACHE_ERROR_LOG: /tmp/sccache_server.log + RUST_BACKTRACE: full + # CUDA arch policy: FAST single-arch build for validation runs (PR / push / non-publish + # dispatch) to cut nvcc time; FULL arch set only when actually publishing to Central + # (publish_to_central=true) so the distributed jar runs on every GPU generation. The + # publish-snapshot/publish-release jobs require publish_to_central, so any artifact that + # reaches Central is always built with the full set. CI has no GPU, so the fast path pins a + # fixed CUDA_ARCH ('native' would fail at configure). '0' => full (release-safe), '1' => fast. + CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }} + # Newest CUDA 13.2 architecture: sm_120 (consumer Blackwell / RTX 50xx). Only used on the + # fast validation path; bump as newer GPU generations ship. Releases ignore it (full set). + CUDA_ARCH: '120' + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE -e SCCACHE_LOG -e SCCACHE_ERROR_LOG -e RUST_BACKTRACE -e CUDA_FAST_BUILD -e CUDA_ARCH" steps: - uses: actions/checkout@v6 - name: Download shared WebUI assets @@ -200,6 +224,17 @@ jobs: name: Cross-Compile manylinux2014 x86_64 needs: [startgate, build-webui] runs-on: ubuntu-latest + # Phase 2 dockcross cache rollout — job 1, VERIFIED green in CI (PR #245): sccache v0.16.0 + # probe passed in-container (devtoolset-10 gcc), cache ON over Depot WebDAV (cold run: 275 + # objects stored). Steady-state env below — the first-run diagnostics (SCCACHE_LOG / + # SCCACHE_ERROR_LOG / RUST_BACKTRACE) were dropped now that it is proven. Inert without + # DEPOT_TOKEN (fork PRs) or with use_cache=false; a crashing sccache still falls back to a + # green uncached build via the build.sh probe. + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE" steps: - uses: actions/checkout@v6 - name: Download shared WebUI assets @@ -229,6 +264,14 @@ jobs: name: Cross-Compile Linux aarch64 (LTS) needs: [startgate, build-webui] runs-on: ubuntu-latest + # Phase 2 dockcross cache rollout — job 3. Same steady-state env as manylinux2014 (job 1); + # the build.sh probe makes it safe to enable without a separate verification run. Inert + # without DEPOT_TOKEN (fork PRs) or use_cache=false. + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE" steps: - uses: actions/checkout@v6 - name: Download shared WebUI assets @@ -258,6 +301,14 @@ jobs: name: Cross-Compile Android aarch64 needs: [startgate, build-webui] runs-on: ubuntu-latest + # Phase 2 dockcross cache rollout — job 4. Same steady-state env as manylinux2014 (job 1); + # the build.sh probe makes it safe to enable without a separate verification run. Inert + # without DEPOT_TOKEN (fork PRs) or use_cache=false. + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE" steps: - uses: actions/checkout@v6 - name: Download shared WebUI assets @@ -287,6 +338,15 @@ jobs: name: Cross-Compile Android aarch64 (OpenCL/Adreno) needs: [startgate, build-webui] runs-on: ubuntu-latest + # Phase 2 dockcross cache rollout — job 5. build_opencl_android.sh stages the OpenCL + # headers/loader, then delegates the jllama cmake build to build.sh (which owns the + # sccache probe + launcher). Same steady-state env as the other dockcross jobs. Inert + # without DEPOT_TOKEN (fork PRs) or use_cache=false. + env: + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} + DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE" steps: - uses: actions/checkout@v6 - name: Download shared WebUI assets @@ -561,22 +621,35 @@ jobs: with: name: Linux-x86_64-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ + # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from + # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler + # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's + # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache` + # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced + # there and its file cache needs Depot-hosted runners. See CLAUDE.md. + - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace) + uses: actions/cache@v5 + with: + path: models/ + # GGUF is platform-independent, so ubuntu + macOS share one entry; + # bump the suffix when the model set / URLs change. + key: gguf-models-v1 - name: Download text generation model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download nomic embedding model (issue #98 regression) - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} + run: test -f models/${NOMIC_EMBED_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -685,20 +758,33 @@ jobs: with: name: macos-14-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ + # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from + # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler + # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's + # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache` + # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced + # there and its file cache needs Depot-hosted runners. See CLAUDE.md. + - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace) + uses: actions/cache@v5 + with: + path: models/ + # GGUF is platform-independent, so ubuntu + macOS share one entry; + # bump the suffix when the model set / URLs change. + key: gguf-models-v1 - name: Download text generation model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -752,20 +838,33 @@ jobs: with: name: macos-15-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ + # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from + # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler + # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's + # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache` + # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced + # there and its file cache needs Depot-hosted runners. See CLAUDE.md. + - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace) + uses: actions/cache@v5 + with: + path: models/ + # GGUF is platform-independent, so ubuntu + macOS share one entry; + # bump the suffix when the model set / URLs change. + key: gguf-models-v1 - name: Download text generation model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -819,20 +918,33 @@ jobs: with: name: macos-15-metal-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ + # GGUF model cache — introduced to stop re-downloading ~5 GB of test models from + # HuggingFace on every run (also dodges HF rate-limits). Complements the sccache compiler + # cache but is always ON: there is intentionally NO on/off flag for it (it is GitHub's + # free cache, safe + free), whereas the sccache cache is toggled by the `use_cache` + # workflow_dispatch input / USE_CACHE env. Not Depot — GB-scale blobs are usage-priced + # there and its file cache needs Depot-hosted runners. See CLAUDE.md. + - name: Cache GGUF models (GitHub Actions cache; avoids re-downloading from HuggingFace) + uses: actions/cache@v5 + with: + path: models/ + # GGUF is platform-independent, so ubuntu + macOS share one entry; + # bump the suffix when the model set / URLs change. + key: gguf-models-v1 - name: Download text generation model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: test -f models/${MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: test -f models/${RERANKING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: test -f models/${DRAFT_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: test -f models/${REASONING_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: test -f models/${TOOL_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: test -f models/${VISION_MODEL_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: test -f models/${VISION_MMPROJ_NAME} || curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files diff --git a/CLAUDE.md b/CLAUDE.md index 7b66afea..e654df78 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,6 +38,45 @@ git add .github/build_cuda_linux.sh pom.xml CLAUDE.md git commit -m "Upgrade CUDA from 13.2 to 13.3" ``` +### Fast local CUDA builds (`CUDA_FAST_BUILD`) — single-arch speed knob + +The CUDA artifact must ship kernels for **every supported GPU generation**, so the default +build — and every CI/release build — compiles the **full `CMAKE_CUDA_ARCHITECTURES` set** that +ggml/llama.cpp selects. nvcc recompiles each `.cu` kernel once per architecture, which is the +dominant cost of the ~70 min CUDA job. **`sccache` does not help here:** it caches the gcc +C/C++ TUs but not the nvcc `.cu` kernels (sccache's nvcc support is limited/experimental), so +the per-arch nvcc passes remain even with the cache on. The one reliable lever to cut that time +is to build **fewer architectures**. + +`build_cuda_linux.sh` therefore honors an **opt-in** env knob — default **off** (full arch set, +release-safe): + +```bash +# Full release build (default): all archs — slow, runs on every GPU generation. +.github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64" + +# Fast local dev build: one arch only. Defaults to `native` (the build machine's own GPU; +# needs a GPU present at configure time). Override with CUDA_ARCH=, e.g. CUDA_ARCH=90. +CUDA_FAST_BUILD=1 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64" +CUDA_FAST_BUILD=1 CUDA_ARCH=90 .github/build_cuda_linux.sh "-DOS_NAME=Linux -DOS_ARCH=x86_64" +# Direct-cmake equivalent: cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native +``` + +**Default + CI policy (release-safety is the invariant).** An artifact built with `CUDA_FAST_BUILD` +runs on only the single GPU generation it was compiled for, so the **distributed jar must always be +the full arch set**. The script default is **off** (full) so any *local/manual* build is +release-safe. In CI (`publish.yml`, the `crosscompile-linux-x86_64-cuda` job) the flag is **on for +validation runs** (PR / push / non-publish dispatch) to cut nvcc time, and **off only when actually +publishing to Central** — it is wired as `CUDA_FAST_BUILD: ${{ inputs.publish_to_central && '0' || '1' }}` +(`'0'`=full, `'1'`=fast). Because the `publish-snapshot`/`publish-release` jobs require +`publish_to_central`, **every artifact that reaches Central is built with the full arch set** while +ordinary PR/push CI stays fast. CI has no GPU, so the fast path pins a fixed `CUDA_ARCH` (default +`120` — the newest CUDA 13.2 arch, sm_120 / consumer Blackwell — in the job env) — `native` +would fail at configure. Both `CUDA_FAST_BUILD` and `CUDA_ARCH` are +forwarded into the dockcross container via `DOCKCROSS_ARGS` `-e`. To cache the nvcc kernels too you +would add `-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache` (gated behind the same probe), but sccache's nvcc +caching is unreliable — the arch knob is the better lever and is what this repo ships. + ## Android minimum API level Current Android minimum API level: **28** (Android 9.0 Pie) @@ -197,14 +236,50 @@ stays `-O3` and is **bit-identical** to a clean build (release-safe). **Safety / transparency.** It is **inert** until `DEPOT_TOKEN` is configured and on **fork PRs** (secrets are hidden there) — those simply compile normally; the `Install sccache` step -is `continue-on-error`; and `use_cache=false` forces a pristine, from-scratch build. - -**Rollout.** **Phase 1 (current): the 3 macOS build jobs** (slowest + OOM-prone) — -`brew install sccache` + the env above + `BUILD_JOBS: 2`. **Phase 2 (TODO):** the dockcross -Linux/Android/CUDA jobs (the `sccache` binary **and** `DEPOT_TOKEN` must be passed *into* the -container), the Windows jobs (sccache supports MSVC), and the Linux-host `test-cpp` job. To -extend a job: install `sccache`, set the two `SCCACHE_WEBDAV_*` env vars, and (for -RAM-limited runners) `BUILD_JOBS`. +is `continue-on-error`; and `use_cache=false` forces a pristine, from-scratch build. Crucially, +`build.sh` runs a **probe-compile health-check** (`sccache_can_wrap_compiler`) before trusting +sccache as the launcher: it compiles a trivial TU *through* sccache, and only sets +`-DCMAKE_{C,CXX}_COMPILER_LAUNCHER=sccache` if that succeeds. So a sccache that is present but +**crashes** (the in-container panic that stalled phase 2) also falls back to an uncached, green +`-O3` build — it logs the Rust panic backtrace (and the detached server's `SCCACHE_ERROR_LOG`, +when a job sets one) for diagnosis but never reds the build. This closes the gap the original +absent-only guard left. + +**Rollout.** **Phase 1 — DONE & proven: the 3 macOS build jobs** (slowest + OOM-prone) — +`brew install sccache` + the env above + `BUILD_JOBS: 2`. macOS build dropped **~40 min → ~6 min** +with a warm cache. **Phase 2 — DONE: all 5 dockcross cross-compile jobs** now have the same +steady-state env (`USE_CACHE` + `SCCACHE_WEBDAV_*` + `DOCKCROSS_ARGS`). The probe makes it safe +to enable them all at once — any container where sccache crashes falls back to an uncached green +build automatically. (The first attempt enabled all four at once without the probe and was +reverted: the static-musl sccache v0.8.2 panicked in-container and redded the build. With +v0.16.0 + the probe this is no longer a risk.) Job-by-job status: +1. `crosscompile-linux-x86_64` (manylinux2014) — ✅ **verified green** in PR #245: sccache + **v0.16.0** probe passed in-container (devtoolset-10 gcc), `sccache ON` over Depot WebDAV, + warm cache 277/278 hits (99.64%), 1m46s build time. +2. `crosscompile-linux-x86_64-cuda` (via `build_cuda_linux.sh`, which execs `build.sh`) — + 🚧 **first run in progress** (diagnostics on). Only the gcc C/C++ TUs cache (134 model files + + ggml + httplib); the nvcc `.cu` kernels won't (limited sccache nvcc support) — still a + large partial win on the ~70 min full-arch job; the fast single-arch (sm_120) validation path + cuts nvcc time independently of sccache. +3. `crosscompile-linux-aarch64` — ✅ **enabled** (same steady-state env; probe guards it). +4. `crosscompile-android-aarch64` — ✅ **enabled** (same steady-state env; probe guards it). +5. `crosscompile-android-aarch64-opencl` — ✅ **enabled**. `build_opencl_android.sh` stages the + OpenCL headers/loader, then delegates the jllama cmake build to `build.sh` via `exec` + (same pattern as `build_cuda_linux.sh`), so it inherits the probe and launcher automatically. + +Per-job recipe: add `env:` { `USE_CACHE`, `SCCACHE_WEBDAV_ENDPOINT`, `SCCACHE_WEBDAV_TOKEN` } and +`DOCKCROSS_ARGS: "-e SCCACHE_WEBDAV_ENDPOINT -e SCCACHE_WEBDAV_TOKEN -e USE_CACHE"` — the +dockcross wrapper only forwards host env it is explicitly told to via `-e`. The fetched sccache +version is the `SCCACHE_DL_VERSION` knob in `build.sh` (default **0.16.0**; overridable per-job +to try a different build against a container that crashed another). **Windows** (`build.bat` + +MSVC) is separate and last: use `mozilla-actions/sccache-action` / sccache's MSVC support, not +the `build.sh` musl fetch. + +**Cross-repo scope.** This Depot/sccache compiler cache makes sense only for java-llama.cpp — +it is the only sibling repo with a native (C++/JNI) build. It does not apply to the pure-Maven +siblings; why (and why the `DEPOT_TOKEN` org secret and the README "Build cache by Depot" badge +are kept jllama-only) is explained in the cross-repo status under "Deliberate non-parity": +[`../workspace/crossrepostatus.md`](../workspace/crossrepostatus.md). ## Upgrading/Downgrading llama.cpp Version @@ -701,6 +776,12 @@ ctest --test-dir build --output-on-failure -R "ResultsToJson" llama.cpp is fetched via CMake FetchContent, pinned to `GIT_TAG b9682`. +**GoogleTest** is a separate `BUILD_TESTING`-only FetchContent (`GIT_TAG v1.17.0`), used solely +by the `jllama_test` C++ unit-test binary — not by the shipped library, and not coupled to the +llama.cpp pin or the bundled nlohmann/json. There is **no constraint behind the exact tag**; it +is just the latest stable at the time it was last touched. Bump it from time to time (nothing +auto-tracks it), pairing the bump with a green `C++ Tests` CI run. + ``` build/_deps/llama.cpp-src/tools/server/ ← server-task.h, server-common.h, etc. build/_deps/llama.cpp-src/include/ ← llama.h, llama-cpp.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 89d80585..f9cb148d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -383,7 +383,10 @@ if(BUILD_TESTING) FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG v1.15.2 + # No constraint behind this exact tag — GoogleTest is only used by this repo's own + # C++ unit tests (jllama_test), not by the shipped library and not tied to llama.cpp. + # It is just "latest stable at the time"; bump it from time to time (see CLAUDE.md). + GIT_TAG v1.17.0 ) # Keep GTest on the same CRT as the rest of the project. # OFF means GTest respects CMAKE_MSVC_RUNTIME_LIBRARY (static /MT here). diff --git a/TODO.md b/TODO.md index 0a9f1342..15233961 100644 --- a/TODO.md +++ b/TODO.md @@ -85,6 +85,56 @@ primary goal: agentic tool-calling with Qwen): - **Gemma 4 tool-calling validation.** Confirm the pinned llama.cpp (`b9682`) includes the Gemma 4 tool-call parser fixes; if not, bump per the upgrade procedure. +### Windows compiler cache (sccache) — deferred: needs Ninja; evaluate dual-artifact + +The two Windows native build jobs (`build-windows-x86_64`, `build-windows-x86`) are the **only +remaining uncached** native builds — the 3 macOS jobs and all 5 dockcross jobs now cache via +sccache + Depot. Windows is not yet wired up because of a hard CMake constraint, and the chosen +path is to validate it carefully rather than flip the working build in place. + +**Why the obvious fix doesn't work.** Our cache mechanism is the CMake *compiler launcher* +(`-DCMAKE_C_COMPILER_LAUNCHER=sccache`, set by `build.sh`). ggml has its own equivalent +(`GGML_CCACHE` → `RULE_LAUNCH_COMPILE`). **Both are honored only by the Ninja and Makefile +generators — the Visual Studio generator ignores them entirely.** Our Windows jobs use +`-G "Visual Studio 18 2026" -A x64|Win32`, so just adding `mozilla-actions/sccache-action` +caches nothing. (The CLAUDE.md "use sccache-action / MSVC support" note predates hitting this.) + +**Upstream evidence (llama.cpp `b9682`, `.github/workflows/release.yml`).** ggml-org ships its +Windows artifacts with Ninja, not the VS generator: +- `windows-cpu` (the main CPU artifact, our analogue) — **Ninja Multi-Config** + clang toolchain + (`cmake/x64-windows-llvm.cmake`) + ccache. +- `windows-cuda` — **Ninja Multi-Config** + MSVC + ccache (proves Ninja Multi-Config + MSVC works + on the same llama.cpp + BoringSSL tree we build). +- `windows-sycl` — Ninja; `windows-hip` — Unix Makefiles; legacy `windows` + `windows-openvino` — + Visual Studio 17 2022. All jobs cache via `ggml-org/ccache-action@v1.2.21`. +- Important detail: it is **"Ninja Multi-Config"**, not plain Ninja — it keeps multi-config + semantics, so `cmake --build … --config Release` and our config-specific + `RUNTIME_OUTPUT_DIRECTORY_RELEASE` properties (`CMakeLists.txt:363-365`) behave exactly as they + do under the VS generator. The diff vs today is small: swap `-G`/`-A` for `-G "Ninja + Multi-Config"` + an MSVC env step (`vcvarsall` / `ilammy/msvc-dev-cmd`); `/MT` runtime and the + x64-vs-x86 arch gating are unchanged. + +**Chosen approach — do NOT switch the working build blindly.** Instead either (a) prove the Ninja +Multi-Config build in a **separate/experimental job first**, or preferably (b) **ship two Windows +artifacts in parallel — one Ninja-built, one MSVC(VS-generator)-built — so end users can test both** +and we can compare them before committing to one. That means the Windows native build runs **twice** +(once per generator) for a transition period; keep the MSVC/VS artifact as the trusted default and +add the Ninja one alongside until it's proven equivalent. Only after the Ninja artifact is validated +should we consider making it the sole Windows build (and retiring the second run). + +**Implementation notes for when this is picked up:** +- Cache backend: prefer **sccache + Depot WebDAV** (consistent with the other 8 jobs — one token, + shared cross-branch) over upstream's ccache (GitHub per-branch cache, a second cache system). + sccache supports MSVC `cl.exe`; Release config emits no debug info, so the `/Zi`→`/Z7` PDB caveat + doesn't apply. +- `build.bat` needs a Ninja path: pass `-G "Ninja Multi-Config"` + `-DCMAKE_BUILD_TYPE` is *not* + needed (multi-config keeps `--config Release`); add an sccache presence/probe guard mirroring + `build.sh` so a missing/crashing sccache falls back to a green uncached build. +- Files to touch: `.github/workflows/publish.yml` (the two `build-windows-*` jobs — add the MSVC env + step, the cache action, and the second artifact), `.github/build.bat` (generator + launcher wiring). +- Risk is bounded: a broken Ninja build shows up as a red Windows job, and publishing is gated behind + `publish_to_central`, so no broken artifact can reach Central/GitHub Releases. + ### llama.cpp upstream feature exposure (queued, deferred by policy) These are JNI plumbing items for upstream API additions. Policy: add only after a real user request — they are mostly relevant to specific model families or specialized workflows. diff --git a/pom.xml b/pom.xml index 3f1f52c6..3dd10543 100644 --- a/pom.xml +++ b/pom.xml @@ -54,7 +54,7 @@ SPDX-License-Identifier: MIT 1.0.0 1.18.46 2.50.0 - 0.13.6 + 0.13.7 4.2.0 2.22.0 3.8.6 @@ -330,7 +330,7 @@ SPDX-License-Identifier: MIT org.pitest pitest-maven - 1.25.4 + 1.25.5 org.sonatype.central diff --git a/src/main/java/net/ladenthin/llama/server/NativeServer.java b/src/main/java/net/ladenthin/llama/server/NativeServer.java new file mode 100644 index 00000000..024ac827 --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/NativeServer.java @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import java.util.Objects; +import lombok.ToString; + +/** + * Scaffold for the native HTTP server bridge — the planned counterpart to + * {@link OpenAiCompatServer}. + * + *

{@link OpenAiCompatServer} implements the HTTP transport in Java (on the JDK's + * {@code com.sun.net.httpserver}) and drives the native llama.cpp server core over JNI. This + * class is instead the entry point for the upstream native HTTP transport that is already + * compiled into {@code libjllama} (llama.cpp's {@code server-http.cpp} plus its {@code cpp-httplib} + * backend). That native transport is the only component able to serve the embedded llama.cpp + * WebUI (the {@code ui.cpp}/{@code ui.h} asset table compiled in behind + * {@code LLAMA_UI_HAS_ASSETS}).

+ * + *

Status: scaffold only. The route registration that upstream performs in + * {@code server.cpp} (deliberately excluded from this build) is not yet wired to a JNI entry point, so + * {@link #start()} throws {@link UnsupportedOperationException} for now. This class only fixes the + * package structure and the public API shape; the native {@code startServer}/{@code stopServer} + * methods, their C++ implementation, the server lifecycle/threading and WebUI serving are a separate, + * detailed step (see {@code CLAUDE.md}, "WebUI (llama.cpp Svelte UI) embedding").

+ * + *

It is {@link AutoCloseable} so that, once implemented, callers can drive it with + * try-with-resources exactly like {@link OpenAiCompatServer}.

+ */ +@ToString +public final class NativeServer implements AutoCloseable { + + /** Message thrown by {@link #start()} until the native route-wiring lands. */ + static final String NOT_WIRED_MESSAGE = + "NativeServer is a scaffold: the upstream native HTTP routes (server-http.cpp) are " + + "not yet wired to JNI. Use OpenAiCompatServer for now; the native server and " + + "embedded WebUI are a planned step."; + + /** Immutable server configuration (bind host, port, ...) shared with {@link OpenAiCompatServer}. */ + private final OpenAiServerConfig config; + + /** + * Creates a native-server bridge for the given configuration. + * + *

Construction performs no native work and binds no socket; it only captures the configuration. + * Call {@link #start()} to launch the server (not implemented yet).

+ * + * @param config the server configuration (host, port, ...); must not be {@code null} + */ + public NativeServer(OpenAiServerConfig config) { + this.config = Objects.requireNonNull(config, "config"); + } + + /** + * Starts the native HTTP server and begins serving the embedded WebUI. + * + *

Not implemented yet — this is a scaffold. The native route registration and + * its JNI binding are a planned step, so this method always throws until then.

+ * + * @return this server instance (for fluent / try-with-resources use), once implemented + * @throws UnsupportedOperationException always, until the native routes are wired to JNI + */ + // Scaffold: start() intentionally always throws for now, but must stay callable (not @DoNotCall) + // so the real implementation and its callers/tests keep the same signature. + @SuppressWarnings("DoNotCallSuggester") + public NativeServer start() { + throw new UnsupportedOperationException(NOT_WIRED_MESSAGE); + } + + /** + * Reports whether the native server is currently running. + * + * @return {@code false} — the scaffold never starts a server yet + */ + public boolean isRunning() { + return false; + } + + /** + * Returns the host the server is configured to bind to. + * + * @return the configured bind host + */ + public String getHost() { + return config.getHost(); + } + + /** + * Returns the port the server is configured to bind to. + * + * @return the configured port + */ + public int getPort() { + return config.getPort(); + } + + /** + * Stops the native server if it is running. + * + *

No-op in the scaffold (nothing is ever started), so it is always safe to call, including from + * try-with-resources. Real lifecycle teardown is part of the planned native-server implementation.

+ */ + @Override + public void close() { + // Nothing is started yet, so there is nothing to release. + } +} diff --git a/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java b/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java new file mode 100644 index 00000000..7e74dec4 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/NativeServerSmokeTest.java @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; + +/** + * Model-free smoke test for the {@link NativeServer} scaffold: it must construct without any native + * work, expose its configured host/port, never report itself running, throw a clear + * {@link UnsupportedOperationException} from {@link NativeServer#start()} until the native routes are + * wired, and be a safe no-op {@link AutoCloseable}. No model and no {@code libjllama} required. + */ +public class NativeServerSmokeTest { + + private static OpenAiServerConfig config() { + return OpenAiServerConfig.builder().host("127.0.0.1").port(1234).build(); + } + + @Test + public void exposesConfiguredHostAndPortWithoutStarting() { + NativeServer server = new NativeServer(config()); + assertThat(server.getHost(), is("127.0.0.1")); + assertThat(server.getPort(), is(1234)); + assertThat(server.isRunning(), is(false)); + } + + @Test + public void startThrowsUntilNativeRoutesAreWired() { + NativeServer server = new NativeServer(config()); + UnsupportedOperationException ex = assertThrows(UnsupportedOperationException.class, server::start); + assertThat(ex.getMessage(), containsString("not yet wired")); + assertThat(server.isRunning(), is(false)); + } + + @Test + public void closeIsSafeNoOpEvenViaTryWithResources() { + try (NativeServer server = new NativeServer(config())) { + assertThat(server.isRunning(), is(false)); + } + } +}