diff --git a/.github/build.sh b/.github/build.sh index 3d0ae778..23fef1d2 100755 --- a/.github/build.sh +++ b/.github/build.sh @@ -6,5 +6,53 @@ # SPDX-License-Identifier: MIT mkdir -p build -cmake -Bbuild $@ || exit 1 -cmake --build build --config Release -j$(nproc) || exit 1 + +# Build parallelism. Defaults to all cores; RAM-limited CI runners (notably GitHub's +# ~7 GB macOS arm64) export BUILD_JOBS lower (e.g. 2) so the large httplib.cpp + the 134 +# llama.cpp model TUs do not exhaust memory and get the runner OOM-killed mid-compile +# (which surfaces as a SIGTERM / "runner received a shutdown signal", not a clean timeout). +JOBS="${BUILD_JOBS:-}" +if [ -z "$JOBS" ]; then + JOBS="$( { command -v nproc >/dev/null 2>&1 && nproc; } || sysctl -n hw.ncpu 2>/dev/null || echo 4 )" +fi + +# Fetch sccache when caching is requested but the runner/container doesn't ship it — the +# dockcross cross-compile containers (manylinux/Android) and Linux hosts have no sccache, +# while macOS installs it via brew in the workflow. Best-effort and inert-safe: any failure +# leaves sccache absent, so the build just proceeds uncached. The static musl binary runs in +# any x86_64 Linux container (the cross-compile host is always x86_64). +if [ "${USE_CACHE:-true}" = "true" ] && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ] \ + && ! command -v sccache >/dev/null 2>&1 \ + && [ "$(uname -s)" = "Linux" ] && [ "$(uname -m)" = "x86_64" ]; then + SCCACHE_REL="sccache-v0.8.2-x86_64-unknown-linux-musl" + echo "build.sh: fetching ${SCCACHE_REL} (no sccache on PATH)..." + if curl -fsSL --proto =https --proto-redir =https \ + "https://github.com/mozilla/sccache/releases/download/v0.8.2/${SCCACHE_REL}.tar.gz" \ + -o /tmp/sccache.tgz && tar -xzf /tmp/sccache.tgz -C /tmp; then + export PATH="/tmp/${SCCACHE_REL}:$PATH" + echo "build.sh: sccache -> $(command -v sccache || echo 'still missing')" + else + echo "build.sh: sccache fetch failed; continuing without cache" + fi +fi + +# Optional shared compiler cache: sccache fronting Depot Cache (WebDAV). Enabled only when +# USE_CACHE is true AND sccache + a cache token are present, so it stays inert before the +# DEPOT_TOKEN secret is configured and on fork PRs (secrets hidden) — those just compile +# normally. sccache is content-addressed, so a cache hit is bit-identical to a fresh -O3 +# compile (release-safe), and it degrades to direct compilation if the cache is unreachable. +LAUNCH="" +if [ "${USE_CACHE:-true}" = "true" ] && command -v sccache >/dev/null 2>&1 \ + && [ -n "${SCCACHE_WEBDAV_TOKEN:-}${SCCACHE_GHA_ENABLED:-}" ]; then + LAUNCH="-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache" + echo "build.sh: sccache ON (endpoint=${SCCACHE_WEBDAV_ENDPOINT:-default}), building with -j${JOBS}" +else + echo "build.sh: sccache OFF, building with -j${JOBS}" +fi + +cmake -Bbuild $LAUNCH $@ || exit 1 +cmake --build build --config Release -j"${JOBS}" || exit 1 + +if command -v sccache >/dev/null 2>&1; then + sccache --show-stats || true +fi diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index a9c2b3cf..18f566ab 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -15,6 +15,10 @@ on: description: "Deploy to Maven Central (snapshot if -SNAPSHOT, release if a vX.Y.Z tag)" type: boolean default: false + use_cache: + description: "Use the shared sccache/Depot compiler cache (faster incremental builds)" + type: boolean + default: true env: JAVA_VERSION: '21' MODEL_URL: "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf" @@ -82,12 +86,98 @@ jobs: echo "=== internal package dependency graph (jdeps, bytecode) ===" jdeps -verbose:package target/classes | grep 'net.ladenthin.llama' || true + # --------------------------------------------------------------------------- + # Build the llama.cpp WebUI ONCE, from the same pinned tag CMakeLists.txt fetches, + # and share it to every native build as the generated, platform-independent + # ui.cpp/ui.h ("webui-generated" artifact). The native builds embed it into + # libjllama (CMake's "WebUI assets" block); when this job's artifact is absent the + # build falls back to the empty-asset stub. npm runs only here, in one controlled + # job — never in the dockcross cross-compilers (which have no node) or per-platform. + # --------------------------------------------------------------------------- + build-webui: + name: Build WebUI assets (shared) + needs: startgate + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - name: Resolve pinned llama.cpp tag from CMakeLists.txt + id: tag + shell: bash + run: | + TAG=$(grep -oE 'GIT_TAG[[:space:]]+b[0-9]+' CMakeLists.txt | grep -oE 'b[0-9]+' | head -1) + if [ -z "$TAG" ]; then + echo "could not resolve llama.cpp GIT_TAG (b) from CMakeLists.txt" >&2 + exit 1 + fi + echo "tag=$TAG" >> "$GITHUB_OUTPUT" + echo "Pinned llama.cpp WebUI tag: $TAG" + - name: Checkout llama.cpp tools/ui at the pinned tag + uses: actions/checkout@v6 + with: + repository: ggml-org/llama.cpp + ref: ${{ steps.tag.outputs.tag }} + path: llamacpp-ui + sparse-checkout: tools/ui + sparse-checkout-cone-mode: true + - uses: actions/setup-node@v6 + with: + node-version: '24' + cache: npm + cache-dependency-path: llamacpp-ui/tools/ui/package-lock.json + - name: Build WebUI (Svelte/Vite) + working-directory: llamacpp-ui/tools/ui + env: + HF_UI_VERSION: ${{ steps.tag.outputs.tag }} + LLAMA_BUILD_NUMBER: ${{ steps.tag.outputs.tag }} + run: | + npm ci --ignore-scripts + npm run build + test -f dist/index.html + - name: Embed assets into ui.cpp / ui.h (gzip parity with upstream) + working-directory: llamacpp-ui/tools/ui + shell: bash + run: | + set -euo pipefail + # gzip every asset into dist/_gzip/ so llama-ui-embed embeds the + # compressed bytes (LLAMA_UI_GZIP parity); embed auto-detects _gzip. + ( cd dist && find . -type f -not -path './_gzip/*' | while read -r f; do + mkdir -p "_gzip/$(dirname "$f")" + gzip -9 -c "$f" > "_gzip/$f" + done ) + # llama-ui-embed is a self-contained C++17 host tool (no npm) — build + run it. + g++ -O2 -std=c++17 -o llama-ui-embed embed.cpp + mkdir -p "$GITHUB_WORKSPACE/webui-generated" + ./llama-ui-embed \ + "$GITHUB_WORKSPACE/webui-generated/ui.cpp" \ + "$GITHUB_WORKSPACE/webui-generated/ui.h" \ + dist + echo "=== generated WebUI assets ===" + ls -la "$GITHUB_WORKSPACE/webui-generated" + if grep -q LLAMA_UI_HAS_ASSETS "$GITHUB_WORKSPACE/webui-generated/ui.h"; then + echo "LLAMA_UI_HAS_ASSETS: present (real WebUI embedded)" + else + echo "ERROR: embed produced an empty asset table" >&2 + exit 1 + fi + - name: Upload WebUI artifact + uses: actions/upload-artifact@v7 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ + retention-days: 1 + if-no-files-found: error + crosscompile-linux-x86_64-cuda: name: Cross-Compile manylinux_2_28 x86_64 (CUDA) - needs: startgate + needs: [startgate, build-webui] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: bash run: | @@ -108,10 +198,15 @@ jobs: crosscompile-linux-x86_64: name: Cross-Compile manylinux2014 x86_64 - needs: startgate + needs: [startgate, build-webui] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: bash run: | @@ -132,10 +227,15 @@ jobs: crosscompile-linux-aarch64: name: Cross-Compile Linux aarch64 (LTS) - needs: startgate + needs: [startgate, build-webui] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: bash run: | @@ -156,10 +256,15 @@ jobs: crosscompile-android-aarch64: name: Cross-Compile Android aarch64 - needs: startgate + needs: [startgate, build-webui] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: bash run: | @@ -180,10 +285,15 @@ jobs: crosscompile-android-aarch64-opencl: name: Cross-Compile Android aarch64 (OpenCL/Adreno) - needs: startgate + needs: [startgate, build-webui] runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Build libraries shell: bash run: | @@ -200,10 +310,20 @@ jobs: build-macos-arm64-no-metal: name: Build and Test macOS 15 arm64 (no Metal) - needs: startgate + needs: [startgate, build-webui] runs-on: macos-15 + env: + BUILD_JOBS: 2 + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - uses: actions/setup-java@v5 with: distribution: 'temurin' @@ -216,6 +336,10 @@ jobs: echo "" echo "=== Processor Details ===" system_profiler SPHardwareDataType + - name: Install sccache (shared compiler cache) + if: env.USE_CACHE == 'true' + continue-on-error: true + run: brew install sccache - name: Build libraries shell: bash run: | @@ -231,10 +355,20 @@ jobs: build-macos-arm64-metal: name: Build and Test macOS 14 arm64 (Metal) - needs: startgate + needs: [startgate, build-webui] runs-on: macos-14 + env: + BUILD_JOBS: 2 + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - uses: actions/setup-java@v5 with: distribution: 'temurin' @@ -247,6 +381,10 @@ jobs: echo "" echo "=== Processor Details ===" system_profiler SPHardwareDataType + - name: Install sccache (shared compiler cache) + if: env.USE_CACHE == 'true' + continue-on-error: true + run: brew install sccache - name: Build libraries shell: bash run: | @@ -262,10 +400,15 @@ jobs: build-windows-x86_64: name: Build and Test Windows 2025 x86_64 (VS 2026) - needs: startgate + needs: [startgate, build-webui] runs-on: windows-2025-vs2026 steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: pwsh run: | @@ -291,10 +434,15 @@ jobs: build-windows-x86: name: Build and Test Windows 2025 x86 (VS 2026) - needs: startgate + needs: [startgate, build-webui] runs-on: windows-2025-vs2026 steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - name: Display CPU Info shell: pwsh run: | @@ -348,10 +496,20 @@ jobs: build-macos-arm64-metal-15: name: Build and Test macOS 15 arm64 (Metal) - needs: startgate + needs: [startgate, build-webui] runs-on: macos-15 + env: + BUILD_JOBS: 2 + USE_CACHE: ${{ github.event_name != 'workflow_dispatch' || inputs.use_cache }} + SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev + SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }} steps: - uses: actions/checkout@v6 + - name: Download shared WebUI assets + uses: actions/download-artifact@v8 + with: + name: webui-generated + path: ${{ github.workspace }}/webui-generated/ - uses: actions/setup-java@v5 with: distribution: 'temurin' @@ -364,6 +522,10 @@ jobs: echo "" echo "=== Processor Details ===" system_profiler SPHardwareDataType + - name: Install sccache (shared compiler cache) + if: env.USE_CACHE == 'true' + continue-on-error: true + run: brew install sccache - name: Build libraries shell: bash run: | @@ -400,21 +562,21 @@ jobs: name: Linux-x86_64-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ - name: Download text generation model - run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download nomic embedding model (issue #98 regression) - run: curl -L --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -524,19 +686,19 @@ jobs: name: macos-14-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ - name: Download text generation model - run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -591,19 +753,19 @@ jobs: name: macos-15-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ - name: Download text generation model - run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -658,19 +820,19 @@ jobs: name: macos-15-metal-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ - name: Download text generation model - run: curl -L --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${MODEL_URL} --create-dirs -o models/${MODEL_NAME} - name: Download reranking model - run: curl -L --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${RERANKING_MODEL_URL} --create-dirs -o models/${RERANKING_MODEL_NAME} - name: Download draft model - run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME} - name: Download reasoning model - run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME} - name: Download tool-calling model - run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME} - name: Download vision model (issues #103 / #34) - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME} - name: Download vision mmproj - run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors ${VISION_MMPROJ_URL} --create-dirs -o models/${VISION_MMPROJ_NAME} - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -728,19 +890,19 @@ jobs: name: Windows-x86_64-libraries path: ${{ github.workspace }}/src/main/resources/net/ladenthin/llama/ - name: Download text generation model - run: curl -L --fail --retry 5 --retry-all-errors $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME - name: Download reranking model - run: curl -L --fail --retry 5 --retry-all-errors $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:RERANKING_MODEL_URL --create-dirs -o models/$env:RERANKING_MODEL_NAME - name: Download draft model - run: curl -L --fail --retry 5 --retry-all-errors $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME - name: Download reasoning model - run: curl -L --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME - name: Download tool-calling model - run: curl -L --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME - name: Download vision model (issues #103 / #34) - run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME - name: Download vision mmproj - run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME + run: curl -L --proto =https --proto-redir =https --fail --retry 5 --retry-all-errors $env:VISION_MMPROJ_URL --create-dirs -o models/$env:VISION_MMPROJ_NAME - name: List files in models directory run: ls -l models/ - name: Validate model files @@ -838,7 +1000,7 @@ jobs: # `assembly` additionally produces the fat jar-with-dependencies uber JAR # (llama--jar-with-dependencies.jar: library classes + Java runtime deps + # default-platform native libs in one drop-on-classpath JAR, runnable via its - # LlamaServer Main-Class). It lands in target/ and is uploaded in the `llama-jars` + # OpenAiCompatServer Main-Class). It lands in target/ and is uploaded in the `llama-jars` # artifact below - a CI run artifact only, not a Maven Central / GitHub-Release asset. run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,assembly -Dmaven.test.skip=true -Dgpg.skip=true package - name: Upload JARs diff --git a/.gitignore b/.gitignore index 7b3a5e04..d22d4433 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,11 @@ src/main/resources/**/*.dll src/main/resources/**/*.metal src/test/resources/**/*.gbnf +# Generated WebUI assets (ui.cpp/ui.h) produced once by the build-webui CI job and +# downloaded into every native build; embedded into libjllama, never committed +# (this repo commits no build outputs — same policy as the native libs above). +/webui-generated/ + **/*.etag **/*.lastModified src/main/cpp/llama.cpp/ diff --git a/CLAUDE.md b/CLAUDE.md index 85b4a1fb..7b66afea 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -121,6 +121,91 @@ At runtime the device must provide its own OpenCL ICD (`libOpenCL.so`); Qualcomm Adreno drivers do. Devices without an ICD should use the default CPU-only Android JAR. +## WebUI (llama.cpp Svelte UI) embedding + +The llama.cpp WebUI is **built once in CI and shared to every native build**, then +compiled into `libjllama` so the embedded server (`server-http.cpp`) can serve it. +This repo commits no build outputs, so the assets are produced per-pipeline, never +checked in (same policy as the native libs). + +Pipeline (`.github/workflows/publish.yml`): + +1. **`build-webui` job** (ubuntu — the *only* job that runs `npm`): resolves the + pinned `b` tag from `CMakeLists.txt`'s `GIT_TAG`, sparse-checks-out + `ggml-org/llama.cpp@` `tools/ui`, runs the upstream Svelte build + (`npm ci && npm run build`), gzips `dist/` into `dist/_gzip/` (LLAMA_UI_GZIP + parity), builds the self-contained `llama-ui-embed` host tool (plain C++17, **no + npm**) and runs it to produce the platform-independent **`webui-generated/ui.cpp` + + `ui.h`**, uploaded as the `webui-generated` artifact. +2. **Every native build job** (`needs: [startgate, build-webui]`) downloads that + artifact into `webui-generated/` before building. npm never runs in the dockcross + cross-compilers (which have no node) or per-platform. +3. **CMake** (the "WebUI assets" block in `CMakeLists.txt`): if + `webui-generated/ui.cpp` + `ui.h` exist, compiles `ui.cpp` in and adds its dir to + the include path — the generated `ui.h` `#define`s `LLAMA_UI_HAS_ASSETS`, which + activates `server-http.cpp`'s static-asset routes. If absent, it falls back to the + empty-asset stub `src/main/cpp/webui_stub/ui.h` (no embedded UI) so local builds — + and any job without the artifact — still build and run. + +The WebUI version **auto-follows** the pinned `GIT_TAG`: a llama.cpp version bump +needs no extra step here, `build-webui` re-reads the tag and rebuilds the matching UI. + +**Building the WebUI locally** (optional — a plain `cmake` build uses the stub and +ships no UI): +```bash +# needs node/npm + network; embed.cpp is plain C++17 (no npm) +git clone --depth 1 --branch b9682 https://github.com/ggml-org/llama.cpp /tmp/lc +( cd /tmp/lc/tools/ui && npm ci && npm run build \ + && ( cd dist && find . -type f -not -path './_gzip/*' \ + | while read -r f; do mkdir -p "_gzip/$(dirname "$f")"; gzip -9 -c "$f" > "_gzip/$f"; done ) \ + && g++ -O2 -std=c++17 -o /tmp/llama-ui-embed embed.cpp ) +mkdir -p webui-generated +/tmp/llama-ui-embed webui-generated/ui.cpp webui-generated/ui.h /tmp/lc/tools/ui/dist +cmake -B build && cmake --build build --target jllama # now embeds the real UI +``` +`webui-generated/` is git-ignored. + +## CI build cache & parallelism (sccache + Depot) + +The native build dominates CI time (134 llama.cpp model TUs + ggml + the 16.6k-line +`httplib.cpp`, all at `-O3`). Two knobs in **`.github/build.sh`**, both behind the +`use_cache` `workflow_dispatch` input (default **true**), keep it fast and stop the macOS +runners OOM-ing. + +**`BUILD_JOBS` — compile parallelism.** `build.sh` builds with `cmake --build -j${BUILD_JOBS}` +(default: all cores, via portable `nproc` → `sysctl -n hw.ncpu` → `4` detection). GitHub's +~7 GB **macOS arm64** runners OOM under full `-j` when `httplib.cpp` co-schedules with the +model TUs; the runner is then killed as **SIGTERM / exit 143** ("received a shutdown +signal"), which *looks* like a timeout but is an out-of-memory kill. The three macOS build +jobs therefore set `BUILD_JOBS: 2` to bound peak memory. + +**`sccache` → Depot Cache — shared compiler cache.** When `USE_CACHE=true` **and** `sccache` +plus a cache token are present, `build.sh` adds +`-DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache` and prints +`sccache --show-stats`. The cache lives in **Depot Cache** over sccache's **WebDAV** backend: + +- `SCCACHE_WEBDAV_ENDPOINT: https://cache.depot.dev` +- `SCCACHE_WEBDAV_TOKEN: ${{ secrets.DEPOT_TOKEN }}` — a Depot **organization** token, stored + as the repo secret **`DEPOT_TOKEN`**. + +Because `sccache` is **content-addressed** and llama.cpp is pinned (`GIT_TAG b9682`), the +~280 upstream object files are byte-identical every run, so a warm cache recompiles only the +*changed* files. Depot's cache is **shared across all branches** (unlike GitHub's +per-branch `actions/cache`), so every branch builds incrementally; a `b` version bump +naturally invalidates the upstream entries (their content changed) with no manual step. It +stays `-O3` and is **bit-identical** to a clean build (release-safe). + +**Safety / transparency.** It is **inert** until `DEPOT_TOKEN` is configured and on **fork +PRs** (secrets are hidden there) — those simply compile normally; the `Install sccache` step +is `continue-on-error`; and `use_cache=false` forces a pristine, from-scratch build. + +**Rollout.** **Phase 1 (current): the 3 macOS build jobs** (slowest + OOM-prone) — +`brew install sccache` + the env above + `BUILD_JOBS: 2`. **Phase 2 (TODO):** the dockcross +Linux/Android/CUDA jobs (the `sccache` binary **and** `DEPOT_TOKEN` must be passed *into* the +container), the Windows jobs (sccache supports MSVC), and the Linux-host `test-cpp` job. To +extend a job: install `sccache`, set the two `SCCACHE_WEBDAV_*` env vars, and (for +RAM-limited runners) `BUILD_JOBS`. + ## Upgrading/Downgrading llama.cpp Version To change the llama.cpp version, update the following **three** files: @@ -471,9 +556,11 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in - `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`. - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`. - `OSInfo` — Detects OS and architecture for library resolution. -- **`server` package — OpenAI-compatible HTTP endpoint. NOTE: two implementations coexist on this branch pending a "best of both" consolidation (see [`TODO.md`](TODO.md)).** - - `server.OpenAiCompatServer` — built on the JDK's `com.sun.net.httpserver` (no new dependency). Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming) and `GET /v1/models` by delegating to `LlamaModel.chatComplete` / `LlamaModel.streamChatCompletion`, so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint") can drive a local model. Streaming uses the native OAI chunk path (`requestChatCompletionStream` / `receiveChatCompletionChunk`), preserving `delta.tool_calls`. - - `server.LlamaServer` — an OpenAI-compatible HTTP server and the fat-jar `Main-Class`. `LlamaServerArgs` parses the CLI; `OaiRouter` / `OaiHttpServer` (NanoHTTPD) map `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings` and `GET /v1/models` to the `LlamaModel.handle*` methods. NanoHTTPD is an `` dependency (bundled only in the fat jar, not inherited by library consumers). The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`). See README "OpenAI-compatible HTTP server". +- **`server` package — OpenAI-compatible HTTP endpoint (a single implementation).** + - `server.OpenAiCompatServer` — built only on the JDK's `com.sun.net.httpserver` (no new dependency), both embeddable and the fat-jar `Main-Class`. Serves `POST /v1/chat/completions` (streaming via SSE + non-streaming), `POST /v1/completions`, `POST /v1/embeddings`, `POST /v1/rerank`, `POST /infill`, `GET /v1/models` and `GET /health` (every route is also reachable without the `/v1` prefix), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint", Cline, Roo Code, Continue) can drive a local model. Streaming chat uses the native OAI chunk path (`LlamaModel.streamChatCompletion` → `requestChatCompletionStream` / `receiveChatCompletionChunk` + the C++ `wrap_stream_chunk` helper), preserving `delta.tool_calls`; completions/embeddings/infill forward verbatim to the matching `LlamaModel.handle*`; rerank reshapes `handleRerank` into the OAI `results`/`data` shape. The chat mapper forwards `stream_options` and `response_format` and defaults `cache_prompt=true`; a CORS `Filter` answers `OPTIONS` preflights; `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees `usage.prompt_tokens_details.cached_tokens` on the streamed usage chunk (Copilot crash fix, microsoft/vscode #273482). **Agentic tool-calling is the primary target**; a C++ guard (`test_server.cpp`) pins `tool_calls.function.arguments` as a JSON string (llama.cpp #20198). + - **Alternative protocol surfaces** (pure translation over the OpenAI chat core — no second inference path; each reconstructs streamed tool calls via `ToolCallDeltaAccumulator`): **Ollama-native** (`GET /api/version`, `/api/tags`, `POST /api/show`, `/api/chat` with NDJSON streaming, `/api/generate` prompt-completion/FIM — `OllamaApiSupport`; `/api/show` advertises tools/insert/vision capabilities + context length for Copilot's Ollama provider), **Anthropic Messages** (`POST /v1/messages`, SSE event stream — `AnthropicApiSupport` + `AnthropicStreamTranslator`), and **OpenAI Responses** (`POST /v1/responses`, SSE event stream — `ResponsesApiSupport` + `ResponsesStreamTranslator`). The llama.cpp-native `GET /props` (context length + `modalities`) is served via `OpenAiSseFormatter.propsJson` for autocomplete clients that size their context from it. + - Supporting classes: `OpenAiServerConfig` (builder; optional bearer auth; binds `127.0.0.1`; `corsAllowOrigin`; `supportsVision`), `OpenAiServerCli` (testable CLI arg parser → `ModelParameters` + `OpenAiServerConfig`; flags incl. `--mmproj`/`--embedding`/`--reranking`), `OpenAiRequestMapper` (OAI chat request → `InferenceParameters`), `OpenAiSseFormatter` (SSE/models/error JSON + usage normalization), `OaiRerankSupport` (pure rerank request/response shaping), and the model-free test seam `OpenAiBackend`/`ChunkSink` + `LlamaModelBackend`. The streaming envelope is parsed by `json.ChatStreamChunkParser`. + - The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`); `noInternalJdkImports` carries an explicit exception for the supported `com.sun.net.httpserver` (the exported `jdk.httpserver` module, which `module-info.java` `requires`). See README "OpenAI-compatible HTTP server". **Native layer** (`src/main/cpp/`): - `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods. @@ -481,7 +568,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in - `json_helpers.hpp` — Pure JSON transformation helpers (no JNI, no llama state). Independently unit-testable. - `jni_helpers.hpp` — JNI bridge helpers (handle management + server orchestration). Includes `json_helpers.hpp`. - Uses `nlohmann/json` for JSON deserialization of parameters. -- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-models.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. +- The upstream server library (`server-context.cpp`, `server-queue.cpp`, `server-task.cpp`, `server-models.cpp`) is compiled directly into `jllama` via CMake — there is no hand-ported `server.hpp` fork. **Phase 2:** the upstream HTTP transport (`tools/server/server-http.cpp`) and its `cpp-httplib` backend (`vendor/cpp-httplib/httplib.cpp`) are now compiled into `jllama` too, so the OpenAI-compatible server can be driven natively from JNI *inside* `libjllama` — no separate `llama-server` executable (a JNI shared library loads anywhere a JVM runs, which a standalone binary does not). `server-http.cpp` does `#include "ui.h"` (the WebUI asset table that `tools/ui`/`llama-ui` normally generates); since the Svelte WebUI is not shipped, `src/main/cpp/webui_stub/ui.h` supplies the upstream **empty-asset** interface and leaves `LLAMA_UI_HAS_ASSETS` undefined (all static-asset-serving blocks compile out). `` already resolves via `llama-common`'s `vendor/` include dir (same nlohmann/json 3.12.0 as the FetchContent copy). No SSL: `CPPHTTPLIB_OPENSSL_SUPPORT` is left undefined (plain-HTTP; bind localhost / front with a TLS proxy). Only `server.cpp` (the standalone `main()` + route wiring) remains excluded — wiring the routes to JNI is the next step. ### Native Helper Architecture diff --git a/CMakeLists.txt b/CMakeLists.txt index d3c688e9..89d80585 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -261,7 +261,6 @@ add_library(jllama SHARED # Phase 1 refactoring: compile upstream server library units directly into jllama # server.hpp has been replaced by direct upstream includes in jllama.cpp. -# server-http.cpp and server.cpp (main) are intentionally excluded. # server-context.cpp, server-queue.cpp, server-task.cpp compile on all platforms # including Android. server-models.cpp is excluded on Android because it pulls # in subprocess.h which calls posix_spawn_*, declared but not implemented by the @@ -278,17 +277,85 @@ if(NOT ANDROID_ABI AND NOT OS_NAME MATCHES "Android") ) endif() +# Phase 2: also compile the upstream HTTP transport (server-http.cpp) and its +# cpp-httplib backend directly into jllama, so the OpenAI-compatible server can be +# driven natively from JNI — shipped inside libjllama, with no separate +# llama-server executable (a JNI .so/.dll/.dylib loads everywhere a JVM runs, +# unlike a standalone binary). Only server.cpp (the standalone main() + route +# wiring) stays excluded for now; this first step just makes the HTTP layer build +# and link. +# +# server-http.cpp does `#include "ui.h"` — the WebUI asset table that tools/ui +# normally GENERATES. The WebUI is built once in CI (the build-webui job) and +# shared to every native build as a generated, platform-independent ui.cpp/ui.h; +# the "WebUI assets" block below compiles it in when present and otherwise falls +# back to the empty-asset stub (src/main/cpp/webui_stub/ui.h). +# already resolves via llama-common's vendor/ include dir, +# whose bundled nlohmann/json is the same 3.12.0 as our FetchContent copy, so +# adding nothing there shadows it. +target_sources(jllama PRIVATE + ${llama.cpp_SOURCE_DIR}/tools/server/server-http.cpp + ${llama.cpp_SOURCE_DIR}/vendor/cpp-httplib/httplib.cpp +) + +# cpp-httplib is third-party: silence its warnings (matching upstream's own +# cpp-httplib target, which compiles it with -w / /w). No SSL is enabled — +# CPPHTTPLIB_OPENSSL_SUPPORT is left undefined — so the embedded server is +# plain-HTTP for now (bind to localhost or front it with a TLS proxy). +if(MSVC) + set_source_files_properties( + ${llama.cpp_SOURCE_DIR}/vendor/cpp-httplib/httplib.cpp + PROPERTIES COMPILE_FLAGS "/w") +else() + set_source_files_properties( + ${llama.cpp_SOURCE_DIR}/vendor/cpp-httplib/httplib.cpp + PROPERTIES COMPILE_FLAGS "-w") +endif() + +# MinGW needs ws2_32 explicitly; MSVC auto-links it via a #pragma in httplib.h. +if(WIN32 AND NOT MSVC) + target_link_libraries(jllama PRIVATE ws2_32) +endif() + +# WebUI assets. The llama.cpp WebUI is built once in CI by the build-webui job +# (upstream Svelte build + llama-ui-embed, run against the pinned llama.cpp tag) +# and shared to every native build as the generated, platform-independent +# ui.cpp / ui.h under webui-generated/ (git-ignored; downloaded as the +# "webui-generated" artifact — this repo commits no build outputs). When present, +# compile it in so libjllama serves the real WebUI: the generated ui.h #defines +# LLAMA_UI_HAS_ASSETS, which activates server-http.cpp's static-asset routes. When +# absent (local builds, or any job without the artifact) fall back to the empty- +# asset stub so the server still builds and runs, just without an embedded WebUI. +set(JLLAMA_WEBUI_GENERATED_DIR ${CMAKE_SOURCE_DIR}/webui-generated) +if(EXISTS ${JLLAMA_WEBUI_GENERATED_DIR}/ui.cpp AND EXISTS ${JLLAMA_WEBUI_GENERATED_DIR}/ui.h) + message(STATUS "WebUI: embedding generated assets from ${JLLAMA_WEBUI_GENERATED_DIR}") + target_sources(jllama PRIVATE ${JLLAMA_WEBUI_GENERATED_DIR}/ui.cpp) + target_include_directories(jllama PRIVATE ${JLLAMA_WEBUI_GENERATED_DIR}) +else() + message(STATUS "WebUI: no generated assets found; using empty-asset stub (no embedded WebUI)") + target_include_directories(jllama PRIVATE ${CMAKE_SOURCE_DIR}/src/main/cpp/webui_stub) +endif() + set_target_properties(jllama PROPERTIES POSITION_INDEPENDENT_CODE ON) target_include_directories(jllama PRIVATE src/main/cpp ${JNI_INCLUDE_DIRS} ${llama.cpp_SOURCE_DIR}/tools/mtmd ${llama.cpp_SOURCE_DIR}/tools/server) +# Note: the WebUI ui.h include dir (generated webui-generated/ or the empty stub +# src/main/cpp/webui_stub) is added by the "WebUI assets" conditional above. target_link_libraries(jllama PRIVATE llama-common mtmd llama nlohmann_json) target_compile_features(jllama PRIVATE cxx_std_11) target_compile_definitions(jllama PRIVATE SERVER_VERBOSE=$ + # cpp-httplib tuning — mirror the defines upstream's cpp-httplib target sets so + # httplib.cpp and every TU that includes httplib.h (server-http.cpp) agree on + # the inline behaviour these macros control. + CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH=1048576 + CPPHTTPLIB_LISTEN_BACKLOG=512 + CPPHTTPLIB_REQUEST_URI_MAX_LENGTH=32768 + CPPHTTPLIB_TCP_NODELAY=1 ) if(OS_NAME STREQUAL "Windows") diff --git a/README.md b/README.md index 413a127c..97650d5f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,10 @@ [![Publish](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/publish.yml/badge.svg)](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/publish.yml) [![CodeQL](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/codeql.yml/badge.svg)](https://github.com/bernardladenthin/java-llama.cpp/actions/workflows/codeql.yml) +**Build cache:** +[![Build cache by Depot](https://img.shields.io/badge/build%20cache-Depot-FF5C35)](https://depot.dev) +_Shared, incremental CI compiler caching (sccache) powered by [Depot](https://depot.dev)._ + **Coverage:** [![Coverage Status](https://coveralls.io/repos/github/bernardladenthin/java-llama.cpp/badge.svg?branch=main)](https://coveralls.io/github/bernardladenthin/java-llama.cpp?branch=main) [![codecov](https://codecov.io/gh/bernardladenthin/java-llama.cpp/graph/badge.svg)](https://codecov.io/gh/bernardladenthin/java-llama.cpp) @@ -97,7 +101,7 @@ Inference of Meta's LLaMA model (and others) in pure C/C++. - **Infilling** (fill-in-the-middle) for code models. - **Tokenize / detokenize** and **JSON-schema → grammar** conversion. - **Raw JSON endpoint handlers** mirroring the upstream llama.cpp HTTP server (`/completions`, `/v1/completions`, `/embeddings`, `/infill`, `/tokenize`, `/detokenize`). -- **Runnable OpenAI-compatible HTTP server** (`LlamaServer`, the fat-jar `Main-Class`): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`. +- **Runnable OpenAI-compatible HTTP server** (`OpenAiCompatServer`, the fat-jar `Main-Class`, streaming SSE, zero extra dependency): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`. - **Model metadata** access (`getModelMeta()`) and **server management** (metrics, slot save/restore, runtime thread reconfiguration). - Pre-built native binaries for Linux (x86-64, aarch64), macOS (x86-64, arm64), and Windows (x86-64, x86); CUDA, Metal, and Vulkan supported via local build. @@ -434,20 +438,49 @@ Server state is exposed via `getMetrics()`, `eraseSlot(int)`, `saveSlot(int, Str ### OpenAI-compatible HTTP server -> **Note — two implementations pending consolidation.** This branch currently ships **two** -> independent OpenAI-compatible servers in `net.ladenthin.llama.server`, awaiting a "best of both" -> merge (see [TODO.md](TODO.md)). Both are documented below; they will be unified into one. - -#### Option A — `OpenAiCompatServer` (dependency-free, streaming SSE) — for VS Code Copilot and other OpenAI clients - `net.ladenthin.llama.server.OpenAiCompatServer` turns a loaded model into a local OpenAI-compatible HTTP endpoint using only the JDK's built-in `com.sun.net.httpserver` — no extra -dependency and no separate server process. It serves: +dependency and no separate server process. It is both embeddable and the fat-jar `Main-Class`. It +serves: -- `POST /v1/chat/completions` — streaming (Server-Sent Events) and non-streaming, forwarding - `messages`/`tools` verbatim. The streaming path carries `delta.tool_calls`, so agent/tool-calling - clients work. -- `GET /v1/models` — advertises the configured model id. +| Method & path | Backed by | +|---|---| +| `POST /v1/chat/completions` | `LlamaModel.streamChatCompletion` (streaming SSE) / `chatComplete` (blocking) | +| `POST /v1/completions` | `LlamaModel.handleCompletionsOai` | +| `POST /v1/embeddings` (requires `--embedding`) | `LlamaModel.handleEmbeddings` | +| `POST /v1/rerank` (requires `--reranking`) | `LlamaModel.handleRerank` (reshaped to `results`/`data`) | +| `POST /infill` | `LlamaModel.handleInfill` (fill-in-the-middle autocomplete) | +| `GET /v1/models` | the configured model id | +| `GET /health` | static `{"status":"ok"}` (unauthenticated) | + +Chat completions support **streaming via Server-Sent Events** and non-streaming, forwarding +`messages`/`tools` verbatim. The streaming path carries `delta.tool_calls` and (with +`stream_options.include_usage`) a trailing `usage` chunk, so **agent/tool-calling clients work** — +this is the recommended surface for VS Code Copilot agent mode, Cline, Roo Code and Continue. +`response_format` (`json_object` / `json_schema`) is forwarded for structured outputs. Completions, +embeddings, rerank and infill are non-streaming. + +Every route is also reachable **without the `/v1` prefix**, the server answers **CORS preflight** +(`OPTIONS`) and stamps `Access-Control-Allow-Origin` (so browser/webview clients work), and +`POST /infill` is the llama.cpp-native FIM endpoint for local ghost-text autocomplete plugins +(llama.vscode, Twinny, Tabby, Continue's `llama.cpp` provider). Note: GitHub Copilot's **inline** +completions cannot be served by any local endpoint — only its chat/agent surfaces — so use one of +those autocomplete plugins for ghost text. + +**Alternative protocol surfaces.** For clients that don't speak OpenAI Chat Completions, the same model +is exposed through additional protocols (pure translation over the OpenAI core — no extra inference +path), all supporting tools and streaming: + +| Surface | Routes | For | +|---|---|---| +| **Ollama-native** | `GET /api/version`, `GET /api/tags`, `POST /api/show`, `POST /api/chat` (NDJSON streaming), `POST /api/generate` (prompt completion / FIM) | Copilot's built-in **Ollama** provider; Ollama-hardcoded tools | +| **Anthropic Messages** | `POST /v1/messages` (SSE event stream) | Claude-shaped clients (Claude Code); Copilot `messages` apiType | +| **OpenAI Responses** | `POST /v1/responses` (SSE event stream) | Copilot `responses` apiType; Responses-API clients | + +`/api/show` advertises the model's capabilities (`tools`, `insert`, and `vision` when `--mmproj` is set) +and context length, which Copilot's Ollama provider reads to enable agent mode. The llama.cpp-native +`GET /props` reports `default_generation_settings.n_ctx` and a `modalities` block, which autocomplete +clients such as llama.vscode read to size their context window. Embed it in your app: @@ -460,14 +493,25 @@ try (LlamaModel model = new LlamaModel(modelParams); } ``` -…or run it standalone: +…or run it standalone. The fat jar built by the `assembly` profile (`mvn -P assembly package`) is +runnable (its `Main-Class` is `net.ladenthin.llama.server.OpenAiCompatServer`); the plain library jar +works too via `-cp`: ```bash +# fat jar (bundles the native lib + Java deps) +java -jar target/llama--jar-with-dependencies.jar \ + --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99 + +# or the plain jar java -cp target/llama-.jar net.ladenthin.llama.server.OpenAiCompatServer \ --model models/model.gguf --port 8080 --model-id local-model ``` -Verify with curl: +Run with `--help` for the full option list (`-m/--model`, `--host`, `-p/--port`, `-c/--ctx-size`, +`-ngl/--n-gpu-layers`, `-t/--threads`, `--parallel`, `--model-id`, `--api-key`, `--mmproj`, +`--embedding`, `--reranking`). + +Verify with curl (streaming chat): ```bash curl -N http://127.0.0.1:8080/v1/chat/completions \ @@ -509,37 +553,6 @@ tool calling depends on the model's own tool-calling quality. Pass `--api-key` ( `OpenAiServerConfig.apiKey(...)`) to require an `Authorization: Bearer` token; the server binds to `127.0.0.1` by default. -#### Option B — `LlamaServer` (NanoHTTPD, fat-jar `Main-Class`) - -The fat jar built by the `assembly` profile (`mvn -P assembly package`) is runnable: its -`Main-Class` is `net.ladenthin.llama.server.LlamaServer`, a small [NanoHTTPD](https://github.com/NanoHttpd/nanohttpd) -server that loads a GGUF model in-process and serves OpenAI-compatible endpoints by forwarding each -request body to the matching `LlamaModel.handle*` method: - -```bash -java -jar target/llama--jar-with-dependencies.jar \ - --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99 -``` - -| Method & path | Backed by | -|---|---| -| `POST /v1/chat/completions` | `LlamaModel.handleChatCompletions` | -| `POST /v1/completions` | `LlamaModel.handleCompletionsOai` | -| `POST /v1/embeddings` (requires `--embedding`) | `LlamaModel.handleEmbeddings` | -| `GET /v1/models` | the configured model alias | -| `GET /health` | static `{"status":"ok"}` | - -```bash -curl http://localhost:8080/v1/chat/completions \ - -H 'Content-Type: application/json' \ - -d '{"messages":[{"role":"user","content":"Hello!"}]}' -``` - -Run with `--help` for all options (`--ctx-size`, `--threads`, `--model-alias`, …). Responses are -non-streaming (the full JSON result is returned per request). The NanoHTTPD dependency is declared -``, so it is bundled in the fat jar but **not** inherited by projects that use this -library as a Maven dependency; running the server requires the fat jar (or adding NanoHTTPD yourself). - ### Model/Inference Configuration There are two sets of parameters you can configure, `ModelParameters` and `InferenceParameters`. Both provide builder diff --git a/TODO.md b/TODO.md index f195daa1..0a9f1342 100644 --- a/TODO.md +++ b/TODO.md @@ -13,55 +13,77 @@ cross-cutting initiative. ## Open — jllama-specific -### ⚠️ OpenAI server: TWO implementations to consolidate ("best of both") - -Two independent, Claude-generated OpenAI-compatible servers now coexist in -`net.ladenthin.llama.server` after PR #240 was merged on top of the NanoHTTPD server that landed -via #242. **This is a temporary state**; one unified implementation must be chosen. Until then both -compile and are tested side by side. - -| | **Option A — `OpenAiCompatServer`** (from PR #240) | **Option B — `LlamaServer`** (from #242) | -|---|---|---| -| HTTP layer | JDK `com.sun.net.httpserver` (the supported `jdk.httpserver` module — **no dependency**) | NanoHTTPD (`` dep, bundled only in fat jar) | -| Streaming | **Yes** — SSE with `delta.tool_calls`, heartbeats during prefill | No — blocking, full JSON per request | -| Routes | `POST /v1/chat/completions`, `GET /v1/models` | `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings`, `GET /v1/models`, `GET /health` | -| Entry point | CLI launcher + embeddable; `OpenAiServerConfig` builder; optional bearer auth; binds `127.0.0.1` | fat-jar `Main-Class`; `LlamaServerArgs` CLI (`--host/--port/--ctx-size/--threads/…`) | -| Native path | `requestChatCompletionStream` / `receiveChatCompletionChunk` (+ `wrap_stream_chunk` C++ helper) | `LlamaModel.handle*` (blocking) | -| Tests | mapper/SSE/parser unit tests + model-free HTTP test over a socket (`ChatBackend` seam) | `OaiRouterTest`, `LlamaServerArgsTest`, `OaiHttpServerIntegrationTest` | - -**Important cross-insight:** Option B's own follow-up TODO below ("OpenAI-compatible server: token -streaming (SSE) + Java-8 HTTP layer") lists SSE as *the main functional gap* and says to **avoid** -`com.sun.net.httpserver` because it is "ArchUnit-banned". Option A **already implements that SSE -streaming** with `com.sun.net.httpserver`, and the ban was lifted correctly: `com.sun.net.httpserver` -is a *supported, exported* JDK API (the `jdk.httpserver` module), not an internal `com.sun..` package — -the `noInternalJdkImports` ArchUnit rule now carries an explicit exception for it. So the premise that -blocked the JDK approach on Option B's side does not hold. - -**Consolidation task (separate session — a kickoff prompt accompanies this change):** go through both -implementations, take the best of each, settle on ONE server, delete the other, reconcile the -dependency (`pom.xml` NanoHTTPD + assembly), the ArchUnit `layeredArchitecture` `Server` layer, the -`spotbugs-exclude.xml` entries, `package-info.java`, the README "OpenAI-compatible HTTP server" -section, and this TODO (including the now-partly-moot SSE section below). - ### OpenAI-compatible HTTP endpoint (shipped; follow-ups open) -`net.ladenthin.llama.server.OpenAiCompatServer` exposes `POST /v1/chat/completions` (streaming via -SSE + non-streaming) and `GET /v1/models` over the JDK's built-in `com.sun.net.httpserver` (no new -dependency), so editors that speak the OpenAI protocol (e.g. VS Code Copilot "Custom Endpoint") can -drive a local model. Streaming uses the native OAI chunk path (`requestChatCompletionStream` / -`receiveChatCompletionChunk`), preserving `delta.tool_calls` for agent mode. Follow-ups, deferred -until requested: - -- **Multi-model registry.** Only one model id is advertised/served today; support several models - chosen by the request `model` field (and listed in `/v1/models`). -- **`stream_options.include_usage` passthrough** so the final streamed `usage` chunk is emitted - (needs a generic raw-param passthrough on `InferenceParameters`, or explicit mapping). -- **Additional `apiType`s.** VS Code "Custom Endpoint" also offers Anthropic `messages` and OpenAI - `responses`; only `chat-completions` is implemented. Also consider `/v1/completions` and - `/v1/embeddings` routes. +`net.ladenthin.llama.server.OpenAiCompatServer` is the single OpenAI-compatible server (JDK +`com.sun.net.httpserver`, no new dependency, fat-jar `Main-Class`). It exposes the OpenAI routes +`POST /v1/chat/completions` (streaming SSE + non-streaming), `/v1/completions`, `/v1/embeddings`, +`/v1/rerank`, `/infill`, `GET /v1/models`, `GET /health` and `GET /props`, **plus three alternative +protocol surfaces** — Ollama-native (`/api/version`, `/api/tags`, `/api/show`, `/api/chat`, +`/api/generate`), Anthropic Messages (`POST /v1/messages`) and OpenAI Responses (`POST /v1/responses`). +Every route is also reachable without the `/v1` prefix and sits behind a CORS filter. The CLI is parsed +by the testable `OpenAiServerCli`. (Consolidated from PR #240's JDK + streaming server and #242's +NanoHTTPD server; NanoHTTPD + its dependency deleted.) + +**IDE/agent backend hardening — DONE** (from the deep-research investigation +[`docs/feature-investigation-ide-agent-backend.md`](docs/feature-investigation-ide-agent-backend.md); +primary goal: agentic tool-calling with Qwen): + +- Agentic tool-calling verified wire-correct: C++ guard pins `tool_calls.function.arguments` as a JSON + **string** (not object) at b9682 (llama.cpp #20198), plus the existing `finish_reason:"tool_calls"` + test. +- `stream_options.include_usage` forwarded (new `InferenceParameters.withStreamOptions`) so the trailing + usage chunk is emitted, and `OpenAiSseFormatter.ensureUsageCachedTokens` guarantees + `usage.prompt_tokens_details.cached_tokens` (fixes the Copilot custom-endpoint crash, vscode #273482). +- `response_format` (`json_object`/`json_schema`) forwarded for structured outputs. +- `POST /infill` (FIM autocomplete for llama.vscode/Twinny/Tabby/Continue) → native `handleInfill`. +- `POST /v1/rerank` (RAG) → `handleRerank` reshaped to `results`/`data` (`OaiRerankSupport`). +- CORS preflight + `Access-Control-Allow-Origin`; bare-path (no `/v1`) aliases; `cache_prompt=true` + default; `--mmproj` (vision), `--embedding`, `--reranking` CLI flags. +- **Alternative protocol surfaces** (pure translation over the OpenAI core; tool calls reconstructed by + `ToolCallDeltaAccumulator`): **Ollama-native** (`/api/version`, `/api/tags`, `/api/show`, `/api/chat` + with NDJSON streaming, `/api/generate` prompt-completion/FIM — `OllamaApiSupport`; `/api/show` + advertises tools/insert/vision + context length); **Anthropic Messages** (`POST /v1/messages`, SSE + events — `AnthropicApiSupport` + `AnthropicStreamTranslator`); **OpenAI Responses** (`POST + /v1/responses`, SSE events — `ResponsesApiSupport` + `ResponsesStreamTranslator`). +- **`GET /props`** (llama.cpp-native): `default_generation_settings.n_ctx` + `modalities` so autocomplete + clients (llama.vscode) size their context window (`OpenAiSseFormatter.propsJson`). +- Gated **integration round-trips** over a real socket, run in CI's `test-java-linux-x86_64` job, + self-skipping when the model is absent — structural assertions only: + - `OpenAiCompatServerIntegrationTest` (Qwen3-0.6B, chat mode): OpenAI chat (non-stream/stream/tools/ + models) plus Ollama `/api/chat` + discovery, Anthropic `/v1/messages`, OpenAI `/v1/responses` + (non-stream + stream) and `/props`. + - `OpenAiServerEmbeddingsIntegrationTest` (CodeLlama-7B + `enableEmbedding`): `/v1/embeddings` (+ bare + alias). + - `OpenAiServerRerankIntegrationTest` (jina-reranker + `enableReranking`): `/v1/rerank` (sorted + `results`/`data`, `top_n` cap). + - `OpenAiServerCompletionIntegrationTest` (CodeLlama-7B): `/v1/completions`, `/infill`, and Ollama + `/api/generate` (plain + FIM via `suffix`). + +**Open follow-ups (deferred):** + +- **Streaming raw-completion path (the shared blocker).** A new native streaming method + (`requestCompletionStream` alongside the existing chat one) is needed before these can be done + token-incrementally: (a) **streaming `/v1/completions`**, (b) **token-streaming `/api/generate`** + (today it computes the full text then emits one NDJSON content line), and (c) **Continue's native + `llama.cpp` provider** which streams `POST /completion` in the native (non-OAI) shape. Until then these + either run non-streaming or emit a single content chunk. JNI + C++ work; the agentic-chat goal does + not need it. +- **Incremental tool-call streaming on the alternative surfaces.** Ollama/Anthropic/Responses emit each + tool call *whole* at end-of-stream (reconstructed by `ToolCallDeltaAccumulator`) rather than streaming + argument fragments. Fine for clients that apply tool calls after generation; revisit if a client needs + incremental `input_json_delta` / `function_call_arguments.delta` fidelity. +- **Per-model FIM template registry** (Qwen/CodeLlama/DeepSeek v1&V2/StarCoder2/Codestral) — only needed + if we also expose `/v1/completions`-with-`suffix` FIM; `/infill` (and Ollama `/api/generate` with a + `suffix`) applies the model's FIM tokens server-side, so this is lower value. +- **Multi-model registry.** Only one model id is advertised/served today; serving several would need + multi-model load + lifecycle management. +- **Manual real-client validation.** Gated server-side round-trips now exist for every surface (above). + What remains is manual validation against the actual editor clients — point Copilot's Ollama provider / + a Custom Endpoint, Claude Code, and a Responses client at the running server — since a server-side + round-trip confirms the wire shapes but not each client's own parser. - **Gemma 4 tool-calling validation.** Confirm the pinned llama.cpp (`b9682`) includes the Gemma 4 - tool-call parser fixes (landed upstream ~Apr 2026); if not, bump per the upgrade procedure so - streamed/blocking `tool_calls` come through for Gemma 4 GGUFs. + tool-call parser fixes; if not, bump per the upgrade procedure. ### llama.cpp upstream feature exposure (queued, deferred by policy) @@ -105,40 +127,6 @@ These are JNI plumbing items for upstream API additions. Policy: add only after **Out of scope until evidence supports it**: actually implementing any of the above. This entry exists so that when someone asks "can I ship java-llama.cpp as a single 30 MB binary?" the answer points to a concrete investigation plan rather than restarting from zero. -### OpenAI-compatible server: token streaming (SSE) + Java-8 HTTP layer - -The `net.ladenthin.llama.server.LlamaServer` MVP is **non-streaming**: every request calls -the blocking `LlamaModel.handle*` method and returns the full JSON response in one shot. A -client that sends `"stream": true` still receives a single response, not the incremental -`text/event-stream` (SSE) `data: {chunk}\n\n` events the OpenAI API emits for streaming -chat/completions. This is the main functional gap of the server today. - -The token source already exists — `LlamaModel.generateChat(InferenceParameters)` / -`generate(...)` yield tokens incrementally through a Java `Iterator` (`LlamaIterable`). What -is missing is an HTTP layer that emits SSE. - -**Find a Java-8-compatible HTTP layer with good SSE support (alternative to Javalin), or -implement SSE on NanoHTTPD.** Javalin has a first-class `ctx.sse(...)` API but is **not -usable here**: Javalin 5 requires Java 11 and Javalin 6 requires Java 17, while this repo -targets Java 8; Javalin 4 (the last Java-8 release) is EOL. Options, in rough order of -preference: -- **Implement SSE on the existing NanoHTTPD** via `NanoHTTPD.newChunkedResponse(status, - "text/event-stream", InputStream)`, bridging a `LlamaIterable` to an `InputStream` that - writes `data: {chunk}\n\n` frames. No new dependency, stays Java-8 clean; likely the right - answer. Cost: the iterator→SSE bridge plus closing the `LlamaIterable` on client - disconnect. -- **Undertow** — Java-8 compatible, has a server-sent-events handler, but a heavier - dependency tree. -- **Spark Java** (Jetty 9) — Java-8 compatible; SSE support is limited/manual. -- Avoid: Javalin 5/6 (Java 11/17), Javalin 4 (EOL), and the JDK `com.sun.net.httpserver` - (ArchUnit-banned `com.sun..`). - -Scope when implemented: honour `"stream": true` on `POST /v1/chat/completions` and -`POST /v1/completions`, emit OpenAI-style SSE chunks terminated by `data: [DONE]`, close the -underlying `LlamaIterable` on disconnect, and keep the non-streaming path as the default. Add -a model-free routing test plus a real-socket SSE integration test (mirroring -`OaiHttpServerIntegrationTest`). - ## Open — cross-cutting (slice for this repo) - **jqwik pin policy** — see [`../workspace/policies/jqwik-prompt-injection.md`](../workspace/policies/jqwik-prompt-injection.md). `jqwik.version ≤ 1.9.3` is mandatory. diff --git a/docs/feature-investigation-ide-agent-backend.md b/docs/feature-investigation-ide-agent-backend.md new file mode 100644 index 00000000..af12ba58 --- /dev/null +++ b/docs/feature-investigation-ide-agent-backend.md @@ -0,0 +1,254 @@ + + +# Feature Investigation — IDE coding/agent backend (2025–2026) + +> **Implementation status (this repo).** The XS/S/M recommendations below are implemented on +> `net.ladenthin.llama.server.OpenAiCompatServer`: `POST /infill` (FIM autocomplete), `POST /v1/rerank` +> (RAG), `stream_options.include_usage` passthrough + a `cached_tokens` safety net, `response_format` +> (structured outputs), CORS/`OPTIONS` preflight, bare-path (`/v1`-less) aliases, a `cache_prompt=true` +> default, and `--mmproj`/`--embedding`/`--reranking` CLI flags. Agentic tool-calling is the primary +> target and is verified wire-correct by a C++ guard pinning `tool_calls.function.arguments` as a JSON +> string (llama.cpp #20198). Open items that need a product decision (Ollama native-API emulation, +> Anthropic `POST /v1/messages` + OpenAI `POST /v1/responses` shims, Continue's native `/completion`, +> a per-model FIM template registry, `/props` capability reporting) are tracked in +> [`../TODO.md`](../TODO.md). The verbatim deep-research report follows. + +--- + +# Making a llama.cpp-Backed Local Server a First-Class IDE Coding/Agent Backend (2025–2026) + +## TL;DR +- **The single highest-leverage change is to add a llama.cpp-native `POST /infill` endpoint (fields `input_prefix`, `input_suffix`, `input_extra`, `prompt`, `n_predict`), because every high-quality local ghost-text client (llama.vscode, llama.vim, Tabby, Twinny, and Continue's `llama.cpp` provider) drives FIM through `/infill` or a raw `/v1/completions` `suffix` template — NOT through `/v1/chat/completions`.** A chat-only server unlocks chat/agent but currently unlocks zero first-class autocomplete. +- **For chat + agent (Copilot BYOK, Cline, Roo Code, Continue, ProxyAI, Zed, Aider), your existing `/v1/chat/completions` is already the right surface — but the make-or-break details are: streamed `delta.tool_calls` with correct `index`/`id`/`function.name`/`function.arguments` fragments, `finish_reason:"tool_calls"` on the terminating chunk, a `stream_options.include_usage` final usage chunk with an empty `choices` array, and never emitting `tool_calls.function.arguments` as a JSON object (it must be a JSON-encoded string).** Copilot's VS Code custom-endpoint feature also reads `usage.prompt_tokens_details.cached_tokens` and crashes if it is absent. +- **As of VS Code 1.122 (released May 28, 2026), the generic OpenAI-compatible "Custom Endpoint" provider (apiTypes `chat-completions` / `responses` / `messages`) is now in VS Code Stable — so a plain OpenAI-compatible server is now a first-class Copilot chat/agent backend without Insiders or Ollama emulation.** Copilot inline completions, however, remain closed to all local endpoints ("Inline suggestions and next edit suggestions still require a GitHub sign-in. BYOK powers chat, tools, and MCP servers only"). + +## Key Findings + +1. **Two protocol families, not one.** Autocomplete/FIM and chat/agent are almost entirely disjoint wire contracts. Chat/agent is OpenAI `/v1/chat/completions` (or Anthropic `/v1/messages`, or OpenAI `/v1/responses`). Autocomplete is either llama.cpp `/infill`, Ollama `/api/generate`, or raw `/v1/completions` with a `suffix` field and a model-specific FIM template. Your server implements the chat side well and the FIM side not at all. + +2. **Copilot inline completion is closed to local models.** Per the VS Code 1.122 release notes, "Inline suggestions and next edit suggestions (NES) still require a GitHub sign-in. BYOK powers chat, tools, and MCP servers only." VS Code's language-models docs add: "Currently, you cannot connect to a local model for inline suggestions. VS Code provides an extension API `InlineCompletionItemProvider` that enables extensions to contribute a custom completion provider." So no llama.cpp server can power Copilot ghost text — you can only target Copilot's **chat + agent** surfaces (or ship your own inline-completion VS Code extension). + +3. **Copilot's OpenAI-compatible path went Stable in May 2026.** VS Code 1.122 (May 28, 2026) notes: "The Custom Endpoint provider lets you connect models that implement Chat Completions, Responses, or Messages APIs… The Custom Endpoint provider is now available in VS Code Stable." This supersedes the earlier Insiders-only status and reduces the urgency of emulating Ollama's native API. The built-in **Ollama** provider (native `/api/version`, `/api/tags`, `/api/show`) and the deprecated `github.copilot.chat.customOAIModels` settings object remain as alternative paths. BYOK "now works without GitHub sign-in… in air-gapped or restricted environments" (GitHub Changelog, Apr 22, 2026), though model selection in the UI generally still prompts a GitHub login. + +4. **Cline and Roo Code diverge on tool-calling.** Roo Code forces native OpenAI tool calling: per the Roo Code blog ("Sorry we didn't listen sooner: Native Tool Calling"), "In 3.36.0 we introduced native tool calling… In 3.37.0 we made native tool calling the default and removed XML tool calling entirely." If your endpoint doesn't fully implement `tools`/`tool_calls`, Roo (≥3.37) cannot be used unless the user rolls back to 3.36.16 and selects XML in advanced settings. Cline historically inlines XML-style tool instructions into the prompt and parses tool calls out of plain text, so it is tolerant of weak native tool support. This is a critical compatibility fork. + +5. **Real-world SSE bugs cluster around three things:** the trailing usage chunk (`stream_options.include_usage`), the `finish_reason` after streamed tool calls (must be `"tool_calls"`, not `"stop"`), and Copilot's hard dependency on `usage.prompt_tokens_details.cached_tokens`. + +6. **KV-cache prefix reuse is a latency feature clients actively rely on.** llama.vscode warms the server with a fire-and-forget `/infill` `n_predict:0` request [DeepWiki](https://deepwiki.com/ggml-org/llama.vscode/3.2-llamaserver) and sets `cache_prompt:true`; [GitHub](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) `--cache-reuse 256` is a standard launch flag. For acceptable repeated-prefix latency you must support `cache_prompt`/prompt-prefix reuse. + +## Details + +### A. Landscape of clients targeting local endpoints + +| Client | IDEs | License | Local-endpoint mechanism | +|---|---|---|---| +| **GitHub Copilot (VS Code)** | VS Code | Proprietary (Copilot sub; BYOK works on Free) | **Chat/agent only.** Generic **Custom Endpoint** (`chat-completions`/`responses`/`messages`) — **Stable since 1.122 (May 28 2026)**; built-in **Ollama** provider (native `/api/*`); legacy `github.copilot.chat.customOAIModels` (OpenAI base URL). Inline completion NOT available locally. | +| **GitHub Copilot (Visual Studio / JetBrains)** | VS, JetBrains | Proprietary | Model picker; local/BYOK parity with VS Code is unverified from a primary source (treat as lagging). | +| **GitHub Copilot CLI** | Terminal | Proprietary | `COPILOT_PROVIDER_BASE_URL` [Ofox](https://ofox.ai/blog/github-copilot-byok-oai-compatible-api-setup/) + `COPILOT_MODEL` (+`COPILOT_PROVIDER_TYPE=azure/anthropic`); any OpenAI-compatible endpoint; requires tool calling + streaming; "for best results, use a model with a context window of at least 128k tokens." | +| **Continue.dev** | VS Code, JetBrains | Apache-2.0 | `provider: openai` + `apiBase`; [Continue](https://docs.continue.dev/customize/model-providers/top-level/openai) native `provider: llama.cpp`; `provider: ollama`; roles `chat/edit/apply/autocomplete/embed/rerank`. | +| **Cline** | VS Code | Apache-2.0 | "OpenAI Compatible" base URL + key + model ID; [Cline](https://docs.cline.bot/provider-config/openai-compatible) tolerant XML-ish tool parsing. | +| **Roo Code** | VS Code | Apache-2.0 | "OpenAI Compatible" base URL; **native tool calling only (≥3.37)**. | +| **Kilo Code** | VS Code | Apache-2.0 | OpenAI-compatible; XML tool-call option still present in advanced settings (later versions). | +| **Twinny** | VS Code, VSCodium | MIT | OpenAI-compatible chat; **`/infill` or FIM template** for completion; [Open VSX Registry](https://open-vsx.org/extension/rjmacarthy/twinny/3.17.6) llama.cpp/Ollama/LM Studio/Oobabooga presets. | +| **llama.vscode / llama.vim** | VS Code, Vim | MIT | **llama.cpp `/infill`** for FIM (required); `/v1/chat/completions` for chat/agent; `/v1/embeddings`. | +| **ProxyAI (formerly CodeGPT)** | JetBrains | Apache-2.0 | "Custom OpenAI" provider; FIM template for code completion; dedicated LLaMA C/C++ offline provider. | +| **Cursor** | Cursor (own) | Proprietary | "Override OpenAI Base URL" (+`/v1`); chat/agent; not local-friendly without a public/tunnel URL. | +| **Zed** | Zed (own) | GPL/Apache | `language_models.openai_compatible` with `api_url`, `available_models[].capabilities.{tools,images}`. | +| **Aider** | Terminal | Apache-2.0 | `OPENAI_API_BASE` + `OPENAI_API_KEY`, `--model openai/`. [Aider](https://aider.chat/docs/llms/openai-compat.html) | +| **Void** | Void (own) | Apache-2.0 | OpenAI-compatible base URL (detailed behavior unverified). | +| **Tabby** | VS Code, JetBrains | Apache-2.0 (core) | `config.toml`: `kind="llama.cpp/completion"` (FIM via `prompt_template`), `kind="openai/chat"`, `kind="llama.cpp/before_b4356_embedding"`. [Tabby](https://tabby.tabbyml.com/docs/references/models-http-api/llamafile/) | +| **Tabnine, Qodo/Codium, Windsurf, Augment, Sourcegraph Cody, Pieces, Refact, Goose, OpenHands** | various | mixed | Most accept an OpenAI-compatible base URL for chat; FIM/autocomplete typically proprietary or model-specific (verify per-tool). Goose & OpenHands are agent frameworks consuming `/v1/chat/completions` + tools. | + +### B. Exact wire contract per client (Copilot first) + +**GitHub Copilot (VS Code).** Three configuration paths today: + +- **Custom Endpoint provider (Stable since 1.122):** added via *Chat: Manage Language Models* → Add Models → Custom Endpoint. Supports per-model `apiType` ∈ `chat-completions` | `responses` | `messages`. The Insiders-era `chatLanguageModels.json` file used `vendor: "customendpoint"` with the same `apiType` selector. +- **Legacy `github.copilot.chat.customOAIModels` (still works in stable),** object keyed by model id: +```json +"github.copilot.chat.customOAIModels": { + "my-model": { + "name": "My Model", + "url": "http://127.0.0.1:8080/v1/chat/completions", + "toolCalling": true, "vision": false, "thinking": false, + "maxInputTokens": 128000, "maxOutputTokens": 16000, + "requiresAPIKey": false + } +} +``` +- **Built-in Ollama provider:** requires native endpoints `GET /api/version`, `GET /api/tags` (model list), `POST /api/show` (capabilities incl. context length, `tools`/`vision`). LM Studio issue #526 documents that emulating these is what unlocks Copilot's "Ollama" provider for non-Ollama servers. [GitHub](https://github.com/lmstudio-ai/lms/issues/526) + +Copilot reads capability flags `toolCalling`, `vision`, `thinking`, `maxInputTokens`/`maxOutputTokens`. It sends standard OpenAI chat bodies with `messages`, `tools`, `tool_choice`, `stream:true`. A documented crash — microsoft/vscode issue #273482 ("OpenAI Compatible models return TypeError: Cannot read properties of undefined (reading 'cached_tokens')"), shows `TypeError: Cannot read properties of undefined (reading 'cached_tokens') at SX.push (…github.copilot-chat-0.33.2025102701…)` reproduced with LM Studio models in agent and ask mode — occurs when the streamed `usage` lacks `prompt_tokens_details.cached_tokens`. [GitHub](https://github.com/microsoft/vscode/issues/273482) + +**Continue.dev** (`config.yaml`, schema v1): `provider: openai` + `apiBase: http://127.0.0.1:8080/v1` + `model` + `roles: [chat, edit, apply]`. For OpenAI-compatible non-chat completion: `useLegacyCompletionsEndpoint: true` forces `/v1/completions`. [Continue](https://docs.continue.dev/customize/model-providers/top-level/openai) Continue's native `llama.cpp` provider posts to `/completion` (singular), not `/completions` (issue #4991). `requestOptions.headers` carries auth; `capabilities: [tool_use, image_input]` can be declared. + +**Cline / Roo Code:** Settings → "OpenAI Compatible" → Base URL (must include `/v1`), API key, model ID. Roo internally uses Anthropic message format then transforms to OpenAI `ChatCompletionTool`; it accumulates streamed fragments by `index`; finalizes on `finish_reason:"tool_calls"`. `parallelToolCalls:true` is the default. + +### C. Inline autocomplete / FIM (highest priority) + +**llama.cpp `/infill` contract (the target to implement):** +- `POST /infill`, fields: `input_prefix` (string, code before cursor), `input_suffix` (string, code after cursor), [GitLab](https://gitlab.informatik.uni-halle.de/ambcj/llama.cpp/-/blob/b2308/examples/server/README.md) `input_extra` (array of context chunks, prepended toward prompt start), `prompt` (optional raw text appended after the FIM middle marker), plus all `/completion` options; common params `n_predict`, `temperature`, `top_p`, `top_k`, `stop`, `samplers` (e.g. `["top_k","top_p","infill"]`), `cache_prompt:true`. +- Response: JSON with **`content`** (the completion — the only field clients require), plus `stop`, `tokens_predicted`, `timings`, etc. Streaming supported. +- The model's own FIM tokens are applied server-side from GGUF metadata, so clients send raw prefix/suffix. `--spm-infill` toggles SPM vs PSM ordering. [Debian Manpages](https://manpages.debian.org/unstable/llama.cpp-tools/llama-server.1.en.html) + +**FIM control tokens by model family (verbatim — character precision matters):** + +| Model | Tokens (verbatim) | Char notes | Order | +|---|---|---|---| +| **Qwen2.5-Coder** | `<\|fim_prefix\|>` `<\|fim_suffix\|>` `<\|fim_middle\|>` `<\|fim_pad\|>` `<\|repo_name\|>` `<\|file_sep\|>` | ASCII pipes (ids 151659–151664) | PSM: prefix·suffix·middle | +| **Code Llama** | `▁
` `▁` `▁` `▁` | `▁` = U+2581 (not ASCII underscore); ids 32007–32010 | PSM: `
`pre``suf`` (paper recommends PSM over SPM) |
+| **DeepSeek-Coder (v1, 6.7b)** | `<|fim▁begin|>` `<|fim▁hole|>` `<|fim▁end|>` | `|`=U+FF5C full-width pipe; `▁`=U+2581 | PSM: begin·pre·hole·suf·end |
+| **DeepSeek-Coder-V2** | `<\|fim_begin\|>` `<\|fim_hole\|>` `<\|fim_end\|>` | ASCII pipe + ASCII underscore — **NOT byte-compatible with v1** | PSM |
+| **StarCoder2** | `` `` `` `` `` `` | ASCII `<>` + underscore | PSM: prefix``suffix`` |
+| **Codestral** | `[PREFIX]` `[SUFFIX]` (`[MIDDLE]`) | ASCII brackets; build via `mistral_common.encode_fim`, not by hand | SPM internal: `[SUFFIX]`suf`[PREFIX]`pre; API uses `prompt`+`suffix` |
+
+Character-precision warnings: Code Llama and DeepSeek-Coder-v1 use the SentencePiece `▁` (U+2581) glyph, not an ASCII underscore; DeepSeek-Coder-v1 uses the full-width pipe `|` (U+FF5C) while DeepSeek-Coder-V2 uses ASCII `|` + ASCII `_` (the two are not interchangeable — match the exact checkpoint); Codestral uses square-bracket `[PREFIX]`/`[SUFFIX]` (the widely-circulated ``/`` angle-bracket claim is incorrect) and its FIM API is `POST /v1/fim/completions` with `prompt`+`suffix`.
+
+**Per-client FIM behavior:**
+- **llama.vscode / llama.vim:** `POST /infill`, reads `content`; defaults `cache_prompt:true`, `samplers:["top_k","top_p","infill"]`, `top_k:40`, `top_p:0.99`, `stream:false`; [DeepWiki](https://deepwiki.com/ggml-org/llama.vscode/3.2-llamaserver) warms cache with a fire-and-forget `n_predict:0` `/infill`. Requires llama.cpp (only server with `/infill`). Recommended launch: `llama-server -hf ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF --port 8012 -ub 1024 -b 1024 --ctx-size 0 --cache-reuse 256`.
+- **Twinny:** OpenAI-compatible; per-model FIM template; CodeLlama uses `
{prefix}{suffix}`, DeepSeek uses its FIM template; base (not instruct) models for FIM. "Twinny supports the OpenAI API specification so in theory any API should work."
+- **Tabby:** `kind="llama.cpp/completion"`, `prompt_template="<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>"` (Qwen2.5) [Tabby](https://tabby.tabbyml.com/docs/references/models-http-api/llamafile/) or `"
 {prefix} {suffix} "` (CodeLlama); endpoint **must NOT** include the `/v1` suffix.
+- **Continue.dev autocomplete:** `roles:[autocomplete]`; `provider: llama.cpp` drives FIM; or `provider: openai` with a `template` Mustache string (`{{{prefix}}}`,`{{{suffix}}}`,`{{{filename}}}`,`{{{reponame}}}`,`{{{language}}}`). `autocompleteOptions`: `debounceDelay:250`, `maxPromptTokens:1024`, [Continue](https://docs.continue.dev/reference) `modelTimeout`, `maxSuffixPercentage:0.2`, `prefixPercentage:0.3`, `onlyMyCode:true`.
+- **ProxyAI:** Custom OpenAI → Code Completions → "FIM Template (OpenAI)" + URL; [Medium](https://medium.com/@mitrut98/ghost-coding-on-prem-building-a-self-hosted-ai-copilot-for-intellij-or-any-jetbrains-ide-fdac377a10fd) uses `/v1/completions` or `/v1/chat/completions`.
+- **Cline/Roo:** no ghost-text autocomplete; chat/agent only.
+
+### D. Agentic tool-calling
+
+OpenAI shape required: `tools:[{type:"function",function:{name,description,parameters}}]`, [GitHub](https://github.com/ggml-org/llama.cpp/blob/master/docs/function-calling.md) `tool_choice` ∈ `auto|none|required|{type:"function",function:{name}}`. Streaming `delta.tool_calls[]` carry `index`, `id`, `function.name`, and incremental `function.arguments` string fragments; `finish_reason:"tool_calls"` terminates. [OpenAI API Reference](https://developers.openai.com/api/reference/resources/chat/subresources/completions/streaming-events) **`function.arguments` MUST be a JSON-encoded string, not an object.** ggml-org/llama.cpp issue #20198 ("llama-server tool_calls returns arguments as JSON object instead of string, breaking OpenAI compatibility") documents that after the Autoparser refactoring (PR #18675), llama-server returned `arguments` as a parsed object (root cause in `common/chat.cpp` ~line 132: `{"arguments", json::parse(tool_call.arguments)}`), which crashes the official OpenAI Python SDK (Pydantic) with a `TypeError`. Your server must serialize arguments as a string.
+
+- **Roo Code:** native only (≥3.37); transforms to OpenAI `ChatCompletionTool`; `parallelToolCalls:true` default; finalizes on `finish_reason:"tool_calls"` or stream end. Removal of XML tool calling broke some local stacks (issue #10319, SGLang gpt-oss 500 errors); rollback to 3.36.16 restores the XML selector.
+- **Cline / Kilo Code:** historically XML-in-prompt tool calling parsed from text; tolerant of weak native support. The `native_tool_call_adapter` proxy exists specifically to translate Cline/Roo XML into OpenAI `tool_calls`.
+- **Copilot agent:** native OpenAI tools via chat-completions; needs `toolCalling:true` on the model entry (a model that appears in chat but not agent mode usually has `toolCalling` missing/false).
+- **llama.cpp tool support** requires `--jinja` (and often `--chat-template-file` for a tool-capable template; worst case `--chat-template chatml`). `chat_template_kwargs` (e.g. `{"enable_thinking":false}`), `parallel_tool_calls`, and `reasoning_format` (deepseek → `message.reasoning_content`) [Debian Manpages](https://manpages.debian.org/unstable/llama.cpp-tools/llama-server.1.en.html) are supported. [Fossies](https://fossies.org/linux/llama.cpp/tools/server/README.md) No client in scope strictly requires `/v1/responses`; Copilot's custom-endpoint can use `responses` or Anthropic `messages` but `chat-completions` suffices. Structured outputs (`response_format:{type:"json_schema"}`) are supported by llama.cpp via grammar but are not universally required by these clients.
+
+### E. Model discovery & capabilities
+
+- `GET /v1/models` clients read `id`, `object`, `owned_by`. Continue can use the special `AUTODETECT` model name. Roo/Cline mostly take an explicit model ID.
+- Copilot's **Ollama** path reads context length and `tools`/`vision` from `POST /api/show` (microsoft/vscode issue #295659 shows Copilot's Manage Models UI expecting capability + context fields there, e.g. `262144` context, `Tools`/`Vision`). The OpenAI custom path takes `maxInputTokens`/`maxOutputTokens`/`toolCalling`/`vision` from settings, not from `/v1/models`.
+- Zed reads `max_tokens`, `max_output_tokens`, `capabilities.{tools,images}` [Ofox](https://ofox.ai/blog/zed-editor-ai-configuration-guide-2026/) from its own settings, not the server.
+- A single advertised model is fine for most clients; multi-model is optional. Non-standard capability fields on `/v1/models` are largely ignored — capabilities are configured client-side.
+
+### F. Cross-cutting compatibility pitfalls
+
+- **Trailing usage chunk:** when `stream_options:{include_usage:true}`, emit a final chunk with `choices:[]` and a populated `usage`, [LiteLLM](https://docs.litellm.ai/docs/completion/usage) then `data: [DONE]`. All non-final chunks should carry `usage:null` (per OpenAI's documented streaming shape and LiteLLM docs).
+- **Copilot `cached_tokens`:** include `usage.prompt_tokens_details.cached_tokens` or Copilot's custom-OAI path throws `Cannot read properties of undefined (reading 'cached_tokens')` (vscode #273482).
+- **finish_reason after tool calls:** must be `"tool_calls"` on the terminating chunk in streaming, else agent loops terminate early [GitHub](https://github.com/open-webui/open-webui/issues/21768) (the open-webui #21768 pattern: "finish_reason incorrectly returned as 'stop' after streaming tool_calls").
+- **First delta with role:** emit an initial `delta:{role:"assistant",content:""}` chunk (matches OpenAI's documented first streamed event).
+- **`data: [DONE]` terminator** is expected by OpenAI-style consumers; always send it last. LiteLLM #25389 shows consumers that stop at `finish_reason` lose the trailing usage chunk — keep the stream open until `[DONE]`.
+- **CORS / preflight:** browser/webview clients send `OPTIONS` preflights and an `Authorization` header; respond to `OPTIONS` with `Access-Control-Allow-Origin`, `Access-Control-Allow-Methods: GET,POST,OPTIONS`, `Access-Control-Allow-Headers: Content-Type, Authorization`. Ollama's default of restricting origins/headers is a documented friction source ("Request header field 'authorization' is not allowed by Access-Control-Allow-Headers in preflight response").
+- **Path / `/v1` differences:** some clients append `/v1` (Continue `openai`, Cline, Zed), some must NOT (Tabby `llama.cpp/completion`, llama.vscode `endpoint_chat` excludes `v1`). Continue's `llama.cpp` provider uses `/completion` singular. Support both trailing-slash and non-slash forms.
+- **Keep-alive / timeouts:** long prefill needs SSE heartbeats (you already emit these) and generous read timeouts (llama.cpp server default is 600s; Continue defaults `requestOptions.timeout` to tens of seconds — local guides raise it to 60000 ms for CPU).
+- **gzip:** accept but don't require; some clients send `Accept-Encoding: gzip`.
+- **arguments-as-string** (Section D) is the single most damaging non-spec deviation.
+
+### G. Adjacent features
+
+- **Embeddings:** `POST /v1/embeddings` (`input` string or array, `model`, `encoding_format`). Used by Continue (`roles:[embed]`), Twinny (workspace embeddings, default `all-minilm:latest`), llama.vscode (semantic re-rank). Response must be OpenAI `data:[{embedding,...}]` shaped; llama.cpp's native `/embedding` is non-OAI [GitHub](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) , so clients want `/v1/embeddings`.
+- **Reranking:** llama.cpp exposes `POST /v1/rerank` [GitHub](https://github.com/ggml-org/llama.cpp/blob/master/tools/server/README.md) (also `/rerank`, `/reranking`) requiring `--reranking`/`--pooling rank`; request `{model,query,documents,top_n}`, response `{results:[{index,relevance_score}]}`. **Continue expects a `data` array and errors on llama.cpp's `results` shape** (continue #6478: "Expected 'data' array but got: ['id','model','usage','results']") — a real interop gap to document or shim.
+- **Prompt / KV cache:** `cache_prompt:true` and `--cache-reuse N` reuse common prefixes — essential for repeated-prefix latency (chat turns, FIM). `--cache-reuse` has regressed before (llama.cpp #15082) — pin a known-good build.
+- **Vision:** chat `content` parts with `image_url`/base64; llama.cpp multimodal needs `--mmproj`; [GitHub](https://github.com/DelftSolutions/vscode-llama-copilot) clients gate on `vision:true` / `capabilities.images`. Used by Cline/Roo screenshots and ProxyAI image chat. Per the llama.cpp server README, "A client must not specify [media] unless the server has the multimodal capability. Clients should check /models or /v1/models for the multimodal capability before a multimodal request."
+
+## Recommendations
+
+**Must-have for broad compatibility (do these first):**
+1. **Implement `POST /infill`** with `input_prefix`/`input_suffix`/`input_extra`/`prompt`/`n_predict`/`cache_prompt`, returning `content`. Unlocks llama.vscode, llama.vim, Twinny (llama.cpp mode), Tabby, Continue `llama.cpp` autocomplete. *This is your biggest gap.*
+2. **Serialize `tool_calls.function.arguments` as a JSON string** (never an object). Unlocks Roo Code, Copilot agent, any OpenAI-SDK consumer. Acceptance benchmark: the OpenAI Python SDK must parse your tool response without `TypeError`.
+3. **Streaming tool-call correctness:** `delta.tool_calls` with `index`/`id`/`function.name`/incremental `arguments`, terminating `finish_reason:"tool_calls"`. Unlocks all agent modes.
+4. **`stream_options.include_usage` trailing chunk** with empty `choices` + populated `usage`, always ending with `data: [DONE]`, and include `usage.prompt_tokens_details.cached_tokens`. Unlocks Copilot custom-endpoint without crashes.
+5. **CORS/OPTIONS handling** allowing `Authorization` + `Content-Type`. Unlocks webview/browser clients.
+6. **Tolerant routing:** accept both `/v1/...` and bare paths, with and without trailing slash; accept `/completion` and `/completions`.
+
+**High-value:**
+7. **Emulate the Ollama native API** (`GET /api/version`, `GET /api/tags`, `POST /api/show` with `capabilities` incl. tools/vision + context length, and `POST /api/chat`/`/api/generate`). *Downgraded from earlier priority:* because the OpenAI Custom Endpoint provider reached VS Code Stable in 1.122 (May 28, 2026), a clean OpenAI-compatible surface now covers Copilot chat/agent. Ollama emulation is still worthwhile to support older VS Code versions and tools hard-coded to Ollama's native endpoints, but it is no longer on the critical path for current Copilot.
+8. **`POST /v1/rerank`** (`{query,documents,top_n}` → `{results:[{index,relevance_score}]}`) and consider also returning a `data` array alias for Continue. Unlocks RAG / "chat with codebase."
+9. **`cache_prompt` + prefix reuse** and SSE heartbeats during prefill (you have heartbeats; add prefix reuse).
+10. **Advertise capabilities** in `/v1/models` and (if Ollama-emulating) `/api/show` so agent modes light up.
+
+**Nice-to-have:**
+11. **Vision** via `image_url` content parts (needs mmproj).
+12. **Anthropic `/v1/messages`** and **OpenAI `/v1/responses`** shims for Copilot's other apiTypes and for Claude-shaped clients (Claude Code, etc.).
+13. **Per-model FIM template registry** (Qwen / CodeLlama / DeepSeek v1 & V2 / StarCoder2 / Codestral) if you also expose `/v1/completions`-with-`suffix` for clients that don't use `/infill`.
+14. **`/props` / `/v1/models` context-length reporting** so clients auto-size prompts.
+
+**Staged rollout:** Ship (1)–(6) → validate against Continue (autocomplete + chat + agent), Twinny (FIM), Roo Code (native tools), Copilot Custom Endpoint (chat/agent). Then (7)–(10) → validate Copilot Ollama provider + RAG. Then (11)–(14).
+
+## Exact config snippets
+
+**GitHub Copilot (VS Code ≥1.122) — Custom Endpoint (preferred):** *Chat: Manage Language Models* → Add Models → **Custom Endpoint** → display name, API key (any string if none), Base URL `http://127.0.0.1:8080/v1`, model ID, apiType `chat-completions`. For older stable builds, use the legacy object in `settings.json`:
+```json
+"github.copilot.chat.customOAIModels": {
+  "local-qwen": {
+    "name": "Local Qwen (llama.cpp)",
+    "url": "http://127.0.0.1:8080/v1/chat/completions",
+    "toolCalling": true,
+    "vision": false,
+    "thinking": false,
+    "maxInputTokens": 32768,
+    "maxOutputTokens": 8192,
+    "requiresAPIKey": false
+  }
+}
+```
+(For Copilot's built-in **Ollama** provider, run an Ollama-emulation layer on `:11434` and use Chat: Manage Language Models → Add → Ollama. Inline completions cannot be served locally in any case.)
+
+**Continue.dev — `~/.continue/config.yaml`:**
+```yaml
+name: Local llama.cpp
+version: 1.0.0
+schema: v1
+models:
+  - name: local-fim
+    provider: llama.cpp
+    model: your-model.gguf
+    apiBase: http://127.0.0.1:8080
+    roles: [autocomplete]
+    autocompleteOptions:
+      debounceDelay: 250
+      maxPromptTokens: 1024
+  - name: local-chat
+    provider: openai
+    model: your-model
+    apiBase: http://127.0.0.1:8080/v1
+    apiKey: sk-local
+    roles: [chat, edit, apply]
+```
+
+**Cline / Roo Code (VS Code settings UI):**
+- API Provider: **OpenAI Compatible**
+- Base URL: `http://127.0.0.1:8080/v1`
+- API Key: `sk-local` (any string if no auth)
+- Model ID: `your-model`
+- (Roo Code ≥3.37: model must support native tool calling, or roll back to 3.36.16 for the XML option.)
+
+**Twinny (Providers → Code/FIM provider):**
+- Provider: `llamacpp`
+- Hostname/port: `127.0.0.1` / `8080`
+- FIM endpoint path: `/infill`
+- FIM Template: match the model (e.g. Qwen2.5-Coder / DeepSeek / CodeLlama)
+
+**Tabby — `~/.tabby/config.toml`:**
+```toml
+[model.completion.http]
+kind = "llama.cpp/completion"
+api_endpoint = "http://127.0.0.1:8080"   # no /v1
+prompt_template = "<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>"
+
+[model.chat.http]
+kind = "openai/chat"
+api_endpoint = "http://127.0.0.1:8080/v1"
+```
+
+## Caveats / open risks (2026 moving targets)
+- **Copilot Custom Endpoint (OpenAI) reached Stable in VS Code 1.122 (May 28, 2026).** This is recent; behavior on older VS Code (and the exact apiType handling) may differ. The legacy `github.copilot.chat.customOAIModels` object is slated to change to an array form (microsoft/vscode issue #277102) — track this.
+- **Copilot inline completion remains closed to local models** — a stable, documented limitation as of 2026 ("Inline suggestions and next edit suggestions still require a GitHub sign-in"). The only local autocomplete in VS Code is via third-party `InlineCompletionItemProvider` extensions (Continue, Twinny, llama.vscode).
+- **Roo Code removed the XML tool-calling selector in 3.37** and forces native; this broke some local stacks (issue #10319). A fallback may return in future versions — verify per release.
+- **llama.cpp build-dependent behavior:** the `tool_calls` arguments-as-object regression (#20198, from PR #18675) and `--cache-reuse` regressions (#15082) mean you should pin a known-good commit and add regression tests for both.
+- **llama.cpp rerank path aliases** (`/rerank` vs `/v1/rerank` vs `/reranking`) have shifted across releases; Continue expects a `data` array, not `results` [GitHub](https://github.com/continuedev/continue/issues/6478) (#6478). Reranker score quality also varies with GGUF conversion/quantization (#16407).
+- **Visual Studio / JetBrains Copilot** local-model parity with VS Code is unverified from a primary source — treat as "lagging/uncertain" and test directly before claiming support.
+- **Copilot CLI BYOK** (GitHub Changelog, Apr 7 2026) requires tool calling + streaming and recommends ≥128k context; `COPILOT_OFFLINE=true` enables air-gapped use with a local provider.
+
+*Primary sources cited inline include: VS Code docs (code.visualstudio.com/docs/agent-customization/language-models) and the v1.122 release notes; GitHub Changelog (Apr 7 & Apr 22, 2026) and GitHub Docs BYOK pages; ggml-org/llama.cpp `tools/server/README.md`, `docs/function-calling.md`, and issues #20198, #15082, #16407, #16498, #21415; ggml-org/llama.vscode repo/wiki and DeepWiki; Continue.dev docs (autocomplete, yaml-reference, openai provider) and issues #4991, #2330, #6478; Roo Code docs and issues #4047, #10319; Cline docs; Tabby docs (llama.cpp/llamafile/model config); Twinny repo/docs; ProxyAI repo/docs; microsoft/vscode issues #273482, #277102, #295659; Hugging Face model cards/papers for Qwen2.5-Coder, Code Llama, DeepSeek-Coder, StarCoder2, and Codestral. Claims about Visual Studio/JetBrains Copilot parity and the Void editor are flagged as unverified.*
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 49a03a0c..3f1f52c6 100644
--- a/pom.xml
+++ b/pom.xml
@@ -58,7 +58,6 @@ SPDX-License-Identifier: MIT
 		4.2.0
 		2.22.0
 		3.8.6
-		2.3.1
 		2.0.18
 		1.5.34
 		1.27
@@ -149,20 +148,6 @@ SPDX-License-Identifier: MIT
 			jackson-databind
 			${jackson.version}
 		
-		
-		
-			org.nanohttpd
-			nanohttpd
-			${nanohttpd.version}
-			true
-		
 		
 		
 			org.slf4j
@@ -992,7 +977,7 @@ SPDX-License-Identifier: MIT
 			
     
-        
+        
         
         
     
 
     
     
-        
-        
-        
+        
+        
+    
+
+    
+    
+        
+        
+        
+    
+
+    
+    
+        
+        
+        
+            
+            
+        
+    
+    
+        
+        
+        
+    
+
+    
+    
+        
+        
+        
+    
+
+    
+    
+        
+        
+        
+            
+            
+        
+    
+
+    
+    
+        
+        
+        
+    
+
+    
+    
+        
+        
+    
+
+    
+    
+        
+            
+            
+        
+        
     
 
 
diff --git a/src/main/cpp/webui_stub/ui.h b/src/main/cpp/webui_stub/ui.h
new file mode 100644
index 00000000..feb15889
--- /dev/null
+++ b/src/main/cpp/webui_stub/ui.h
@@ -0,0 +1,52 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin 
+//
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+// ui.h — minimal stand-in for the WebUI asset interface that llama.cpp's
+// tools/ui (CMake target "llama-ui") normally GENERATES into ui.h / ui.cpp at
+// build time via the llama-ui-embed host tool.
+//
+// The upstream HTTP transport (tools/server/server-http.cpp) does
+//     #include "ui.h"
+// and references llama_ui_get_assets() / llama_ui_find_asset() /
+// llama_ui_use_gzip().  We compile server-http.cpp directly into libjllama but do
+// NOT ship the Svelte WebUI assets (building them needs npm, or a prebuilt-asset
+// download from Hugging Face) — so we provide the exact "empty asset table"
+// interface that embed.cpp emits for its n_assets == 0 branch: the struct plus
+// the three functions, returning nothing.
+//
+// LLAMA_UI_HAS_ASSETS is intentionally left UNDEFINED.  Every static-asset-serving
+// block in server-http.cpp is guarded by `#if defined(LLAMA_UI_HAS_ASSETS)`, so
+// all of them compile out; the single unguarded use — iterating the asset list to
+// collect public endpoint paths — simply iterates this empty array.
+//
+// To actually ship the WebUI later: remove this stub directory from jllama's
+// include path, build the real llama-ui target (assets on), and add its
+// generated-header directory instead.
+
+#include 
+#include 
+#include 
+
+struct llama_ui_asset {
+    std::string name;
+    const unsigned char * data;
+    std::size_t size;
+    std::string etag;
+    std::string type;
+};
+
+inline const llama_ui_asset * llama_ui_find_asset(const std::string & /*name*/) {
+    return nullptr;
+}
+
+inline bool llama_ui_use_gzip() {
+    return false;
+}
+
+inline const std::array & llama_ui_get_assets() {
+    static const std::array empty{};
+    return empty;
+}
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
index 2a1e131f..012090c6 100644
--- a/src/main/java/net/ladenthin/llama/LlamaModel.java
+++ b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -472,7 +472,19 @@ public LlamaOutput rerank(String query, String... documents) {
         return new LlamaOutput(query, probabilities, true, StopReason.EOS);
     }
 
-    native String handleRerank(String query, String... documents);
+    /**
+     * Rerank {@code documents} against {@code query} and return the raw native JSON array of results.
+     * Each element carries {@code "document"}, {@code "index"} (the position in the input array) and
+     * {@code "score"}. This is the JSON-in/JSON-out form behind
+     * {@link #rerank(boolean, String, String...)}; the OpenAI-compatible server reshapes it into the
+     * {@code POST /v1/rerank} response. Requires the model loaded in reranking mode
+     * ({@link net.ladenthin.llama.parameters.ModelParameters#enableReranking()}).
+     *
+     * @param query the query string
+     * @param documents the documents to score against the query
+     * @return a JSON array of {@code {document, index, score}} objects
+     */
+    public native String handleRerank(String query, String... documents);
 
     /**
      * Applies the chat template to the given inference parameters and returns the formatted string.
@@ -853,7 +865,7 @@ public String restoreSlot(int slotId, String filepath) {
      * result in OAI format with a {@code "choices"} array. This is the raw JSON-in/JSON-out
      * form used by {@link #chatComplete(net.ladenthin.llama.parameters.InferenceParameters)}
      * and by the embedded OpenAI-compatible server
-     * ({@link net.ladenthin.llama.server.LlamaServer}); it is the chat counterpart of
+     * ({@link net.ladenthin.llama.server.OpenAiCompatServer}); it is the chat counterpart of
      * {@link #handleCompletionsOai(String)} and {@link #handleEmbeddings(String, boolean)}.
      *
      * @param params JSON string with OAI-compatible chat-completion parameters (incl. {@code "messages"})
diff --git a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
index 6da0c49d..b10a6b87 100644
--- a/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
+++ b/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java
@@ -58,6 +58,8 @@ public final class InferenceParameters extends JsonParameters {
     private static final String PARAM_INPUT_PREFIX = "input_prefix";
     private static final String PARAM_INPUT_SUFFIX = "input_suffix";
     private static final String PARAM_CACHE_PROMPT = "cache_prompt";
+    private static final String PARAM_STREAM_OPTIONS = "stream_options";
+    private static final String PARAM_RESPONSE_FORMAT = "response_format";
     private static final String PARAM_N_PREDICT = "n_predict";
     private static final String PARAM_TOP_K = "top_k";
     private static final String PARAM_TOP_P = "top_p";
@@ -439,6 +441,32 @@ public InferenceParameters withJsonSchema(String schema) {
         return withRaw(PARAM_JSON_SCHEMA, schema);
     }
 
+    /**
+     * Returns a new request with the OpenAI streaming {@code stream_options} object replaced. Passing
+     * {@code {"include_usage":true}} makes the native server emit a trailing {@code usage} chunk after
+     * the stream completes (with an empty {@code choices} array), which OpenAI clients — notably the
+     * VS Code Copilot custom endpoint — rely on for token accounting.
+     *
+     * @param streamOptionsJson the {@code stream_options} object as a JSON-encoded string
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withStreamOptions(String streamOptionsJson) {
+        return withRaw(PARAM_STREAM_OPTIONS, streamOptionsJson);
+    }
+
+    /**
+     * Returns a new request with the OpenAI {@code response_format} object replaced. The native server
+     * turns {@code {"type":"json_object"}} or {@code {"type":"json_schema","json_schema":{...}}} into a
+     * GBNF grammar constraint internally, so the model is forced to emit conforming JSON — the OpenAI
+     * "structured outputs" feature that strict agent clients use.
+     *
+     * @param responseFormatJson the {@code response_format} object as a JSON-encoded string
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withResponseFormat(String responseFormatJson) {
+        return withRaw(PARAM_RESPONSE_FORMAT, responseFormatJson);
+    }
+
     /**
      * Returns a new request with the repetition-penalty prompt-portion override replaced.
      *
diff --git a/src/main/java/net/ladenthin/llama/server/AnthropicApiSupport.java b/src/main/java/net/ladenthin/llama/server/AnthropicApiSupport.java
new file mode 100644
index 00000000..c48f8dc7
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/AnthropicApiSupport.java
@@ -0,0 +1,407 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin 
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import java.io.IOException;
+
+/**
+ * Pure translators between the Anthropic Messages API ({@code POST /v1/messages}) and the internal
+ * OpenAI chat shape, plus builders for the Anthropic streaming SSE events. Lets clients that speak the
+ * Anthropic protocol (Claude Code, Copilot's {@code messages} apiType) drive the local model without a
+ * second inference path.
+ *
+ * 

Request mapping covers Anthropic's content-block model: a {@code system} string/blocks becomes an + * OpenAI system message; message {@code content} that is a string or an array of {@code text} / + * {@code tool_use} / {@code tool_result} blocks is flattened to OpenAI messages (a user message's + * {@code tool_result} blocks become separate {@code role:"tool"} messages); Anthropic {@code tools} + * ({@code name}/{@code description}/{@code input_schema}) become OpenAI function tools. Responses map the + * other way: OpenAI {@code content} + {@code tool_calls} become Anthropic {@code text} + + * {@code tool_use} content blocks. + * + *

Stateless and free of JNI / model dependencies; unit-testable with JSON literals. Streaming state + * is held by {@link AnthropicStreamTranslator}. + */ +final class AnthropicApiSupport { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private AnthropicApiSupport() {} + + /** + * Whether the Anthropic request asks for a streamed response ({@code "stream"} defaults to false). + * + * @param request the parsed Anthropic request + * @return {@code true} if {@code "stream"} is explicitly true + */ + static boolean isStreaming(JsonNode request) { + return request.path("stream").asBoolean(false); + } + + /** + * Translate an Anthropic {@code /v1/messages} request into the internal OpenAI chat request shape. + * + * @param request the parsed Anthropic request + * @return an OpenAI {@code /v1/chat/completions} request object + */ + static ObjectNode toOpenAiChatRequest(JsonNode request) { + ObjectNode openAi = OBJECT_MAPPER.createObjectNode(); + if (request.path("model").isTextual()) { + openAi.put("model", request.path("model").asText()); + } + + ArrayNode messages = openAi.putArray("messages"); + String system = systemText(request.path("system")); + if (!system.isEmpty()) { + ObjectNode systemMessage = messages.addObject(); + systemMessage.put("role", "system"); + systemMessage.put("content", system); + } + for (JsonNode message : request.path("messages")) { + appendOpenAiMessages(messages, message); + } + + JsonNode tools = request.path("tools"); + if (tools.isArray() && tools.size() > 0) { + ArrayNode openAiTools = openAi.putArray("tools"); + for (JsonNode tool : tools) { + ObjectNode openAiTool = openAiTools.addObject(); + openAiTool.put("type", "function"); + ObjectNode function = openAiTool.putObject("function"); + function.put("name", tool.path("name").asText("")); + if (tool.path("description").isTextual()) { + function.put("description", tool.path("description").asText()); + } + if (tool.path("input_schema").isObject()) { + function.set("parameters", tool.path("input_schema").deepCopy()); + } + } + String toolChoice = toOpenAiToolChoice(request.path("tool_choice")); + if (toolChoice != null) { + openAi.put("tool_choice", toolChoice); + } + // Anthropic expresses "no parallel tool use" via tool_choice.disable_parallel_tool_use; + // OpenAI's equivalent is parallel_tool_calls=false. Map it so the shared chat core honors + // a client's request to serialize tool calls (default stays parallel when unset/false). + if (request.path("tool_choice").path("disable_parallel_tool_use").asBoolean(false)) { + openAi.put("parallel_tool_calls", false); + } + } + + copyNumber(request, "max_tokens", openAi, "max_tokens"); + copyNumber(request, "temperature", openAi, "temperature"); + copyNumber(request, "top_p", openAi, "top_p"); + copyNumber(request, "top_k", openAi, "top_k"); + if (request.path("stop_sequences").isArray()) { + openAi.set("stop", request.path("stop_sequences").deepCopy()); + } + return openAi; + } + + private static String systemText(JsonNode system) { + if (system.isTextual()) { + return system.asText(); + } + if (system.isArray()) { + StringBuilder sb = new StringBuilder(); + for (JsonNode block : system) { + if (block.path("text").isTextual()) { + sb.append(block.path("text").asText()); + } + } + return sb.toString(); + } + return ""; + } + + private static void appendOpenAiMessages(ArrayNode out, JsonNode anthropicMessage) { + String role = anthropicMessage.path("role").asText("user"); + JsonNode content = anthropicMessage.path("content"); + if (content.isTextual()) { + ObjectNode message = out.addObject(); + message.put("role", role); + message.put("content", content.asText()); + return; + } + if (!content.isArray()) { + return; + } + + StringBuilder text = new StringBuilder(); + ArrayNode toolCalls = OBJECT_MAPPER.createArrayNode(); + boolean hadToolResult = false; + for (JsonNode block : content) { + String type = block.path("type").asText(""); + switch (type) { + case "text": + text.append(block.path("text").asText("")); + break; + case "tool_use": + // Assistant tool call: Anthropic input (object) -> OpenAI arguments (JSON string). + ObjectNode toolCall = toolCalls.addObject(); + toolCall.put("id", block.path("id").asText("")); + toolCall.put("type", "function"); + ObjectNode function = toolCall.putObject("function"); + function.put("name", block.path("name").asText("")); + function.put("arguments", block.path("input").toString()); + break; + case "tool_result": + // A user-message tool_result becomes a separate OpenAI role:"tool" message. + ObjectNode toolMessage = out.addObject(); + toolMessage.put("role", "tool"); + toolMessage.put("tool_call_id", block.path("tool_use_id").asText("")); + toolMessage.put("content", toolResultText(block.path("content"))); + hadToolResult = true; + break; + default: + break; + } + } + if (text.length() > 0 || toolCalls.size() > 0) { + ObjectNode message = out.addObject(); + message.put("role", role); + if (toolCalls.size() > 0 && text.length() == 0) { + message.putNull("content"); // assistant tool-call turn carries null content + } else { + message.put("content", text.toString()); + } + if (toolCalls.size() > 0) { + message.set("tool_calls", toolCalls); + } + } else if (!hadToolResult) { + // Genuinely empty/plain content (no text, no tool calls, no tool_result) — keep a slot. + // A content array of only tool_result blocks emits no extra message (they became tool messages). + ObjectNode message = out.addObject(); + message.put("role", role); + message.put("content", ""); + } + } + + private static String toolResultText(JsonNode content) { + if (content.isTextual()) { + return content.asText(); + } + if (content.isArray()) { + StringBuilder sb = new StringBuilder(); + for (JsonNode block : content) { + if (block.path("text").isTextual()) { + sb.append(block.path("text").asText()); + } + } + return sb.toString(); + } + return content.toString(); + } + + private static @org.jspecify.annotations.Nullable String toOpenAiToolChoice(JsonNode toolChoice) { + String type = toolChoice.path("type").asText(""); + if ("auto".equals(type)) { + return "auto"; + } + if ("any".equals(type) || "tool".equals(type)) { + // OpenAI's textual tool_choice cannot name a specific function; "required" is the closest. + return "required"; + } + return null; + } + + private static void copyNumber(JsonNode from, String fromKey, ObjectNode to, String toKey) { + JsonNode value = from.path(fromKey); + if (value.isNumber()) { + to.set(toKey, value); + } + } + + /** + * Translate a non-streaming OpenAI {@code chat.completion} into an Anthropic message response. + * + * @param openAiCompletionJson the OpenAI completion body + * @param model the model id to echo + * @return the Anthropic message serialized as JSON + */ + static String toAnthropicResponse(String openAiCompletionJson, String model) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("id", "msg_" + Integer.toHexString(openAiCompletionJson.hashCode())); + root.put("type", "message"); + root.put("role", "assistant"); + root.put("model", model); + ArrayNode content = root.putArray("content"); + String stopReason = "end_turn"; + ObjectNode usage = root.putObject("usage"); + usage.put("input_tokens", 0); + usage.put("output_tokens", 0); + try { + JsonNode completion = OBJECT_MAPPER.readTree(openAiCompletionJson); + JsonNode choice = completion.path("choices").path(0); + JsonNode message = choice.path("message"); + String text = message.path("content").asText(""); + if (!text.isEmpty()) { + ObjectNode textBlock = content.addObject(); + textBlock.put("type", "text"); + textBlock.put("text", text); + } + JsonNode toolCalls = message.path("tool_calls"); + if (toolCalls.isArray()) { + for (JsonNode toolCall : toolCalls) { + content.add(toolUseBlock(toolCall)); + } + } + stopReason = anthropicStopReason(choice.path("finish_reason").asText("stop")); + JsonNode openAiUsage = completion.path("usage"); + if (openAiUsage.isObject()) { + usage.put("input_tokens", openAiUsage.path("prompt_tokens").asInt(0)); + usage.put("output_tokens", openAiUsage.path("completion_tokens").asInt(0)); + } + } catch (IOException e) { + stopReason = "end_turn"; + } + root.put("stop_reason", stopReason); + root.putNull("stop_sequence"); + return root.toString(); + } + + /** Build an Anthropic {@code tool_use} content block from an OpenAI tool call. */ + static ObjectNode toolUseBlock(JsonNode openAiToolCall) { + JsonNode function = openAiToolCall.path("function"); + ObjectNode block = OBJECT_MAPPER.createObjectNode(); + block.put("type", "tool_use"); + block.put("id", openAiToolCall.path("id").asText("")); + block.put("name", function.path("name").asText("")); + block.set("input", parseToObject(function.path("arguments"))); + return block; + } + + private static JsonNode parseToObject(JsonNode arguments) { + if (arguments.isObject() || arguments.isArray()) { + return arguments; + } + if (arguments.isTextual()) { + try { + return OBJECT_MAPPER.readTree(arguments.asText()); + } catch (IOException e) { + return OBJECT_MAPPER.createObjectNode(); + } + } + return OBJECT_MAPPER.createObjectNode(); + } + + /** Map an OpenAI finish_reason to an Anthropic stop_reason. */ + static String anthropicStopReason(String openAiFinishReason) { + switch (openAiFinishReason) { + case "length": + return "max_tokens"; + case "tool_calls": + return "tool_use"; + case "stop": + default: + return "end_turn"; + } + } + + // ----- streaming SSE event builders ----- + + /** + * Frame an Anthropic SSE event: {@code event: \ndata: \n\n}. + * + * @param type the event type + * @param dataJson the event data object serialized as JSON + * @return the framed SSE event + */ + static String sseEvent(String type, String dataJson) { + return "event: " + type + "\ndata: " + dataJson + "\n\n"; + } + + /** {@code message_start} event for a new assistant message. */ + static String messageStartEvent(String id, String model) { + ObjectNode message = OBJECT_MAPPER.createObjectNode(); + message.put("id", id); + message.put("type", "message"); + message.put("role", "assistant"); + message.put("model", model); + message.putArray("content"); + message.putNull("stop_reason"); + message.putNull("stop_sequence"); + ObjectNode usage = message.putObject("usage"); + usage.put("input_tokens", 0); + usage.put("output_tokens", 0); + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "message_start"); + data.set("message", message); + return sseEvent("message_start", data.toString()); + } + + /** {@code content_block_start} event opening a text block at {@code index}. */ + static String textBlockStartEvent(int index) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "content_block_start"); + data.put("index", index); + ObjectNode block = data.putObject("content_block"); + block.put("type", "text"); + block.put("text", ""); + return sseEvent("content_block_start", data.toString()); + } + + /** {@code content_block_delta} event appending {@code text} to the block at {@code index}. */ + static String textDeltaEvent(int index, String text) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "content_block_delta"); + data.put("index", index); + ObjectNode delta = data.putObject("delta"); + delta.put("type", "text_delta"); + delta.put("text", text); + return sseEvent("content_block_delta", data.toString()); + } + + /** {@code content_block_start} event opening a {@code tool_use} block at {@code index}. */ + static String toolUseBlockStartEvent(int index, String id, String name) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "content_block_start"); + data.put("index", index); + ObjectNode block = data.putObject("content_block"); + block.put("type", "tool_use"); + block.put("id", id); + block.put("name", name); + block.putObject("input"); + return sseEvent("content_block_start", data.toString()); + } + + /** {@code content_block_delta} event carrying the tool-call arguments as an {@code input_json_delta}. */ + static String inputJsonDeltaEvent(int index, String partialJson) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "content_block_delta"); + data.put("index", index); + ObjectNode delta = data.putObject("delta"); + delta.put("type", "input_json_delta"); + delta.put("partial_json", partialJson); + return sseEvent("content_block_delta", data.toString()); + } + + /** {@code content_block_stop} event closing the block at {@code index}. */ + static String blockStopEvent(int index) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "content_block_stop"); + data.put("index", index); + return sseEvent("content_block_stop", data.toString()); + } + + /** {@code message_delta} event carrying the final stop reason. */ + static String messageDeltaEvent(String stopReason) { + ObjectNode data = OBJECT_MAPPER.createObjectNode(); + data.put("type", "message_delta"); + ObjectNode delta = data.putObject("delta"); + delta.put("stop_reason", stopReason); + delta.putNull("stop_sequence"); + data.putObject("usage").put("output_tokens", 0); + return sseEvent("message_delta", data.toString()); + } + + /** {@code message_stop} event ending the stream. */ + static String messageStopEvent() { + return sseEvent("message_stop", "{\"type\":\"message_stop\"}"); + } +} diff --git a/src/main/java/net/ladenthin/llama/server/AnthropicStreamTranslator.java b/src/main/java/net/ladenthin/llama/server/AnthropicStreamTranslator.java new file mode 100644 index 00000000..f5cfc2ff --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/AnthropicStreamTranslator.java @@ -0,0 +1,109 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import lombok.ToString; + +/** + * Stateful translator that turns the OpenAI streaming chat chunks into the Anthropic Messages SSE event + * sequence: {@code message_start} → (a {@code text} content block with {@code content_block_start} + + * {@code content_block_delta}s + {@code content_block_stop}) → one {@code tool_use} block per tool call + * (start + {@code input_json_delta} + stop) → {@code message_delta} (stop reason) → {@code message_stop}. + * + *

Text deltas are emitted live; tool calls are reconstructed via {@link ToolCallDeltaAccumulator} and + * emitted as whole blocks at the end (Anthropic expects each tool_use block's input as one + * {@code input_json_delta}). Free of JNI / model dependencies; unit-testable by feeding chunk JSON. + */ +@ToString +final class AnthropicStreamTranslator { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final String id; + private final String model; + private final ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + + private boolean textBlockOpen; + private int textBlockIndex = -1; + private int nextIndex; + private String finishReason = "stop"; + + AnthropicStreamTranslator(String id, String model) { + this.id = id; + this.model = model; + } + + /** + * The opening {@code message_start} event. + * + * @return the framed SSE event + */ + String begin() { + return AnthropicApiSupport.messageStartEvent(id, model); + } + + /** + * Translate one OpenAI chunk into the SSE events it produces (text block start/delta), accumulating + * tool-call fragments and capturing the finish reason. Returns an empty string when the chunk yields + * no event (role-only / finish-only / tool-call-only chunks). + * + * @param openAiChunkJson one OpenAI {@code chat.completion.chunk} + * @return zero or more framed SSE events, concatenated + */ + String onChunk(String openAiChunkJson) { + StringBuilder out = new StringBuilder(); + try { + JsonNode chunk = OBJECT_MAPPER.readTree(openAiChunkJson); + accumulator.accept(chunk); + JsonNode choice = chunk.path("choices").path(0); + JsonNode content = choice.path("delta").path("content"); + if (content.isTextual() && !content.asText().isEmpty()) { + if (!textBlockOpen) { + textBlockIndex = nextIndex++; + out.append(AnthropicApiSupport.textBlockStartEvent(textBlockIndex)); + textBlockOpen = true; + } + out.append(AnthropicApiSupport.textDeltaEvent(textBlockIndex, content.asText())); + } + if (choice.path("finish_reason").isTextual()) { + finishReason = choice.path("finish_reason").asText(); + } + } catch (IOException e) { + // A malformed chunk produces no events. + } + return out.toString(); + } + + /** + * The closing events: stop the open text block, emit a {@code tool_use} block per accumulated tool + * call, then {@code message_delta} (mapped stop reason) and {@code message_stop}. + * + * @return the framed SSE events, concatenated + */ + String end() { + StringBuilder out = new StringBuilder(); + if (textBlockOpen) { + out.append(AnthropicApiSupport.blockStopEvent(textBlockIndex)); + textBlockOpen = false; + } + if (accumulator.hasToolCalls()) { + for (JsonNode toolCall : accumulator.toOpenAiToolCalls()) { + int index = nextIndex++; + String callId = toolCall.path("id").asText(""); + String name = toolCall.path("function").path("name").asText(""); + String arguments = toolCall.path("function").path("arguments").asText(""); + out.append(AnthropicApiSupport.toolUseBlockStartEvent(index, callId, name)); + out.append(AnthropicApiSupport.inputJsonDeltaEvent(index, arguments)); + out.append(AnthropicApiSupport.blockStopEvent(index)); + } + } + out.append(AnthropicApiSupport.messageDeltaEvent(AnthropicApiSupport.anthropicStopReason(finishReason))); + out.append(AnthropicApiSupport.messageStopEvent()); + return out.toString(); + } +} diff --git a/src/main/java/net/ladenthin/llama/server/ChatBackend.java b/src/main/java/net/ladenthin/llama/server/ChatBackend.java deleted file mode 100644 index cd16a233..00000000 --- a/src/main/java/net/ladenthin/llama/server/ChatBackend.java +++ /dev/null @@ -1,40 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import com.fasterxml.jackson.databind.JsonNode; -import java.io.IOException; - -/** - * The chat engine seam behind {@link OpenAiCompatServer}. - * - *

Decoupling the HTTP layer from {@link net.ladenthin.llama.LlamaModel} lets the whole server — - * routing, authentication, Server-Sent-Events framing, heartbeats — be exercised by tests with a fake - * backend, with no native library and no model loaded. The production implementation is - * {@link LlamaModelChatBackend}. - * - *

Both methods receive the parsed OpenAI request object (already validated as JSON by the handler). - */ -interface ChatBackend { - - /** - * Run a non-streaming chat completion. - * - * @param request the parsed OpenAI {@code /v1/chat/completions} request - * @return the complete OpenAI {@code chat.completion} response serialized as JSON - * @throws IOException if generation fails in a way the caller should surface as a server error - */ - String complete(JsonNode request) throws IOException; - - /** - * Run a streaming chat completion, delivering each {@code chat.completion.chunk} to {@code sink} - * in order. Implementations must not emit the terminating {@code [DONE]} marker; the caller adds it. - * - * @param request the parsed OpenAI {@code /v1/chat/completions} request - * @param sink receiver for each streamed chunk's JSON - * @throws IOException if a chunk cannot be delivered or generation fails - */ - void stream(JsonNode request, ChunkSink sink) throws IOException; -} diff --git a/src/main/java/net/ladenthin/llama/server/LlamaModelChatBackend.java b/src/main/java/net/ladenthin/llama/server/LlamaModelBackend.java similarity index 52% rename from src/main/java/net/ladenthin/llama/server/LlamaModelChatBackend.java rename to src/main/java/net/ladenthin/llama/server/LlamaModelBackend.java index 3d418a8b..de289d41 100644 --- a/src/main/java/net/ladenthin/llama/server/LlamaModelChatBackend.java +++ b/src/main/java/net/ladenthin/llama/server/LlamaModelBackend.java @@ -10,19 +10,21 @@ import net.ladenthin.llama.parameters.InferenceParameters; /** - * Production {@link ChatBackend} that runs requests against a loaded {@link LlamaModel}. + * Production {@link OpenAiBackend} that runs requests against a loaded {@link LlamaModel}. * - *

Non-streaming requests reuse {@link LlamaModel#chatComplete(InferenceParameters)}, whose return - * value is already a verbatim OpenAI {@code chat.completion} body. Streaming requests use - * {@link LlamaModel#streamChatCompletion(InferenceParameters, java.util.function.Consumer)}, which - * emits OpenAI {@code chat.completion.chunk} objects (including {@code delta.tool_calls}). + *

Non-streaming chat reuses {@link LlamaModel#chatComplete(InferenceParameters)}, whose return value + * is already a verbatim OpenAI {@code chat.completion} body. Streaming chat uses + * {@link LlamaModel#streamChatCompletion(InferenceParameters, java.util.function.Consumer)}, which emits + * OpenAI {@code chat.completion.chunk} objects (including {@code delta.tool_calls}). Text completions and + * embeddings forward the request body verbatim to {@link LlamaModel#handleCompletionsOai(String)} / + * {@link LlamaModel#handleEmbeddings(String, boolean)}, which already return OpenAI-shaped JSON. * *

The streaming sink may fail with {@link IOException} (client disconnect); because the underlying * model API takes a {@link java.util.function.Consumer} (no checked exceptions), that failure is * relayed across the boundary via {@link java.io.UncheckedIOException} and unwrapped here so the * in-flight native task is cancelled. */ -final class LlamaModelChatBackend implements ChatBackend { +final class LlamaModelBackend implements OpenAiBackend { private final LlamaModel model; private final OpenAiRequestMapper mapper; @@ -33,7 +35,7 @@ final class LlamaModelChatBackend implements ChatBackend { * @param model the loaded model to run completions against * @param mapper the OpenAI-request to {@link InferenceParameters} mapper */ - LlamaModelChatBackend(LlamaModel model, OpenAiRequestMapper mapper) { + LlamaModelBackend(LlamaModel model, OpenAiRequestMapper mapper) { this.model = model; this.mapper = mapper; } @@ -66,4 +68,33 @@ public void stream(JsonNode request, ChunkSink sink) throws IOException { throw e; } } + + @Override + public String completions(JsonNode request) { + // The native /v1/completions handler parses the OpenAI body itself; forward it verbatim. + return model.handleCompletionsOai(request.toString()); + } + + @Override + public String embeddings(JsonNode request) { + // oaiCompat=true so the response uses the OpenAI {"object":"list","data":[{embedding}]} shape. + return model.handleEmbeddings(request.toString(), true); + } + + @Override + public String infill(JsonNode request) { + // The native /infill handler parses the body itself (input_prefix/input_suffix/...) and applies + // the model's FIM tokens from GGUF metadata; forward verbatim. + return model.handleInfill(request.toString()); + } + + @Override + public String rerank(JsonNode request) { + final String query = OaiRerankSupport.readQuery(request); + final String[] documents = OaiRerankSupport.readDocuments(request); + final int topN = OaiRerankSupport.readTopN(request); + final String requestModel = request.path("model").asText(""); + final String nativeJson = model.handleRerank(query, documents); + return OaiRerankSupport.toOaiResponse(nativeJson, requestModel, topN); + } } diff --git a/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java b/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java deleted file mode 100644 index 7c1a85a5..00000000 --- a/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import lombok.ToString; -import net.ladenthin.llama.LlamaModel; - -/** - * {@link OaiBackend} backed by a loaded {@link LlamaModel}. Each operation forwards the raw request - * JSON to the matching {@code LlamaModel.handle*} method, which already produces - * OpenAI-compatible response JSON, so no per-field marshalling happens here. - * - *

The model is owned by the caller ({@link LlamaServer}); this class neither closes it nor holds - * any other resource.

- */ -@ToString -public final class LlamaModelOaiBackend implements OaiBackend { - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private final LlamaModel model; - private final String modelId; - - /** - * Create a backend over a loaded model. - * - * @param model the loaded model to serve requests with - * @param modelId the identifier reported by {@link #listModels()} and echoed in responses - */ - public LlamaModelOaiBackend(LlamaModel model, String modelId) { - this.model = model; - this.modelId = modelId; - } - - @Override - public String chatCompletions(String requestJson) { - return model.handleChatCompletions(requestJson); - } - - @Override - public String completions(String requestJson) { - return model.handleCompletionsOai(requestJson); - } - - @Override - public String embeddings(String requestJson) { - return model.handleEmbeddings(requestJson, true); - } - - @Override - public String listModels() { - final ObjectNode root = OBJECT_MAPPER.createObjectNode(); - root.put("object", "list"); - final ArrayNode data = root.putArray("data"); - final ObjectNode entry = data.addObject(); - entry.put("id", modelId); - entry.put("object", "model"); - entry.put("owned_by", "llamacpp"); - // ObjectNode.toString() emits valid JSON without a checked exception. - return root.toString(); - } -} diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServer.java b/src/main/java/net/ladenthin/llama/server/LlamaServer.java deleted file mode 100644 index 2f630513..00000000 --- a/src/main/java/net/ladenthin/llama/server/LlamaServer.java +++ /dev/null @@ -1,93 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import fi.iki.elonen.NanoHTTPD; -import java.io.IOException; -import net.ladenthin.llama.LlamaModel; -import net.ladenthin.llama.parameters.ModelParameters; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Entry point for the optional OpenAI-compatible HTTP server, and the {@code Main-Class} of the - * {@code -jar-with-dependencies} assembly. - * - *

It parses the command line ({@link LlamaServerArgs}), loads a GGUF model into a - * {@link LlamaModel}, and serves OpenAI-compatible endpoints over NanoHTTPD via {@link OaiRouter} / - * {@link OaiHttpServer}. A shutdown hook stops the server and closes the model on JVM exit - * (e.g. Ctrl-C / SIGTERM). Run {@code --help} for the full option list.

- * - *

Example:

- * - *
{@code
- * java -jar llama--jar-with-dependencies.jar \
- *     --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
- * }
- * - *

Responses are non-streaming: the full JSON result is returned per request.

- */ -public final class LlamaServer { - - private static final Logger LOG = LoggerFactory.getLogger(LlamaServer.class); - - private LlamaServer() {} - - /** - * Start the server (blocks the JVM alive on a non-daemon listener thread), or print help. - * - * @param args command-line arguments; see {@link LlamaServerArgs#usage()} - * @throws IOException if the HTTP server cannot bind the configured host/port - */ - public static void main(String[] args) throws IOException { - if (LlamaServerArgs.isHelpRequested(args)) { - LOG.info("{}{}", System.lineSeparator(), LlamaServerArgs.usage()); - return; - } - - final LlamaServerConfig config = LlamaServerArgs.parse(args); - final LlamaModel model = loadModel(config); - final OaiBackend backend = new LlamaModelOaiBackend(model, config.getModelAlias()); - final OaiHttpServer server = new OaiHttpServer(config.getHost(), config.getPort(), new OaiRouter(backend)); - - Runtime.getRuntime().addShutdownHook(new Thread(() -> shutdown(server, model), "llama-server-shutdown")); - - try { - // daemon=false: the non-daemon listener thread keeps the JVM alive after main() returns. - server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, false); - } catch (IOException e) { - // Close the just-loaded native model before propagating the bind failure. - model.close(); - throw e; - } - - LOG.info( - "LlamaServer listening on http://{}:{} (model={})", - config.getHost(), - config.getPort(), - config.getModelAlias()); - } - - private static LlamaModel loadModel(LlamaServerConfig config) { - final ModelParameters params = - new ModelParameters().setModel(config.getModelPath()).setGpuLayers(config.getGpuLayers()); - if (config.getCtxSize() > 0) { - params.setCtxSize(config.getCtxSize()); - } - if (config.getThreads() > 0) { - params.setThreads(config.getThreads()); - } - if (config.isEmbedding()) { - params.enableEmbedding(); - } - LOG.info("Loading model {} ...", config.getModelPath()); - return new LlamaModel(params); - } - - private static void shutdown(OaiHttpServer server, LlamaModel model) { - server.stop(); - model.close(); - } -} diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java b/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java deleted file mode 100644 index 1bfcef71..00000000 --- a/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import java.nio.file.Path; -import java.nio.file.Paths; -import org.jspecify.annotations.Nullable; - -/** - * Command-line argument parser for {@link LlamaServer}. Pure and free of any native dependency, so - * it can be unit-tested in isolation (no socket, no model). - * - *

{@link #parse(String[])} returns a validated {@link LlamaServerConfig} or throws - * {@link IllegalArgumentException} (whose message includes the {@link #usage()} text) for unknown - * flags, missing values or a missing required {@code --model}. {@code -h}/{@code --help} is detected - * separately via {@link #isHelpRequested(String[])} so callers can print help without it being - * treated as an error.

- */ -public final class LlamaServerArgs { - - /** Default bind interface (loopback only; pass {@code --host 0.0.0.0} to expose on the LAN). */ - public static final String DEFAULT_HOST = "127.0.0.1"; - - /** Default TCP port. */ - public static final int DEFAULT_PORT = 8080; - - private LlamaServerArgs() {} - - /** - * Whether the arguments request the help text. - * - * @param args the raw command-line arguments - * @return {@code true} if {@code -h} or {@code --help} is present - */ - public static boolean isHelpRequested(String... args) { - for (final String arg : args) { - if ("-h".equals(arg) || "--help".equals(arg)) { - return true; - } - } - return false; - } - - /** - * Parse the command-line arguments into a {@link LlamaServerConfig}. - * - * @param args the raw command-line arguments - * @return the validated configuration - * @throws IllegalArgumentException if an argument is unknown, a value is missing or malformed, - * or the required {@code --model} is absent - */ - public static LlamaServerConfig parse(String... args) { - String host = DEFAULT_HOST; - int port = DEFAULT_PORT; - @Nullable String modelPath = null; - @Nullable String modelAlias = null; - int ctxSize = 0; - int gpuLayers = 0; - int threads = 0; - boolean embedding = false; - - for (int i = 0; i < args.length; i++) { - final String arg = args[i]; - switch (arg) { - case "-m": - case "--model": - modelPath = nextValue(args, ++i, arg); - break; - case "--host": - host = nextValue(args, ++i, arg); - break; - case "-p": - case "--port": - port = intValue(args, ++i, arg); - break; - case "-c": - case "--ctx-size": - ctxSize = intValue(args, ++i, arg); - break; - case "-ngl": - case "--n-gpu-layers": - gpuLayers = intValue(args, ++i, arg); - break; - case "-t": - case "--threads": - threads = intValue(args, ++i, arg); - break; - case "--model-alias": - modelAlias = nextValue(args, ++i, arg); - break; - case "--embedding": - case "--embeddings": - embedding = true; - break; - case "-h": - case "--help": - // Detected by isHelpRequested(); accepted here so parse() still succeeds. - break; - default: - throw error("Unknown argument: " + arg); - } - } - - if (modelPath == null) { - throw error("Missing required argument: -m/--model "); - } - final String alias = modelAlias != null ? modelAlias : deriveAlias(modelPath); - return new LlamaServerConfig(host, port, modelPath, alias, ctxSize, gpuLayers, threads, embedding); - } - - /** - * The human-readable usage / help text. - * - * @return the usage text - */ - public static String usage() { - return String.join( - System.lineSeparator(), - "LlamaServer - OpenAI-compatible HTTP server for java-llama.cpp", - "", - "Usage:", - " java -jar llama--jar-with-dependencies.jar --model [options]", - "", - "Required:", - " -m, --model Path to the GGUF model file", - "", - "Options:", - " --host Interface to bind (default: " + DEFAULT_HOST + ")", - " -p, --port TCP port to listen on (default: " + DEFAULT_PORT + ")", - " -c, --ctx-size Context window size (default: llama.cpp default)", - " -ngl,--n-gpu-layers Layers to offload to GPU (default: 0 = CPU only)", - " -t, --threads Inference thread count (default: llama.cpp default)", - " --model-alias Model id reported by /v1/models (default: file name)", - " --embedding Load in embedding mode (enables POST /v1/embeddings)", - " -h, --help Show this help and exit", - "", - "Endpoints:", - " POST /v1/chat/completions", - " POST /v1/completions", - " POST /v1/embeddings (requires --embedding)", - " GET /v1/models", - " GET /health"); - } - - private static String nextValue(String[] args, int valueIndex, String flag) { - if (valueIndex >= args.length) { - throw error("Missing value for " + flag); - } - return args[valueIndex]; - } - - private static int intValue(String[] args, int valueIndex, String flag) { - final String raw = nextValue(args, valueIndex, flag); - try { - return Integer.parseInt(raw.trim()); - } catch (NumberFormatException e) { - throw error(flag + " expects an integer, got: " + raw, e); - } - } - - private static String deriveAlias(String modelPath) { - final Path name = Paths.get(modelPath).getFileName(); - return name != null ? name.toString() : modelPath; - } - - private static IllegalArgumentException error(String message) { - return error(message, null); - } - - private static IllegalArgumentException error(String message, @Nullable Throwable cause) { - return new IllegalArgumentException(message + System.lineSeparator() + System.lineSeparator() + usage(), cause); - } -} diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java b/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java deleted file mode 100644 index f5f37a2e..00000000 --- a/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java +++ /dev/null @@ -1,66 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; - -/** - * Immutable, parsed configuration for {@link LlamaServer}, produced by - * {@link LlamaServerArgs#parse(String[])}. - * - *

{@code ctxSize} and {@code threads} use {@code 0} as a sentinel meaning "leave the llama.cpp - * default" — they are only applied to {@link net.ladenthin.llama.parameters.ModelParameters} when - * positive. {@code gpuLayers} is always applied (its own default of {@code 0} already means - * CPU-only).

- * - *

Value equality / {@code toString} are generated by Lombok over all fields.

- */ -@Getter -@ToString -@EqualsAndHashCode -public final class LlamaServerConfig { - - private final String host; - private final int port; - private final String modelPath; - private final String modelAlias; - private final int ctxSize; - private final int gpuLayers; - private final int threads; - private final boolean embedding; - - /** - * Create a server configuration. - * - * @param host the interface to bind (e.g. {@code "127.0.0.1"} or {@code "0.0.0.0"}) - * @param port the TCP port to listen on - * @param modelPath the path to the GGUF model file to load - * @param modelAlias the identifier reported by {@code /v1/models} - * @param ctxSize context window size, or {@code 0} to use the llama.cpp default - * @param gpuLayers number of layers to offload to the GPU ({@code 0} = CPU-only) - * @param threads inference thread count, or {@code 0} to use the llama.cpp default - * @param embedding whether to load the model in embedding mode (enables {@code /v1/embeddings}) - */ - public LlamaServerConfig( - String host, - int port, - String modelPath, - String modelAlias, - int ctxSize, - int gpuLayers, - int threads, - boolean embedding) { - this.host = host; - this.port = port; - this.modelPath = modelPath; - this.modelAlias = modelAlias; - this.ctxSize = ctxSize; - this.gpuLayers = gpuLayers; - this.threads = threads; - this.embedding = embedding; - } -} diff --git a/src/main/java/net/ladenthin/llama/server/OaiBackend.java b/src/main/java/net/ladenthin/llama/server/OaiBackend.java deleted file mode 100644 index f8b57ca2..00000000 --- a/src/main/java/net/ladenthin/llama/server/OaiBackend.java +++ /dev/null @@ -1,50 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -/** - * The inference operations the {@link OaiRouter} forwards HTTP requests to, abstracted behind an - * interface so the router can be unit-tested without loading a native model. The production - * implementation is {@link LlamaModelOaiBackend}, which delegates to - * {@link net.ladenthin.llama.LlamaModel}. - * - *

Each request method takes the raw OpenAI-compatible request body and returns the raw - * OpenAI-compatible response JSON. Implementations may throw a {@link RuntimeException} (e.g. - * {@link net.ladenthin.llama.exception.LlamaException}) on inference failure; the router converts - * that into an HTTP {@code 500} error response.

- */ -public interface OaiBackend { - - /** - * Run a chat completion ({@code POST /v1/chat/completions}). - * - * @param requestJson the OAI chat-completion request body (must contain {@code "messages"}) - * @return the OAI chat-completion response JSON - */ - String chatCompletions(String requestJson); - - /** - * Run a text completion ({@code POST /v1/completions}). - * - * @param requestJson the OAI completion request body (must contain {@code "prompt"}) - * @return the OAI completion response JSON - */ - String completions(String requestJson); - - /** - * Generate embeddings ({@code POST /v1/embeddings}). - * - * @param requestJson the OAI embeddings request body (must contain {@code "input"}) - * @return the OAI embeddings response JSON - */ - String embeddings(String requestJson); - - /** - * List the available model(s) ({@code GET /v1/models}). - * - * @return the OAI model-list response JSON - */ - String listModels(); -} diff --git a/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java b/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java deleted file mode 100644 index f8567a41..00000000 --- a/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java +++ /dev/null @@ -1,83 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import fi.iki.elonen.NanoHTTPD; -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import lombok.ToString; -import org.jspecify.annotations.Nullable; - -/** - * Thin NanoHTTPD adapter: reads the method, path and (for body-bearing methods) the raw request - * body from each session, hands them to an {@link OaiRouter}, and converts the resulting - * {@link OaiResponse} into a fixed-length {@code application/json} NanoHTTPD response. - * - *

All request-shaping decisions live in {@link OaiRouter}; this class only bridges NanoHTTPD's - * session API to that router so the routing logic stays unit-testable without a socket.

- */ -@ToString -public final class OaiHttpServer extends NanoHTTPD { - - private static final String MIME_JSON = "application/json"; - - private static final String MALFORMED_BODY_JSON = - "{\"error\":{\"message\":\"Malformed request body\",\"type\":\"invalid_request_error\"}}"; - - private final OaiRouter router; - - /** - * Create (but do not start) the server. - * - * @param host the interface to bind, e.g. {@code "127.0.0.1"} or {@code "0.0.0.0"} - * @param port the TCP port to listen on - * @param router the router that turns requests into responses - */ - public OaiHttpServer(String host, int port, OaiRouter router) { - super(host, port); - this.router = router; - } - - @Override - public Response serve(IHTTPSession session) { - final String method = session.getMethod().name(); - final String uri = session.getUri(); - - @Nullable String body = null; - if (bodyBearing(method)) { - final Map files = new HashMap<>(); - try { - session.parseBody(files); - } catch (IOException | ResponseException e) { - return newFixedLengthResponse(Response.Status.BAD_REQUEST, MIME_JSON, MALFORMED_BODY_JSON); - } - // For non-multipart bodies NanoHTTPD stores the raw payload under "postData". - body = files.get("postData"); - } - - final OaiResponse routed = router.route(method, uri, body); - return newFixedLengthResponse(statusFor(routed.getStatus()), MIME_JSON, routed.getBody()); - } - - private static boolean bodyBearing(String method) { - return "POST".equals(method) || "PUT".equals(method) || "PATCH".equals(method); - } - - private static Response.IStatus statusFor(int code) { - switch (code) { - case 200: - return Response.Status.OK; - case 400: - return Response.Status.BAD_REQUEST; - case 404: - return Response.Status.NOT_FOUND; - case 405: - return Response.Status.METHOD_NOT_ALLOWED; - default: - return Response.Status.INTERNAL_ERROR; - } - } -} diff --git a/src/main/java/net/ladenthin/llama/server/OaiRerankSupport.java b/src/main/java/net/ladenthin/llama/server/OaiRerankSupport.java new file mode 100644 index 00000000..b578b985 --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/OaiRerankSupport.java @@ -0,0 +1,126 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Pure request-parsing and response-shaping helpers for the {@code POST /v1/rerank} route. + * + *

Reads the Jina/Cohere-style rerank request ({@code query} + {@code documents} [+ {@code top_n}]) and + * reshapes the native llama.cpp rerank array ({@code [{document,index,score}]}) into the OpenAI-style + * rerank response. The response carries both a {@code results} array (the standard llama.cpp/Jina shape) + * and a {@code data} alias of the same entries, because Continue expects {@code data} and errors on + * {@code results} (continuedev/continue #6478). + * + *

Stateless and free of JNI / model dependencies, so each helper is unit-testable with JSON literals. + */ +final class OaiRerankSupport { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private OaiRerankSupport() {} + + /** + * Read the required {@code query} string. + * + * @param request the parsed rerank request + * @return the query text + * @throws IllegalArgumentException if {@code query} is missing or not a string + */ + static String readQuery(JsonNode request) { + JsonNode query = request.path("query"); + if (!query.isTextual()) { + throw new IllegalArgumentException("'query' must be a string"); + } + return query.asText(); + } + + /** + * Read the required {@code documents} array. Each entry may be a plain string or an object carrying a + * {@code "text"} string (the Cohere/Jina document shape). + * + * @param request the parsed rerank request + * @return the documents, in request order + * @throws IllegalArgumentException if {@code documents} is absent, empty, or holds an unsupported entry + */ + static String[] readDocuments(JsonNode request) { + JsonNode documents = request.path("documents"); + if (!documents.isArray() || documents.size() == 0) { + throw new IllegalArgumentException("'documents' must be a non-empty array"); + } + List out = new ArrayList<>(documents.size()); + for (JsonNode document : documents) { + if (document.isTextual()) { + out.add(document.asText()); + } else if (document.isObject() && document.path("text").isTextual()) { + out.add(document.path("text").asText()); + } else { + throw new IllegalArgumentException("each document must be a string or an object with a 'text' string"); + } + } + return out.toArray(new String[0]); + } + + /** + * Read the optional {@code top_n} cap. + * + * @param request the parsed rerank request + * @return the requested cap, or {@code -1} when absent or not an integer + */ + static int readTopN(JsonNode request) { + JsonNode topN = request.path("top_n"); + return topN.isInt() ? topN.asInt() : -1; + } + + /** + * Reshape the native rerank array into the OpenAI-style rerank response. + * + * @param nativeArrayJson the native {@code [{document,index,score}]} array as JSON + * @param model the model id to echo (omitted when empty) + * @param topN keep only the top-N highest-scoring entries, or {@code <= 0} to keep all + * @return the rerank response JSON with sorted {@code results} and a {@code data} alias + */ + static String toOaiResponse(String nativeArrayJson, String model, int topN) { + final List entries = new ArrayList<>(); + try { + JsonNode arr = OBJECT_MAPPER.readTree(nativeArrayJson); + if (arr.isArray()) { + for (JsonNode entry : arr) { + ObjectNode result = OBJECT_MAPPER.createObjectNode(); + result.put("index", entry.path("index").asInt()); + result.put("relevance_score", entry.path("score").asDouble()); + entries.add(result); + } + } + } catch (IOException e) { + // The native call already succeeded; an unexpected body just yields empty results. + entries.clear(); + } + entries.sort((a, b) -> Double.compare( + b.path("relevance_score").asDouble(), a.path("relevance_score").asDouble())); + + final ArrayNode results = OBJECT_MAPPER.createArrayNode(); + final int limit = topN > 0 ? Math.min(topN, entries.size()) : entries.size(); + for (int i = 0; i < limit; i++) { + results.add(entries.get(i)); + } + + final ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("object", "list"); + if (!model.isEmpty()) { + root.put("model", model); + } + root.set("results", results); + root.set("data", results.deepCopy()); // alias for Continue (#6478), independent copy + return root.toString(); + } +} diff --git a/src/main/java/net/ladenthin/llama/server/OaiResponse.java b/src/main/java/net/ladenthin/llama/server/OaiResponse.java deleted file mode 100644 index f772525d..00000000 --- a/src/main/java/net/ladenthin/llama/server/OaiResponse.java +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import lombok.EqualsAndHashCode; -import lombok.ToString; - -/** - * Immutable result of routing one HTTP request: an HTTP status code and a JSON body. - * - *

Produced by {@link OaiRouter#route(String, String, String)} and adapted to a NanoHTTPD - * response by {@link OaiHttpServer}. Keeping it independent of NanoHTTPD lets the routing logic be - * unit-tested without binding a socket. The body is always JSON (the server always replies with - * {@code application/json}).

- * - *

Value equality / {@code toString} are generated by Lombok over the status and body.

- */ -@ToString -@EqualsAndHashCode -public final class OaiResponse { - - private final int status; - private final String body; - - /** - * Create a routed response. - * - * @param status the HTTP status code (e.g. {@code 200}, {@code 400}, {@code 404}, {@code 500}) - * @param body the JSON response body - */ - public OaiResponse(int status, String body) { - this.status = status; - this.body = body; - } - - /** - * The HTTP status code. - * - * @return the status code - */ - public int getStatus() { - return status; - } - - /** - * The JSON response body. - * - * @return the body - */ - public String getBody() { - return body; - } -} diff --git a/src/main/java/net/ladenthin/llama/server/OaiRouter.java b/src/main/java/net/ladenthin/llama/server/OaiRouter.java deleted file mode 100644 index 1524c408..00000000 --- a/src/main/java/net/ladenthin/llama/server/OaiRouter.java +++ /dev/null @@ -1,130 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; -import java.util.function.Function; -import lombok.ToString; -import org.jspecify.annotations.Nullable; - -/** - * Maps an HTTP method + path + body to an {@link OaiResponse} by dispatching to an - * {@link OaiBackend}. This is the testable core of the server: it is independent of NanoHTTPD and - * of {@link net.ladenthin.llama.LlamaModel}, so it can be exercised with a fake backend and plain - * strings (no socket, no native library, no GGUF model). - * - *

Supported routes:

- *
    - *
  • {@code POST /v1/chat/completions} → {@link OaiBackend#chatCompletions(String)}
  • - *
  • {@code POST /v1/completions} → {@link OaiBackend#completions(String)}
  • - *
  • {@code POST /v1/embeddings} → {@link OaiBackend#embeddings(String)}
  • - *
  • {@code GET /v1/models} → {@link OaiBackend#listModels()}
  • - *
  • {@code GET /health} and {@code GET /} → a static {@code {"status":"ok"}}
  • - *
- * - *

Unknown paths yield {@code 404}; a known path with the wrong method yields {@code 405}; an - * empty body on a {@code POST} route yields {@code 400}; any {@link RuntimeException} thrown by the - * backend (e.g. inference failure) is converted to {@code 500}. Error bodies use the OpenAI error - * envelope {@code {"error":{"message":...,"type":...}}}.

- */ -@ToString -public final class OaiRouter { - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private static final String METHOD_GET = "GET"; - private static final String METHOD_POST = "POST"; - - private static final String HEALTH_BODY = "{\"status\":\"ok\"}"; - - private final OaiBackend backend; - - /** - * Create a router over a backend. - * - * @param backend the inference backend requests are dispatched to - */ - public OaiRouter(OaiBackend backend) { - this.backend = backend; - } - - /** - * Route a single request. - * - * @param method the HTTP method (e.g. {@code "GET"}, {@code "POST"}) - * @param rawPath the request path, optionally including a {@code ?query} suffix - * @param body the request body, or {@code null} when there is none (e.g. for {@code GET}) - * @return the status code and JSON body to return to the client - */ - public OaiResponse route(String method, String rawPath, @Nullable String body) { - final String path = stripQuery(rawPath); - try { - switch (path) { - case "/v1/chat/completions": - return post(method, body, backend::chatCompletions); - case "/v1/completions": - return post(method, body, backend::completions); - case "/v1/embeddings": - return post(method, body, backend::embeddings); - case "/v1/models": - return get(method, backend::listModels); - case "/health": - case "/": - return get(method, () -> HEALTH_BODY); - default: - return error(404, "not_found", "Unknown endpoint: " + path); - } - } catch (RuntimeException e) { - return error(500, "internal_error", describe(e)); - } - } - - private OaiResponse post(String method, @Nullable String body, Function handler) { - if (!METHOD_POST.equals(method)) { - return methodNotAllowed(method); - } - if (body == null || body.trim().isEmpty()) { - return error(400, "invalid_request_error", "Request body is required"); - } - return new OaiResponse(200, handler.apply(body)); - } - - private OaiResponse get(String method, java.util.function.Supplier handler) { - if (!METHOD_GET.equals(method)) { - return methodNotAllowed(method); - } - return new OaiResponse(200, handler.get()); - } - - private OaiResponse methodNotAllowed(String method) { - return error(405, "method_not_allowed", "Method not allowed: " + method); - } - - private static String stripQuery(String rawPath) { - final int q = rawPath.indexOf('?'); - return q >= 0 ? rawPath.substring(0, q) : rawPath; - } - - private static String describe(RuntimeException e) { - final String message = e.getMessage(); - return message != null ? message : e.getClass().getSimpleName(); - } - - private static OaiResponse error(int status, String type, String message) { - final ObjectNode root = OBJECT_MAPPER.createObjectNode(); - final ObjectNode err = root.putObject("error"); - err.put("message", message); - err.put("type", type); - String json; - try { - json = OBJECT_MAPPER.writeValueAsString(root); - } catch (JsonProcessingException e) { - json = "{\"error\":{\"message\":\"serialization failed\",\"type\":\"internal_error\"}}"; - } - return new OaiResponse(status, json); - } -} diff --git a/src/main/java/net/ladenthin/llama/server/OllamaApiSupport.java b/src/main/java/net/ladenthin/llama/server/OllamaApiSupport.java new file mode 100644 index 00000000..4ec6748d --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/OllamaApiSupport.java @@ -0,0 +1,435 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; +import java.time.Instant; +import org.jspecify.annotations.Nullable; + +/** + * Pure translators between Ollama's native HTTP API and the OpenAI chat shape used internally, so the + * server can present an Ollama-compatible surface (which Copilot's built-in Ollama provider and + * Ollama-hardcoded tools target) without a second inference path. + * + *

Covers the discovery endpoints ({@code /api/version}, {@code /api/tags}, {@code /api/show}) and the + * {@code /api/chat} request/response translation, including the NDJSON streaming shape (one JSON object + * per line, terminated by a {@code "done":true} line). Tool-call {@code arguments} are objects in Ollama + * but JSON-encoded strings in OpenAI, so they are converted on the way in and out. + * + *

Stateless and free of JNI / model dependencies; unit-testable with JSON literals. + */ +final class OllamaApiSupport { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** Advertised Ollama API version (clients only check that {@code /api/version} responds with one). */ + static final String OLLAMA_VERSION = "0.1.0"; + + private OllamaApiSupport() {} + + private static String nowIso() { + return Instant.now().toString(); + } + + /** + * The {@code GET /api/version} body. + * + * @return {@code {"version":""}} + */ + static String versionJson() { + return OBJECT_MAPPER.createObjectNode().put("version", OLLAMA_VERSION).toString(); + } + + /** + * The {@code GET /api/tags} body advertising the single served model. + * + * @param modelId the model id + * @return an Ollama model-list object serialized as JSON + */ + static String tagsJson(String modelId) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + ArrayNode models = root.putArray("models"); + ObjectNode model = models.addObject(); + model.put("name", modelId); + model.put("model", modelId); + model.put("modified_at", nowIso()); + model.put("size", 0L); + model.put("digest", ""); + ObjectNode details = model.putObject("details"); + details.put("family", "llama"); + details.put("parameter_size", ""); + details.put("quantization_level", ""); + return root.toString(); + } + + /** + * The {@code POST /api/show} body advertising the model's capabilities and context length — the + * fields Copilot's Ollama provider reads to enable tools/vision and size prompts. + * + * @param modelId the model id + * @param contextLength the advertised context window + * @param vision whether image input is supported + * @return an Ollama show object serialized as JSON + */ + static String showJson(String modelId, int contextLength, boolean vision) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("license", ""); + root.put("modelfile", ""); + root.put("parameters", ""); + root.put("template", ""); + ObjectNode details = root.putObject("details"); + details.put("family", "llama"); + details.putArray("families").add("llama"); + details.put("parameter_size", ""); + details.put("quantization_level", ""); + ObjectNode modelInfo = root.putObject("model_info"); + modelInfo.put("general.architecture", "llama"); + modelInfo.put("llama.context_length", contextLength); + ArrayNode capabilities = root.putArray("capabilities"); + capabilities.add("completion"); + capabilities.add("tools"); + capabilities.add("insert"); // fill-in-the-middle via /infill + if (vision) { + capabilities.add("vision"); + } + root.put("model", modelId); + return root.toString(); + } + + /** + * Whether the Ollama request asks for a streamed response. Ollama defaults {@code stream} to + * {@code true} when the field is absent. + * + * @param ollamaRequest the parsed Ollama request + * @return {@code true} unless {@code "stream"} is explicitly {@code false} + */ + static boolean isStreaming(JsonNode ollamaRequest) { + JsonNode stream = ollamaRequest.path("stream"); + return !stream.isBoolean() || stream.asBoolean(); + } + + /** + * Translate an Ollama {@code /api/chat} request into the internal OpenAI chat request shape. + * + * @param ollamaRequest the parsed Ollama request + * @return an OpenAI {@code /v1/chat/completions} request object + */ + static ObjectNode toOpenAiChatRequest(JsonNode ollamaRequest) { + ObjectNode openAi = OBJECT_MAPPER.createObjectNode(); + if (ollamaRequest.path("model").isTextual()) { + openAi.put("model", ollamaRequest.path("model").asText()); + } + + // Messages: copy through, converting any assistant tool_calls.arguments object to the OpenAI + // JSON-encoded string form. + ArrayNode messages = openAi.putArray("messages"); + for (JsonNode message : ollamaRequest.path("messages")) { + messages.add(toOpenAiMessage(message)); + } + + JsonNode tools = ollamaRequest.path("tools"); + if (tools.isArray() && tools.size() > 0) { + openAi.set("tools", tools.deepCopy()); + } + + // Ollama nests sampling under "options"; map the common knobs onto OpenAI top-level fields. + JsonNode options = ollamaRequest.path("options"); + copyNumber(options, "temperature", openAi, "temperature"); + copyNumber(options, "top_p", openAi, "top_p"); + copyNumber(options, "top_k", openAi, "top_k"); + copyNumber(options, "seed", openAi, "seed"); + copyNumber(options, "num_predict", openAi, "max_tokens"); + if (options.path("stop").isArray()) { + openAi.set("stop", options.path("stop").deepCopy()); + } + + // Ollama "format": "json" or a JSON schema → OpenAI response_format. + JsonNode format = ollamaRequest.path("format"); + if (format.isTextual() && "json".equals(format.asText())) { + openAi.putObject("response_format").put("type", "json_object"); + } else if (format.isObject()) { + ObjectNode responseFormat = openAi.putObject("response_format"); + responseFormat.put("type", "json_schema"); + responseFormat.putObject("json_schema").set("schema", format.deepCopy()); + } + return openAi; + } + + private static ObjectNode toOpenAiMessage(JsonNode ollamaMessage) { + ObjectNode message = ollamaMessage.deepCopy(); + JsonNode toolCalls = message.path("tool_calls"); + if (toolCalls.isArray()) { + for (JsonNode toolCall : toolCalls) { + JsonNode arguments = toolCall.path("function").path("arguments"); + if (arguments.isObject() || arguments.isArray()) { + ((ObjectNode) toolCall.path("function")).put("arguments", arguments.toString()); + } + } + } + return message; + } + + private static void copyNumber(JsonNode from, String fromKey, ObjectNode to, String toKey) { + JsonNode value = from.path(fromKey); + if (value.isNumber()) { + to.set(toKey, value); + } + } + + /** + * Translate a non-streaming OpenAI {@code chat.completion} into an Ollama {@code /api/chat} response. + * + * @param openAiCompletionJson the OpenAI completion body + * @param model the model id to echo + * @return the Ollama chat response serialized as JSON + */ + static String toOllamaChatResponse(String openAiCompletionJson, String model) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("model", model); + root.put("created_at", nowIso()); + ObjectNode message = root.putObject("message"); + message.put("role", "assistant"); + message.put("content", ""); + String doneReason = "stop"; + try { + JsonNode completion = OBJECT_MAPPER.readTree(openAiCompletionJson); + JsonNode choice = completion.path("choices").path(0); + JsonNode openAiMessage = choice.path("message"); + message.put("content", openAiMessage.path("content").asText("")); + ArrayNode ollamaToolCalls = toOllamaToolCalls(openAiMessage.path("tool_calls")); + if (ollamaToolCalls.size() > 0) { + message.set("tool_calls", ollamaToolCalls); + } + if (choice.path("finish_reason").isTextual()) { + doneReason = choice.path("finish_reason").asText(); + } + JsonNode usage = completion.path("usage"); + if (usage.isObject()) { + root.put("prompt_eval_count", usage.path("prompt_tokens").asInt(0)); + root.put("eval_count", usage.path("completion_tokens").asInt(0)); + } + } catch (IOException e) { + // Defensive: an unexpected body still yields a valid, empty Ollama "done" response. + doneReason = "stop"; + } + root.put("done", true); + root.put("done_reason", doneReason); + return root.toString(); + } + + /** + * Translate one streamed OpenAI chunk into an Ollama NDJSON content line, or return {@code null} when + * the chunk carries no assistant text to emit (role-only, finish-only or usage-only chunks). + * + * @param openAiChunkJson one OpenAI {@code chat.completion.chunk} + * @param model the model id to echo + * @return the Ollama NDJSON line (with trailing newline), or {@code null} if nothing to emit + */ + static @Nullable String toOllamaContentLine(String openAiChunkJson, String model) { + try { + JsonNode chunk = OBJECT_MAPPER.readTree(openAiChunkJson); + JsonNode content = chunk.path("choices").path(0).path("delta").path("content"); + if (!content.isTextual() || content.asText().isEmpty()) { + return null; + } + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("model", model); + root.put("created_at", nowIso()); + ObjectNode message = root.putObject("message"); + message.put("role", "assistant"); + message.put("content", content.asText()); + root.put("done", false); + return root.toString() + "\n"; + } catch (IOException e) { + return null; + } + } + + /** + * Build the terminating Ollama NDJSON line ({@code "done":true}), attaching any tool calls + * reconstructed from the stream. + * + * @param model the model id to echo + * @param accumulator the tool-call accumulator fed with the stream's chunks + * @return the final Ollama NDJSON line (with trailing newline) + */ + static String toOllamaDoneLine(String model, ToolCallDeltaAccumulator accumulator) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("model", model); + root.put("created_at", nowIso()); + ObjectNode message = root.putObject("message"); + message.put("role", "assistant"); + message.put("content", ""); + if (accumulator.hasToolCalls()) { + ArrayNode ollamaToolCalls = toOllamaToolCalls(accumulator.toOpenAiToolCalls()); + if (ollamaToolCalls.size() > 0) { + message.set("tool_calls", ollamaToolCalls); + } + } + root.put("done", true); + root.put("done_reason", "stop"); + return root.toString() + "\n"; + } + + /** Convert OpenAI tool calls (arguments = JSON string) to Ollama tool calls (arguments = object). */ + private static ArrayNode toOllamaToolCalls(JsonNode openAiToolCalls) { + ArrayNode out = OBJECT_MAPPER.createArrayNode(); + if (!openAiToolCalls.isArray()) { + return out; + } + for (JsonNode openAiToolCall : openAiToolCalls) { + JsonNode function = openAiToolCall.path("function"); + ObjectNode ollamaToolCall = out.addObject(); + ObjectNode ollamaFunction = ollamaToolCall.putObject("function"); + ollamaFunction.put("name", function.path("name").asText("")); + ollamaFunction.set("arguments", parseArgumentsToObject(function.path("arguments"))); + } + return out; + } + + private static JsonNode parseArgumentsToObject(JsonNode arguments) { + if (arguments.isObject() || arguments.isArray()) { + return arguments; + } + if (arguments.isTextual()) { + try { + return OBJECT_MAPPER.readTree(arguments.asText()); + } catch (IOException e) { + // Fall through to an empty object on unparseable arguments. + return OBJECT_MAPPER.createObjectNode(); + } + } + return OBJECT_MAPPER.createObjectNode(); + } + + // ----- /api/generate (prompt completion / FIM) ----- + + /** + * Whether the {@code /api/generate} request carries a {@code suffix} (a fill-in-the-middle request). + * + * @param request the parsed Ollama generate request + * @return {@code true} if a textual {@code suffix} is present + */ + static boolean hasSuffix(JsonNode request) { + return request.path("suffix").isTextual(); + } + + /** + * Translate an Ollama {@code /api/generate} request into the internal OpenAI {@code /v1/completions} + * request shape ({@code prompt} + sampling). Used when there is no {@code suffix}. + * + * @param request the parsed Ollama generate request + * @return an OpenAI completion request object + */ + static ObjectNode toOpenAiCompletionRequest(JsonNode request) { + ObjectNode openAi = OBJECT_MAPPER.createObjectNode(); + openAi.put("prompt", request.path("prompt").asText("")); + JsonNode options = request.path("options"); + copyNumber(options, "temperature", openAi, "temperature"); + copyNumber(options, "top_p", openAi, "top_p"); + copyNumber(options, "top_k", openAi, "top_k"); + copyNumber(options, "seed", openAi, "seed"); + copyNumber(options, "num_predict", openAi, "max_tokens"); + if (options.path("stop").isArray()) { + openAi.set("stop", options.path("stop").deepCopy()); + } + return openAi; + } + + /** + * Translate an Ollama {@code /api/generate} request with a {@code suffix} into the native infill + * request shape ({@code input_prefix} / {@code input_suffix}). + * + * @param request the parsed Ollama generate request + * @return a native {@code /infill} request object + */ + static ObjectNode toInfillRequest(JsonNode request) { + ObjectNode infill = OBJECT_MAPPER.createObjectNode(); + infill.put("input_prefix", request.path("prompt").asText("")); + infill.put("input_suffix", request.path("suffix").asText("")); + JsonNode options = request.path("options"); + copyNumber(options, "temperature", infill, "temperature"); + copyNumber(options, "num_predict", infill, "n_predict"); + return infill; + } + + /** + * Extract the generated text from an OpenAI completion body ({@code choices[0].text}). + * + * @param openAiCompletionJson the OpenAI {@code /v1/completions} response + * @return the completion text, or empty on an unexpected body + */ + static String extractCompletionText(String openAiCompletionJson) { + try { + return OBJECT_MAPPER + .readTree(openAiCompletionJson) + .path("choices") + .path(0) + .path("text") + .asText(""); + } catch (IOException e) { + return ""; + } + } + + /** + * Extract the generated text from a native infill body ({@code content}). + * + * @param infillJson the native {@code /infill} response + * @return the infill content, or empty on an unexpected body + */ + static String extractInfillContent(String infillJson) { + try { + return OBJECT_MAPPER.readTree(infillJson).path("content").asText(""); + } catch (IOException e) { + return ""; + } + } + + /** + * Build a non-streaming Ollama {@code /api/generate} response wrapping {@code text}. + * + * @param text the generated text + * @param model the model id to echo + * @return the Ollama generate response serialized as JSON + */ + static String toOllamaGenerateResponse(String text, String model) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("model", model); + root.put("created_at", nowIso()); + root.put("response", text); + root.put("done", true); + root.put("done_reason", "stop"); + return root.toString(); + } + + /** + * Build the streamed Ollama {@code /api/generate} NDJSON: a single response line carrying {@code text} + * followed by a terminating {@code "done":true} line. (Generation completes before emission, so this + * is one content chunk rather than token-by-token streaming.) + * + * @param text the generated text + * @param model the model id to echo + * @return the two NDJSON lines, concatenated (each with a trailing newline) + */ + static String toOllamaGenerateStream(String text, String model) { + ObjectNode line = OBJECT_MAPPER.createObjectNode(); + line.put("model", model); + line.put("created_at", nowIso()); + line.put("response", text); + line.put("done", false); + ObjectNode done = OBJECT_MAPPER.createObjectNode(); + done.put("model", model); + done.put("created_at", nowIso()); + done.put("response", ""); + done.put("done", true); + done.put("done_reason", "stop"); + return line + "\n" + done + "\n"; + } +} diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiBackend.java b/src/main/java/net/ladenthin/llama/server/OpenAiBackend.java new file mode 100644 index 00000000..5080ff2a --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/OpenAiBackend.java @@ -0,0 +1,90 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import java.io.IOException; + +/** + * The inference engine seam behind {@link OpenAiCompatServer}. + * + *

Decoupling the HTTP layer from {@link net.ladenthin.llama.LlamaModel} lets the whole server — + * routing, authentication, Server-Sent-Events framing, heartbeats — be exercised by tests with a fake + * backend, with no native library and no model loaded. The production implementation is + * {@link LlamaModelBackend}. + * + *

Every method receives the parsed OpenAI request object (already validated as a JSON object by the + * handler) and returns the OpenAI-shaped response JSON, except {@link #stream} which delivers chunks + * incrementally. The {@code GET /v1/models} response is built from configuration alone and so is not + * part of this seam. + */ +interface OpenAiBackend { + + /** + * Run a non-streaming chat completion ({@code POST /v1/chat/completions}). + * + * @param request the parsed OpenAI {@code /v1/chat/completions} request + * @return the complete OpenAI {@code chat.completion} response serialized as JSON + * @throws IOException if generation fails in a way the caller should surface as a server error + */ + String complete(JsonNode request) throws IOException; + + /** + * Run a streaming chat completion, delivering each {@code chat.completion.chunk} to {@code sink} + * in order. Implementations must not emit the terminating {@code [DONE]} marker; the caller adds it. + * + * @param request the parsed OpenAI {@code /v1/chat/completions} request + * @param sink receiver for each streamed chunk's JSON + * @throws IOException if a chunk cannot be delivered or generation fails + */ + void stream(JsonNode request, ChunkSink sink) throws IOException; + + /** + * Run a (non-streaming) text completion ({@code POST /v1/completions}). The request body is + * forwarded verbatim to the native OpenAI-compatible completion handler. + * + * @param request the parsed OpenAI {@code /v1/completions} request (must contain {@code "prompt"}) + * @return the OpenAI {@code text_completion} response serialized as JSON + * @throws IOException if generation fails in a way the caller should surface as a server error + */ + String completions(JsonNode request) throws IOException; + + /** + * Generate embeddings ({@code POST /v1/embeddings}). Requires the model to have been loaded in + * embedding mode; otherwise the native call fails and the caller surfaces a server error. + * + * @param request the parsed OpenAI {@code /v1/embeddings} request (must contain {@code "input"}) + * @return the OpenAI embeddings response serialized as JSON + * @throws IOException if generation fails in a way the caller should surface as a server error + */ + String embeddings(JsonNode request) throws IOException; + + /** + * Rerank documents against a query ({@code POST /v1/rerank}). Requires the model to have been loaded + * in reranking mode; otherwise the native call fails and the caller surfaces a server error. + * + * @param request the parsed rerank request ({@code query} string + {@code documents} array, optional + * {@code top_n}) + * @return the rerank response serialized as JSON ({@code results}/{@code data} of + * {@code {index, relevance_score}}) + * @throws IOException if reranking fails in a way the caller should surface as a server error + */ + String rerank(JsonNode request) throws IOException; + + /** + * Run a (non-streaming) fill-in-the-middle completion ({@code POST /infill}). The request body is + * forwarded verbatim to the native llama.cpp infill handler, which applies the model's FIM control + * tokens server-side from GGUF metadata — so callers send raw {@code input_prefix} / + * {@code input_suffix} (and optional {@code input_extra} / {@code prompt}). This is the endpoint + * that drives local ghost-text autocomplete clients (llama.vscode, llama.vim, Twinny, Tabby, + * Continue's {@code llama.cpp} provider). + * + * @param request the parsed llama.cpp {@code /infill} request (typically {@code input_prefix} + + * {@code input_suffix}) + * @return the infill response serialized as JSON (clients read the {@code "content"} field) + * @throws IOException if generation fails in a way the caller should surface as a server error + */ + String infill(JsonNode request) throws IOException; +} diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java b/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java index ac5d6178..98af69db 100644 --- a/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java +++ b/src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java @@ -7,15 +7,17 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.sun.net.httpserver.Filter; import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpHandler; import com.sun.net.httpserver.HttpServer; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.InetSocketAddress; import java.nio.charset.StandardCharsets; -import java.util.HashMap; -import java.util.Map; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; @@ -24,21 +26,25 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import net.ladenthin.llama.LlamaModel; -import net.ladenthin.llama.parameters.ModelParameters; import org.jspecify.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** - * A minimal OpenAI-compatible HTTP endpoint over a loaded {@link LlamaModel}, built only on the JDK's - * {@code com.sun.net.httpserver.HttpServer} (no new runtime dependency). + * An OpenAI-compatible HTTP endpoint over a loaded {@link LlamaModel}, built only on the JDK's + * {@code com.sun.net.httpserver.HttpServer} (no new runtime dependency). It is both embeddable and the + * {@code Main-Class} of the {@code -jar-with-dependencies} assembly. * *

Routes: *

    *
  • {@code POST /v1/chat/completions} — streaming (Server-Sent Events) and non-streaming chat * completions, forwarded faithfully (messages/tools verbatim; streamed {@code delta.tool_calls} * preserved).
  • + *
  • {@code POST /v1/completions} — non-streaming text completion.
  • + *
  • {@code POST /v1/embeddings} — embeddings (requires the model to be loaded in embedding + * mode).
  • *
  • {@code GET /v1/models} — advertises the single configured model.
  • + *
  • {@code GET /health} — liveness probe returning {@code {"status":"ok"}} (no authentication).
  • *
* *

During streaming, the server emits SSE comment heartbeats on a timer so a long prompt prefill on @@ -63,9 +69,53 @@ public final class OpenAiCompatServer implements AutoCloseable { /** The chat-completions route. */ public static final String PATH_CHAT_COMPLETIONS = "/v1/chat/completions"; + /** The text-completions route. */ + public static final String PATH_COMPLETIONS = "/v1/completions"; + + /** The embeddings route. */ + public static final String PATH_EMBEDDINGS = "/v1/embeddings"; + + /** The rerank route (requires the model loaded in reranking mode). */ + public static final String PATH_RERANK = "/v1/rerank"; + + /** The Anthropic Messages API route. */ + public static final String PATH_MESSAGES = "/v1/messages"; + + /** The OpenAI Responses API route. */ + public static final String PATH_RESPONSES = "/v1/responses"; + + /** + * The fill-in-the-middle (autocomplete) route. Deliberately the llama.cpp-native bare path (no + * {@code /v1}) so ghost-text clients such as llama.vscode and Tabby reach it unchanged. + */ + public static final String PATH_INFILL = "/infill"; + /** The model-list route. */ public static final String PATH_MODELS = "/v1/models"; + /** The liveness-probe route. */ + public static final String PATH_HEALTH = "/health"; + + /** The llama.cpp-native server-properties route (context length + modalities). */ + public static final String PATH_PROPS = "/props"; + + /** Ollama-native discovery route (version). */ + public static final String PATH_OLLAMA_VERSION = "/api/version"; + + /** Ollama-native discovery route (model list). */ + public static final String PATH_OLLAMA_TAGS = "/api/tags"; + + /** Ollama-native discovery route (model capabilities). */ + public static final String PATH_OLLAMA_SHOW = "/api/show"; + + /** Ollama-native chat route. */ + public static final String PATH_OLLAMA_CHAT = "/api/chat"; + + /** Ollama-native generate route (prompt completion / fill-in-the-middle). */ + public static final String PATH_OLLAMA_GENERATE = "/api/generate"; + + private static final String CONTENT_TYPE_NDJSON = "application/x-ndjson"; + private static final int HTTP_OK = 200; private static final int HTTP_BAD_REQUEST = 400; private static final int HTTP_UNAUTHORIZED = 401; @@ -78,10 +128,12 @@ public final class OpenAiCompatServer implements AutoCloseable { private static final String BEARER_PREFIX = "Bearer "; private static final String ERROR_TYPE_REQUEST = "invalid_request_error"; private static final String ERROR_TYPE_SERVER = "server_error"; + private static final String HEALTH_BODY = "{\"status\":\"ok\"}"; private final OpenAiServerConfig config; - private final ChatBackend backend; + private final OpenAiBackend backend; private final HttpServer http; + private final Filter corsFilter; private final ExecutorService requestExecutor; private final ScheduledExecutorService heartbeatExecutor; @@ -93,26 +145,52 @@ public final class OpenAiCompatServer implements AutoCloseable { * @throws IOException if the listening socket cannot be bound */ public OpenAiCompatServer(LlamaModel model, OpenAiServerConfig config) throws IOException { - this(new LlamaModelChatBackend(model, new OpenAiRequestMapper()), config); + this(new LlamaModelBackend(model, new OpenAiRequestMapper()), config); } /** - * Create a server backed by an arbitrary {@link ChatBackend}. Used by tests to drive the full HTTP + * Create a server backed by an arbitrary {@link OpenAiBackend}. Used by tests to drive the full HTTP * surface without a native library or model. * - * @param backend the chat engine seam + * @param backend the inference engine seam * @param config the server configuration * @throws IOException if the listening socket cannot be bound */ - OpenAiCompatServer(ChatBackend backend, OpenAiServerConfig config) throws IOException { + OpenAiCompatServer(OpenAiBackend backend, OpenAiServerConfig config) throws IOException { this.config = config; this.backend = backend; this.requestExecutor = Executors.newCachedThreadPool(namedFactory("jllama-openai-http")); this.heartbeatExecutor = Executors.newScheduledThreadPool(1, namedFactory("jllama-openai-hb")); this.http = HttpServer.create(new InetSocketAddress(config.getHost(), config.getPort()), 0); - http.createContext("/", this::handleNotFound); - http.createContext(PATH_MODELS, this::handleModels); - http.createContext(PATH_CHAT_COMPLETIONS, this::handleChatCompletions); + this.corsFilter = buildCorsFilter(config.getCorsAllowOrigin()); + register("/", this::handleNotFound); + register(PATH_HEALTH, this::handleHealth); + register(PATH_PROPS, this::handleProps); + // Each route is registered under its canonical path and a bare alias (clients disagree on + // whether to include the /v1 prefix), so both forms resolve to the same handler. + register(PATH_MODELS, this::handleModels); + register("/models", this::handleModels); + register(PATH_CHAT_COMPLETIONS, this::handleChatCompletions); + register("/chat/completions", this::handleChatCompletions); + register(PATH_COMPLETIONS, this::handleCompletions); + register("/completions", this::handleCompletions); + register(PATH_EMBEDDINGS, this::handleEmbeddings); + register("/embeddings", this::handleEmbeddings); + register(PATH_RERANK, this::handleRerank); + register("/rerank", this::handleRerank); + register("/reranking", this::handleRerank); + register(PATH_INFILL, this::handleInfill); + register("/v1/infill", this::handleInfill); + register(PATH_MESSAGES, this::handleAnthropicMessages); + register("/messages", this::handleAnthropicMessages); + register(PATH_RESPONSES, this::handleResponses); + register("/responses", this::handleResponses); + // Ollama-native surface (Copilot's built-in Ollama provider + Ollama-hardcoded tools). + register(PATH_OLLAMA_VERSION, this::handleOllamaVersion); + register(PATH_OLLAMA_TAGS, this::handleOllamaTags); + register(PATH_OLLAMA_SHOW, this::handleOllamaShow); + register(PATH_OLLAMA_CHAT, this::handleOllamaChat); + register(PATH_OLLAMA_GENERATE, this::handleOllamaGenerate); http.setExecutor(requestExecutor); } @@ -144,21 +222,48 @@ public void close() { heartbeatExecutor.shutdownNow(); } + /** + * Register {@code handler} for {@code path} with the CORS filter attached. Centralised so the + * cross-cutting CORS/preflight wiring applies uniformly to every route (including the catch-all). + */ + private void register(String path, HttpHandler handler) { + http.createContext(path, handler).getFilters().add(corsFilter); + } + + /** + * Build a CORS filter that stamps {@code Access-Control-Allow-Origin} on every response and answers + * {@code OPTIONS} preflights with {@code 204} + the allowed methods/headers — so browser- and + * webview-based clients (which preflight an {@code Authorization} header) are not blocked. + */ + private static Filter buildCorsFilter(String allowOrigin) { + return new Filter() { + @Override + public void doFilter(HttpExchange exchange, Chain chain) throws IOException { + exchange.getResponseHeaders().set("Access-Control-Allow-Origin", allowOrigin); + if ("OPTIONS".equalsIgnoreCase(exchange.getRequestMethod())) { + exchange.getResponseHeaders().set("Access-Control-Allow-Methods", "GET, POST, OPTIONS"); + exchange.getResponseHeaders().set("Access-Control-Allow-Headers", "Content-Type, Authorization"); + exchange.getResponseHeaders().set("Access-Control-Max-Age", "86400"); + exchange.sendResponseHeaders(204, -1); + exchange.close(); + return; + } + chain.doFilter(exchange); + } + + @Override + public String description() { + return "CORS preflight + Access-Control-Allow-Origin"; + } + }; + } + // ----- handlers ----- private void handleChatCompletions(HttpExchange exchange) throws IOException { try { - if (!"POST".equalsIgnoreCase(exchange.getRequestMethod())) { - sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only POST is supported"); - return; - } - if (!authorized(exchange)) { - sendError(exchange, HTTP_UNAUTHORIZED, ERROR_TYPE_REQUEST, "Missing or invalid API key"); - return; - } - JsonNode request = readBody(exchange); - if (request == null || !request.isObject()) { - sendError(exchange, HTTP_BAD_REQUEST, ERROR_TYPE_REQUEST, "Request body must be a JSON object"); + JsonNode request = requirePostJson(exchange); + if (request == null) { return; } JsonNode messages = request.path("messages"); @@ -169,22 +274,71 @@ private void handleChatCompletions(HttpExchange exchange) throws IOException { if (request.path("stream").asBoolean(false)) { streamChat(exchange, request); } else { - completeChat(exchange, request); + completeNonStreaming(exchange, request, backend::complete); + } + } finally { + exchange.close(); + } + } + + private void handleCompletions(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request != null) { + completeNonStreaming(exchange, request, backend::completions); + } + } finally { + exchange.close(); + } + } + + private void handleEmbeddings(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request != null) { + completeNonStreaming(exchange, request, backend::embeddings); + } + } finally { + exchange.close(); + } + } + + private void handleInfill(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request != null) { + completeNonStreaming(exchange, request, backend::infill); + } + } finally { + exchange.close(); + } + } + + private void handleRerank(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request != null) { + completeNonStreaming(exchange, request, backend::rerank); } } finally { exchange.close(); } } - private void completeChat(HttpExchange exchange, JsonNode request) throws IOException { + /** + * Run a non-streaming request through {@code producer} and write its JSON body, translating an + * {@link IllegalArgumentException} to {@code 400} and any other failure to {@code 500}. + */ + private void completeNonStreaming(HttpExchange exchange, JsonNode request, BodyProducer producer) + throws IOException { final String body; try { - body = backend.complete(request); + body = producer.produce(request); } catch (IllegalArgumentException e) { sendError(exchange, HTTP_BAD_REQUEST, ERROR_TYPE_REQUEST, message(e)); return; } catch (IOException | RuntimeException e) { - LOG.warn("chat completion failed", e); + LOG.warn("request failed", e); sendError(exchange, HTTP_SERVER_ERROR, ERROR_TYPE_SERVER, message(e)); return; } @@ -195,32 +349,330 @@ private void streamChat(HttpExchange exchange, JsonNode request) throws IOExcept exchange.getResponseHeaders().set("Content-Type", CONTENT_TYPE_SSE); exchange.getResponseHeaders().set("Cache-Control", "no-cache"); exchange.sendResponseHeaders(HTTP_OK, 0); - final OutputStream os = exchange.getResponseBody(); - final Object writeLock = new Object(); - final ScheduledFuture heartbeat = heartbeatExecutor.scheduleAtFixedRate( - () -> writeQuietly(os, writeLock, OpenAiSseFormatter.heartbeat()), - config.getHeartbeatMillis(), - config.getHeartbeatMillis(), - TimeUnit.MILLISECONDS); + try (ResponseStream out = new ResponseStream(exchange.getResponseBody())) { + ScheduledFuture heartbeat = null; + try { + heartbeat = heartbeatExecutor.scheduleAtFixedRate( + () -> out.writeQuietly(OpenAiSseFormatter.heartbeat()), + config.getHeartbeatMillis(), + config.getHeartbeatMillis(), + TimeUnit.MILLISECONDS); + backend.stream( + request, + chunkJson -> out.writeStrict( + OpenAiSseFormatter.sseData(OpenAiSseFormatter.ensureUsageCachedTokens(chunkJson)))); + out.writeStrict(OpenAiSseFormatter.sseDone()); + } catch (IllegalArgumentException e) { + out.writeQuietly( + OpenAiSseFormatter.sseData(OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_REQUEST, null))); + } catch (IOException e) { + LOG.debug("client disconnected during stream", e); + } catch (RuntimeException e) { + LOG.warn("streaming chat completion failed", e); + out.writeQuietly( + OpenAiSseFormatter.sseData(OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_SERVER, null))); + } finally { + // try-with-resources closes the stream (under its lock) after the heartbeat is cancelled, + // so the close never races a still-in-flight heartbeat write. + if (heartbeat != null) { + heartbeat.cancel(false); + } + } + } + } + + // ----- Ollama-native surface ----- + + private void handleOllamaVersion(HttpExchange exchange) throws IOException { try { - backend.stream(request, chunkJson -> writeStrict(os, writeLock, OpenAiSseFormatter.sseData(chunkJson))); - writeStrict(os, writeLock, OpenAiSseFormatter.sseDone()); - } catch (IllegalArgumentException e) { - writeQuietly( - os, - writeLock, - OpenAiSseFormatter.sseData(OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_REQUEST, null))); - } catch (IOException e) { - LOG.debug("client disconnected during stream", e); - } catch (RuntimeException e) { - LOG.warn("streaming chat completion failed", e); - writeQuietly( - os, - writeLock, - OpenAiSseFormatter.sseData(OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_SERVER, null))); + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only GET is supported"); + return; + } + sendJson(exchange, HTTP_OK, OllamaApiSupport.versionJson()); + } finally { + exchange.close(); + } + } + + private void handleOllamaTags(HttpExchange exchange) throws IOException { + try { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only GET is supported"); + return; + } + sendJson(exchange, HTTP_OK, OllamaApiSupport.tagsJson(config.getModelId())); } finally { - heartbeat.cancel(false); - closeQuietly(os, writeLock); + exchange.close(); + } + } + + private void handleOllamaShow(HttpExchange exchange) throws IOException { + try { + if (!"POST".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only POST is supported"); + return; + } + // The request body (optionally {"model":...}) is ignored: this server serves one model. + int contextLength = config.getMaxInputTokens() + config.getMaxOutputTokens(); + sendJson( + exchange, + HTTP_OK, + OllamaApiSupport.showJson(config.getModelId(), contextLength, config.isSupportsVision())); + } finally { + exchange.close(); + } + } + + private void handleOllamaChat(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request == null) { + return; + } + JsonNode openAiRequest = OllamaApiSupport.toOpenAiChatRequest(request); + String model = request.path("model").asText(config.getModelId()); + if (OllamaApiSupport.isStreaming(request)) { + streamOllamaChat(exchange, openAiRequest, model); + } else { + final String body; + try { + body = backend.complete(openAiRequest); + } catch (IllegalArgumentException e) { + sendJson(exchange, HTTP_BAD_REQUEST, ollamaError(message(e))); + return; + } catch (IOException | RuntimeException e) { + LOG.warn("ollama chat failed", e); + sendJson(exchange, HTTP_SERVER_ERROR, ollamaError(message(e))); + return; + } + sendJson(exchange, HTTP_OK, OllamaApiSupport.toOllamaChatResponse(body, model)); + } + } finally { + exchange.close(); + } + } + + /** Stream an Ollama {@code /api/chat} response as newline-delimited JSON, ending with a done line. */ + private void streamOllamaChat(HttpExchange exchange, JsonNode openAiRequest, String model) throws IOException { + exchange.getResponseHeaders().set("Content-Type", CONTENT_TYPE_NDJSON); + exchange.getResponseHeaders().set("Cache-Control", "no-cache"); + exchange.sendResponseHeaders(HTTP_OK, 0); + final ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + try (ResponseStream out = new ResponseStream(exchange.getResponseBody())) { + try { + backend.stream(openAiRequest, chunkJson -> { + accumulator.accept(chunkJson); + String line = OllamaApiSupport.toOllamaContentLine(chunkJson, model); + if (line != null) { + out.writeStrict(line); + } + }); + out.writeStrict(OllamaApiSupport.toOllamaDoneLine(model, accumulator)); + } catch (IllegalArgumentException e) { + out.writeQuietly(ollamaError(message(e)) + "\n"); + } catch (IOException e) { + LOG.debug("ollama client disconnected during stream", e); + } catch (RuntimeException e) { + LOG.warn("ollama streaming chat failed", e); + out.writeQuietly(ollamaError(message(e)) + "\n"); + } + } + } + + private void handleOllamaGenerate(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request == null) { + return; + } + String model = request.path("model").asText(config.getModelId()); + // Generation runs to completion first (there is no streaming raw-completion path), then the + // text is wrapped — as a single NDJSON content line + done line when stream is requested. + final String text; + try { + if (OllamaApiSupport.hasSuffix(request)) { + text = OllamaApiSupport.extractInfillContent( + backend.infill(OllamaApiSupport.toInfillRequest(request))); + } else { + text = OllamaApiSupport.extractCompletionText( + backend.completions(OllamaApiSupport.toOpenAiCompletionRequest(request))); + } + } catch (IllegalArgumentException e) { + sendJson(exchange, HTTP_BAD_REQUEST, ollamaError(message(e))); + return; + } catch (IOException | RuntimeException e) { + LOG.warn("ollama generate failed", e); + sendJson(exchange, HTTP_SERVER_ERROR, ollamaError(message(e))); + return; + } + if (OllamaApiSupport.isStreaming(request)) { + byte[] bytes = + OllamaApiSupport.toOllamaGenerateStream(text, model).getBytes(StandardCharsets.UTF_8); + exchange.getResponseHeaders().set("Content-Type", CONTENT_TYPE_NDJSON); + exchange.sendResponseHeaders(HTTP_OK, bytes.length); + try (OutputStream os = exchange.getResponseBody()) { + os.write(bytes); + } + } else { + sendJson(exchange, HTTP_OK, OllamaApiSupport.toOllamaGenerateResponse(text, model)); + } + } finally { + exchange.close(); + } + } + + private static String ollamaError(String message) { + return OBJECT_MAPPER.createObjectNode().put("error", message).toString(); + } + + // ----- Anthropic Messages API ----- + + private void handleAnthropicMessages(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request == null) { + return; + } + JsonNode openAiRequest = AnthropicApiSupport.toOpenAiChatRequest(request); + String model = request.path("model").asText(config.getModelId()); + if (AnthropicApiSupport.isStreaming(request)) { + streamAnthropic(exchange, openAiRequest, model); + } else { + final String body; + try { + body = backend.complete(openAiRequest); + } catch (IllegalArgumentException e) { + sendJson(exchange, HTTP_BAD_REQUEST, anthropicError(message(e))); + return; + } catch (IOException | RuntimeException e) { + LOG.warn("anthropic messages failed", e); + sendJson(exchange, HTTP_SERVER_ERROR, anthropicError(message(e))); + return; + } + sendJson(exchange, HTTP_OK, AnthropicApiSupport.toAnthropicResponse(body, model)); + } + } finally { + exchange.close(); + } + } + + /** Stream an Anthropic {@code /v1/messages} response as the Anthropic SSE event sequence. */ + private void streamAnthropic(HttpExchange exchange, JsonNode openAiRequest, String model) throws IOException { + exchange.getResponseHeaders().set("Content-Type", CONTENT_TYPE_SSE); + exchange.getResponseHeaders().set("Cache-Control", "no-cache"); + exchange.sendResponseHeaders(HTTP_OK, 0); + final AnthropicStreamTranslator translator = + new AnthropicStreamTranslator("msg_" + Long.toHexString(System.nanoTime()), model); + try (ResponseStream out = new ResponseStream(exchange.getResponseBody())) { + ScheduledFuture heartbeat = null; + try { + heartbeat = heartbeatExecutor.scheduleAtFixedRate( + () -> out.writeQuietly(OpenAiSseFormatter.heartbeat()), + config.getHeartbeatMillis(), + config.getHeartbeatMillis(), + TimeUnit.MILLISECONDS); + out.writeStrict(translator.begin()); + backend.stream(openAiRequest, chunkJson -> { + String events = translator.onChunk(chunkJson); + if (!events.isEmpty()) { + out.writeStrict(events); + } + }); + out.writeStrict(translator.end()); + } catch (IllegalArgumentException e) { + out.writeQuietly(AnthropicApiSupport.sseEvent("error", anthropicError(message(e)))); + } catch (IOException e) { + LOG.debug("anthropic client disconnected during stream", e); + } catch (RuntimeException e) { + LOG.warn("anthropic streaming failed", e); + out.writeQuietly(AnthropicApiSupport.sseEvent("error", anthropicError(message(e)))); + } finally { + if (heartbeat != null) { + heartbeat.cancel(false); + } + } + } + } + + private static String anthropicError(String message) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("type", "error"); + ObjectNode error = root.putObject("error"); + error.put("type", "invalid_request_error"); + error.put("message", message); + return root.toString(); + } + + // ----- OpenAI Responses API ----- + + private void handleResponses(HttpExchange exchange) throws IOException { + try { + JsonNode request = requirePostJson(exchange); + if (request == null) { + return; + } + JsonNode openAiRequest = ResponsesApiSupport.toOpenAiChatRequest(request); + String model = request.path("model").asText(config.getModelId()); + String responseId = "resp_" + Long.toHexString(System.nanoTime()); + if (ResponsesApiSupport.isStreaming(request)) { + streamResponses(exchange, openAiRequest, model, responseId); + } else { + final String body; + try { + body = backend.complete(openAiRequest); + } catch (IllegalArgumentException e) { + sendError(exchange, HTTP_BAD_REQUEST, ERROR_TYPE_REQUEST, message(e)); + return; + } catch (IOException | RuntimeException e) { + LOG.warn("responses failed", e); + sendError(exchange, HTTP_SERVER_ERROR, ERROR_TYPE_SERVER, message(e)); + return; + } + sendJson(exchange, HTTP_OK, ResponsesApiSupport.toResponsesResponse(body, model, responseId)); + } + } finally { + exchange.close(); + } + } + + /** Stream a Responses {@code /v1/responses} reply as the Responses SSE event sequence. */ + private void streamResponses(HttpExchange exchange, JsonNode openAiRequest, String model, String responseId) + throws IOException { + exchange.getResponseHeaders().set("Content-Type", CONTENT_TYPE_SSE); + exchange.getResponseHeaders().set("Cache-Control", "no-cache"); + exchange.sendResponseHeaders(HTTP_OK, 0); + final ResponsesStreamTranslator translator = new ResponsesStreamTranslator(model, responseId); + try (ResponseStream out = new ResponseStream(exchange.getResponseBody())) { + ScheduledFuture heartbeat = null; + try { + heartbeat = heartbeatExecutor.scheduleAtFixedRate( + () -> out.writeQuietly(OpenAiSseFormatter.heartbeat()), + config.getHeartbeatMillis(), + config.getHeartbeatMillis(), + TimeUnit.MILLISECONDS); + out.writeStrict(translator.begin()); + backend.stream(openAiRequest, chunkJson -> { + String events = translator.onChunk(chunkJson); + if (!events.isEmpty()) { + out.writeStrict(events); + } + }); + out.writeStrict(translator.end()); + } catch (IllegalArgumentException e) { + out.writeQuietly("event: error\ndata: " + + OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_REQUEST, null) + "\n\n"); + } catch (IOException e) { + LOG.debug("responses client disconnected during stream", e); + } catch (RuntimeException e) { + LOG.warn("responses streaming failed", e); + out.writeQuietly("event: error\ndata: " + + OpenAiSseFormatter.errorJson(message(e), ERROR_TYPE_SERVER, null) + "\n\n"); + } finally { + if (heartbeat != null) { + heartbeat.cancel(false); + } + } } } @@ -240,6 +692,35 @@ private void handleModels(HttpExchange exchange) throws IOException { } } + private void handleHealth(HttpExchange exchange) throws IOException { + try { + // Liveness probe: deliberately unauthenticated so orchestrators can poll it without a key. + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only GET is supported"); + return; + } + sendJson(exchange, HTTP_OK, HEALTH_BODY); + } finally { + exchange.close(); + } + } + + private void handleProps(HttpExchange exchange) throws IOException { + try { + if (!"GET".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only GET is supported"); + return; + } + int contextLength = config.getMaxInputTokens() + config.getMaxOutputTokens(); + sendJson( + exchange, + HTTP_OK, + OpenAiSseFormatter.propsJson(config.getModelId(), contextLength, config.isSupportsVision())); + } finally { + exchange.close(); + } + } + private void handleNotFound(HttpExchange exchange) throws IOException { try { sendError(exchange, HTTP_NOT_FOUND, ERROR_TYPE_REQUEST, "Not found: " + exchange.getRequestURI()); @@ -250,6 +731,27 @@ private void handleNotFound(HttpExchange exchange) throws IOException { // ----- helpers ----- + /** + * Shared preamble for the {@code POST} JSON routes: enforce the method, authentication and a JSON + * object body, sending the matching error and returning {@code null} when any precondition fails. + */ + private @Nullable JsonNode requirePostJson(HttpExchange exchange) throws IOException { + if (!"POST".equalsIgnoreCase(exchange.getRequestMethod())) { + sendError(exchange, HTTP_METHOD_NOT_ALLOWED, ERROR_TYPE_REQUEST, "Only POST is supported"); + return null; + } + if (!authorized(exchange)) { + sendError(exchange, HTTP_UNAUTHORIZED, ERROR_TYPE_REQUEST, "Missing or invalid API key"); + return null; + } + JsonNode request = readBody(exchange); + if (request == null || !request.isObject()) { + sendError(exchange, HTTP_BAD_REQUEST, ERROR_TYPE_REQUEST, "Request body must be a JSON object"); + return null; + } + return request; + } + private boolean authorized(HttpExchange exchange) { if (!config.isAuthenticationEnabled()) { return true; @@ -287,32 +789,51 @@ private void sendError(HttpExchange exchange, int status, String type, String me sendJson(exchange, status, OpenAiSseFormatter.errorJson(message, type, null)); } - /** Write under the response lock, propagating failures so a streaming generation can be cancelled. */ - private void writeStrict(OutputStream os, Object writeLock, String text) throws IOException { - synchronized (writeLock) { - os.write(text.getBytes(StandardCharsets.UTF_8)); - os.flush(); + /** + * Per-request, thread-safe wrapper over a streaming HTTP response body. Every write and the close are + * serialized on a {@code private final} lock, so the generation thread and the heartbeat-timer task + * never write to (or close) the same stream concurrently. The lock is owned by this per-request + * instance rather than shared, so independent concurrent streams never serialize against each other. + * It is {@link AutoCloseable} so callers drive it with try-with-resources, which closes the stream + * (under the lock) on every exit path. + */ + private static final class ResponseStream implements AutoCloseable { + + private final OutputStream os; + private final Object lock = new Object(); + + ResponseStream(OutputStream os) { + this.os = os; } - } - /** Write under the response lock, swallowing failures (used for heartbeats and best-effort events). */ - private void writeQuietly(OutputStream os, Object writeLock, String text) { - synchronized (writeLock) { - try { + /** Write under the lock, propagating failures so a streaming generation can be cancelled. */ + void writeStrict(String text) throws IOException { + synchronized (lock) { os.write(text.getBytes(StandardCharsets.UTF_8)); os.flush(); - } catch (IOException e) { - LOG.trace("stream write failed (client likely disconnected)", e); } } - } - private void closeQuietly(OutputStream os, Object writeLock) { - synchronized (writeLock) { - try { - os.close(); - } catch (IOException e) { - LOG.trace("stream close failed", e); + /** Write under the lock, swallowing failures (used for heartbeats and best-effort events). */ + void writeQuietly(String text) { + synchronized (lock) { + try { + os.write(text.getBytes(StandardCharsets.UTF_8)); + os.flush(); + } catch (IOException e) { + LOG.trace("stream write failed (client likely disconnected)", e); + } + } + } + + @Override + public void close() { + synchronized (lock) { + try { + os.close(); + } catch (IOException e) { + LOG.trace("stream close failed", e); + } } } } @@ -331,101 +852,72 @@ private static ThreadFactory namedFactory(String prefix) { }; } + /** Produces a non-streaming response body from a parsed request; may fail with {@link IOException}. */ + @FunctionalInterface + private interface BodyProducer { + String produce(JsonNode request) throws IOException; + } + // ----- standalone launcher ----- /** - * Command-line launcher: load a GGUF model and serve it over the OpenAI-compatible endpoint. + * Command-line launcher: load a GGUF model and serve it over the OpenAI-compatible endpoint. This is + * the {@code Main-Class} of the {@code -jar-with-dependencies} assembly. * - *

Options: {@code --model } (required), {@code --host}, {@code --port}, - * {@code --api-key}, {@code --model-id}, {@code --ctx}, {@code --gpu-layers}, {@code --parallel}. + *

Parsing, validation and the option list live in {@link OpenAiServerCli}; run with + * {@code --help} for the full usage text. No {@code System.exit} is used (the {@code noSystemExit} + * architecture rule forbids it): a usage error prints to stderr and returns. * * @param args command-line options * @throws IOException if the listening socket cannot be bound */ public static void main(String[] args) throws IOException { - Map opts = parseArgs(args); - String modelPath = opts.get("model"); - if (modelPath == null) { - System.err.println("Usage: OpenAiCompatServer --model [--host 127.0.0.1] [--port 8080]" - + " [--api-key KEY] [--model-id ID] [--ctx 8192] [--gpu-layers N] [--parallel N]"); + if (OpenAiServerCli.isHelpRequested(args)) { + System.out.println(OpenAiServerCli.usage()); return; } - ModelParameters modelParams = new ModelParameters().setModel(modelPath); - OpenAiServerConfig.Builder cfg = OpenAiServerConfig.builder(); - - String host = opts.get("host"); - if (host != null) { - cfg.host(host); - } - String apiKey = opts.get("api-key"); - if (apiKey != null) { - cfg.apiKey(apiKey); - } - String modelId = opts.get("model-id"); - if (modelId != null) { - cfg.modelId(modelId); - } - - // Parse all numeric options in one place so a non-numeric value (e.g. "--port abc") yields a - // clear message instead of an uncaught NumberFormatException stack trace. No System.exit here - // — the noSystemExit architecture rule forbids it; print to stderr and return like the - // missing-"--model" path above. + final OpenAiServerCli.Options options; try { - String ctx = opts.get("ctx"); - if (ctx != null) { - int ctxSize = Integer.parseInt(ctx); - modelParams.setCtxSize(ctxSize); - cfg.maxOutputTokens(Math.min(OpenAiServerConfig.DEFAULT_MAX_OUTPUT_TOKENS, Math.max(1, ctxSize / 2))); - cfg.maxInputTokens(Math.max(1, ctxSize - OpenAiServerConfig.DEFAULT_MAX_OUTPUT_TOKENS)); - } - String gpuLayers = opts.get("gpu-layers"); - if (gpuLayers != null) { - modelParams.setGpuLayers(Integer.parseInt(gpuLayers)); - } - String parallel = opts.get("parallel"); - if (parallel != null) { - modelParams.setParallel(Integer.parseInt(parallel)); - } - String port = opts.get("port"); - if (port != null) { - cfg.port(Integer.parseInt(port)); - } - } catch (NumberFormatException e) { - System.err.println("Invalid numeric option (expected an integer): " + e.getMessage()); + options = OpenAiServerCli.parse(args); + } catch (IllegalArgumentException e) { + System.err.println(e.getMessage()); return; } - OpenAiServerConfig config = cfg.build(); + OpenAiServerConfig config = options.toServerConfig(); - LlamaModel model = new LlamaModel(modelParams); - OpenAiCompatServer server = new OpenAiCompatServer(model, config); + // The server runs on daemon threads, so the main thread blocks until the JVM is asked to + // shut down (Ctrl-C / SIGTERM); the try-with-resources then closes the server and model. + // Two latches keep that shutdown graceful and race-free: the hook signals stopRequested and + // then waits on cleanedUp, so the JVM — which blocks until shutdown hooks return — does not + // halt until the close has actually run. + final CountDownLatch stopRequested = new CountDownLatch(1); + final CountDownLatch cleanedUp = new CountDownLatch(1); Runtime.getRuntime() .addShutdownHook(new Thread( () -> { - server.close(); - model.close(); + stopRequested.countDown(); + try { + cleanedUp.await(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } }, "jllama-openai-shutdown")); - server.start(); - printReady(config, server.getPort()); - try { - Thread.currentThread().join(); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - private static Map parseArgs(String[] args) { - Map opts = new HashMap<>(); - for (int i = 0; i < args.length; i++) { - String arg = args[i]; - if (arg.startsWith("--") && i + 1 < args.length) { - opts.put(arg.substring(2), args[i + 1]); - i++; + try (LlamaModel model = new LlamaModel(options.toModelParameters()); + OpenAiCompatServer server = new OpenAiCompatServer(model, config)) { + server.start(); + printReady(config, server.getPort()); + try { + stopRequested.await(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); } + } finally { + cleanedUp.countDown(); } - return opts; } private static void printReady(OpenAiServerConfig config, int port) { @@ -433,7 +925,7 @@ private static void printReady(OpenAiServerConfig config, int port) { System.out.println(); System.out.println("OpenAI-compatible endpoint ready: " + url); System.out.println("Add this to VS Code's chatLanguageModels.json (Chat: Manage Language Models):"); - System.out.println("["); + System.out.println('['); System.out.println(" {"); System.out.println(" \"name\": \"Local llama.cpp (java-llama.cpp)\","); System.out.println(" \"vendor\": \"customendpoint\","); @@ -452,6 +944,6 @@ private static void printReady(OpenAiServerConfig config, int port) { System.out.println(" }"); System.out.println(" ]"); System.out.println(" }"); - System.out.println("]"); + System.out.println(']'); } } diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java b/src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java index e4424c91..fd12a617 100644 --- a/src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java +++ b/src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java @@ -41,7 +41,12 @@ InferenceParameters toInferenceParameters(JsonNode request) { throw new IllegalArgumentException("'messages' must be a non-empty array"); } - InferenceParameters params = InferenceParameters.empty().withMessagesJson(messages.toString()); + // cache_prompt=true reuses the slot's KV prefix across turns — the standard llama.cpp-server + // default and what IDE clients rely on for acceptable repeated-prefix latency. OpenAI requests + // never carry this llama.cpp-specific flag, so defaulting it here is safe. + InferenceParameters params = InferenceParameters.empty() + .withMessagesJson(messages.toString()) + .withCachePrompt(true); JsonNode tools = request.path("tools"); if (tools.isArray() && tools.size() > 0) { @@ -91,6 +96,20 @@ InferenceParameters toInferenceParameters(JsonNode request) { params = params.withStopStrings(stops); } + // Forward stream_options verbatim (e.g. {"include_usage":true}) so the native server emits the + // trailing usage chunk the OpenAI streaming protocol — and the Copilot custom endpoint — expect. + JsonNode streamOptions = request.path("stream_options"); + if (streamOptions.isObject()) { + params = params.withStreamOptions(streamOptions.toString()); + } + + // Forward response_format verbatim (json_object / json_schema) so the native server applies the + // matching grammar constraint — the OpenAI "structured outputs" feature used by strict clients. + JsonNode responseFormat = request.path("response_format"); + if (responseFormat.isObject()) { + params = params.withResponseFormat(responseFormat.toString()); + } + return params; } diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java b/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java new file mode 100644 index 00000000..b6b609fa --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/OpenAiServerCli.java @@ -0,0 +1,414 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import java.nio.file.Path; +import java.nio.file.Paths; +import net.ladenthin.llama.parameters.ModelParameters; +import org.jspecify.annotations.Nullable; + +/** + * Command-line argument parser for {@link OpenAiCompatServer}'s standalone launcher. Pure and free of + * any native dependency, so it can be unit-tested in isolation (no socket, no model). + * + *

{@link #parse(String[])} returns an immutable {@link Options} or throws + * {@link IllegalArgumentException} (whose message embeds the {@link #usage()} text) for unknown flags, + * missing values or a missing required {@code --model}. {@code -h}/{@code --help} is detected separately + * via {@link #isHelpRequested(String[])} so callers can print help without it being treated as an error. + * + *

Flags mirror llama.cpp's own server where they overlap ({@code -m}, {@code -p}, {@code -c}, + * {@code -ngl}, {@code -t}); a few legacy spellings are accepted as aliases so earlier documented + * invocations keep working. + */ +public final class OpenAiServerCli { + + /** Default bind interface (loopback only; pass {@code --host 0.0.0.0} to expose on the LAN). */ + public static final String DEFAULT_HOST = OpenAiServerConfig.DEFAULT_HOST; + + /** Default TCP port. */ + public static final int DEFAULT_PORT = OpenAiServerConfig.DEFAULT_PORT; + + private OpenAiServerCli() {} + + /** + * Whether the arguments request the help text. + * + * @param args the raw command-line arguments + * @return {@code true} if {@code -h} or {@code --help} is present + */ + public static boolean isHelpRequested(String... args) { + for (final String arg : args) { + if ("-h".equals(arg) || "--help".equals(arg)) { + return true; + } + } + return false; + } + + /** + * Parse the command-line arguments into validated {@link Options}. + * + * @param args the raw command-line arguments + * @return the parsed options + * @throws IllegalArgumentException if an argument is unknown, a value is missing or malformed, + * or the required {@code --model} is absent + */ + public static Options parse(String... args) { + String host = DEFAULT_HOST; + int port = DEFAULT_PORT; + @Nullable String modelPath = null; + @Nullable String modelId = null; + @Nullable String apiKey = null; + @Nullable String mmproj = null; + int ctxSize = 0; + int gpuLayers = 0; + int threads = 0; + int parallel = 0; + boolean embedding = false; + boolean reranking = false; + + for (int i = 0; i < args.length; i++) { + final String arg = args[i]; + switch (arg) { + case "-m": + case "--model": + modelPath = nextValue(args, ++i, arg); + break; + case "--host": + host = nextValue(args, ++i, arg); + break; + case "-p": + case "--port": + port = intValue(args, ++i, arg); + break; + case "-c": + case "--ctx-size": + case "--ctx": + ctxSize = intValue(args, ++i, arg); + break; + case "-ngl": + case "--n-gpu-layers": + case "--gpu-layers": + gpuLayers = intValue(args, ++i, arg); + break; + case "-t": + case "--threads": + threads = intValue(args, ++i, arg); + break; + case "--parallel": + parallel = intValue(args, ++i, arg); + break; + case "--model-id": + case "--model-alias": + modelId = nextValue(args, ++i, arg); + break; + case "--api-key": + apiKey = nextValue(args, ++i, arg); + break; + case "--mmproj": + mmproj = nextValue(args, ++i, arg); + break; + case "--embedding": + case "--embeddings": + embedding = true; + break; + case "--reranking": + case "--rerank": + reranking = true; + break; + case "-h": + case "--help": + // Detected by isHelpRequested(); accepted here so parse() still succeeds. + break; + default: + throw error("Unknown argument: " + arg); + } + } + + if (modelPath == null) { + throw error("Missing required argument: -m/--model "); + } + return new Options( + host, port, modelPath, modelId, apiKey, mmproj, ctxSize, gpuLayers, threads, parallel, embedding, + reranking); + } + + /** + * The human-readable usage / help text. + * + * @return the usage text + */ + public static String usage() { + return String.join( + System.lineSeparator(), + "OpenAiCompatServer - OpenAI-compatible HTTP server for java-llama.cpp", + "", + "Usage:", + " java -jar llama--jar-with-dependencies.jar --model [options]", + "", + "Required:", + " -m, --model Path to the GGUF model file", + "", + "Options:", + " --host Interface to bind (default: " + DEFAULT_HOST + ")", + " -p, --port TCP port to listen on (default: " + DEFAULT_PORT + ")", + " -c, --ctx-size Context window size (default: llama.cpp default)", + " -ngl,--n-gpu-layers Layers to offload to GPU (default: 0 = CPU only)", + " -t, --threads Inference thread count (default: llama.cpp default)", + " --parallel Parallel inference slots (default: llama.cpp default)", + " --model-id Model id reported by /v1/models (default: file name)", + " --api-key Require an 'Authorization: Bearer ' header", + " --mmproj Multimodal projector for vision models (enables image input)", + " --embedding Load in embedding mode (enables POST /v1/embeddings)", + " --reranking Load in reranking mode (enables POST /v1/rerank)", + " -h, --help Show this help and exit", + "", + "Endpoints:", + " POST /v1/chat/completions (streaming via SSE + non-streaming)", + " POST /v1/completions", + " POST /v1/embeddings (requires --embedding)", + " POST /v1/rerank (requires --reranking)", + " POST /infill (fill-in-the-middle / autocomplete)", + " GET /v1/models", + " GET /health"); + } + + private static String nextValue(String[] args, int valueIndex, String flag) { + if (valueIndex >= args.length) { + throw error("Missing value for " + flag); + } + return args[valueIndex]; + } + + private static int intValue(String[] args, int valueIndex, String flag) { + final String raw = nextValue(args, valueIndex, flag); + try { + return Integer.parseInt(raw.trim()); + } catch (NumberFormatException e) { + throw error(flag + " expects an integer, got: " + raw, e); + } + } + + private static IllegalArgumentException error(String message) { + return error(message, null); + } + + private static IllegalArgumentException error(String message, @Nullable Throwable cause) { + return new IllegalArgumentException(message + System.lineSeparator() + System.lineSeparator() + usage(), cause); + } + + /** + * Immutable, parsed launcher options. {@code ctxSize}, {@code threads} and {@code parallel} use + * {@code 0} as a sentinel meaning "leave the llama.cpp default" — they are only applied to + * {@link ModelParameters} when positive. {@code gpuLayers} is always applied (its own default of + * {@code 0} already means CPU-only). + */ + public static final class Options { + + private final String host; + private final int port; + private final String modelPath; + private final @Nullable String modelId; + private final @Nullable String apiKey; + private final @Nullable String mmproj; + private final int ctxSize; + private final int gpuLayers; + private final int threads; + private final int parallel; + private final boolean embedding; + private final boolean reranking; + + private Options( + String host, + int port, + String modelPath, + @Nullable String modelId, + @Nullable String apiKey, + @Nullable String mmproj, + int ctxSize, + int gpuLayers, + int threads, + int parallel, + boolean embedding, + boolean reranking) { + this.host = host; + this.port = port; + this.modelPath = modelPath; + this.modelId = modelId; + this.apiKey = apiKey; + this.mmproj = mmproj; + this.ctxSize = ctxSize; + this.gpuLayers = gpuLayers; + this.threads = threads; + this.parallel = parallel; + this.embedding = embedding; + this.reranking = reranking; + } + + /** + * The interface to bind. + * + * @return the bind host + */ + public String getHost() { + return host; + } + + /** + * The TCP port to listen on. + * + * @return the port + */ + public int getPort() { + return port; + } + + /** + * The path to the GGUF model file to load. + * + * @return the model path + */ + public String getModelPath() { + return modelPath; + } + + /** + * The advertised model id, resolved from {@code --model-id} or derived from the model file name. + * + * @return the model id reported by {@code GET /v1/models} + */ + public String getModelId() { + if (modelId != null) { + return modelId; + } + final Path name = Paths.get(modelPath).getFileName(); + return name != null ? name.toString() : OpenAiServerConfig.DEFAULT_MODEL_ID; + } + + /** + * The optional bearer API key. + * + * @return the API key, or {@code null} when authentication is disabled + */ + public @Nullable String getApiKey() { + return apiKey; + } + + /** + * The optional multimodal projector path for vision models. + * + * @return the mmproj path, or {@code null} when no vision projector is configured + */ + public @Nullable String getMmproj() { + return mmproj; + } + + /** + * The context window size, or {@code 0} for the llama.cpp default. + * + * @return the context size + */ + public int getCtxSize() { + return ctxSize; + } + + /** + * The number of layers to offload to the GPU ({@code 0} = CPU-only). + * + * @return the GPU layer count + */ + public int getGpuLayers() { + return gpuLayers; + } + + /** + * The inference thread count, or {@code 0} for the llama.cpp default. + * + * @return the thread count + */ + public int getThreads() { + return threads; + } + + /** + * The number of parallel inference slots, or {@code 0} for the llama.cpp default. + * + * @return the parallel slot count + */ + public int getParallel() { + return parallel; + } + + /** + * Whether to load the model in embedding mode. + * + * @return {@code true} if embedding mode is requested + */ + public boolean isEmbedding() { + return embedding; + } + + /** + * Whether to load the model in reranking mode. + * + * @return {@code true} if reranking mode is requested + */ + public boolean isReranking() { + return reranking; + } + + /** + * Build the {@link ModelParameters} for loading the model described by these options. + * + * @return the model parameters + */ + public ModelParameters toModelParameters() { + final ModelParameters params = + new ModelParameters().setModel(modelPath).setGpuLayers(gpuLayers); + if (mmproj != null) { + params.setMmproj(mmproj); + } + if (ctxSize > 0) { + params.setCtxSize(ctxSize); + } + if (threads > 0) { + params.setThreads(threads); + } + if (parallel > 0) { + params.setParallel(parallel); + } + if (embedding) { + params.enableEmbedding(); + } + if (reranking) { + params.enableReranking(); + } + return params; + } + + /** + * Build the {@link OpenAiServerConfig} describing the server side of these options. When a + * context size is given, the advertised input/output token budgets are derived from it. + * + * @return the server configuration + */ + public OpenAiServerConfig toServerConfig() { + final OpenAiServerConfig.Builder builder = OpenAiServerConfig.builder() + .host(host) + .port(port) + .modelId(getModelId()) + .supportsVision(mmproj != null); + if (apiKey != null) { + builder.apiKey(apiKey); + } + if (ctxSize > 0) { + builder.maxOutputTokens( + Math.min(OpenAiServerConfig.DEFAULT_MAX_OUTPUT_TOKENS, Math.max(1, ctxSize / 2))); + builder.maxInputTokens(Math.max(1, ctxSize - OpenAiServerConfig.DEFAULT_MAX_OUTPUT_TOKENS)); + } + return builder.build(); + } + } +} diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java b/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java index 098512aa..285cf23e 100644 --- a/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java +++ b/src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java @@ -4,14 +4,17 @@ package net.ladenthin.llama.server; +import lombok.EqualsAndHashCode; import org.jspecify.annotations.Nullable; /** * Immutable configuration for {@link OpenAiCompatServer}. * *

Sensible localhost defaults are provided; build instances with {@link #builder()}. The API key is - * deliberately excluded from {@link #toString()} so it is never written to logs. + * deliberately excluded from {@link #toString()} so it is never written to logs (but it is part + * of {@link #equals(Object)}/{@link #hashCode()}, which are never logged). */ +@EqualsAndHashCode public final class OpenAiServerConfig { /** Default bind address: loopback only, so the endpoint is not exposed off-host. */ @@ -32,6 +35,13 @@ public final class OpenAiServerConfig { /** Default Server-Sent-Events heartbeat interval, in milliseconds. */ public static final long DEFAULT_HEARTBEAT_MILLIS = 15_000L; + /** + * Default {@code Access-Control-Allow-Origin} value: {@code "*"}. Browser- and webview-based clients + * send a CORS preflight and require this header; {@code "*"} is the pragmatic default for a server + * that binds loopback and authenticates with a bearer token (not cookies). + */ + public static final String DEFAULT_CORS_ALLOW_ORIGIN = "*"; + private final String host; private final int port; private final @Nullable String apiKey; @@ -39,6 +49,8 @@ public final class OpenAiServerConfig { private final int maxInputTokens; private final int maxOutputTokens; private final long heartbeatMillis; + private final String corsAllowOrigin; + private final boolean supportsVision; private OpenAiServerConfig(Builder builder) { this.host = builder.host; @@ -48,6 +60,8 @@ private OpenAiServerConfig(Builder builder) { this.maxInputTokens = builder.maxInputTokens; this.maxOutputTokens = builder.maxOutputTokens; this.heartbeatMillis = builder.heartbeatMillis; + this.corsAllowOrigin = builder.corsAllowOrigin; + this.supportsVision = builder.supportsVision; } /** @@ -122,6 +136,25 @@ public long getHeartbeatMillis() { return heartbeatMillis; } + /** + * The {@code Access-Control-Allow-Origin} value sent on every response and CORS preflight. + * + * @return the allowed CORS origin + */ + public String getCorsAllowOrigin() { + return corsAllowOrigin; + } + + /** + * Whether the served model supports image input (a multimodal projector was configured). Advertised + * to clients that gate on a vision capability (e.g. Copilot's Ollama provider via {@code /api/show}). + * + * @return {@code true} if vision/image input is available + */ + public boolean isSupportsVision() { + return supportsVision; + } + /** * Whether bearer-token authentication is enabled (an API key is configured). * @@ -152,6 +185,8 @@ public String toString() { + maxOutputTokens + ", heartbeatMillis=" + heartbeatMillis + + ", corsAllowOrigin=" + + corsAllowOrigin + '}'; } @@ -165,6 +200,8 @@ public static final class Builder { private int maxInputTokens = DEFAULT_MAX_INPUT_TOKENS; private int maxOutputTokens = DEFAULT_MAX_OUTPUT_TOKENS; private long heartbeatMillis = DEFAULT_HEARTBEAT_MILLIS; + private String corsAllowOrigin = DEFAULT_CORS_ALLOW_ORIGIN; + private boolean supportsVision; private Builder() {} @@ -245,6 +282,28 @@ public Builder heartbeatMillis(long heartbeatMillis) { return this; } + /** + * Sets the {@code Access-Control-Allow-Origin} value (CORS). + * + * @param corsAllowOrigin the allowed origin (e.g. {@code "*"} or a specific scheme/host/port) + * @return this builder + */ + public Builder corsAllowOrigin(String corsAllowOrigin) { + this.corsAllowOrigin = corsAllowOrigin; + return this; + } + + /** + * Sets whether the served model supports image input (a multimodal projector is configured). + * + * @param supportsVision {@code true} if vision/image input is available + * @return this builder + */ + public Builder supportsVision(boolean supportsVision) { + this.supportsVision = supportsVision; + return this; + } + /** * Builds the immutable configuration. * diff --git a/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java b/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java index 9a9d4b5f..937af691 100644 --- a/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java +++ b/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java @@ -4,9 +4,11 @@ package net.ladenthin.llama.server; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; import org.jspecify.annotations.Nullable; /** @@ -73,6 +75,46 @@ static String errorJson(String message, String type, @Nullable String code) { return root.toString(); } + /** + * Guarantee a streamed chunk's usage object carries {@code usage.prompt_tokens_details.cached_tokens}. + * + *

When {@code stream_options.include_usage} is set, the OpenAI streaming protocol emits a trailing + * usage chunk. The VS Code Copilot custom endpoint throws + * {@code Cannot read properties of undefined (reading 'cached_tokens')} (microsoft/vscode #273482) if + * {@code usage.prompt_tokens_details.cached_tokens} is missing, and upstream llama.cpp does not always + * populate it. This fills a default of {@code 0} when absent. Token-delta chunks (which carry no + * non-null usage object) are returned unchanged and unparsed, so the streaming hot path is untouched. + * + * @param chunkJson one {@code chat.completion.chunk} serialized as JSON + * @return the chunk JSON with {@code cached_tokens} guaranteed present inside any non-null usage object + */ + static String ensureUsageCachedTokens(String chunkJson) { + // Fast path: only the trailing usage chunk carries a non-null usage object — skip the rest unparsed. + if (!chunkJson.contains("\"usage\"") || chunkJson.contains("\"usage\":null")) { + return chunkJson; + } + try { + JsonNode root = OBJECT_MAPPER.readTree(chunkJson); + if (!root.isObject() || !root.path("usage").isObject()) { + return chunkJson; + } + ObjectNode usage = (ObjectNode) root.get("usage"); + JsonNode details = usage.path("prompt_tokens_details"); + if (details.isObject()) { + if (details.has("cached_tokens")) { + return chunkJson; // already correct — emit verbatim + } + ((ObjectNode) details).put("cached_tokens", 0); + } else { + usage.putObject("prompt_tokens_details").put("cached_tokens", 0); + } + return root.toString(); + } catch (IOException e) { + // Never break a live stream over a formatting nicety. + return chunkJson; + } + } + /** * Build the {@code GET /v1/models} body advertising a single model. * @@ -91,4 +133,28 @@ static String modelsJson(String modelId) { root.set("data", data); return root.toString(); } + + /** + * Build the llama.cpp-native {@code GET /props} body. Autocomplete clients (e.g. llama.vscode) read + * {@code default_generation_settings.n_ctx} from here to size their context window, and newer clients + * read the {@code modalities} block to gate image input. + * + * @param modelId the served model id + * @param nCtx the advertised context length + * @param vision whether image input is supported + * @return the props object serialized as JSON + */ + static String propsJson(String modelId, int nCtx, boolean vision) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + ObjectNode defaults = root.putObject("default_generation_settings"); + defaults.put("n_ctx", nCtx); + defaults.put("model", modelId); + root.put("total_slots", 1); + root.put("model_alias", modelId); + root.put("chat_template", ""); + ObjectNode modalities = root.putObject("modalities"); + modalities.put("vision", vision); + modalities.put("audio", false); + return root.toString(); + } } diff --git a/src/main/java/net/ladenthin/llama/server/ResponsesApiSupport.java b/src/main/java/net/ladenthin/llama/server/ResponsesApiSupport.java new file mode 100644 index 00000000..65abf7a2 --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/ResponsesApiSupport.java @@ -0,0 +1,261 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; + +/** + * Pure translators between the OpenAI Responses API ({@code POST /v1/responses}) and the internal OpenAI + * chat shape, plus builders for the Responses streaming SSE events. Lets clients/editors that use the + * newer Responses protocol (e.g. Copilot's {@code responses} apiType) drive the local model. + * + *

Request mapping: {@code instructions} becomes a system message; {@code input} (a string, or an array + * of {@code message} / {@code function_call} / {@code function_call_output} items) is flattened to OpenAI + * messages; Responses function tools ({@code {type:"function",name,description,parameters}}) become OpenAI + * function tools. Responses replies wrap the assistant turn in a {@code response} object whose + * {@code output} array holds a {@code message} item (with {@code output_text} content) and one + * {@code function_call} item per tool call. + * + *

Stateless and free of JNI / model dependencies; unit-testable with JSON literals. Streaming state is + * held by {@link ResponsesStreamTranslator}. + */ +final class ResponsesApiSupport { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private ResponsesApiSupport() {} + + /** + * Whether the Responses request asks for a streamed response ({@code "stream"} defaults to false). + * + * @param request the parsed Responses request + * @return {@code true} if {@code "stream"} is explicitly true + */ + static boolean isStreaming(JsonNode request) { + return request.path("stream").asBoolean(false); + } + + /** + * Translate an OpenAI Responses request into the internal OpenAI chat request shape. + * + * @param request the parsed Responses request + * @return an OpenAI {@code /v1/chat/completions} request object + */ + static ObjectNode toOpenAiChatRequest(JsonNode request) { + ObjectNode openAi = OBJECT_MAPPER.createObjectNode(); + if (request.path("model").isTextual()) { + openAi.put("model", request.path("model").asText()); + } + + ArrayNode messages = openAi.putArray("messages"); + if (request.path("instructions").isTextual()) { + ObjectNode system = messages.addObject(); + system.put("role", "system"); + system.put("content", request.path("instructions").asText()); + } + appendInput(messages, request.path("input")); + + JsonNode tools = request.path("tools"); + if (tools.isArray() && tools.size() > 0) { + ArrayNode openAiTools = openAi.putArray("tools"); + for (JsonNode tool : tools) { + if (!"function".equals(tool.path("type").asText(""))) { + continue; + } + ObjectNode openAiTool = openAiTools.addObject(); + openAiTool.put("type", "function"); + ObjectNode function = openAiTool.putObject("function"); + function.put("name", tool.path("name").asText("")); + if (tool.path("description").isTextual()) { + function.put("description", tool.path("description").asText()); + } + if (tool.path("parameters").isObject()) { + function.set("parameters", tool.path("parameters").deepCopy()); + } + } + // The Responses API uses the same tool_choice + parallel_tool_calls fields as chat; forward + // them so the shared chat core honors them. The mapper consumes the string form of + // tool_choice ("auto"/"none"/"required"), which is what we forward here. + if (request.path("tool_choice").isTextual()) { + openAi.put("tool_choice", request.path("tool_choice").asText()); + } + if (request.path("parallel_tool_calls").isBoolean()) { + openAi.put( + "parallel_tool_calls", + request.path("parallel_tool_calls").asBoolean()); + } + } + + copyNumber(request, "temperature", openAi, "temperature"); + copyNumber(request, "top_p", openAi, "top_p"); + copyNumber(request, "max_output_tokens", openAi, "max_tokens"); + return openAi; + } + + private static void appendInput(ArrayNode messages, JsonNode input) { + if (input.isTextual()) { + ObjectNode message = messages.addObject(); + message.put("role", "user"); + message.put("content", input.asText()); + return; + } + if (!input.isArray()) { + return; + } + for (JsonNode item : input) { + String type = item.path("type").asText("message"); + switch (type) { + case "function_call": + ObjectNode assistant = messages.addObject(); + assistant.put("role", "assistant"); + assistant.putNull("content"); + ArrayNode toolCalls = assistant.putArray("tool_calls"); + ObjectNode toolCall = toolCalls.addObject(); + toolCall.put( + "id", item.path("call_id").asText(item.path("id").asText(""))); + toolCall.put("type", "function"); + ObjectNode function = toolCall.putObject("function"); + function.put("name", item.path("name").asText("")); + function.put("arguments", item.path("arguments").asText("")); + break; + case "function_call_output": + ObjectNode toolMessage = messages.addObject(); + toolMessage.put("role", "tool"); + toolMessage.put("tool_call_id", item.path("call_id").asText("")); + toolMessage.put("content", item.path("output").asText("")); + break; + case "message": + default: + ObjectNode message = messages.addObject(); + message.put("role", item.path("role").asText("user")); + message.put("content", inputContentText(item.path("content"))); + break; + } + } + } + + private static String inputContentText(JsonNode content) { + if (content.isTextual()) { + return content.asText(); + } + if (content.isArray()) { + StringBuilder sb = new StringBuilder(); + for (JsonNode part : content) { + if (part.path("text").isTextual()) { + sb.append(part.path("text").asText()); + } + } + return sb.toString(); + } + return ""; + } + + private static void copyNumber(JsonNode from, String fromKey, ObjectNode to, String toKey) { + JsonNode value = from.path(fromKey); + if (value.isNumber()) { + to.set(toKey, value); + } + } + + /** + * Translate a non-streaming OpenAI {@code chat.completion} into a Responses API response object. + * + * @param openAiCompletionJson the OpenAI completion body + * @param model the model id to echo + * @param responseId the response id to assign + * @return the Responses object serialized as JSON + */ + static String toResponsesResponse(String openAiCompletionJson, String model, String responseId) { + ObjectNode root = newResponseShell(model, responseId, "completed"); + ArrayNode output = root.putArray("output"); + ObjectNode usage = root.putObject("usage"); + usage.put("input_tokens", 0); + usage.put("output_tokens", 0); + usage.put("total_tokens", 0); + try { + JsonNode completion = OBJECT_MAPPER.readTree(openAiCompletionJson); + JsonNode message = completion.path("choices").path(0).path("message"); + String text = message.path("content").asText(""); + ObjectNode messageItem = output.addObject(); + messageItem.put("type", "message"); + messageItem.put("id", "msg_" + responseId); + messageItem.put("status", "completed"); + messageItem.put("role", "assistant"); + ArrayNode content = messageItem.putArray("content"); + ObjectNode textPart = content.addObject(); + textPart.put("type", "output_text"); + textPart.put("text", text); + textPart.putArray("annotations"); + JsonNode toolCalls = message.path("tool_calls"); + if (toolCalls.isArray()) { + for (JsonNode toolCall : toolCalls) { + output.add(functionCallItem(toolCall)); + } + } + JsonNode openAiUsage = completion.path("usage"); + if (openAiUsage.isObject()) { + int in = openAiUsage.path("prompt_tokens").asInt(0); + int out = openAiUsage.path("completion_tokens").asInt(0); + usage.put("input_tokens", in); + usage.put("output_tokens", out); + usage.put("total_tokens", in + out); + } + } catch (IOException e) { + // Defensive: an unexpected body still yields a valid, empty completed response. + output.removeAll(); + } + return root.toString(); + } + + /** Build a Responses {@code function_call} output item from an OpenAI tool call. */ + static ObjectNode functionCallItem(JsonNode openAiToolCall) { + JsonNode function = openAiToolCall.path("function"); + ObjectNode item = OBJECT_MAPPER.createObjectNode(); + item.put("type", "function_call"); + item.put("id", "fc_" + openAiToolCall.path("id").asText("")); + item.put("call_id", openAiToolCall.path("id").asText("")); + item.put("name", function.path("name").asText("")); + item.put("arguments", function.path("arguments").asText("")); + item.put("status", "completed"); + return item; + } + + /** A bare Responses API object shell (no output/usage), used by both the final reply and events. */ + static ObjectNode newResponseShell(String model, String responseId, String status) { + ObjectNode root = OBJECT_MAPPER.createObjectNode(); + root.put("id", responseId); + root.put("object", "response"); + root.put("created_at", 0); + root.put("status", status); + root.put("model", model); + return root; + } + + // ----- streaming SSE event builders ----- + + /** + * Frame a Responses SSE event: {@code event: \ndata: \n\n}, where the data object carries + * the event {@code type} and {@code sequence_number}. + * + * @param type the event type (e.g. {@code response.output_text.delta}) + * @param sequenceNumber the monotonic event sequence number + * @param data the event payload (the {@code type}/{@code sequence_number} are added here) + * @return the framed SSE event + */ + static String sseEvent(String type, int sequenceNumber, ObjectNode data) { + data.put("type", type); + data.put("sequence_number", sequenceNumber); + return "event: " + type + "\ndata: " + data + "\n\n"; + } + + /** New empty data object for an event payload. */ + static ObjectNode dataObject() { + return OBJECT_MAPPER.createObjectNode(); + } +} diff --git a/src/main/java/net/ladenthin/llama/server/ResponsesStreamTranslator.java b/src/main/java/net/ladenthin/llama/server/ResponsesStreamTranslator.java new file mode 100644 index 00000000..17d928f7 --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/ResponsesStreamTranslator.java @@ -0,0 +1,197 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.io.IOException; +import lombok.ToString; + +/** + * Stateful translator that turns the OpenAI streaming chat chunks into the OpenAI Responses SSE event + * sequence: {@code response.created} → (for a text message) {@code response.output_item.added} + + * {@code response.content_part.added} + {@code response.output_text.delta}* + + * {@code response.output_text.done} + {@code response.content_part.done} + + * {@code response.output_item.done} → (per tool call) a {@code function_call} item with + * {@code response.function_call_arguments.done} → {@code response.completed}. Each event carries a + * monotonic {@code sequence_number}. + * + *

Text deltas are emitted live; tool calls are reconstructed via {@link ToolCallDeltaAccumulator} and + * emitted as whole {@code function_call} items at the end. Free of JNI / model dependencies; + * unit-testable by feeding chunk JSON. + */ +@ToString +final class ResponsesStreamTranslator { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private final String model; + private final String responseId; + private final String messageItemId; + private final ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + private final StringBuilder text = new StringBuilder(); + + private int sequence; + private boolean messageOpen; + private int nextOutputIndex; + private int messageOutputIndex = -1; + + ResponsesStreamTranslator(String model, String responseId) { + this.model = model; + this.responseId = responseId; + this.messageItemId = "msg_" + responseId; + } + + /** + * The opening {@code response.created} event. + * + * @return the framed SSE event + */ + String begin() { + ObjectNode data = ResponsesApiSupport.dataObject(); + data.set("response", ResponsesApiSupport.newResponseShell(model, responseId, "in_progress")); + return ResponsesApiSupport.sseEvent("response.created", sequence++, data); + } + + /** + * Translate one OpenAI chunk into the Responses events it produces (opening the message item and + * content part on first text, then text deltas), accumulating tool-call fragments. Returns an empty + * string when the chunk yields no event. + * + * @param openAiChunkJson one OpenAI {@code chat.completion.chunk} + * @return zero or more framed SSE events, concatenated + */ + String onChunk(String openAiChunkJson) { + StringBuilder out = new StringBuilder(); + try { + JsonNode chunk = OBJECT_MAPPER.readTree(openAiChunkJson); + accumulator.accept(chunk); + JsonNode content = chunk.path("choices").path(0).path("delta").path("content"); + if (content.isTextual() && !content.asText().isEmpty()) { + if (!messageOpen) { + messageOutputIndex = nextOutputIndex++; + out.append(outputItemAdded(messageOutputIndex, messageItemShell())); + out.append(contentPartAdded()); + messageOpen = true; + } + String delta = content.asText(); + text.append(delta); + ObjectNode data = ResponsesApiSupport.dataObject(); + data.put("item_id", messageItemId); + data.put("output_index", messageOutputIndex); + data.put("content_index", 0); + data.put("delta", delta); + out.append(ResponsesApiSupport.sseEvent("response.output_text.delta", sequence++, data)); + } + } catch (IOException e) { + // A malformed chunk produces no events. + } + return out.toString(); + } + + /** + * The closing events: finish the text content part / message item, emit a {@code function_call} item + * per accumulated tool call, then {@code response.completed} carrying the assembled output and usage. + * + * @return the framed SSE events, concatenated + */ + String end() { + StringBuilder out = new StringBuilder(); + ArrayNode output = OBJECT_MAPPER.createArrayNode(); + + if (messageOpen) { + ObjectNode textDone = ResponsesApiSupport.dataObject(); + textDone.put("item_id", messageItemId); + textDone.put("output_index", messageOutputIndex); + textDone.put("content_index", 0); + textDone.put("text", text.toString()); + out.append(ResponsesApiSupport.sseEvent("response.output_text.done", sequence++, textDone)); + + ObjectNode partDone = ResponsesApiSupport.dataObject(); + partDone.put("item_id", messageItemId); + partDone.put("output_index", messageOutputIndex); + partDone.put("content_index", 0); + ObjectNode part = partDone.putObject("part"); + part.put("type", "output_text"); + part.put("text", text.toString()); + out.append(ResponsesApiSupport.sseEvent("response.content_part.done", sequence++, partDone)); + + ObjectNode messageItem = completedMessageItem(); + output.add(messageItem); + out.append(outputItemDone(messageOutputIndex, messageItem)); + } + + for (JsonNode toolCall : accumulator.toOpenAiToolCalls()) { + int index = nextOutputIndex++; + ObjectNode functionCall = ResponsesApiSupport.functionCallItem(toolCall); + out.append(outputItemAdded(index, functionCall)); + ObjectNode argsDone = ResponsesApiSupport.dataObject(); + argsDone.put("item_id", functionCall.path("id").asText()); + argsDone.put("output_index", index); + argsDone.put( + "arguments", toolCall.path("function").path("arguments").asText("")); + out.append(ResponsesApiSupport.sseEvent("response.function_call_arguments.done", sequence++, argsDone)); + out.append(outputItemDone(index, functionCall)); + output.add(functionCall); + } + + ObjectNode completed = ResponsesApiSupport.dataObject(); + ObjectNode response = ResponsesApiSupport.newResponseShell(model, responseId, "completed"); + response.set("output", output); + completed.set("response", response); + out.append(ResponsesApiSupport.sseEvent("response.completed", sequence++, completed)); + return out.toString(); + } + + private ObjectNode messageItemShell() { + ObjectNode item = OBJECT_MAPPER.createObjectNode(); + item.put("type", "message"); + item.put("id", messageItemId); + item.put("status", "in_progress"); + item.put("role", "assistant"); + item.putArray("content"); + return item; + } + + private ObjectNode completedMessageItem() { + ObjectNode item = OBJECT_MAPPER.createObjectNode(); + item.put("type", "message"); + item.put("id", messageItemId); + item.put("status", "completed"); + item.put("role", "assistant"); + ObjectNode textPart = item.putArray("content").addObject(); + textPart.put("type", "output_text"); + textPart.put("text", text.toString()); + textPart.putArray("annotations"); + return item; + } + + private String outputItemAdded(int outputIndex, ObjectNode item) { + ObjectNode data = ResponsesApiSupport.dataObject(); + data.put("output_index", outputIndex); + data.set("item", item); + return ResponsesApiSupport.sseEvent("response.output_item.added", sequence++, data); + } + + private String outputItemDone(int outputIndex, ObjectNode item) { + ObjectNode data = ResponsesApiSupport.dataObject(); + data.put("output_index", outputIndex); + data.set("item", item); + return ResponsesApiSupport.sseEvent("response.output_item.done", sequence++, data); + } + + private String contentPartAdded() { + ObjectNode data = ResponsesApiSupport.dataObject(); + data.put("item_id", messageItemId); + data.put("output_index", messageOutputIndex); + data.put("content_index", 0); + ObjectNode part = data.putObject("part"); + part.put("type", "output_text"); + part.put("text", ""); + return ResponsesApiSupport.sseEvent("response.content_part.added", sequence++, data); + } +} diff --git a/src/main/java/net/ladenthin/llama/server/ToolCallDeltaAccumulator.java b/src/main/java/net/ladenthin/llama/server/ToolCallDeltaAccumulator.java new file mode 100644 index 00000000..58a9bab2 --- /dev/null +++ b/src/main/java/net/ladenthin/llama/server/ToolCallDeltaAccumulator.java @@ -0,0 +1,108 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.Map; +import java.util.TreeMap; +import lombok.ToString; + +/** + * Reconstructs whole tool calls from the incremental {@code delta.tool_calls} fragments of an OpenAI + * streaming chat completion. Across a stream, the first fragment for a given {@code index} carries the + * call {@code id} and {@code function.name}, and subsequent fragments append {@code function.arguments} + * string pieces. This accumulator merges them by index so the non-OpenAI protocol shims (Ollama, + * Anthropic, OpenAI Responses) — which deliver tool calls whole rather than fragmented — can emit a + * complete tool-call list once the stream finishes. + * + *

Stateful but free of JNI / model dependencies; unit-testable by feeding chunk JSON literals. + */ +@ToString +final class ToolCallDeltaAccumulator { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + /** Per-index partial tool call: id/name captured once, argument fragments concatenated. */ + private static final class Partial { + private String id = ""; + private String name = ""; + private final StringBuilder arguments = new StringBuilder(); + } + + private final Map byIndex = new TreeMap<>(); + + /** + * Feed one OpenAI chunk from its raw JSON; unparseable chunks are ignored. Convenience for streaming + * sinks that hold the chunk as a string. + * + * @param openAiChunkJson one OpenAI {@code chat.completion.chunk} serialized as JSON + */ + void accept(String openAiChunkJson) { + try { + accept(OBJECT_MAPPER.readTree(openAiChunkJson)); + } catch (java.io.IOException e) { + // A malformed chunk simply contributes no tool-call fragments. + } + } + + /** + * Feed one OpenAI {@code chat.completion.chunk}; merges any {@code delta.tool_calls} fragments it + * carries. Chunks without tool-call deltas are ignored. + * + * @param openAiChunk a parsed OpenAI streaming chunk + */ + void accept(JsonNode openAiChunk) { + JsonNode toolCalls = openAiChunk.path("choices").path(0).path("delta").path("tool_calls"); + if (!toolCalls.isArray()) { + return; + } + for (JsonNode toolCall : toolCalls) { + int index = toolCall.path("index").asInt(0); + Partial partial = byIndex.computeIfAbsent(index, k -> new Partial()); + if (toolCall.path("id").isTextual()) { + partial.id = toolCall.path("id").asText(); + } + JsonNode function = toolCall.path("function"); + if (function.path("name").isTextual()) { + partial.name = function.path("name").asText(); + } + if (function.path("arguments").isTextual()) { + partial.arguments.append(function.path("arguments").asText()); + } + } + } + + /** + * Whether any tool-call fragments were accumulated. + * + * @return {@code true} if at least one tool call was seen + */ + boolean hasToolCalls() { + return !byIndex.isEmpty(); + } + + /** + * The reconstructed tool calls as an OpenAI-shaped array + * ({@code [{id,type:"function",function:{name,arguments}}]}), in index order. {@code arguments} + * is the concatenated JSON-encoded string, exactly as the OpenAI non-streaming message carries it. + * + * @return the reconstructed tool-call array (empty when none were seen) + */ + ArrayNode toOpenAiToolCalls() { + ArrayNode out = OBJECT_MAPPER.createArrayNode(); + for (Partial partial : byIndex.values()) { + ObjectNode toolCall = out.addObject(); + toolCall.put("id", partial.id); + toolCall.put("type", "function"); + ObjectNode function = toolCall.putObject("function"); + function.put("name", partial.name); + function.put("arguments", partial.arguments.toString()); + } + return out; + } +} diff --git a/src/main/java/net/ladenthin/llama/server/package-info.java b/src/main/java/net/ladenthin/llama/server/package-info.java index ff1eb971..6b8b145c 100644 --- a/src/main/java/net/ladenthin/llama/server/package-info.java +++ b/src/main/java/net/ladenthin/llama/server/package-info.java @@ -5,31 +5,60 @@ /** * Optional OpenAI-compatible HTTP server over a loaded {@link net.ladenthin.llama.LlamaModel}. * - *

Interim state — two implementations pending consolidation. This package - * currently contains two independent OpenAI-compatible server implementations that landed - * on separate branches and are awaiting a "best of both" merge (tracked in {@code TODO.md}). Both - * let editors and tools that speak the OpenAI Chat Completions protocol (for example a VS Code - * Copilot "Custom Endpoint") drive a local GGUF model running in-process through the JNI binding, - * and both are faithful pass-throughs that do not implement or execute tools themselves.

+ *

{@link net.ladenthin.llama.server.OpenAiCompatServer} is a dependency-free server built only on + * the JDK's {@code com.sun.net.httpserver.HttpServer} (the supported, exported {@code jdk.httpserver} + * module — no web-framework dependency). It is both embeddable and the {@code Main-Class} of the + * {@code -jar-with-dependencies} assembly, so editors and tools that speak the OpenAI protocol (for + * example a VS Code Copilot "Custom Endpoint") can drive a local GGUF model running in-process + * through the JNI binding. It is a faithful pass-through that does not implement or execute tools + * itself.

* + *

Routes:

*
    - *
  • {@link net.ladenthin.llama.server.LlamaServer} / - * {@link net.ladenthin.llama.server.OaiHttpServer} — a NanoHTTPD-based server. - * {@code LlamaServer} is a {@code main} entry point (and the {@code Main-Class} of the - * {@code -jar-with-dependencies} assembly). It exposes {@code POST /v1/chat/completions}, - * {@code POST /v1/completions}, {@code POST /v1/embeddings} and {@code GET /v1/models} by - * forwarding the request body to the matching {@code LlamaModel.handle*} method, which already - * returns OpenAI-shaped JSON. Routing ({@link net.ladenthin.llama.server.OaiRouter}) is - * decoupled from NanoHTTPD so it is unit-testable without binding a socket or loading a model. - * NanoHTTPD is an {@code } dependency (bundled only in the fat jar).
  • - *
  • {@link net.ladenthin.llama.server.OpenAiCompatServer} — a dependency-free server built only - * on the JDK's {@code com.sun.net.httpserver.HttpServer}. It serves - * {@code POST /v1/chat/completions} (streaming via Server-Sent Events and non-streaming) and - * {@code GET /v1/models}. Streaming comes straight from the native OpenAI chunk formatter (see + *
  • {@code POST /v1/chat/completions} — streaming (Server-Sent Events) and non-streaming. Streaming + * comes straight from the native OpenAI chunk formatter (see * {@link net.ladenthin.llama.LlamaModel#streamChatCompletion(net.ladenthin.llama.parameters.InferenceParameters, java.util.function.Consumer)}), * so streamed {@code delta.tool_calls} are preserved for agent-mode tool use.
  • + *
  • {@code POST /v1/completions} and {@code POST /v1/embeddings} — non-streaming, forwarding the + * request body to the matching {@code LlamaModel.handle*} method.
  • + *
  • {@code POST /v1/rerank} — document reranking for RAG (requires the model loaded in reranking + * mode); the native result array is reshaped to {@code results}/{@code data} of + * {@code {index, relevance_score}}.
  • + *
  • {@code POST /infill} — non-streaming fill-in-the-middle for local ghost-text autocomplete + * clients (llama.vscode, Twinny, Tabby); the model's FIM tokens are applied server-side.
  • + *
  • {@code GET /v1/models} — advertises the configured model id.
  • + *
  • {@code GET /health} — unauthenticated liveness probe.
  • + *
  • {@code GET /props} — llama.cpp-native server properties (context length + modalities) that + * autocomplete clients read to size their context window.
  • *
* + *

Every route is also reachable without the {@code /v1} prefix, answers CORS preflight + * ({@code OPTIONS}) requests, and stamps {@code Access-Control-Allow-Origin} on responses so + * browser/webview clients are not blocked.

+ * + *

Alternative protocol surfaces let non-OpenAI clients drive the same model without a + * second inference path — each is a pure translation over the OpenAI chat core:

+ *
    + *
  • Ollama-native ({@code GET /api/version}, {@code GET /api/tags}, + * {@code POST /api/show}, {@code POST /api/chat} with NDJSON streaming, {@code POST /api/generate} + * for prompt completion / fill-in-the-middle) — for Copilot's built-in Ollama provider; see + * {@link net.ladenthin.llama.server.OllamaApiSupport}.
  • + *
  • Anthropic Messages ({@code POST /v1/messages}, SSE event stream) — see + * {@link net.ladenthin.llama.server.AnthropicApiSupport} / + * {@link net.ladenthin.llama.server.AnthropicStreamTranslator}.
  • + *
  • OpenAI Responses ({@code POST /v1/responses}, SSE event stream) — see + * {@link net.ladenthin.llama.server.ResponsesApiSupport} / + * {@link net.ladenthin.llama.server.ResponsesStreamTranslator}.
  • + *
+ *

Streamed tool calls on these surfaces are reconstructed from the OpenAI {@code delta.tool_calls} + * fragments by {@link net.ladenthin.llama.server.ToolCallDeltaAccumulator}.

+ * + *

The HTTP surface is decoupled from the model behind {@link net.ladenthin.llama.server.OpenAiBackend} + * (production implementation {@link net.ladenthin.llama.server.LlamaModelBackend}) so routing, + * authentication, SSE framing and heartbeats are unit-testable with a fake backend — no socket and no + * native model. The standalone launcher's command line is parsed by + * {@link net.ladenthin.llama.server.OpenAiServerCli}.

+ * *

JSpecify {@code @NullMarked} is applied module-wide (see {@code module-info.java}) and applies * to this package transitively.

*/ diff --git a/src/test/cpp/test_server.cpp b/src/test/cpp/test_server.cpp index 872f66d3..a25ab965 100644 --- a/src/test/cpp/test_server.cpp +++ b/src/test/cpp/test_server.cpp @@ -1294,6 +1294,25 @@ TEST(CmplFinalOaicompatChat, WithToolCalls_MessageHasToolCallsArray) { EXPECT_EQ(msg.at("tool_calls")[0].at("function").at("name").get(), "search"); } +TEST(CmplFinalOaicompatChat, WithToolCalls_ArgumentsIsJsonStringNotObject) { + // Regression guard for ggml-org/llama.cpp #20198 (introduced by the Autoparser refactor in + // PR #18675): function.arguments MUST be a JSON-encoded *string*, never a parsed object. The + // official OpenAI SDK (Pydantic) and native-tool-calling agent clients (Roo Code >=3.37, + // Copilot agent) raise a TypeError when arguments is an object — breaking agentic mode. This + // pins the wire shape at the pinned llama.cpp build; if an upgrade reintroduces the regression + // this test fails in CI before it can ship. + auto f = make_oai_final(""); + common_chat_tool_call tc; + tc.id = "call_1"; + tc.name = "search"; + tc.arguments = R"({"q":"test"})"; + f.oaicompat_msg.tool_calls.push_back(tc); + const json j = f.to_json_oaicompat_chat(); + const json &args = j.at("choices")[0].at("message").at("tool_calls")[0].at("function").at("arguments"); + ASSERT_TRUE(args.is_string()); + EXPECT_EQ(args.get(), R"({"q":"test"})"); +} + // ============================================================ // server_task_result_cmpl_final::to_json_anthropic // Anthropic Messages API response shape. diff --git a/src/test/java/net/ladenthin/llama/RerankingModelTest.java b/src/test/java/net/ladenthin/llama/RerankingModelTest.java index b6f11776..57de6c32 100644 --- a/src/test/java/net/ladenthin/llama/RerankingModelTest.java +++ b/src/test/java/net/ladenthin/llama/RerankingModelTest.java @@ -33,12 +33,11 @@ public class RerankingModelTest { @BeforeAll public static void setup() { Assumptions.assumeTrue( - new File("models/jina-reranker-v1-tiny-en-Q4_0.gguf").exists(), - "Reranking model not available, skipping tests"); + new File(TestConstants.RERANKING_MODEL_PATH).exists(), "Reranking model not available, skipping tests"); int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); model = new LlamaModel(new ModelParameters() .setCtxSize(128) - .setModel("models/jina-reranker-v1-tiny-en-Q4_0.gguf") + .setModel(TestConstants.RERANKING_MODEL_PATH) .setGpuLayers(gpuLayers) .enableReranking() .enableLogTimestamps() diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java index a4976a6a..57aa218a 100644 --- a/src/test/java/net/ladenthin/llama/TestConstants.java +++ b/src/test/java/net/ladenthin/llama/TestConstants.java @@ -23,6 +23,9 @@ public class TestConstants { /** Path to the Qwen3 thinking model used for reasoning budget tests. */ public static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf"; + /** Path to the reranking model used in tests (loaded with {@code enableReranking()}). */ + public static final String RERANKING_MODEL_PATH = "models/jina-reranker-v1-tiny-en-Q4_0.gguf"; + /** System property overriding the GGUF used by the real tool-calling integration tests. */ public static final String PROP_TOOL_MODEL_PATH = LlamaSystemProperties.PREFIX + ".tool.model"; diff --git a/src/test/java/net/ladenthin/llama/server/AnthropicApiSupportTest.java b/src/test/java/net/ladenthin/llama/server/AnthropicApiSupportTest.java new file mode 100644 index 00000000..29f79ed3 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/AnthropicApiSupportTest.java @@ -0,0 +1,184 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link AnthropicApiSupport}: the Anthropic Messages ↔ OpenAI chat request/response + * translation (content blocks, tool_use/tool_result, tools, stop reasons) and the SSE event builders. + * Pure — no model. + */ +public class AnthropicApiSupportTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static JsonNode read(String json) throws IOException { + return MAPPER.readTree(json); + } + + @Test + public void isStreamingDefaultsFalse() throws IOException { + assertThat(AnthropicApiSupport.isStreaming(read("{}")), is(false)); + assertThat(AnthropicApiSupport.isStreaming(read("{\"stream\":true}")), is(true)); + } + + @Test + public void requestMapsSystemMessagesToolsAndSampling() throws IOException { + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\",\"max_tokens\":64," + + "\"system\":\"be brief\",\"temperature\":0.3," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]," + + "\"tools\":[{\"name\":\"get_weather\",\"description\":\"d\"," + + "\"input_schema\":{\"type\":\"object\"}}],\"tool_choice\":{\"type\":\"auto\"}}")); + // system becomes the first OpenAI message. + assertThat(openAi.path("messages").get(0).path("role").asText(), is("system")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("be brief")); + assertThat(openAi.path("messages").get(1).path("content").asText(), is("hi")); + assertThat(openAi.path("max_tokens").asInt(), is(64)); + assertThat(openAi.path("temperature").asDouble(), is(0.3)); + // Anthropic tool input_schema -> OpenAI function parameters. + assertThat(openAi.path("tools").get(0).path("function").path("name").asText(), is("get_weather")); + assertThat( + openAi.path("tools") + .get(0) + .path("function") + .path("parameters") + .path("type") + .asText(), + is("object")); + assertThat(openAi.path("tool_choice").asText(), is("auto")); + } + + @Test + public void requestFlattensToolUseAndToolResultBlocks() throws IOException { + String anthropic = "{\"messages\":[" + + "{\"role\":\"assistant\",\"content\":[{\"type\":\"tool_use\",\"id\":\"c1\"," + + "\"name\":\"get_weather\",\"input\":{\"city\":\"Paris\"}}]}," + + "{\"role\":\"user\",\"content\":[{\"type\":\"tool_result\",\"tool_use_id\":\"c1\"," + + "\"content\":\"sunny\"}]}]}"; + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read(anthropic)); + // assistant tool_use -> OpenAI tool_calls with arguments as a JSON string. + JsonNode toolCall = openAi.path("messages").get(0).path("tool_calls").get(0); + assertThat(toolCall.path("id").asText(), is("c1")); + assertThat(toolCall.path("function").path("arguments").isTextual(), is(true)); + assertThat( + read(toolCall.path("function").path("arguments").asText()) + .path("city") + .asText(), + is("Paris")); + // user tool_result -> separate OpenAI role:"tool" message. + JsonNode toolMessage = openAi.path("messages").get(1); + assertThat(toolMessage.path("role").asText(), is("tool")); + assertThat(toolMessage.path("tool_call_id").asText(), is("c1")); + assertThat(toolMessage.path("content").asText(), is("sunny")); + } + + @Test + public void requestConcatenatesSystemBlocksAndMapsStopSequences() throws IOException { + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest( + read("{\"system\":[{\"type\":\"text\",\"text\":\"a\"},{\"type\":\"text\",\"text\":\"b\"}]," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]," + + "\"stop_sequences\":[\"X\",\"Y\"]}")); + // system blocks are concatenated into one system message. + assertThat(openAi.path("messages").get(0).path("role").asText(), is("system")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("ab")); + // stop_sequences -> OpenAI stop. + assertThat(openAi.path("stop").size(), is(2)); + assertThat(openAi.path("stop").get(0).asText(), is("X")); + } + + @Test + public void toolChoiceAnyMapsToRequired() throws IOException { + JsonNode openAi = + AnthropicApiSupport.toOpenAiChatRequest(read("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}]," + + "\"tools\":[{\"name\":\"f\",\"input_schema\":{\"type\":\"object\"}}]," + + "\"tool_choice\":{\"type\":\"any\"}}")); + assertThat(openAi.path("tool_choice").asText(), is("required")); + } + + @Test + public void toolResultOnlyUserMessageEmitsOnlyToolMessage() throws IOException { + // A user turn that carries only tool_result blocks must become exactly one role:"tool" + // message — not a tool message plus a spurious empty user message. + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest( + read("{\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"tool_result\"," + + "\"tool_use_id\":\"c1\",\"content\":[{\"type\":\"text\",\"text\":\"su\"}," + + "{\"type\":\"text\",\"text\":\"nny\"}]}]}]}")); + assertThat(openAi.path("messages").size(), is(1)); + JsonNode toolMessage = openAi.path("messages").get(0); + assertThat(toolMessage.path("role").asText(), is("tool")); + assertThat(toolMessage.path("tool_call_id").asText(), is("c1")); + // tool_result content blocks are flattened to text. + assertThat(toolMessage.path("content").asText(), is("sunny")); + } + + @Test + public void responseEmitsTextAndToolUseBlocksAndStopReason() throws IOException { + String openAi = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"hi\"," + + "\"tool_calls\":[{\"id\":\"c1\",\"type\":\"function\",\"function\":{\"name\":\"f\"," + + "\"arguments\":\"{\\\"a\\\":1}\"}}]},\"finish_reason\":\"tool_calls\"}]," + + "\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":2}}"; + JsonNode out = read(AnthropicApiSupport.toAnthropicResponse(openAi, "m")); + assertThat(out.path("type").asText(), is("message")); + assertThat(out.path("role").asText(), is("assistant")); + assertThat(out.path("content").get(0).path("type").asText(), is("text")); + assertThat(out.path("content").get(0).path("text").asText(), is("hi")); + JsonNode toolUse = out.path("content").get(1); + assertThat(toolUse.path("type").asText(), is("tool_use")); + assertThat(toolUse.path("name").asText(), is("f")); + assertThat(toolUse.path("input").path("a").asInt(), is(1)); + // finish_reason "tool_calls" -> stop_reason "tool_use". + assertThat(out.path("stop_reason").asText(), is("tool_use")); + assertThat(out.path("usage").path("input_tokens").asInt(), is(5)); + assertThat(out.path("usage").path("output_tokens").asInt(), is(2)); + } + + @Test + public void stopReasonMapping() { + assertThat(AnthropicApiSupport.anthropicStopReason("stop"), is("end_turn")); + assertThat(AnthropicApiSupport.anthropicStopReason("length"), is("max_tokens")); + assertThat(AnthropicApiSupport.anthropicStopReason("tool_calls"), is("tool_use")); + } + + @Test + public void sseEventBuildersAreWellFormed() throws IOException { + String start = AnthropicApiSupport.messageStartEvent("msg_1", "m"); + assertThat(start.startsWith("event: message_start\ndata: "), is(true)); + assertThat( + read(start.substring(start.indexOf('{'))) + .path("message") + .path("role") + .asText(), + is("assistant")); + assertThat(AnthropicApiSupport.messageStopEvent().startsWith("event: message_stop"), is(true)); + } + + @Test + public void requestMapsDisableParallelToolUseToParallelToolCallsFalse() throws IOException { + // Anthropic tool_choice.disable_parallel_tool_use=true -> OpenAI parallel_tool_calls=false. + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\"," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]," + + "\"tools\":[{\"name\":\"get_weather\",\"input_schema\":{\"type\":\"object\"}}]," + + "\"tool_choice\":{\"type\":\"auto\",\"disable_parallel_tool_use\":true}}")); + assertThat(openAi.path("parallel_tool_calls").isBoolean(), is(true)); + assertThat(openAi.path("parallel_tool_calls").asBoolean(), is(false)); + } + + @Test + public void requestOmitsParallelToolCallsWhenParallelToolUseAllowed() throws IOException { + // disable_parallel_tool_use absent -> default (parallel allowed) -> no override emitted. + JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\"," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]," + + "\"tools\":[{\"name\":\"get_weather\",\"input_schema\":{\"type\":\"object\"}}]," + + "\"tool_choice\":{\"type\":\"auto\"}}")); + assertThat(openAi.has("parallel_tool_calls"), is(false)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/AnthropicStreamTranslatorTest.java b/src/test/java/net/ladenthin/llama/server/AnthropicStreamTranslatorTest.java new file mode 100644 index 00000000..e77f271d --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/AnthropicStreamTranslatorTest.java @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link AnthropicStreamTranslator}: the OpenAI-chunk to Anthropic-SSE-event sequence + * (message_start → text content block → tool_use blocks → message_delta → message_stop). Pure. + */ +public class AnthropicStreamTranslatorTest { + + @Test + public void beginEmitsMessageStart() { + AnthropicStreamTranslator translator = new AnthropicStreamTranslator("msg_1", "m"); + assertThat(translator.begin(), containsString("event: message_start")); + } + + @Test + public void firstTextDeltaOpensBlockThenSubsequentDeltasAppend() { + AnthropicStreamTranslator translator = new AnthropicStreamTranslator("msg_1", "m"); + String first = translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"he\"}}]}"); + assertThat(first, containsString("event: content_block_start")); + assertThat(first, containsString("event: content_block_delta")); + assertThat(first, containsString("\"text\":\"he\"")); + String second = translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"llo\"}}]}"); + // No second block start; just another delta. + assertThat(second.contains("content_block_start"), is(false)); + assertThat(second, containsString("\"text\":\"llo\"")); + } + + @Test + public void endClosesTextBlockAndEmitsStopReasonAndMessageStop() { + AnthropicStreamTranslator translator = new AnthropicStreamTranslator("msg_1", "m"); + translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"hi\"},\"finish_reason\":\"stop\"}]}"); + String end = translator.end(); + assertThat(end, containsString("event: content_block_stop")); + assertThat(end, containsString("event: message_delta")); + assertThat(end, containsString("\"stop_reason\":\"end_turn\"")); + assertThat(end, containsString("event: message_stop")); + } + + @Test + public void accumulatedToolCallsBecomeToolUseBlocksAtEnd() { + AnthropicStreamTranslator translator = new AnthropicStreamTranslator("msg_1", "m"); + translator.onChunk("{\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"c1\"," + + "\"function\":{\"name\":\"get_weather\",\"arguments\":\"{\\\"city\\\":\\\"Paris\\\"}\"}}]}," + + "\"finish_reason\":\"tool_calls\"}]}"); + String end = translator.end(); + assertThat(end, containsString("event: content_block_start")); + assertThat(end, containsString("\"type\":\"tool_use\"")); + assertThat(end, containsString("\"name\":\"get_weather\"")); + assertThat(end, containsString("event: content_block_delta")); + assertThat(end, containsString("input_json_delta")); + assertThat(end, containsString("\"stop_reason\":\"tool_use\"")); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java b/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java deleted file mode 100644 index bca12fd9..00000000 --- a/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java +++ /dev/null @@ -1,115 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.is; -import static org.junit.jupiter.api.Assertions.assertThrows; - -import net.ladenthin.llama.ClaudeGenerated; -import org.junit.jupiter.api.Test; - -@ClaudeGenerated( - purpose = "Verify LlamaServerArgs parses long/short flags, applies defaults, derives the model alias from the " - + "model path, and rejects unknown flags, missing values, malformed integers and a missing --model.") -public class LlamaServerArgsTest { - - @Test - public void minimalArgsApplyDefaults() { - LlamaServerConfig config = LlamaServerArgs.parse(new String[] {"--model", "models/Qwen3-0.6B.gguf"}); - assertThat(config.getModelPath(), is("models/Qwen3-0.6B.gguf")); - assertThat(config.getHost(), is(LlamaServerArgs.DEFAULT_HOST)); - assertThat(config.getPort(), is(LlamaServerArgs.DEFAULT_PORT)); - assertThat(config.getCtxSize(), is(0)); - assertThat(config.getGpuLayers(), is(0)); - assertThat(config.getThreads(), is(0)); - assertThat(config.isEmbedding(), is(false)); - // Alias defaults to the model file name. - assertThat(config.getModelAlias(), is("Qwen3-0.6B.gguf")); - } - - @Test - public void allLongFlagsParsed() { - LlamaServerConfig config = LlamaServerArgs.parse(new String[] { - "--model", "m.gguf", - "--host", "0.0.0.0", - "--port", "9090", - "--ctx-size", "4096", - "--n-gpu-layers", "99", - "--threads", "8", - "--model-alias", "my-model", - "--embedding" - }); - assertThat(config.getModelPath(), is("m.gguf")); - assertThat(config.getHost(), is("0.0.0.0")); - assertThat(config.getPort(), is(9090)); - assertThat(config.getCtxSize(), is(4096)); - assertThat(config.getGpuLayers(), is(99)); - assertThat(config.getThreads(), is(8)); - assertThat(config.getModelAlias(), is("my-model")); - assertThat(config.isEmbedding(), is(true)); - } - - @Test - public void shortFlagsParsed() { - LlamaServerConfig config = LlamaServerArgs.parse( - new String[] {"-m", "m.gguf", "-p", "1234", "-c", "512", "-ngl", "10", "-t", "4"}); - assertThat(config.getPort(), is(1234)); - assertThat(config.getCtxSize(), is(512)); - assertThat(config.getGpuLayers(), is(10)); - assertThat(config.getThreads(), is(4)); - } - - @Test - public void aliasDerivedFromNestedPath() { - LlamaServerConfig config = LlamaServerArgs.parse(new String[] {"-m", "/opt/models/Llama-3.gguf"}); - assertThat(config.getModelAlias(), is("Llama-3.gguf")); - } - - @Test - public void missingModelThrows() { - IllegalArgumentException ex = - assertThrows(IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {})); - assertThat(ex.getMessage(), containsString("--model")); - } - - @Test - public void unknownFlagThrows() { - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--bogus"})); - assertThat(ex.getMessage(), containsString("Unknown argument: --bogus")); - } - - @Test - public void missingValueThrows() { - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--port"})); - assertThat(ex.getMessage(), containsString("Missing value for --port")); - } - - @Test - public void nonIntegerPortThrows() { - IllegalArgumentException ex = assertThrows( - IllegalArgumentException.class, - () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--port", "abc"})); - assertThat(ex.getMessage(), containsString("expects an integer")); - } - - @Test - public void helpRequestedDetection() { - assertThat(LlamaServerArgs.isHelpRequested(new String[] {"-h"}), is(true)); - assertThat(LlamaServerArgs.isHelpRequested(new String[] {"--help"}), is(true)); - assertThat(LlamaServerArgs.isHelpRequested(new String[] {"--model", "m.gguf"}), is(false)); - } - - @Test - public void usageMentionsEndpointsAndRequiredFlag() { - String usage = LlamaServerArgs.usage(); - assertThat(usage, containsString("--model")); - assertThat(usage, containsString("/v1/chat/completions")); - assertThat(usage, containsString("/v1/embeddings")); - } -} diff --git a/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java deleted file mode 100644 index 63af9c47..00000000 --- a/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java +++ /dev/null @@ -1,128 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.is; - -import fi.iki.elonen.NanoHTTPD; -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.HttpURLConnection; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import net.ladenthin.llama.ClaudeGenerated; -import org.junit.jupiter.api.Test; - -@ClaudeGenerated( - purpose = - "End-to-end exercise of OaiHttpServer over a real loopback socket (ephemeral port, fake backend, no " - + "native model): confirms the NanoHTTPD adapter extracts the method/URI, reads the JSON POST body via " - + "the 'postData' idiom, forwards it to the router, and maps the routed status/body back to the client.") -public class OaiHttpServerIntegrationTest { - - /** Fake backend that echoes the received chat body so the test can assert it round-tripped. */ - private static final class EchoBackend implements OaiBackend { - private String lastChatBody = ""; - - @Override - public String chatCompletions(String requestJson) { - lastChatBody = requestJson; - return "{\"object\":\"chat.completion\",\"echo\":" + requestJson + "}"; - } - - @Override - public String completions(String requestJson) { - return "{\"object\":\"text_completion\"}"; - } - - @Override - public String embeddings(String requestJson) { - return "{\"object\":\"list\"}"; - } - - @Override - public String listModels() { - return "{\"object\":\"list\",\"data\":[]}"; - } - - String lastChatBody() { - return lastChatBody; - } - } - - @Test - public void servesHealthAndChatOverRealSocket() throws IOException { - EchoBackend backend = new EchoBackend(); - OaiHttpServer server = new OaiHttpServer("127.0.0.1", 0, new OaiRouter(backend)); - // daemon=true so a failed assertion never leaves a non-daemon listener thread behind. - server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, true); - try { - final int port = server.getListeningPort(); - final String base = "http://127.0.0.1:" + port; - - Response health = httpGet(base + "/health"); - assertThat(health.status, is(200)); - assertThat(health.body, containsString("\"status\":\"ok\"")); - - final String chatRequest = "{\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; - Response chat = httpPost(base + "/v1/chat/completions", chatRequest); - assertThat(chat.status, is(200)); - assertThat(chat.body, containsString("chat.completion")); - // The JSON POST body reached the backend intact (validates the parseBody/postData path). - assertThat(backend.lastChatBody(), is(chatRequest)); - - Response notFound = httpGet(base + "/v1/nope"); - assertThat(notFound.status, is(404)); - } finally { - server.stop(); - } - } - - private static final class Response { - private final int status; - private final String body; - - Response(int status, String body) { - this.status = status; - this.body = body; - } - } - - private static Response httpGet(String url) throws IOException { - final HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); - conn.setRequestMethod("GET"); - return readResponse(conn); - } - - private static Response httpPost(String url, String body) throws IOException { - final HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection(); - conn.setRequestMethod("POST"); - conn.setDoOutput(true); - conn.setRequestProperty("Content-Type", "application/json"); - try (OutputStream os = conn.getOutputStream()) { - os.write(body.getBytes(StandardCharsets.UTF_8)); - } - return readResponse(conn); - } - - private static Response readResponse(HttpURLConnection conn) throws IOException { - final int status = conn.getResponseCode(); - try (InputStream in = status < 400 ? conn.getInputStream() : conn.getErrorStream()) { - final ByteArrayOutputStream out = new ByteArrayOutputStream(); - final byte[] buffer = new byte[1024]; - int read; - while ((read = in.read(buffer)) != -1) { - out.write(buffer, 0, read); - } - return new Response(status, new String(out.toByteArray(), StandardCharsets.UTF_8)); - } finally { - conn.disconnect(); - } - } -} diff --git a/src/test/java/net/ladenthin/llama/server/OaiRerankSupportTest.java b/src/test/java/net/ladenthin/llama/server/OaiRerankSupportTest.java new file mode 100644 index 00000000..3bad7134 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OaiRerankSupportTest.java @@ -0,0 +1,107 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link OaiRerankSupport}: request parsing and the native-array to OpenAI-rerank + * reshape (sorting, {@code top_n}, the {@code results}/{@code data} alias). Pure — no model. + */ +public class OaiRerankSupportTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static JsonNode read(String json) throws IOException { + return MAPPER.readTree(json); + } + + @Test + public void readQueryReturnsText() throws IOException { + assertThat(OaiRerankSupport.readQuery(read("{\"query\":\"hello\"}")), is("hello")); + } + + @Test + public void readQueryThrowsWhenMissing() throws IOException { + JsonNode request = read("{\"documents\":[\"a\"]}"); + assertThrows(IllegalArgumentException.class, () -> OaiRerankSupport.readQuery(request)); + } + + @Test + public void readDocumentsAcceptsStrings() throws IOException { + String[] docs = OaiRerankSupport.readDocuments(read("{\"documents\":[\"a\",\"b\"]}")); + assertThat(docs.length, is(2)); + assertThat(docs[0], is("a")); + assertThat(docs[1], is("b")); + } + + @Test + public void readDocumentsAcceptsTextObjects() throws IOException { + String[] docs = OaiRerankSupport.readDocuments(read("{\"documents\":[{\"text\":\"x\"},{\"text\":\"y\"}]}")); + assertThat(docs.length, is(2)); + assertThat(docs[0], is("x")); + assertThat(docs[1], is("y")); + } + + @Test + public void readDocumentsThrowsWhenEmptyOrMissing() throws IOException { + JsonNode empty = read("{\"documents\":[]}"); + assertThrows(IllegalArgumentException.class, () -> OaiRerankSupport.readDocuments(empty)); + JsonNode missing = read("{\"query\":\"q\"}"); + assertThrows(IllegalArgumentException.class, () -> OaiRerankSupport.readDocuments(missing)); + } + + @Test + public void readDocumentsThrowsOnUnsupportedEntry() throws IOException { + JsonNode request = read("{\"documents\":[123]}"); + assertThrows(IllegalArgumentException.class, () -> OaiRerankSupport.readDocuments(request)); + } + + @Test + public void readTopNReturnsValueOrNegativeOne() throws IOException { + assertThat(OaiRerankSupport.readTopN(read("{\"top_n\":3}")), is(3)); + assertThat(OaiRerankSupport.readTopN(read("{}")), is(-1)); + } + + @Test + public void toOaiResponseSortsByScoreDescendingWithRelevanceScoreAndDataAlias() throws IOException { + String nativeJson = + "[{\"document\":\"a\",\"index\":0,\"score\":0.2}," + "{\"document\":\"b\",\"index\":1,\"score\":0.9}]"; + JsonNode out = read(OaiRerankSupport.toOaiResponse(nativeJson, "rr", -1)); + assertThat(out.path("object").asText(), is("list")); + assertThat(out.path("model").asText(), is("rr")); + // Highest score first; score is renamed to relevance_score; index preserved. + assertThat(out.path("results").get(0).path("index").asInt(), is(1)); + assertThat(out.path("results").get(0).path("relevance_score").asDouble(), is(0.9)); + assertThat(out.path("results").get(1).path("index").asInt(), is(0)); + // data is an alias of results (Continue #6478). + assertThat(out.path("data").get(0).path("index").asInt(), is(1)); + } + + @Test + public void toOaiResponseAppliesTopN() throws IOException { + String nativeJson = "[{\"index\":0,\"score\":0.2},{\"index\":1,\"score\":0.9},{\"index\":2,\"score\":0.5}]"; + JsonNode out = read(OaiRerankSupport.toOaiResponse(nativeJson, "", 2)); + assertThat(out.path("results").size(), is(2)); + assertThat(out.path("results").get(0).path("index").asInt(), is(1)); // 0.9 + assertThat(out.path("results").get(1).path("index").asInt(), is(2)); // 0.5 + // Empty model id is omitted. + assertThat(out.has("model"), is(false)); + } + + @Test + public void toOaiResponseOnMalformedNativeBodyYieldsEmptyResults() throws IOException { + JsonNode out = read(OaiRerankSupport.toOaiResponse("not json", "m", -1)); + assertThat(out.path("results").size(), is(0)); + assertThat(out.path("data").size(), is(0)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java b/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java deleted file mode 100644 index dd189ea0..00000000 --- a/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-FileCopyrightText: 2026 Bernard Ladenthin -// -// SPDX-License-Identifier: MIT - -package net.ladenthin.llama.server; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.hamcrest.Matchers.containsString; -import static org.hamcrest.Matchers.is; - -import net.ladenthin.llama.ClaudeGenerated; -import org.junit.jupiter.api.Test; - -@ClaudeGenerated( - purpose = - "Verify OaiRouter dispatches each OAI endpoint to the backend, forwards the request body, enforces " - + "method/body preconditions (405/400), returns 404 for unknown paths, strips query strings, and " - + "converts backend exceptions into a 500 OpenAI error envelope. Uses a fake backend (no native model).") -public class OaiRouterTest { - - private static final String CHAT_RESPONSE = "{\"object\":\"chat.completion\"}"; - private static final String COMPLETION_RESPONSE = "{\"object\":\"text_completion\"}"; - private static final String EMBED_RESPONSE = "{\"object\":\"list\",\"data\":[]}"; - private static final String MODELS_RESPONSE = "{\"object\":\"list\",\"data\":[{\"id\":\"m\"}]}"; - - /** Records the last forwarded body and returns canned per-endpoint JSON. */ - private static final class RecordingBackend implements OaiBackend { - private String lastBody = ""; - - @Override - public String chatCompletions(String requestJson) { - lastBody = requestJson; - return CHAT_RESPONSE; - } - - @Override - public String completions(String requestJson) { - lastBody = requestJson; - return COMPLETION_RESPONSE; - } - - @Override - public String embeddings(String requestJson) { - lastBody = requestJson; - return EMBED_RESPONSE; - } - - @Override - public String listModels() { - return MODELS_RESPONSE; - } - - String lastBody() { - return lastBody; - } - } - - private static final class ThrowingBackend implements OaiBackend { - @Override - public String chatCompletions(String requestJson) { - throw new IllegalStateException("boom"); - } - - @Override - public String completions(String requestJson) { - throw new IllegalStateException("boom"); - } - - @Override - public String embeddings(String requestJson) { - throw new IllegalStateException("boom"); - } - - @Override - public String listModels() { - throw new IllegalStateException("boom"); - } - } - - @Test - public void chatCompletionsForwardsBodyAndReturnsResponse() { - RecordingBackend backend = new RecordingBackend(); - OaiRouter router = new OaiRouter(backend); - OaiResponse resp = router.route("POST", "/v1/chat/completions", "{\"messages\":[]}"); - assertThat(resp.getStatus(), is(200)); - assertThat(resp.getBody(), is(CHAT_RESPONSE)); - assertThat(backend.lastBody(), is("{\"messages\":[]}")); - } - - @Test - public void completionsRoute() { - OaiResponse resp = - new OaiRouter(new RecordingBackend()).route("POST", "/v1/completions", "{\"prompt\":\"hi\"}"); - assertThat(resp.getStatus(), is(200)); - assertThat(resp.getBody(), is(COMPLETION_RESPONSE)); - } - - @Test - public void embeddingsRoute() { - OaiResponse resp = new OaiRouter(new RecordingBackend()).route("POST", "/v1/embeddings", "{\"input\":\"hi\"}"); - assertThat(resp.getStatus(), is(200)); - assertThat(resp.getBody(), is(EMBED_RESPONSE)); - } - - @Test - public void modelsRoute() { - OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/models", null); - assertThat(resp.getStatus(), is(200)); - assertThat(resp.getBody(), is(MODELS_RESPONSE)); - } - - @Test - public void modelsRouteIgnoresQueryString() { - OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/models?limit=1", null); - assertThat(resp.getStatus(), is(200)); - assertThat(resp.getBody(), is(MODELS_RESPONSE)); - } - - @Test - public void healthRoutes() { - OaiRouter router = new OaiRouter(new RecordingBackend()); - assertThat(router.route("GET", "/health", null).getStatus(), is(200)); - assertThat(router.route("GET", "/health", null).getBody(), containsString("\"status\":\"ok\"")); - assertThat(router.route("GET", "/", null).getStatus(), is(200)); - } - - @Test - public void wrongMethodYields405() { - OaiRouter router = new OaiRouter(new RecordingBackend()); - assertThat(router.route("GET", "/v1/chat/completions", null).getStatus(), is(405)); - assertThat(router.route("POST", "/v1/models", "{}").getStatus(), is(405)); - } - - @Test - public void emptyOrNullBodyYields400() { - OaiRouter router = new OaiRouter(new RecordingBackend()); - assertThat(router.route("POST", "/v1/chat/completions", null).getStatus(), is(400)); - assertThat(router.route("POST", "/v1/chat/completions", " ").getStatus(), is(400)); - } - - @Test - public void unknownPathYields404() { - OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/nope", null); - assertThat(resp.getStatus(), is(404)); - assertThat(resp.getBody(), containsString("\"type\":\"not_found\"")); - assertThat(resp.getBody(), containsString("/v1/nope")); - } - - @Test - public void backendExceptionYields500() { - OaiResponse resp = new OaiRouter(new ThrowingBackend()).route("POST", "/v1/chat/completions", "{}"); - assertThat(resp.getStatus(), is(500)); - assertThat(resp.getBody(), containsString("\"type\":\"internal_error\"")); - assertThat(resp.getBody(), containsString("boom")); - } -} diff --git a/src/test/java/net/ladenthin/llama/server/OllamaApiSupportTest.java b/src/test/java/net/ladenthin/llama/server/OllamaApiSupportTest.java new file mode 100644 index 00000000..660d3cb9 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OllamaApiSupportTest.java @@ -0,0 +1,221 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link OllamaApiSupport}: discovery bodies and the Ollama↔OpenAI chat request, + * response and NDJSON streaming translation. Pure — no model. + */ +public class OllamaApiSupportTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static JsonNode read(String json) throws IOException { + return MAPPER.readTree(json); + } + + @Test + public void versionJsonCarriesAVersion() throws IOException { + assertThat(read(OllamaApiSupport.versionJson()).path("version").asText(), is(OllamaApiSupport.OLLAMA_VERSION)); + } + + @Test + public void tagsJsonListsTheModel() throws IOException { + JsonNode out = read(OllamaApiSupport.tagsJson("local-qwen")); + assertThat(out.path("models").get(0).path("name").asText(), is("local-qwen")); + assertThat(out.path("models").get(0).path("model").asText(), is("local-qwen")); + } + + @Test + public void showJsonAdvertisesCapabilitiesAndContextLength() throws IOException { + JsonNode out = read(OllamaApiSupport.showJson("local-qwen", 8192, false)); + assertThat(out.path("model_info").path("llama.context_length").asInt(), is(8192)); + String capabilities = out.path("capabilities").toString(); + assertThat(capabilities.contains("completion"), is(true)); + assertThat(capabilities.contains("tools"), is(true)); + assertThat(capabilities.contains("vision"), is(false)); + } + + @Test + public void showJsonAddsVisionCapabilityWhenEnabled() throws IOException { + JsonNode out = read(OllamaApiSupport.showJson("m", 4096, true)); + assertThat(out.path("capabilities").toString().contains("vision"), is(true)); + } + + @Test + public void isStreamingDefaultsTrueAndHonoursExplicitFalse() throws IOException { + assertThat(OllamaApiSupport.isStreaming(read("{}")), is(true)); + assertThat(OllamaApiSupport.isStreaming(read("{\"stream\":false}")), is(false)); + assertThat(OllamaApiSupport.isStreaming(read("{\"stream\":true}")), is(true)); + } + + @Test + public void toOpenAiChatRequestMapsMessagesToolsAndOptions() throws IOException { + JsonNode openAi = OllamaApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\"," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]," + + "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"f\"}}]," + + "\"options\":{\"temperature\":0.4,\"num_predict\":64}}")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("hi")); + assertThat(openAi.path("tools").get(0).path("function").path("name").asText(), is("f")); + assertThat(openAi.path("temperature").asDouble(), is(0.4)); + // num_predict maps to OpenAI max_tokens. + assertThat(openAi.path("max_tokens").asInt(), is(64)); + } + + @Test + public void toOpenAiChatRequestStringifiesAssistantToolCallArguments() throws IOException { + JsonNode openAi = OllamaApiSupport.toOpenAiChatRequest( + read( + "{\"messages\":[" + + "{\"role\":\"assistant\",\"tool_calls\":[{\"function\":{\"name\":\"f\",\"arguments\":{\"a\":1}}}]}]}")); + JsonNode arguments = openAi.path("messages") + .get(0) + .path("tool_calls") + .get(0) + .path("function") + .path("arguments"); + // Ollama sends an object; OpenAI requires a JSON-encoded string. + assertThat(arguments.isTextual(), is(true)); + assertThat(read(arguments.asText()).path("a").asInt(), is(1)); + } + + @Test + public void toOpenAiChatRequestMapsFormatJsonToResponseFormat() throws IOException { + JsonNode openAi = OllamaApiSupport.toOpenAiChatRequest( + read("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}],\"format\":\"json\"}")); + assertThat(openAi.path("response_format").path("type").asText(), is("json_object")); + } + + @Test + public void toOpenAiChatRequestMapsFormatSchemaToJsonSchema() throws IOException { + // Ollama `format` as a JSON Schema object -> OpenAI response_format json_schema. + JsonNode openAi = + OllamaApiSupport.toOpenAiChatRequest(read("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}]," + + "\"format\":{\"type\":\"object\",\"properties\":{\"a\":{\"type\":\"string\"}}}}")); + assertThat(openAi.path("response_format").path("type").asText(), is("json_schema")); + assertThat( + openAi.path("response_format") + .path("json_schema") + .path("schema") + .path("type") + .asText(), + is("object")); + } + + @Test + public void toOpenAiChatRequestForwardsStopFromOptions() throws IOException { + JsonNode openAi = + OllamaApiSupport.toOpenAiChatRequest(read("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}]," + + "\"options\":{\"stop\":[\"\\n\",\"END\"]}}")); + assertThat(openAi.path("stop").size(), is(2)); + assertThat(openAi.path("stop").get(1).asText(), is("END")); + } + + @Test + public void toOllamaChatResponseExtractsContentAndCountsAndDone() throws IOException { + String openAi = "{\"choices\":[{\"index\":0,\"message\":{\"role\":\"assistant\",\"content\":\"hello\"}," + + "\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":3}}"; + JsonNode out = read(OllamaApiSupport.toOllamaChatResponse(openAi, "m")); + assertThat(out.path("message").path("content").asText(), is("hello")); + assertThat(out.path("done").asBoolean(), is(true)); + assertThat(out.path("done_reason").asText(), is("stop")); + assertThat(out.path("prompt_eval_count").asInt(), is(7)); + assertThat(out.path("eval_count").asInt(), is(3)); + } + + @Test + public void toOllamaChatResponseConvertsToolCallArgumentsToObject() throws IOException { + String openAi = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":null," + + "\"tool_calls\":[{\"id\":\"c1\",\"type\":\"function\"," + + "\"function\":{\"name\":\"get_weather\",\"arguments\":\"{\\\"city\\\":\\\"Paris\\\"}\"}}]}}]}"; + JsonNode out = read(OllamaApiSupport.toOllamaChatResponse(openAi, "m")); + JsonNode arguments = + out.path("message").path("tool_calls").get(0).path("function").path("arguments"); + // OpenAI string arguments become an Ollama object. + assertThat(arguments.isObject(), is(true)); + assertThat(arguments.path("city").asText(), is("Paris")); + } + + @Test + public void toOllamaContentLineEmitsDeltaAndSkipsEmpty() throws IOException { + String line = OllamaApiSupport.toOllamaContentLine("{\"choices\":[{\"delta\":{\"content\":\"he\"}}]}", "m"); + assertThat(line.endsWith("\n"), is(true)); + JsonNode parsed = read(line.trim()); + assertThat(parsed.path("message").path("content").asText(), is("he")); + assertThat(parsed.path("done").asBoolean(), is(false)); + // Role-only / empty-content chunks emit nothing. + assertThat( + OllamaApiSupport.toOllamaContentLine("{\"choices\":[{\"delta\":{\"role\":\"assistant\"}}]}", "m"), + is((String) null)); + } + + @Test + public void generateHasSuffixDetectsFimRequests() throws IOException { + assertThat(OllamaApiSupport.hasSuffix(read("{\"prompt\":\"a\",\"suffix\":\"b\"}")), is(true)); + assertThat(OllamaApiSupport.hasSuffix(read("{\"prompt\":\"a\"}")), is(false)); + } + + @Test + public void generateMapsPromptAndOptionsToCompletionRequest() throws IOException { + JsonNode openAi = OllamaApiSupport.toOpenAiCompletionRequest( + read("{\"prompt\":\"once\",\"options\":{\"num_predict\":32,\"temperature\":0.5}}")); + assertThat(openAi.path("prompt").asText(), is("once")); + assertThat(openAi.path("max_tokens").asInt(), is(32)); + assertThat(openAi.path("temperature").asDouble(), is(0.5)); + } + + @Test + public void generateWithSuffixMapsToInfillRequest() throws IOException { + JsonNode infill = OllamaApiSupport.toInfillRequest( + read("{\"prompt\":\"pre\",\"suffix\":\"suf\",\"options\":{\"num_predict\":16}}")); + assertThat(infill.path("input_prefix").asText(), is("pre")); + assertThat(infill.path("input_suffix").asText(), is("suf")); + assertThat(infill.path("n_predict").asInt(), is(16)); + } + + @Test + public void generateTextExtractors() { + assertThat(OllamaApiSupport.extractCompletionText("{\"choices\":[{\"text\":\"hi\"}]}"), is("hi")); + assertThat(OllamaApiSupport.extractInfillContent("{\"content\":\"world\"}"), is("world")); + // Unexpected bodies yield empty text rather than throwing. + assertThat(OllamaApiSupport.extractCompletionText("not json"), is("")); + } + + @Test + public void generateResponseAndStreamShapes() throws IOException { + JsonNode nonStream = read(OllamaApiSupport.toOllamaGenerateResponse("hello", "m")); + assertThat(nonStream.path("response").asText(), is("hello")); + assertThat(nonStream.path("done").asBoolean(), is(true)); + + String stream = OllamaApiSupport.toOllamaGenerateStream("hello", "m"); + String[] lines = stream.trim().split("\n"); + assertThat(lines.length, is(2)); + JsonNode first = read(lines[0]); + assertThat(first.path("response").asText(), is("hello")); + assertThat(first.path("done").asBoolean(), is(false)); + assertThat(read(lines[1]).path("done").asBoolean(), is(true)); + } + + @Test + public void toOllamaDoneLineCarriesAccumulatedToolCalls() throws IOException { + ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + accumulator.accept("{\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"c1\"," + + "\"function\":{\"name\":\"f\",\"arguments\":\"{\\\"a\\\":1}\"}}]}}]}"); + JsonNode out = read(OllamaApiSupport.toOllamaDoneLine("m", accumulator).trim()); + assertThat(out.path("done").asBoolean(), is(true)); + JsonNode arguments = + out.path("message").path("tool_calls").get(0).path("function").path("arguments"); + assertThat(arguments.isObject(), is(true)); + assertThat(arguments.path("a").asInt(), is(1)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerHttpTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerHttpTest.java index 77c41b14..e74652ff 100644 --- a/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerHttpTest.java +++ b/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerHttpTest.java @@ -14,8 +14,9 @@ /** * End-to-end HTTP tests for {@link OpenAiCompatServer} driven over a real socket with a - * {@link FakeChatBackend} — no native library and no model are loaded. Exercises routing, - * authentication, the non-streaming and Server-Sent-Events paths, heartbeats, and error statuses. + * {@link FakeBackend} — no native library and no model are loaded. Exercises routing, authentication, + * the non-streaming and Server-Sent-Events paths, heartbeats, the completions/embeddings/health routes, + * and error statuses. * *

HTTP request plumbing is inherited from {@link OpenAiServerTestSupport}. */ @@ -33,7 +34,7 @@ private static OpenAiServerConfig config() { @Test public void nonStreamingReturnsTheCompletionBody() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { Response response = post(server.getPort(), "/v1/chat/completions", CHAT_BODY, ""); assertThat(response.code, is(200)); assertThat(response.body, containsString("chat.completion")); @@ -43,7 +44,7 @@ public void nonStreamingReturnsTheCompletionBody() throws IOException { @Test public void streamingReturnsSseChunksThenDone() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { String body = "{\"stream\":true,\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; Response response = post(server.getPort(), "/v1/chat/completions", body, ""); assertThat(response.code, is(200)); @@ -60,7 +61,7 @@ public void streamingEmitsHeartbeatsDuringAGap() throws IOException { .port(0) .heartbeatMillis(50L) .build(); - try (OpenAiCompatServer server = new OpenAiCompatServer(new SlowFakeChatBackend(), cfg).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new SlowFakeBackend(), cfg).start()) { String body = "{\"stream\":true,\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; Response response = post(server.getPort(), "/v1/chat/completions", body, ""); assertThat(response.code, is(200)); @@ -69,9 +70,207 @@ public void streamingEmitsHeartbeatsDuringAGap() throws IOException { } } + @Test + public void completionsRouteReturnsTextCompletionBody() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = post(server.getPort(), "/v1/completions", "{\"prompt\":\"hi\"}", ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("text_completion")); + } + } + + @Test + public void embeddingsRouteReturnsEmbeddingList() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = post(server.getPort(), "/v1/embeddings", "{\"input\":\"hi\"}", ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("embedding")); + } + } + + @Test + public void infillRouteReturnsContent() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"input_prefix\":\"def add(a,b):\\n return \",\"input_suffix\":\"\"}"; + Response response = post(server.getPort(), "/infill", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("content")); + } + } + + @Test + public void getOnInfillReturns405() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = get(server.getPort(), "/infill", ""); + assertThat(response.code, is(405)); + } + } + + @Test + public void barePathAliasesResolveToTheSameHandlers() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + int port = server.getPort(); + // Clients disagree on the /v1 prefix; the bare aliases must reach the same handlers. + assertThat(get(port, "/models", "").code, is(200)); + assertThat(post(port, "/completions", "{\"prompt\":\"hi\"}", "").code, is(200)); + assertThat(post(port, "/embeddings", "{\"input\":\"hi\"}", "").code, is(200)); + assertThat(post(port, "/chat/completions", CHAT_BODY, "").code, is(200)); + } + } + + @Test + public void rerankRouteReturnsResults() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"query\":\"q\",\"documents\":[\"a\",\"b\"]}"; + Response response = post(server.getPort(), "/v1/rerank", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("relevance_score")); + } + } + + @Test + public void ollamaVersionTagsAndShowRespond() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + int port = server.getPort(); + Response version = get(port, "/api/version", ""); + assertThat(version.code, is(200)); + assertThat(version.body, containsString("version")); + Response tags = get(port, "/api/tags", ""); + assertThat(tags.code, is(200)); + assertThat(tags.body, containsString("test-model")); + Response show = post(port, "/api/show", "{\"model\":\"test-model\"}", ""); + assertThat(show.code, is(200)); + assertThat(show.body, containsString("capabilities")); + } + } + + @Test + public void ollamaChatNonStreamingReturnsDone() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"stream\":false," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; + Response response = post(server.getPort(), "/api/chat", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("hello")); // from FakeBackend.complete + assertThat(response.body, containsString("\"done\":true")); + } + } + + @Test + public void ollamaChatStreamingReturnsNdjsonEndingWithDone() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + // stream defaults to true for Ollama. + String body = "{\"model\":\"test-model\",\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; + Response response = post(server.getPort(), "/api/chat", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("llo")); // streamed content delta + assertThat(response.body, containsString("\"done\":true")); + } + } + + @Test + public void anthropicMessagesNonStreamingReturnsMessage() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"max_tokens\":16," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; + Response response = post(server.getPort(), "/v1/messages", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("\"type\":\"message\"")); + assertThat(response.body, containsString("hello")); // FakeBackend.complete text + } + } + + @Test + public void anthropicMessagesStreamingEmitsEventSequence() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"stream\":true,\"max_tokens\":16," + + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"; + Response response = post(server.getPort(), "/v1/messages", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("event: message_start")); + assertThat(response.body, containsString("event: content_block_delta")); + assertThat(response.body, containsString("event: message_stop")); + } + } + + @Test + public void responsesNonStreamingReturnsResponseObject() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"input\":\"hi\"}"; + Response response = post(server.getPort(), "/v1/responses", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("\"object\":\"response\"")); + assertThat(response.body, containsString("output_text")); + assertThat(response.body, containsString("hello")); + } + } + + @Test + public void responsesStreamingEmitsEventSequence() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"stream\":true,\"input\":\"hi\"}"; + Response response = post(server.getPort(), "/v1/responses", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("event: response.created")); + assertThat(response.body, containsString("event: response.output_text.delta")); + assertThat(response.body, containsString("event: response.completed")); + } + } + + @Test + public void propsEndpointReportsContextLength() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = get(server.getPort(), "/props", ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("n_ctx")); + assertThat(response.body, containsString("modalities")); + } + } + + @Test + public void ollamaGenerateNonStreamingReturnsResponse() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"prompt\":\"once upon\",\"stream\":false}"; + Response response = post(server.getPort(), "/api/generate", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("\"response\":\"hello\"")); // FakeBackend.completions text + assertThat(response.body, containsString("\"done\":true")); + } + } + + @Test + public void ollamaGenerateWithSuffixUsesInfill() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"prompt\":\"def f():\",\"suffix\":\"\",\"stream\":false}"; + Response response = post(server.getPort(), "/api/generate", body, ""); + assertThat(response.code, is(200)); + // FakeBackend.infill returns content " world". + assertThat(response.body, containsString("world")); + } + } + + @Test + public void ollamaGenerateStreamingReturnsNdjsonDone() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + String body = "{\"model\":\"test-model\",\"prompt\":\"hi\",\"stream\":true}"; + Response response = post(server.getPort(), "/api/generate", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("\"done\":true")); + } + } + + @Test + public void healthEndpointReturnsOk() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = get(server.getPort(), "/health", ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("\"status\":\"ok\"")); + } + } + @Test public void modelsEndpointAdvertisesConfiguredModel() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { Response response = get(server.getPort(), "/v1/models", ""); assertThat(response.code, is(200)); assertThat(response.body, containsString("test-model")); @@ -80,15 +279,15 @@ public void modelsEndpointAdvertisesConfiguredModel() throws IOException { @Test public void unknownPathReturns404() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { - Response response = get(server.getPort(), "/v1/embeddings", ""); + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = get(server.getPort(), "/v1/does-not-exist", ""); assertThat(response.code, is(404)); } } @Test public void missingMessagesReturns400() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { Response response = post(server.getPort(), "/v1/chat/completions", "{}", ""); assertThat(response.code, is(400)); } @@ -96,7 +295,7 @@ public void missingMessagesReturns400() throws IOException { @Test public void malformedJsonReturns400() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { Response response = post(server.getPort(), "/v1/chat/completions", "not json", ""); assertThat(response.code, is(400)); } @@ -104,12 +303,51 @@ public void malformedJsonReturns400() throws IOException { @Test public void getOnChatCompletionsReturns405() throws IOException { - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), config()).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { Response response = get(server.getPort(), "/v1/chat/completions", ""); assertThat(response.code, is(405)); } } + @Test + public void getOnEmbeddingsReturns405() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = get(server.getPort(), "/v1/embeddings", ""); + assertThat(response.code, is(405)); + } + } + + @Test + public void getOnNewPostRoutesReturns405() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + int port = server.getPort(); + // The protocol-shim POST routes go through the shared requirePostJson preamble. + assertThat(get(port, "/v1/rerank", "").code, is(405)); + assertThat(get(port, "/v1/messages", "").code, is(405)); + assertThat(get(port, "/v1/responses", "").code, is(405)); + assertThat(get(port, "/api/chat", "").code, is(405)); + assertThat(get(port, "/api/generate", "").code, is(405)); + } + } + + @Test + public void optionsPreflightReturns204WithCorsHeaders() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = options(server.getPort(), "/v1/chat/completions"); + assertThat(response.code, is(204)); + assertThat(response.corsAllowOrigin, is("*")); + } + } + + @Test + public void normalResponsesCarryCorsAllowOrigin() throws IOException { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), config()).start()) { + Response response = post(server.getPort(), "/v1/chat/completions", CHAT_BODY, ""); + assertThat(response.code, is(200)); + assertThat(response.corsAllowOrigin, is("*")); + } + } + @Test public void authRequiredWhenApiKeyConfigured() throws IOException { OpenAiServerConfig cfg = OpenAiServerConfig.builder() @@ -117,7 +355,7 @@ public void authRequiredWhenApiKeyConfigured() throws IOException { .port(0) .apiKey("secret") .build(); - try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeChatBackend(), cfg).start()) { + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), cfg).start()) { int port = server.getPort(); assertThat(post(port, "/v1/chat/completions", CHAT_BODY, "").code, is(401)); assertThat(post(port, "/v1/chat/completions", CHAT_BODY, "Bearer wrong").code, is(401)); @@ -125,8 +363,21 @@ public void authRequiredWhenApiKeyConfigured() throws IOException { } } - /** Deterministic backend that returns canned OpenAI shapes. */ - static final class FakeChatBackend implements ChatBackend { + @Test + public void healthEndpointIsUnauthenticated() throws IOException { + OpenAiServerConfig cfg = OpenAiServerConfig.builder() + .host("127.0.0.1") + .port(0) + .apiKey("secret") + .build(); + try (OpenAiCompatServer server = new OpenAiCompatServer(new FakeBackend(), cfg).start()) { + // No Authorization header, yet /health must still answer 200 for orchestrator probes. + assertThat(get(server.getPort(), "/health", "").code, is(200)); + } + } + + /** Deterministic backend that returns canned OpenAI shapes for every operation. */ + static final class FakeBackend implements OpenAiBackend { @Override public String complete(JsonNode request) { return "{\"object\":\"chat.completion\",\"choices\":[{\"index\":0," @@ -139,10 +390,31 @@ public void stream(JsonNode request, ChunkSink sink) throws IOException { sink.accept("{\"object\":\"chat.completion.chunk\"," + "\"choices\":[{\"delta\":{\"content\":\"llo\"},\"finish_reason\":\"stop\"}]}"); } + + @Override + public String completions(JsonNode request) { + return "{\"object\":\"text_completion\",\"choices\":[{\"text\":\"hello\"}]}"; + } + + @Override + public String embeddings(JsonNode request) { + return "{\"object\":\"list\",\"data\":[{\"object\":\"embedding\",\"embedding\":[0.1,0.2]}]}"; + } + + @Override + public String infill(JsonNode request) { + return "{\"content\":\" world\",\"stop\":true}"; + } + + @Override + public String rerank(JsonNode request) { + return "{\"object\":\"list\",\"results\":[{\"index\":0,\"relevance_score\":0.9}]," + + "\"data\":[{\"index\":0,\"relevance_score\":0.9}]}"; + } } /** Backend that stalls before emitting, so the server's heartbeat fires during the gap. */ - static final class SlowFakeChatBackend implements ChatBackend { + static final class SlowFakeBackend implements OpenAiBackend { @Override public String complete(JsonNode request) { return "{\"object\":\"chat.completion\",\"choices\":[]}"; @@ -159,5 +431,25 @@ public void stream(JsonNode request, ChunkSink sink) throws IOException { sink.accept("{\"object\":\"chat.completion.chunk\"," + "\"choices\":[{\"delta\":{\"content\":\"done\"},\"finish_reason\":\"stop\"}]}"); } + + @Override + public String completions(JsonNode request) { + return "{\"object\":\"text_completion\",\"choices\":[]}"; + } + + @Override + public String embeddings(JsonNode request) { + return "{\"object\":\"list\",\"data\":[]}"; + } + + @Override + public String infill(JsonNode request) { + return "{\"content\":\"\"}"; + } + + @Override + public String rerank(JsonNode request) { + return "{\"object\":\"list\",\"results\":[],\"data\":[]}"; + } } } diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerIntegrationTest.java index c460f662..645bfd03 100644 --- a/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerIntegrationTest.java +++ b/src/test/java/net/ladenthin/llama/server/OpenAiCompatServerIntegrationTest.java @@ -116,4 +116,86 @@ public void modelsEndpointAdvertisesTheServedModel() throws IOException { assertThat(response.code, is(200)); assertThat(response.body, containsString(MODEL_ID)); } + + // ----- alternative protocol surfaces (same Qwen3 model, structural assertions only) ----- + + @Test + public void ollamaChatNonStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":false," + + "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]," + + "\"options\":{\"num_predict\":16}}"; + Response response = post(port, "/api/chat", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("done").asBoolean(), is(true)); + assertThat(json.path("message").path("role").asText(), is("assistant")); + } + + @Test + public void ollamaChatStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\"," + + "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]," + + "\"options\":{\"num_predict\":16}}"; + Response response = post(port, "/api/chat", body, ""); + assertThat(response.code, is(200)); + // NDJSON terminates with a done line regardless of the model's wording. + assertThat(response.body, containsString("\"done\":true")); + } + + @Test + public void ollamaDiscoveryEndpointsRespond() throws IOException { + assertThat(get(port, "/api/version", "").code, is(200)); + assertThat(get(port, "/api/tags", "").body, containsString(MODEL_ID)); + Response show = post(port, "/api/show", "{\"model\":\"" + MODEL_ID + "\"}", ""); + assertThat(show.code, is(200)); + assertThat(show.body, containsString("capabilities")); + } + + @Test + public void anthropicMessagesNonStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":16," + + "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]}"; + Response response = post(port, "/v1/messages", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("type").asText(), is("message")); + assertThat(json.path("role").asText(), is("assistant")); + } + + @Test + public void anthropicMessagesStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":true,\"max_tokens\":16," + + "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]}"; + Response response = post(port, "/v1/messages", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("event: message_start")); + assertThat(response.body, containsString("event: message_stop")); + } + + @Test + public void responsesNonStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"max_output_tokens\":16,\"input\":\"Say hello in one word.\"}"; + Response response = post(port, "/v1/responses", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("object").asText(), is("response")); + assertThat(json.path("status").asText(), is("completed")); + } + + @Test + public void responsesStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":true,\"max_output_tokens\":16," + + "\"input\":\"Say hello in one word.\"}"; + Response response = post(port, "/v1/responses", body, ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("event: response.created")); + assertThat(response.body, containsString("event: response.completed")); + } + + @Test + public void propsEndpointReportsContextLength() throws IOException { + Response response = get(port, "/props", ""); + assertThat(response.code, is(200)); + assertThat(response.body, containsString("n_ctx")); + } } diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiRequestMapperTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiRequestMapperTest.java index e9a3e27f..36302f98 100644 --- a/src/test/java/net/ladenthin/llama/server/OpenAiRequestMapperTest.java +++ b/src/test/java/net/ladenthin/llama/server/OpenAiRequestMapperTest.java @@ -119,6 +119,29 @@ public void stopAsArrayMapped() throws IOException { assertThat(out.path("stop").size(), is(2)); } + @Test + public void streamOptionsForwardedVerbatim() throws IOException { + // include_usage must reach the native layer so the trailing usage chunk is emitted. + JsonNode out = mapAndSerialize("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}]," + + "\"stream_options\":{\"include_usage\":true}}"); + assertThat(out.path("stream_options").path("include_usage").asBoolean(), is(true)); + } + + @Test + public void responseFormatForwardedVerbatim() throws IOException { + // Structured outputs: json_object / json_schema must reach the native grammar constraint. + JsonNode out = mapAndSerialize("{\"messages\":[{\"role\":\"user\",\"content\":\"x\"}]," + + "\"response_format\":{\"type\":\"json_object\"}}"); + assertThat(out.path("response_format").path("type").asText(), is("json_object")); + } + + @Test + public void cachePromptDefaultedTrue() throws IOException { + // The mapper defaults cache_prompt=true so the slot KV prefix is reused across IDE turns. + JsonNode out = mapAndSerialize("{\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}"); + assertThat(out.path("cache_prompt").asBoolean(), is(true)); + } + @Test public void unknownFieldsIgnored() throws IOException { JsonNode out = mapAndSerialize( diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java new file mode 100644 index 00000000..14714735 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerCliTest.java @@ -0,0 +1,203 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link OpenAiServerCli}: parsing of long/short/alias flags, defaults, model-id + * derivation, rejection of bad input, and the {@link OpenAiServerCli.Options#toServerConfig()} + * projection (auth + context-derived token budgets). Pure — no socket and no native model. + */ +public class OpenAiServerCliTest { + + @Test + public void minimalArgsApplyDefaults() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("--model", "models/Qwen3-0.6B.gguf"); + assertThat(options.getModelPath(), is("models/Qwen3-0.6B.gguf")); + assertThat(options.getHost(), is(OpenAiServerCli.DEFAULT_HOST)); + assertThat(options.getPort(), is(OpenAiServerCli.DEFAULT_PORT)); + assertThat(options.getCtxSize(), is(0)); + assertThat(options.getGpuLayers(), is(0)); + assertThat(options.getThreads(), is(0)); + assertThat(options.getParallel(), is(0)); + assertThat(options.isEmbedding(), is(false)); + assertThat(options.getApiKey(), is((String) null)); + // Model id defaults to the model file name. + assertThat(options.getModelId(), is("Qwen3-0.6B.gguf")); + } + + @Test + public void allLongFlagsParsed() { + OpenAiServerCli.Options options = OpenAiServerCli.parse( + "--model", + "m.gguf", + "--host", + "0.0.0.0", + "--port", + "9090", + "--ctx-size", + "4096", + "--n-gpu-layers", + "99", + "--threads", + "8", + "--parallel", + "2", + "--model-id", + "my-model", + "--api-key", + "secret", + "--embedding"); + assertThat(options.getModelPath(), is("m.gguf")); + assertThat(options.getHost(), is("0.0.0.0")); + assertThat(options.getPort(), is(9090)); + assertThat(options.getCtxSize(), is(4096)); + assertThat(options.getGpuLayers(), is(99)); + assertThat(options.getThreads(), is(8)); + assertThat(options.getParallel(), is(2)); + assertThat(options.getModelId(), is("my-model")); + assertThat(options.getApiKey(), is("secret")); + assertThat(options.isEmbedding(), is(true)); + } + + @Test + public void shortFlagsParsed() { + OpenAiServerCli.Options options = + OpenAiServerCli.parse("-m", "m.gguf", "-p", "1234", "-c", "512", "-ngl", "10", "-t", "4"); + assertThat(options.getPort(), is(1234)); + assertThat(options.getCtxSize(), is(512)); + assertThat(options.getGpuLayers(), is(10)); + assertThat(options.getThreads(), is(4)); + } + + @Test + public void legacyAliasFlagsAccepted() { + OpenAiServerCli.Options options = OpenAiServerCli.parse( + "-m", "m.gguf", "--ctx", "256", "--gpu-layers", "5", "--model-alias", "aliased", "--embeddings"); + assertThat(options.getCtxSize(), is(256)); + assertThat(options.getGpuLayers(), is(5)); + assertThat(options.getModelId(), is("aliased")); + assertThat(options.isEmbedding(), is(true)); + } + + @Test + public void mmprojFlagParsed() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "m.gguf", "--mmproj", "proj.gguf"); + assertThat(options.getMmproj(), is("proj.gguf")); + assertThat(OpenAiServerCli.parse("-m", "m.gguf").getMmproj(), is((String) null)); + } + + @Test + public void mmprojEnablesVisionCapabilityInServerConfig() { + assertThat( + OpenAiServerCli.parse("-m", "m.gguf", "--mmproj", "proj.gguf") + .toServerConfig() + .isSupportsVision(), + is(true)); + assertThat(OpenAiServerCli.parse("-m", "m.gguf").toServerConfig().isSupportsVision(), is(false)); + } + + @Test + public void rerankingFlagParsed() { + assertThat(OpenAiServerCli.parse("-m", "m.gguf", "--reranking").isReranking(), is(true)); + assertThat(OpenAiServerCli.parse("-m", "m.gguf", "--rerank").isReranking(), is(true)); + assertThat(OpenAiServerCli.parse("-m", "m.gguf").isReranking(), is(false)); + } + + @Test + public void modelIdDerivedFromNestedPath() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "/opt/models/Llama-3.gguf"); + assertThat(options.getModelId(), is("Llama-3.gguf")); + } + + @Test + public void explicitModelIdOverridesDerivation() { + OpenAiServerCli.Options options = OpenAiServerCli.parse("-m", "/opt/models/Llama-3.gguf", "--model-id", "x"); + assertThat(options.getModelId(), is("x")); + } + + @Test + public void missingModelThrows() { + IllegalArgumentException ex = assertThrows(IllegalArgumentException.class, OpenAiServerCli::parse); + assertThat(ex.getMessage(), containsString("--model")); + } + + @Test + public void unknownFlagThrows() { + IllegalArgumentException ex = + assertThrows(IllegalArgumentException.class, () -> OpenAiServerCli.parse("-m", "m.gguf", "--bogus")); + assertThat(ex.getMessage(), containsString("Unknown argument: --bogus")); + } + + @Test + public void missingValueThrows() { + IllegalArgumentException ex = + assertThrows(IllegalArgumentException.class, () -> OpenAiServerCli.parse("-m", "m.gguf", "--port")); + assertThat(ex.getMessage(), containsString("Missing value for --port")); + } + + @Test + public void nonIntegerPortThrows() { + IllegalArgumentException ex = assertThrows( + IllegalArgumentException.class, () -> OpenAiServerCli.parse("-m", "m.gguf", "--port", "abc")); + assertThat(ex.getMessage(), containsString("expects an integer")); + } + + @Test + public void helpRequestedDetection() { + assertThat(OpenAiServerCli.isHelpRequested("-h"), is(true)); + assertThat(OpenAiServerCli.isHelpRequested("--help"), is(true)); + assertThat(OpenAiServerCli.isHelpRequested("--model", "m.gguf"), is(false)); + } + + @Test + public void usageMentionsEndpointsAndRequiredFlag() { + String usage = OpenAiServerCli.usage(); + assertThat(usage, containsString("--model")); + assertThat(usage, containsString("/v1/chat/completions")); + assertThat(usage, containsString("/v1/embeddings")); + assertThat(usage, containsString("/health")); + } + + @Test + public void serverConfigCarriesHostPortAndModelId() { + OpenAiServerConfig config = OpenAiServerCli.parse("-m", "m.gguf", "--host", "0.0.0.0", "-p", "1234") + .toServerConfig(); + assertThat(config.getHost(), is("0.0.0.0")); + assertThat(config.getPort(), is(1234)); + assertThat(config.getModelId(), is("m.gguf")); + assertThat(config.isAuthenticationEnabled(), is(false)); + } + + @Test + public void apiKeyEnablesAuthInServerConfig() { + OpenAiServerConfig config = + OpenAiServerCli.parse("-m", "m.gguf", "--api-key", "secret").toServerConfig(); + assertThat(config.isAuthenticationEnabled(), is(true)); + assertThat(config.getApiKey(), is("secret")); + } + + @Test + public void contextSizeDerivesAdvertisedTokenBudgets() { + // ctx 8192 -> output capped at the 2048 default, input = 8192 - 2048 = 6144 (README example). + OpenAiServerConfig config = + OpenAiServerCli.parse("-m", "m.gguf", "-c", "8192").toServerConfig(); + assertThat(config.getMaxOutputTokens(), is(2048)); + assertThat(config.getMaxInputTokens(), is(6144)); + } + + @Test + public void modelParametersIncludeModelPath() { + String json = + OpenAiServerCli.parse("-m", "models/m.gguf").toModelParameters().toString(); + assertThat(json, containsString("models/m.gguf")); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerCompletionIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerCompletionIntegrationTest.java new file mode 100644 index 00000000..c5601acf --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerCompletionIntegrationTest.java @@ -0,0 +1,104 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.File; +import java.io.IOException; +import net.ladenthin.llama.LlamaModel; +import net.ladenthin.llama.TestConstants; +import net.ladenthin.llama.parameters.ModelParameters; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * End-to-end integration test for the completion-family routes — {@code POST /v1/completions}, + * {@code POST /infill} (fill-in-the-middle) and the Ollama {@code POST /api/generate} (plain + FIM via a + * {@code suffix}) — against a real model over a real socket. Reuses the CI text model (CodeLlama-7B, + * {@link TestConstants#MODEL_PATH}), which is FIM-capable (see {@code LlamaModelTest#testGenerateInfill}). + * Self-skips when the model file is absent. Assertions are structural (valid response envelopes) rather + * than value-specific. HTTP plumbing is inherited from {@link OpenAiServerTestSupport}. + */ +public class OpenAiServerCompletionIntegrationTest extends OpenAiServerTestSupport { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String MODEL_ID = "completion-local"; + + private static LlamaModel model; + private static OpenAiCompatServer server; + private static int port; + + @BeforeAll + public static void setup() throws IOException { + Assumptions.assumeTrue( + new File(TestConstants.MODEL_PATH).exists(), + "Text model (CodeLlama-7B) not found, skipping completion server integration test"); + int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); + model = new LlamaModel(new ModelParameters() + .setModel(TestConstants.MODEL_PATH) + .setCtxSize(512) + .setGpuLayers(gpuLayers)); + server = new OpenAiCompatServer( + model, + OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build()) + .start(); + port = server.getPort(); + } + + @AfterAll + public static void tearDown() { + if (server != null) { + server.close(); + } + if (model != null) { + model.close(); + } + } + + @Test + public void completionsReturnsTextChoice() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":16,\"prompt\":\"def add(a, b):\\n return\"}"; + Response response = post(port, "/v1/completions", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("object").asText(), is("text_completion")); + assertThat(json.path("choices").get(0).path("text").isTextual(), is(true)); + } + + @Test + public void infillReturnsContent() throws IOException { + String body = "{\"input_prefix\":\"def add(a, b):\\n return \",\"input_suffix\":\"\\n\",\"n_predict\":16}"; + Response response = post(port, "/infill", body, ""); + assertThat(response.code, is(200)); + // The native infill response carries the generated middle under "content". + assertThat(MAPPER.readTree(response.body).path("content").isTextual(), is(true)); + } + + @Test + public void ollamaGenerateNonStreamingRoundTrip() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":false," + + "\"prompt\":\"def add(a, b):\\n return\",\"options\":{\"num_predict\":16}}"; + Response response = post(port, "/api/generate", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("response").isTextual(), is(true)); + assertThat(json.path("done").asBoolean(), is(true)); + } + + @Test + public void ollamaGenerateWithSuffixUsesInfill() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":false," + + "\"prompt\":\"def add(a, b):\\n return \",\"suffix\":\"\\n\",\"options\":{\"num_predict\":16}}"; + Response response = post(port, "/api/generate", body, ""); + assertThat(response.code, is(200)); + assertThat(MAPPER.readTree(response.body).path("response").isTextual(), is(true)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java new file mode 100644 index 00000000..43c43ddb --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerConfigTest.java @@ -0,0 +1,76 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.not; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link OpenAiServerConfig}: builder defaults, the authentication predicate, the CORS / + * vision knobs, and the security contract that {@link OpenAiServerConfig#toString()} never leaks the API + * key. Pure — no socket, no model. + */ +public class OpenAiServerConfigTest { + + @Test + public void builderAppliesLocalhostDefaults() { + OpenAiServerConfig config = OpenAiServerConfig.builder().build(); + assertThat(config.getHost(), is(OpenAiServerConfig.DEFAULT_HOST)); + assertThat(config.getPort(), is(OpenAiServerConfig.DEFAULT_PORT)); + assertThat(config.getModelId(), is(OpenAiServerConfig.DEFAULT_MODEL_ID)); + assertThat(config.getMaxInputTokens(), is(OpenAiServerConfig.DEFAULT_MAX_INPUT_TOKENS)); + assertThat(config.getMaxOutputTokens(), is(OpenAiServerConfig.DEFAULT_MAX_OUTPUT_TOKENS)); + assertThat(config.getHeartbeatMillis(), is(OpenAiServerConfig.DEFAULT_HEARTBEAT_MILLIS)); + assertThat(config.getCorsAllowOrigin(), is(OpenAiServerConfig.DEFAULT_CORS_ALLOW_ORIGIN)); + assertThat(config.isSupportsVision(), is(false)); + assertThat(config.getApiKey(), is((String) null)); + assertThat(config.isAuthenticationEnabled(), is(false)); + } + + @Test + public void authenticationEnabledOnlyForNonEmptyKey() { + assertThat(OpenAiServerConfig.builder().build().isAuthenticationEnabled(), is(false)); + assertThat(OpenAiServerConfig.builder().apiKey("").build().isAuthenticationEnabled(), is(false)); + assertThat(OpenAiServerConfig.builder().apiKey("secret").build().isAuthenticationEnabled(), is(true)); + } + + @Test + public void toStringNeverLeaksTheApiKey() { + String secret = "sk-super-secret-value-1234567890"; + OpenAiServerConfig config = OpenAiServerConfig.builder().apiKey(secret).build(); + String rendered = config.toString(); + // The key value must not appear; only the boolean auth state is exposed. + assertThat(rendered, not(containsString(secret))); + assertThat(rendered, containsString("authEnabled=true")); + } + + @Test + public void corsAndVisionAreConfigurable() { + OpenAiServerConfig config = OpenAiServerConfig.builder() + .corsAllowOrigin("https://editor.example") + .supportsVision(true) + .build(); + assertThat(config.getCorsAllowOrigin(), is("https://editor.example")); + assertThat(config.isSupportsVision(), is(true)); + } + + @Test + public void tokenBudgetsAndPortAreConfigurable() { + OpenAiServerConfig config = OpenAiServerConfig.builder() + .port(0) + .modelId("local-qwen") + .maxInputTokens(6144) + .maxOutputTokens(2048) + .build(); + assertThat(config.getPort(), is(0)); + assertThat(config.getModelId(), is("local-qwen")); + assertThat(config.getMaxInputTokens(), is(6144)); + assertThat(config.getMaxOutputTokens(), is(2048)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerEmbeddingsIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerEmbeddingsIntegrationTest.java new file mode 100644 index 00000000..9bebae78 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerEmbeddingsIntegrationTest.java @@ -0,0 +1,101 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.File; +import java.io.IOException; +import net.ladenthin.llama.LlamaModel; +import net.ladenthin.llama.TestConstants; +import net.ladenthin.llama.args.PoolingType; +import net.ladenthin.llama.parameters.ModelParameters; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * End-to-end integration test for the {@code POST /v1/embeddings} route against a real model loaded in + * embedding mode ({@code enableEmbedding()}), served over a real socket. Reuses the CI text model + * (CodeLlama-7B, {@link TestConstants#MODEL_PATH}) — the same model {@code LlamaEmbeddingsTest} drives in + * embedding mode. Self-skips when the model file is absent (e.g. a local checkout without models), so it + * never breaks a model-free run. Assertions are structural (valid OpenAI embeddings shape) rather than + * value-specific. HTTP plumbing is inherited from {@link OpenAiServerTestSupport}. + */ +public class OpenAiServerEmbeddingsIntegrationTest extends OpenAiServerTestSupport { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String MODEL_ID = "embed-local"; + + private static LlamaModel model; + private static OpenAiCompatServer server; + private static int port; + + @BeforeAll + public static void setup() throws IOException { + Assumptions.assumeTrue( + new File(TestConstants.MODEL_PATH).exists(), + "Text model (CodeLlama-7B) not found, skipping embeddings server integration test"); + int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); + // The OpenAI /v1/embeddings path (oaicompat=true) requires a real pooling type: llama.cpp rejects + // pooling type NONE there ("pooling type 'none' is not OAI compatible"). CodeLlama's GGUF reports + // pooling = -1 (NONE), so an explicit MEAN pooling is set here — MEAN/LAST both produce a single + // pooled sentence vector for decoder-only models (see LlamaEmbeddingsTest). enableEmbedding() + // alone (as the low-level LlamaModel#embed path uses) leaves pooling NONE and would 500 here. + model = new LlamaModel(new ModelParameters() + .setModel(TestConstants.MODEL_PATH) + .setCtxSize(512) + .setGpuLayers(gpuLayers) + .enableEmbedding() + .setPoolingType(PoolingType.MEAN)); + server = new OpenAiCompatServer( + model, + OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build()) + .start(); + port = server.getPort(); + } + + @AfterAll + public static void tearDown() { + if (server != null) { + server.close(); + } + if (model != null) { + model.close(); + } + } + + @Test + public void embeddingsReturnsAVector() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"input\":\"hello world\"}"; + Response response = post(port, "/v1/embeddings", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("object").asText(), is("list")); + JsonNode first = json.path("data").get(0); + assertThat(first.path("object").asText(), is("embedding")); + assertThat(first.path("embedding").isArray(), is(true)); + assertThat(first.path("embedding").size(), greaterThan(0)); + } + + @Test + public void embeddingsReachableWithoutV1Prefix() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"input\":\"alias check\"}"; + Response response = post(port, "/embeddings", body, ""); + assertThat(response.code, is(200)); + assertThat( + MAPPER.readTree(response.body) + .path("data") + .get(0) + .path("embedding") + .isArray(), + is(true)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerRerankIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerRerankIntegrationTest.java new file mode 100644 index 00000000..8ed936cb --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerRerankIntegrationTest.java @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThan; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.lessThanOrEqualTo; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.File; +import java.io.IOException; +import net.ladenthin.llama.LlamaModel; +import net.ladenthin.llama.TestConstants; +import net.ladenthin.llama.parameters.ModelParameters; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * End-to-end integration test for the {@code POST /v1/rerank} route against a real model loaded in + * reranking mode ({@code enableReranking()}), served over a real socket. Reuses the CI reranking model + * (jina-reranker, {@link TestConstants#RERANKING_MODEL_PATH}). Self-skips when the model file is absent. + * Assertions are structural (sorted {@code results}/{@code data} of {@code index}+{@code relevance_score}) + * and check the {@code top_n} cap; exact scores are model-dependent. HTTP plumbing is inherited from + * {@link OpenAiServerTestSupport}. + */ +public class OpenAiServerRerankIntegrationTest extends OpenAiServerTestSupport { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String MODEL_ID = "rerank-local"; + + private static LlamaModel model; + private static OpenAiCompatServer server; + private static int port; + + @BeforeAll + public static void setup() throws IOException { + Assumptions.assumeTrue( + new File(TestConstants.RERANKING_MODEL_PATH).exists(), + "Reranking model (jina-reranker) not found, skipping rerank server integration test"); + int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); + model = new LlamaModel(new ModelParameters() + .setModel(TestConstants.RERANKING_MODEL_PATH) + .setCtxSize(512) + .setGpuLayers(gpuLayers) + .enableReranking() + .skipWarmup()); + server = new OpenAiCompatServer( + model, + OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build()) + .start(); + port = server.getPort(); + } + + @AfterAll + public static void tearDown() { + if (server != null) { + server.close(); + } + if (model != null) { + model.close(); + } + } + + @Test + public void rerankReturnsScoredResultsCappedByTopN() throws IOException { + String body = "{\"model\":\"" + MODEL_ID + "\",\"query\":\"Machine learning is\"," + + "\"documents\":[\"A machine applies forces to perform an action.\"," + + "\"Machine learning is a field of artificial intelligence.\"," + + "\"Paris is the capital of France.\"],\"top_n\":2}"; + Response response = post(port, "/v1/rerank", body, ""); + assertThat(response.code, is(200)); + JsonNode json = MAPPER.readTree(response.body); + assertThat(json.path("object").asText(), is("list")); + JsonNode results = json.path("results"); + assertThat(results.isArray(), is(true)); + assertThat(results.size(), greaterThan(0)); + assertThat(results.size(), lessThanOrEqualTo(2)); // top_n cap + assertThat(results.get(0).path("index").isInt(), is(true)); + assertThat(results.get(0).path("relevance_score").isNumber(), is(true)); + // `data` is an alias of `results` for Continue (#6478). + assertThat(json.path("data").size(), is(results.size())); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerTestSupport.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerTestSupport.java index 5d0faba7..7493e4d9 100644 --- a/src/test/java/net/ladenthin/llama/server/OpenAiServerTestSupport.java +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerTestSupport.java @@ -12,6 +12,7 @@ import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.URL; +import org.jspecify.annotations.Nullable; /** * Shared HTTP plumbing for {@link OpenAiCompatServer} tests: tiny helpers that POST/GET against a @@ -59,6 +60,20 @@ Response get(int port, String path, String auth) throws IOException { return read(conn); } + /** + * Send a CORS preflight ({@code OPTIONS}) to {@code path}. + * + * @param port the server port + * @param path the request path + * @return the captured response (status + {@code Access-Control-*} headers) + * @throws IOException on transport failure + */ + Response options(int port, String path) throws IOException { + HttpURLConnection conn = open(port, path, ""); + conn.setRequestMethod("OPTIONS"); + return read(conn); + } + private static HttpURLConnection open(int port, String path, String auth) throws IOException { HttpURLConnection conn = (HttpURLConnection) new URL("http://127.0.0.1:" + port + path).openConnection(); if (!auth.isEmpty()) { @@ -69,9 +84,10 @@ private static HttpURLConnection open(int port, String path, String auth) throws private static Response read(HttpURLConnection conn) throws IOException { int code = conn.getResponseCode(); + String corsAllowOrigin = conn.getHeaderField("Access-Control-Allow-Origin"); InputStream is = code < 400 ? conn.getInputStream() : conn.getErrorStream(); String body = is == null ? "" : readAll(is); - return new Response(code, body); + return new Response(code, body, corsAllowOrigin); } private static String readAll(InputStream is) throws IOException { @@ -84,14 +100,16 @@ private static String readAll(InputStream is) throws IOException { return new String(buffer.toByteArray(), UTF_8); } - /** Captured HTTP response: status code and body text. */ + /** Captured HTTP response: status code, body text, and the {@code Access-Control-Allow-Origin} header. */ static final class Response { final int code; final String body; + final @Nullable String corsAllowOrigin; - Response(int code, String body) { + Response(int code, String body, @Nullable String corsAllowOrigin) { this.code = code; this.body = body; + this.corsAllowOrigin = corsAllowOrigin; } } } diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerToolCallingIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerToolCallingIntegrationTest.java new file mode 100644 index 00000000..de60c8e0 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/OpenAiServerToolCallingIntegrationTest.java @@ -0,0 +1,125 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.File; +import java.io.IOException; +import net.ladenthin.llama.LlamaModel; +import net.ladenthin.llama.TestConstants; +import net.ladenthin.llama.parameters.ModelParameters; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +/** + * End-to-end tool-calling integration test for {@link OpenAiCompatServer}, driven over a real socket + * against the Qwen2.5-1.5B-Instruct tool model — a stronger tool-calling family than the 0.6B reasoning + * model used by {@link OpenAiCompatServerIntegrationTest}, so it actually emits tool calls. The model is + * resolved from {@link TestConstants#PROP_TOOL_MODEL_PATH} (CI sets it; otherwise + * {@link TestConstants#DEFAULT_TOOL_MODEL_PATH}) and the test self-skips when the GGUF is absent, so a + * model-free checkout is never broken. + * + *

Where {@link OpenAiCompatServerIntegrationTest}'s tool test can only assert a structurally valid + * message (the 0.6B model may not elect to call), these force a call via {@code tool_choice:"required"} + * so the native grammar must emit one — letting us assert, deterministically, that the HTTP server + * returns a well-formed OpenAI {@code tool_calls} array with {@code arguments} carried as a JSON + * string (the agentic-client invariant, llama.cpp #20198), and that #244's + * {@code parallel_tool_calls} flag travels HTTP → mapper → native without breaking the request. + */ +public class OpenAiServerToolCallingIntegrationTest extends OpenAiServerTestSupport { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final String MODEL_ID = "qwen25-tools"; + + /** A trivial single-required-argument function; {@code tool_choice:"required"} forces a call. */ + private static final String TOOLS = "\"tools\":[{\"type\":\"function\",\"function\":{" + + "\"name\":\"get_weather\",\"description\":\"Get the weather for a city\"," + + "\"parameters\":{\"type\":\"object\",\"properties\":{\"city\":{\"type\":\"string\"}}," + + "\"required\":[\"city\"]}}}]"; + + private static LlamaModel model; + private static OpenAiCompatServer server; + private static int port; + + @BeforeAll + public static void setup() throws IOException { + String modelPath = + System.getProperty(TestConstants.PROP_TOOL_MODEL_PATH, TestConstants.DEFAULT_TOOL_MODEL_PATH); + Assumptions.assumeTrue( + new File(modelPath).exists(), + "Tool-calling model (Qwen2.5-1.5B) not found, skipping server tool-calling test: " + modelPath); + int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL); + model = new LlamaModel(new ModelParameters() + .setModel(modelPath) + .setCtxSize(4096) + .setGpuLayers(gpuLayers) + .setFit(false) + .setParallel(1)); + server = new OpenAiCompatServer( + model, + OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build()) + .start(); + port = server.getPort(); + } + + @AfterAll + public static void tearDown() { + if (server != null) { + server.close(); + } + if (model != null) { + model.close(); + } + } + + @Test + public void requiredToolChoiceReturnsWellFormedToolCalls() throws IOException { + // tool_choice=required forces a function call, so a capable model deterministically returns a + // structurally valid OpenAI tool_calls array regardless of its exact wording. + String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":64,\"tool_choice\":\"required\"," + + "\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather in Paris?\"}]," + + TOOLS + "}"; + Response response = post(port, "/v1/chat/completions", body, ""); + assertThat(response.code, is(200)); + JsonNode toolCalls = MAPPER.readTree(response.body) + .path("choices") + .get(0) + .path("message") + .path("tool_calls"); + assertThat(toolCalls.isArray(), is(true)); + assertThat(toolCalls.size(), greaterThanOrEqualTo(1)); + JsonNode function = toolCalls.get(0).path("function"); + assertThat(function.path("name").asText(), is("get_weather")); + // arguments must be a JSON *string* (not an inlined object) — the agentic-client invariant. + assertThat(function.path("arguments").isTextual(), is(true)); + assertThat(MAPPER.readTree(function.path("arguments").asText()).isObject(), is(true)); + } + + @Test + public void parallelToolCallsFalseIsAcceptedEndToEnd() throws IOException { + // parallel_tool_calls=false must flow HTTP -> OpenAiRequestMapper -> native without breaking the + // request; tool_choice=required still yields a well-formed tool call. + String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":64,\"tool_choice\":\"required\"," + + "\"parallel_tool_calls\":false," + + "\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather in Paris?\"}]," + + TOOLS + "}"; + Response response = post(port, "/v1/chat/completions", body, ""); + assertThat(response.code, is(200)); + JsonNode toolCalls = MAPPER.readTree(response.body) + .path("choices") + .get(0) + .path("message") + .path("tool_calls"); + assertThat(toolCalls.isArray(), is(true)); + assertThat(toolCalls.size(), greaterThanOrEqualTo(1)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java index 7c3bcffd..866efd25 100644 --- a/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java +++ b/src/test/java/net/ladenthin/llama/server/OpenAiSseFormatterTest.java @@ -50,6 +50,53 @@ public void errorJsonIncludesCodeWhenProvided() throws IOException { assertThat(error.path("code").asText(), is("E42")); } + @Test + public void ensureUsageCachedTokens_injectsWhenDetailsMissing() throws IOException { + String chunk = "{\"object\":\"chat.completion.chunk\",\"choices\":[]," + + "\"usage\":{\"prompt_tokens\":5,\"completion_tokens\":3,\"total_tokens\":8}}"; + JsonNode out = MAPPER.readTree(OpenAiSseFormatter.ensureUsageCachedTokens(chunk)); + assertThat( + out.path("usage") + .path("prompt_tokens_details") + .path("cached_tokens") + .asInt(), + is(0)); + } + + @Test + public void ensureUsageCachedTokens_injectsWhenDetailsPresentButNoCachedTokens() throws IOException { + String chunk = "{\"usage\":{\"prompt_tokens\":5,\"prompt_tokens_details\":{\"audio_tokens\":0}}}"; + JsonNode out = MAPPER.readTree(OpenAiSseFormatter.ensureUsageCachedTokens(chunk)); + JsonNode details = out.path("usage").path("prompt_tokens_details"); + assertThat(details.path("cached_tokens").asInt(), is(0)); + assertThat(details.path("audio_tokens").asInt(), is(0)); // pre-existing detail preserved + } + + @Test + public void ensureUsageCachedTokens_leavesAlreadyCorrectChunkUnchanged() { + String chunk = "{\"usage\":{\"prompt_tokens_details\":{\"cached_tokens\":4}}}"; + assertThat(OpenAiSseFormatter.ensureUsageCachedTokens(chunk), is(chunk)); + } + + @Test + public void ensureUsageCachedTokens_passesThroughDeltaChunkWithNullUsage() { + String chunk = "{\"choices\":[{\"delta\":{\"content\":\"hi\"}}],\"usage\":null}"; + assertThat(OpenAiSseFormatter.ensureUsageCachedTokens(chunk), is(chunk)); + } + + @Test + public void ensureUsageCachedTokens_passesThroughChunkWithoutUsage() { + String chunk = "{\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}"; + assertThat(OpenAiSseFormatter.ensureUsageCachedTokens(chunk), is(chunk)); + } + + @Test + public void ensureUsageCachedTokens_malformedUsageChunkReturnedUnchanged() { + // Contains a quoted "usage" (so it passes the fast path) but is not parseable — must not throw. + String chunk = "{\"usage\":{ broken"; + assertThat(OpenAiSseFormatter.ensureUsageCachedTokens(chunk), is(chunk)); + } + @Test public void modelsJsonAdvertisesTheConfiguredModel() throws IOException { JsonNode root = MAPPER.readTree(OpenAiSseFormatter.modelsJson("gemma-local")); @@ -57,4 +104,13 @@ public void modelsJsonAdvertisesTheConfiguredModel() throws IOException { assertThat(root.path("data").get(0).path("id").asText(), is("gemma-local")); assertThat(root.path("data").get(0).path("object").asText(), is("model")); } + + @Test + public void propsJsonReportsContextLengthAndModalities() throws IOException { + JsonNode root = MAPPER.readTree(OpenAiSseFormatter.propsJson("local", 8192, true)); + assertThat(root.path("default_generation_settings").path("n_ctx").asInt(), is(8192)); + assertThat(root.path("model_alias").asText(), is("local")); + assertThat(root.path("modalities").path("vision").asBoolean(), is(true)); + assertThat(root.path("modalities").path("audio").asBoolean(), is(false)); + } } diff --git a/src/test/java/net/ladenthin/llama/server/ResponsesApiSupportTest.java b/src/test/java/net/ladenthin/llama/server/ResponsesApiSupportTest.java new file mode 100644 index 00000000..213b711c --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/ResponsesApiSupportTest.java @@ -0,0 +1,143 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import java.io.IOException; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ResponsesApiSupport}: the OpenAI Responses ↔ OpenAI chat request/response + * translation (input items, instructions, function tools, function_call output items). Pure — no model. + */ +public class ResponsesApiSupportTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private static JsonNode read(String json) throws IOException { + return MAPPER.readTree(json); + } + + @Test + public void isStreamingDefaultsFalse() throws IOException { + assertThat(ResponsesApiSupport.isStreaming(read("{}")), is(false)); + assertThat(ResponsesApiSupport.isStreaming(read("{\"stream\":true}")), is(true)); + } + + @Test + public void requestMapsInstructionsStringInputAndTools() throws IOException { + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\"," + + "\"instructions\":\"be brief\",\"input\":\"hi\",\"max_output_tokens\":32," + + "\"tools\":[{\"type\":\"function\",\"name\":\"f\",\"parameters\":{\"type\":\"object\"}}]}")); + assertThat(openAi.path("messages").get(0).path("role").asText(), is("system")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("be brief")); + assertThat(openAi.path("messages").get(1).path("role").asText(), is("user")); + assertThat(openAi.path("messages").get(1).path("content").asText(), is("hi")); + // max_output_tokens -> max_tokens. + assertThat(openAi.path("max_tokens").asInt(), is(32)); + // Responses function tool (flat) -> OpenAI nested function tool. + assertThat(openAi.path("tools").get(0).path("function").path("name").asText(), is("f")); + assertThat( + openAi.path("tools") + .get(0) + .path("function") + .path("parameters") + .path("type") + .asText(), + is("object")); + } + + @Test + public void requestMapsInputArrayMessageAndFunctionCallItems() throws IOException { + String responses = "{\"input\":[" + + "{\"type\":\"message\",\"role\":\"user\",\"content\":[{\"type\":\"input_text\",\"text\":\"q\"}]}," + + "{\"type\":\"function_call\",\"call_id\":\"c1\",\"name\":\"f\",\"arguments\":\"{}\"}," + + "{\"type\":\"function_call_output\",\"call_id\":\"c1\",\"output\":\"ok\"}]}"; + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read(responses)); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("q")); + // function_call -> assistant tool_calls + JsonNode toolCall = openAi.path("messages").get(1).path("tool_calls").get(0); + assertThat(toolCall.path("id").asText(), is("c1")); + assertThat(toolCall.path("function").path("name").asText(), is("f")); + // function_call_output -> role:"tool" message + assertThat(openAi.path("messages").get(2).path("role").asText(), is("tool")); + assertThat(openAi.path("messages").get(2).path("tool_call_id").asText(), is("c1")); + assertThat(openAi.path("messages").get(2).path("content").asText(), is("ok")); + } + + @Test + public void requestWithoutInstructionsHasNoSystemMessage() throws IOException { + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"input\":\"hi\"}")); + // First (and only) message is the user input — no leading system message. + assertThat(openAi.path("messages").size(), is(1)); + assertThat(openAi.path("messages").get(0).path("role").asText(), is("user")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("hi")); + } + + @Test + public void requestMapsAssistantMessageItemAndSkipsNonFunctionTools() throws IOException { + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"input\":[" + + "{\"type\":\"message\",\"role\":\"assistant\"," + + "\"content\":[{\"type\":\"output_text\",\"text\":\"prior\"}]}]," + + "\"tools\":[{\"type\":\"web_search\"},{\"type\":\"function\",\"name\":\"f\"}]}")); + assertThat(openAi.path("messages").get(0).path("role").asText(), is("assistant")); + assertThat(openAi.path("messages").get(0).path("content").asText(), is("prior")); + // Only the function tool survives; the non-function (web_search) tool is dropped. + assertThat(openAi.path("tools").size(), is(1)); + assertThat(openAi.path("tools").get(0).path("function").path("name").asText(), is("f")); + } + + @Test + public void responseWrapsOutputMessageWithOutputTextAndUsage() throws IOException { + String openAi = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"hello\"}," + + "\"finish_reason\":\"stop\"}],\"usage\":{\"prompt_tokens\":4,\"completion_tokens\":1}}"; + JsonNode out = read(ResponsesApiSupport.toResponsesResponse(openAi, "m", "resp_1")); + assertThat(out.path("object").asText(), is("response")); + assertThat(out.path("status").asText(), is("completed")); + JsonNode messageItem = out.path("output").get(0); + assertThat(messageItem.path("type").asText(), is("message")); + assertThat(messageItem.path("content").get(0).path("type").asText(), is("output_text")); + assertThat(messageItem.path("content").get(0).path("text").asText(), is("hello")); + assertThat(out.path("usage").path("input_tokens").asInt(), is(4)); + assertThat(out.path("usage").path("total_tokens").asInt(), is(5)); + } + + @Test + public void responseEmitsFunctionCallItemsForToolCalls() throws IOException { + String openAi = "{\"choices\":[{\"message\":{\"role\":\"assistant\",\"content\":\"\"," + + "\"tool_calls\":[{\"id\":\"c1\",\"type\":\"function\",\"function\":{\"name\":\"f\"," + + "\"arguments\":\"{\\\"a\\\":1}\"}}]},\"finish_reason\":\"tool_calls\"}]}"; + JsonNode out = read(ResponsesApiSupport.toResponsesResponse(openAi, "m", "resp_1")); + // output[0] is the (empty) message, output[1] is the function_call item. + JsonNode functionCall = out.path("output").get(1); + assertThat(functionCall.path("type").asText(), is("function_call")); + assertThat(functionCall.path("call_id").asText(), is("c1")); + assertThat(functionCall.path("name").asText(), is("f")); + assertThat(functionCall.path("arguments").asText(), is("{\"a\":1}")); + } + + @Test + public void requestForwardsToolChoiceAndParallelToolCalls() throws IOException { + // The Responses API uses the same tool_choice + parallel_tool_calls fields as chat. + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\",\"input\":\"hi\"," + + "\"tools\":[{\"type\":\"function\",\"name\":\"f\",\"parameters\":{\"type\":\"object\"}}]," + + "\"tool_choice\":\"required\",\"parallel_tool_calls\":false}")); + assertThat(openAi.path("tool_choice").asText(), is("required")); + assertThat(openAi.path("parallel_tool_calls").isBoolean(), is(true)); + assertThat(openAi.path("parallel_tool_calls").asBoolean(), is(false)); + } + + @Test + public void requestOmitsToolChoiceAndParallelToolCallsWhenAbsent() throws IOException { + JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\",\"input\":\"hi\"," + + "\"tools\":[{\"type\":\"function\",\"name\":\"f\",\"parameters\":{\"type\":\"object\"}}]}")); + assertThat(openAi.has("tool_choice"), is(false)); + assertThat(openAi.has("parallel_tool_calls"), is(false)); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/ResponsesStreamTranslatorTest.java b/src/test/java/net/ladenthin/llama/server/ResponsesStreamTranslatorTest.java new file mode 100644 index 00000000..d4288370 --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/ResponsesStreamTranslatorTest.java @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.is; + +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ResponsesStreamTranslator}: the OpenAI-chunk to Responses-SSE-event sequence + * (response.created → output_item/content_part → output_text.delta* → done events → response.completed). + * Pure. + */ +public class ResponsesStreamTranslatorTest { + + @Test + public void beginEmitsResponseCreated() { + ResponsesStreamTranslator translator = new ResponsesStreamTranslator("m", "resp_1"); + String begin = translator.begin(); + assertThat(begin, containsString("event: response.created")); + assertThat(begin, containsString("\"sequence_number\":0")); + } + + @Test + public void firstTextDeltaOpensItemAndPartThenStreamsDelta() { + ResponsesStreamTranslator translator = new ResponsesStreamTranslator("m", "resp_1"); + String first = translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"he\"}}]}"); + assertThat(first, containsString("event: response.output_item.added")); + assertThat(first, containsString("event: response.content_part.added")); + assertThat(first, containsString("event: response.output_text.delta")); + assertThat(first, containsString("\"delta\":\"he\"")); + String second = translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"llo\"}}]}"); + assertThat(second.contains("output_item.added"), is(false)); + assertThat(second, containsString("\"delta\":\"llo\"")); + } + + @Test + public void endEmitsDoneEventsAndCompleted() { + ResponsesStreamTranslator translator = new ResponsesStreamTranslator("m", "resp_1"); + translator.onChunk("{\"choices\":[{\"delta\":{\"content\":\"hi\"},\"finish_reason\":\"stop\"}]}"); + String end = translator.end(); + assertThat(end, containsString("event: response.output_text.done")); + assertThat(end, containsString("event: response.content_part.done")); + assertThat(end, containsString("event: response.output_item.done")); + assertThat(end, containsString("event: response.completed")); + assertThat(end, containsString("\"text\":\"hi\"")); + } + + @Test + public void toolCallsBecomeFunctionCallItemsBeforeCompleted() { + ResponsesStreamTranslator translator = new ResponsesStreamTranslator("m", "resp_1"); + translator.onChunk("{\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"c1\"," + + "\"function\":{\"name\":\"f\",\"arguments\":\"{}\"}}]},\"finish_reason\":\"tool_calls\"}]}"); + String end = translator.end(); + assertThat(end, containsString("\"type\":\"function_call\"")); + assertThat(end, containsString("event: response.function_call_arguments.done")); + assertThat(end, containsString("event: response.completed")); + } +} diff --git a/src/test/java/net/ladenthin/llama/server/ToolCallDeltaAccumulatorTest.java b/src/test/java/net/ladenthin/llama/server/ToolCallDeltaAccumulatorTest.java new file mode 100644 index 00000000..2a82d37f --- /dev/null +++ b/src/test/java/net/ladenthin/llama/server/ToolCallDeltaAccumulatorTest.java @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin +// +// SPDX-License-Identifier: MIT + +package net.ladenthin.llama.server; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link ToolCallDeltaAccumulator}: reconstructing whole tool calls from fragmented + * OpenAI streaming {@code delta.tool_calls}. Pure — JSON literals only. + */ +public class ToolCallDeltaAccumulatorTest { + + @Test + public void mergesNameAndArgumentFragmentsByIndex() { + ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + accumulator.accept("{\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"id\":\"call_1\"," + + "\"function\":{\"name\":\"get_weather\",\"arguments\":\"{\\\"city\\\":\"}}]}}]}"); + accumulator.accept( + "{\"choices\":[{\"delta\":{\"tool_calls\":[{\"index\":0,\"function\":{\"arguments\":\"\\\"Paris\\\"}\"}}]}}]}"); + assertThat(accumulator.hasToolCalls(), is(true)); + ArrayNode toolCalls = accumulator.toOpenAiToolCalls(); + assertThat(toolCalls.size(), is(1)); + JsonNode toolCall = toolCalls.get(0); + assertThat(toolCall.path("id").asText(), is("call_1")); + assertThat(toolCall.path("type").asText(), is("function")); + assertThat(toolCall.path("function").path("name").asText(), is("get_weather")); + // Arguments are the concatenated JSON-encoded string (not a parsed object). + assertThat(toolCall.path("function").path("arguments").isTextual(), is(true)); + assertThat(toolCall.path("function").path("arguments").asText(), is("{\"city\":\"Paris\"}")); + } + + @Test + public void tracksMultipleParallelToolCallsInIndexOrder() { + ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + accumulator.accept("{\"choices\":[{\"delta\":{\"tool_calls\":[" + + "{\"index\":1,\"id\":\"b\",\"function\":{\"name\":\"two\",\"arguments\":\"{}\"}}," + + "{\"index\":0,\"id\":\"a\",\"function\":{\"name\":\"one\",\"arguments\":\"{}\"}}]}}]}"); + ArrayNode toolCalls = accumulator.toOpenAiToolCalls(); + assertThat(toolCalls.size(), is(2)); + // Emitted in index order (0 then 1), regardless of arrival order. + assertThat(toolCalls.get(0).path("id").asText(), is("a")); + assertThat(toolCalls.get(1).path("id").asText(), is("b")); + } + + @Test + public void ignoresChunksWithoutToolCalls() { + ToolCallDeltaAccumulator accumulator = new ToolCallDeltaAccumulator(); + accumulator.accept("{\"choices\":[{\"delta\":{\"content\":\"hello\"}}]}"); + accumulator.accept("not json"); + assertThat(accumulator.hasToolCalls(), is(false)); + assertThat(accumulator.toOpenAiToolCalls().size(), is(0)); + } +}