bernardladenthin · bernardladenthin · Jun 20, 2026 · Jun 20, 2026
@@ -9,7 +9,7 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")
 
 setlocal enabledelayedexpansion
 
-set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf"
+set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
 
 REM Vision GGUFs are validated only when present (the Windows job downloads
 REM them too, but the validation step must not fail when a future job opts

@@ -15,6 +15,7 @@ MODELS=(
   "models/jina-reranker-v1-tiny-en-Q4_0.gguf"
   "models/AMD-Llama-135m-code.Q2_K.gguf"
   "models/Qwen3-0.6B-Q4_K_M.gguf"
+  "models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
 )
 
 # Optional GGUFs validated only when present so jobs that do not download

@@ -25,6 +25,8 @@ env:
   DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
   REASONING_MODEL_URL: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
   REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
+  TOOL_MODEL_URL: "https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
+  TOOL_MODEL_NAME: "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
   NOMIC_EMBED_MODEL_URL: "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf"
   NOMIC_EMBED_MODEL_NAME: "nomic-embed-text-v1.5.f16.gguf"
   # Vision model + mmproj for MultimodalIntegrationTest (issues #103 / #34).
@@ -405,6 +407,8 @@ jobs:
         run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
         run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+      - name: Download tool-calling model
+        run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download nomic embedding model (issue #98 regression)
         run: curl -L --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
@@ -428,6 +432,7 @@ jobs:
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress -P jcstress test \
+            -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
             -Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
@@ -526,6 +531,8 @@ jobs:
         run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
         run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+      - name: Download tool-calling model
+        run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
         run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
@@ -545,6 +552,7 @@ jobs:
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress -Dnet.ladenthin.llama.test.ngl=0 test \
+            -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
             -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
@@ -590,6 +598,8 @@ jobs:
         run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
         run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+      - name: Download tool-calling model
+        run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
         run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
@@ -609,6 +619,7 @@ jobs:
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress test \
+            -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
             -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
@@ -654,6 +665,8 @@ jobs:
         run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
       - name: Download reasoning model
         run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
+      - name: Download tool-calling model
+        run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
       - name: Download vision model (issues #103 / #34)
         run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
       - name: Download vision mmproj
@@ -673,6 +686,7 @@ jobs:
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress test \
+            -Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
             -Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
             -Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
@@ -721,6 +735,8 @@ jobs:
         run: curl -L --fail --retry 5 --retry-all-errors $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
       - name: Download reasoning model
         run: curl -L --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
+      - name: Download tool-calling model
+        run: curl -L --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME
       - name: Download vision model (issues #103 / #34)
         run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME
       - name: Download vision mmproj
@@ -756,6 +772,7 @@ jobs:
       - name: Run tests
         run: |
           mvn -e --no-transfer-progress test `
+            "-Dnet.ladenthin.llama.tool.model=models/$env:TOOL_MODEL_NAME" `
             "-Dnet.ladenthin.llama.vision.model=models/$env:VISION_MODEL_NAME" `
             "-Dnet.ladenthin.llama.vision.mmproj=models/$env:VISION_MMPROJ_NAME" `
             "-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH"

@@ -51,4 +51,8 @@ src/test/resources/**/*.gbnf
 src/main/cpp/llama.cpp/
 
 # jcstress / jqwik test outputs (generated in repo root)
-/.jqwik-database
+/.jqwik-database
+
+# Local AI agent tooling (not part of the project)
+AGENTS.md
+.agents/
@@ -13,13 +13,16 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
 - `CODE_OF_CONDUCT.md` (Contributor Covenant 2.0).
 - `docs/RELEASE.md` capturing the maintainer-facing release procedure (moved out of CHANGELOG).
 - OpenSSF Best Practices badge (project 12862) on README.
+- OpenAI-compatible `parallel_tool_calls` support: `ChatRequest.withParallelToolCalls(Boolean)` / `getParallelToolCalls()`, `InferenceParameters.withParallelToolCalls(boolean)`, and pass-through in the `/v1/chat/completions` server mapper.
+- Real-model tool-calling integration tests for blocking and streaming required tool calls (`ToolCallingIntegrationTest`, Qwen2.5-1.5B-Instruct), wired into CI and `validate-models`.
 
 ### Changed
 - Unified `CONTRIBUTING.md` and `SECURITY.md` structure with sibling repositories in the project family.
 - Reconciled Java baseline to **11+** across `pom.xml`, README badge, `CLAUDE.md`, and `CONTRIBUTING.md`.
 - README license badge corrected from "Apache 2.0" to "MIT" (matches `LICENSE` file and `pom.xml`).
 - `pom.xml` SCM URL: `tree/master` → `tree/main` (default branch renamed).
 - Upgraded llama.cpp from b9151 to b9172.
+- Extracted the `chatWithTools` agent loop into `ToolCallingAgent`; tool-result errors (unknown tool / handler exception) are now JSON-serialized so tool names containing special characters remain valid JSON.
 
 ### Added
 - Reasoning-budget tests (Qwen3-0.6B).

@@ -259,7 +259,8 @@ Every `net.ladenthin.llama.*` system property recognised by the library, deep-sc
 | `net.ladenthin.llama.lib.path` | unset (falls back to `java.library.path`) | runtime | `LlamaLoader` | Directory containing the native `jllama` shared library. Checked first, before `java.library.path`. Set with `-Dnet.ladenthin.llama.lib.path=/path/to/dir`. |
 | `net.ladenthin.llama.tmpdir` | unset (falls back to `java.io.tmpdir`) | runtime | `LlamaLoader` | Custom temporary directory used when extracting the native library from the JAR. |
 | `net.ladenthin.llama.osinfo.architecture` | unset (uses `os.arch`) | runtime | `OSInfo` | Override for the architecture string used to locate the bundled library inside the JAR. Useful when `os.arch` reports an unexpected value (e.g. inside dockcross / chrooted environments). |
-| `net.ladenthin.llama.test.ngl` | `43` | test | `LlamaModelTest`, `RerankingModelTest`, `ChatScenarioTest`, `ChatAdvancedTest`, `ErrorHandlingTest`, `SessionConcurrencyTest`, `ConfigureParallelInferenceTest`, `MultimodalIntegrationTest` (via `Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL)`) | Number of GPU layers used during testing. Pin to `0` on CPU-only hosts: `mvn test -Dnet.ladenthin.llama.test.ngl=0`. |
+| `net.ladenthin.llama.test.ngl` | `43` for the general suite; `0` for `ToolCallingIntegrationTest` | test | Model-backed integration tests | Number of GPU layers used during testing. Pin to `0` on CPU-only hosts: `mvn test -Dnet.ladenthin.llama.test.ngl=0`. The tool test also selects device `none` at zero layers so Metal/CUDA is not initialized. |
+| `net.ladenthin.llama.tool.model` | `models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf` (test self-skips if missing) | test | `ToolCallingIntegrationTest` | Path to a tool-capable GGUF used to verify required blocking and streaming tool calls. The default matches the Qwen2.5 model in upstream llama.cpp's tool-call test matrix. |
 | `net.ladenthin.llama.nomic.path` | unset (test self-skips) | test | `LlamaEmbeddingsTest#testNomicEmbedLoads` | Path to a Nomic embedding model (`nomic-embed-text-v1.5.f16.gguf` or a compatible BERT-family encoder). Regression test for upstream issue #98 (BERT-encoder `result_output` assertion). |
 | `net.ladenthin.llama.vision.model` | unset (test self-skips) | test | `MultimodalIntegrationTest` (closes #103 / #34) | Path to a vision-capable model GGUF. Any vision-capable GGUF works; CI default is `SmolVLM-500M-Instruct-Q8_0.gguf`. |
 | `net.ladenthin.llama.vision.mmproj` | unset (test self-skips) | test | `MultimodalIntegrationTest` | Matching mmproj GGUF for the vision model. |
@@ -368,6 +369,40 @@ try (LlamaModel model = new LlamaModel(modelParams)) {
 Reasoning/thinking models can receive custom Jinja template variables via
 `ModelParameters#setChatTemplateKwargs(Map)`.
 
+### Tool Calling
+
+Use a tool-aware instruct model and enable Jinja when loading it. A typed request can either return
+the model's tool calls through `chat`, or execute registered handlers until the model produces a
+normal assistant response through `chatWithTools`:
+
+```java
+ToolDefinition weather = new ToolDefinition(
+        "get_weather",
+        "Get the current weather for a city",
+        "{\"type\":\"object\",\"properties\":{\"city\":{\"type\":\"string\"}},"
+                + "\"required\":[\"city\"]}");
+
+ChatRequest request = ChatRequest.empty()
+        .appendMessage("user", "What is the weather in Paris?")
+        .appendTool(weather)
+        .withToolChoice("auto")
+        .withParallelToolCalls(Boolean.FALSE);
+
+Map<String, ToolHandler> handlers = Collections.singletonMap(
+        "get_weather", argumentsJson -> "{\"temperature_c\":21,\"condition\":\"sunny\"}");
+
+try (LlamaModel model = new LlamaModel(new ModelParameters()
+        .setModel("models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf")
+        .enableJinja())) {
+    ChatResponse response = model.chatWithTools(request, handlers);
+    System.out.println(response.getFirstContent());
+}
+```
+
+`tool_choice` is the OpenAI-compatible string form (`auto`, `none`, or `required`). Set
+`parallel_tool_calls` to `false` when handlers should be issued one at a time. Handler failures and
+unknown tool names are returned to the model as valid `{"error":"..."}` tool-result JSON.
+
 ### Infilling
 
 You can simply set `InferenceParameters#setInputPrefix(String)` and `InferenceParameters#setInputSuffix(String)`.

@@ -29,7 +29,6 @@
 import net.ladenthin.llama.parameters.ChatRequest;
 import net.ladenthin.llama.parameters.InferenceParameters;
 import net.ladenthin.llama.parameters.ModelParameters;
-import net.ladenthin.llama.value.ChatMessage;
 import net.ladenthin.llama.value.ChatResponse;
 import net.ladenthin.llama.value.CompletionResult;
 import net.ladenthin.llama.value.LlamaOutput;
@@ -38,7 +37,6 @@
 import net.ladenthin.llama.value.Pair;
 import net.ladenthin.llama.value.ServerMetrics;
 import net.ladenthin.llama.value.StopReason;
-import net.ladenthin.llama.value.ToolCall;
 import org.jspecify.annotations.Nullable;
 
 /**
@@ -551,6 +549,10 @@ public ChatResponse chat(ChatRequest request) {
             if (toolChoice.isPresent()) {
                 params = params.withToolChoice(toolChoice.get());
             }
+            Optional<Boolean> parallelToolCalls = request.getParallelToolCalls();
+            if (parallelToolCalls.isPresent()) {
+                params = params.withParallelToolCalls(parallelToolCalls.get());
+            }
         }
         params = request.applyCustomizer(params);
         String raw = chatComplete(params);
@@ -575,42 +577,7 @@ public ChatResponse chat(ChatRequest request) {
      *         (or the last response when the round cap is hit)
      */
     public ChatResponse chatWithTools(ChatRequest request, java.util.Map<String, ToolHandler> handlers) {
-        final int maxRounds = request.getMaxToolRounds();
-        if (maxRounds < 1) {
-            throw new IllegalArgumentException("ChatRequest.maxToolRounds must be >= 1 (got " + maxRounds + "); "
-                    + "chatWithTools always issues at least one chat call.");
-        }
-        ChatRequest current = request;
-        ChatResponse last = chat(current);
-        for (int round = 1; round < maxRounds; round++) {
-            Optional<ChatMessage> assistantOpt = last.getFirstMessage();
-            // NOTE: inline !isPresent() here (not compatibilityHelper.isEmpty) so NullAway's
-            //       CheckOptionalEmptiness recognises this as null-narrowing for the .get() below.
-            if (!assistantOpt.isPresent() || assistantOpt.get().getToolCalls().isEmpty()) {
-                return last;
-            }
-            ChatMessage assistant = assistantOpt.get();
-            current = current.appendMessage(assistant);
-            for (ToolCall call : assistant.getToolCalls()) {
-                ToolHandler handler = handlers.get(call.getName());
-                String result;
-                if (handler == null) {
-                    result = "{\"error\":\"unknown tool: " + call.getName() + "\"}";
-                } else {
-                    try {
-                        result = handler.invoke(call.getArgumentsJson());
-                    } catch (Exception e) {
-                        result = "{\"error\":"
-                                + net.ladenthin.llama.json.ChatResponseParser.OBJECT_MAPPER.valueToTree(
-                                        e.getClass().getSimpleName() + ": " + e.getMessage())
-                                + "}";
-                    }
-                }
-                current = current.appendMessage(ChatMessage.toolResult(call.getId(), result));
-            }
-            last = chat(current);
-        }
-        return last;
+        return ToolCallingAgent.run(request, handlers, this::chat);
     }
 
     /**

@@ -0,0 +1,65 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.util.Map;
+import java.util.Optional;
+import java.util.function.Function;
+import net.ladenthin.llama.callback.ToolHandler;
+import net.ladenthin.llama.parameters.ChatRequest;
+import net.ladenthin.llama.value.ChatMessage;
+import net.ladenthin.llama.value.ChatResponse;
+import net.ladenthin.llama.value.ToolCall;
+
+/** Model-independent orchestration for the tool-calling agent loop. */
+final class ToolCallingAgent {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+
+    private ToolCallingAgent() {}
+
+    static ChatResponse run(
+            ChatRequest request, Map<String, ToolHandler> handlers, Function<ChatRequest, ChatResponse> chatCall) {
+        final int maxRounds = request.getMaxToolRounds();
+        if (maxRounds < 1) {
+            throw new IllegalArgumentException("ChatRequest.maxToolRounds must be >= 1 (got " + maxRounds + "); "
+                    + "chatWithTools always issues at least one chat call.");
+        }
+
+        ChatRequest current = request;
+        ChatResponse last = chatCall.apply(current);
+        for (int round = 1; round < maxRounds; round++) {
+            Optional<ChatMessage> assistantOpt = last.getFirstMessage();
+            if (!assistantOpt.isPresent() || assistantOpt.get().getToolCalls().isEmpty()) {
+                return last;
+            }
+
+            ChatMessage assistant = assistantOpt.get();
+            current = current.appendMessage(assistant);
+            for (ToolCall call : assistant.getToolCalls()) {
+                current = current.appendMessage(ChatMessage.toolResult(call.getId(), invoke(call, handlers)));
+            }
+            last = chatCall.apply(current);
+        }
+        return last;
+    }
+
+    private static String invoke(ToolCall call, Map<String, ToolHandler> handlers) {
+        ToolHandler handler = handlers.get(call.getName());
+        if (handler == null) {
+            return errorJson("unknown tool: " + call.getName());
+        }
+        try {
+            return handler.invoke(call.getArgumentsJson());
+        } catch (Exception e) {
+            return errorJson(e.getClass().getSimpleName() + ": " + e.getMessage());
+        }
+    }
+
+    private static String errorJson(String message) {
+        return MAPPER.createObjectNode().put("error", message).toString();
+    }
+}