Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/validate-models.bat
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ REM GGUF files start with magic bytes: 0x47 0x47 0x55 0x46 ("GGUF")

setlocal enabledelayedexpansion

set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf"
set "MODELS=models\codellama-7b.Q2_K.gguf" "models\jina-reranker-v1-tiny-en-Q4_0.gguf" "models\AMD-Llama-135m-code.Q2_K.gguf" "models\Qwen3-0.6B-Q4_K_M.gguf" "models\Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"

REM Vision GGUFs are validated only when present (the Windows job downloads
REM them too, but the validation step must not fail when a future job opts
Expand Down
1 change: 1 addition & 0 deletions .github/validate-models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ MODELS=(
"models/jina-reranker-v1-tiny-en-Q4_0.gguf"
"models/AMD-Llama-135m-code.Q2_K.gguf"
"models/Qwen3-0.6B-Q4_K_M.gguf"
"models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
)

# Optional GGUFs validated only when present so jobs that do not download
Expand Down
17 changes: 17 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ env:
DRAFT_MODEL_NAME: "AMD-Llama-135m-code.Q2_K.gguf"
REASONING_MODEL_URL: "https://huggingface.co/unsloth/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q4_K_M.gguf"
REASONING_MODEL_NAME: "Qwen3-0.6B-Q4_K_M.gguf"
TOOL_MODEL_URL: "https://huggingface.co/bartowski/Qwen2.5-1.5B-Instruct-GGUF/resolve/main/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
TOOL_MODEL_NAME: "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
NOMIC_EMBED_MODEL_URL: "https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/nomic-embed-text-v1.5.f16.gguf"
NOMIC_EMBED_MODEL_NAME: "nomic-embed-text-v1.5.f16.gguf"
# Vision model + mmproj for MultimodalIntegrationTest (issues #103 / #34).
Expand Down Expand Up @@ -405,6 +407,8 @@ jobs:
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
- name: Download reasoning model
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
- name: Download tool-calling model
run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
- name: Download nomic embedding model (issue #98 regression)
run: curl -L --fail --retry 5 --retry-all-errors ${NOMIC_EMBED_MODEL_URL} --create-dirs -o models/${NOMIC_EMBED_MODEL_NAME}
- name: Download vision model (issues #103 / #34)
Expand All @@ -428,6 +432,7 @@ jobs:
- name: Run tests
run: |
mvn -e --no-transfer-progress -P jcstress test \
-Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
-Dnet.ladenthin.llama.nomic.path=models/${NOMIC_EMBED_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
Expand Down Expand Up @@ -526,6 +531,8 @@ jobs:
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
- name: Download reasoning model
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
- name: Download tool-calling model
run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
- name: Download vision model (issues #103 / #34)
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
- name: Download vision mmproj
Expand All @@ -545,6 +552,7 @@ jobs:
- name: Run tests
run: |
mvn -e --no-transfer-progress -Dnet.ladenthin.llama.test.ngl=0 test \
-Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
-Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
Expand Down Expand Up @@ -590,6 +598,8 @@ jobs:
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
- name: Download reasoning model
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
- name: Download tool-calling model
run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
- name: Download vision model (issues #103 / #34)
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
- name: Download vision mmproj
Expand All @@ -609,6 +619,7 @@ jobs:
- name: Run tests
run: |
mvn -e --no-transfer-progress test \
-Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
-Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
Expand Down Expand Up @@ -654,6 +665,8 @@ jobs:
run: curl -L --fail --retry 5 --retry-all-errors ${DRAFT_MODEL_URL} --create-dirs -o models/${DRAFT_MODEL_NAME}
- name: Download reasoning model
run: curl -L --fail --retry 5 --retry-all-errors ${REASONING_MODEL_URL} --create-dirs -o models/${REASONING_MODEL_NAME}
- name: Download tool-calling model
run: curl -L --fail --retry 5 --retry-all-errors ${TOOL_MODEL_URL} --create-dirs -o models/${TOOL_MODEL_NAME}
- name: Download vision model (issues #103 / #34)
run: curl -L --fail --retry 5 --retry-all-errors ${VISION_MODEL_URL} --create-dirs -o models/${VISION_MODEL_NAME}
- name: Download vision mmproj
Expand All @@ -673,6 +686,7 @@ jobs:
- name: Run tests
run: |
mvn -e --no-transfer-progress test \
-Dnet.ladenthin.llama.tool.model=models/${TOOL_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.model=models/${VISION_MODEL_NAME} \
-Dnet.ladenthin.llama.vision.mmproj=models/${VISION_MMPROJ_NAME} \
-Dnet.ladenthin.llama.vision.image=${VISION_IMAGE_PATH}
Expand Down Expand Up @@ -721,6 +735,8 @@ jobs:
run: curl -L --fail --retry 5 --retry-all-errors $env:DRAFT_MODEL_URL --create-dirs -o models/$env:DRAFT_MODEL_NAME
- name: Download reasoning model
run: curl -L --fail --retry 5 --retry-all-errors $env:REASONING_MODEL_URL --create-dirs -o models/$env:REASONING_MODEL_NAME
- name: Download tool-calling model
run: curl -L --fail --retry 5 --retry-all-errors $env:TOOL_MODEL_URL --create-dirs -o models/$env:TOOL_MODEL_NAME
- name: Download vision model (issues #103 / #34)
run: curl -L --fail --retry 5 --retry-all-errors $env:VISION_MODEL_URL --create-dirs -o models/$env:VISION_MODEL_NAME
- name: Download vision mmproj
Expand Down Expand Up @@ -756,6 +772,7 @@ jobs:
- name: Run tests
run: |
mvn -e --no-transfer-progress test `
"-Dnet.ladenthin.llama.tool.model=models/$env:TOOL_MODEL_NAME" `
"-Dnet.ladenthin.llama.vision.model=models/$env:VISION_MODEL_NAME" `
"-Dnet.ladenthin.llama.vision.mmproj=models/$env:VISION_MMPROJ_NAME" `
"-Dnet.ladenthin.llama.vision.image=$env:VISION_IMAGE_PATH"
Expand Down
6 changes: 5 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,8 @@ src/test/resources/**/*.gbnf
src/main/cpp/llama.cpp/

# jcstress / jqwik test outputs (generated in repo root)
/.jqwik-database
/.jqwik-database

# Local AI agent tooling (not part of the project)
AGENTS.md
.agents/
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ from version 5.0.0 onward. Pre-fork releases (`1.x`–`4.2.0`) were authored by
- `CODE_OF_CONDUCT.md` (Contributor Covenant 2.0).
- `docs/RELEASE.md` capturing the maintainer-facing release procedure (moved out of CHANGELOG).
- OpenSSF Best Practices badge (project 12862) on README.
- OpenAI-compatible `parallel_tool_calls` support: `ChatRequest.withParallelToolCalls(Boolean)` / `getParallelToolCalls()`, `InferenceParameters.withParallelToolCalls(boolean)`, and pass-through in the `/v1/chat/completions` server mapper.
- Real-model tool-calling integration tests for blocking and streaming required tool calls (`ToolCallingIntegrationTest`, Qwen2.5-1.5B-Instruct), wired into CI and `validate-models`.

### Changed
- Unified `CONTRIBUTING.md` and `SECURITY.md` structure with sibling repositories in the project family.
- Reconciled Java baseline to **11+** across `pom.xml`, README badge, `CLAUDE.md`, and `CONTRIBUTING.md`.
- README license badge corrected from "Apache 2.0" to "MIT" (matches `LICENSE` file and `pom.xml`).
- `pom.xml` SCM URL: `tree/master` → `tree/main` (default branch renamed).
- Upgraded llama.cpp from b9151 to b9172.
- Extracted the `chatWithTools` agent loop into `ToolCallingAgent`; tool-result errors (unknown tool / handler exception) are now JSON-serialized so tool names containing special characters remain valid JSON.

### Added
- Reasoning-budget tests (Qwen3-0.6B).
Expand Down
37 changes: 36 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@ Every `net.ladenthin.llama.*` system property recognised by the library, deep-sc
| `net.ladenthin.llama.lib.path` | unset (falls back to `java.library.path`) | runtime | `LlamaLoader` | Directory containing the native `jllama` shared library. Checked first, before `java.library.path`. Set with `-Dnet.ladenthin.llama.lib.path=/path/to/dir`. |
| `net.ladenthin.llama.tmpdir` | unset (falls back to `java.io.tmpdir`) | runtime | `LlamaLoader` | Custom temporary directory used when extracting the native library from the JAR. |
| `net.ladenthin.llama.osinfo.architecture` | unset (uses `os.arch`) | runtime | `OSInfo` | Override for the architecture string used to locate the bundled library inside the JAR. Useful when `os.arch` reports an unexpected value (e.g. inside dockcross / chrooted environments). |
| `net.ladenthin.llama.test.ngl` | `43` | test | `LlamaModelTest`, `RerankingModelTest`, `ChatScenarioTest`, `ChatAdvancedTest`, `ErrorHandlingTest`, `SessionConcurrencyTest`, `ConfigureParallelInferenceTest`, `MultimodalIntegrationTest` (via `Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL)`) | Number of GPU layers used during testing. Pin to `0` on CPU-only hosts: `mvn test -Dnet.ladenthin.llama.test.ngl=0`. |
| `net.ladenthin.llama.test.ngl` | `43` for the general suite; `0` for `ToolCallingIntegrationTest` | test | Model-backed integration tests | Number of GPU layers used during testing. Pin to `0` on CPU-only hosts: `mvn test -Dnet.ladenthin.llama.test.ngl=0`. The tool test also selects device `none` at zero layers so Metal/CUDA is not initialized. |
| `net.ladenthin.llama.tool.model` | `models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf` (test self-skips if missing) | test | `ToolCallingIntegrationTest` | Path to a tool-capable GGUF used to verify required blocking and streaming tool calls. The default matches the Qwen2.5 model in upstream llama.cpp's tool-call test matrix. |
| `net.ladenthin.llama.nomic.path` | unset (test self-skips) | test | `LlamaEmbeddingsTest#testNomicEmbedLoads` | Path to a Nomic embedding model (`nomic-embed-text-v1.5.f16.gguf` or a compatible BERT-family encoder). Regression test for upstream issue #98 (BERT-encoder `result_output` assertion). |
| `net.ladenthin.llama.vision.model` | unset (test self-skips) | test | `MultimodalIntegrationTest` (closes #103 / #34) | Path to a vision-capable model GGUF. Any vision-capable GGUF works; CI default is `SmolVLM-500M-Instruct-Q8_0.gguf`. |
| `net.ladenthin.llama.vision.mmproj` | unset (test self-skips) | test | `MultimodalIntegrationTest` | Matching mmproj GGUF for the vision model. |
Expand Down Expand Up @@ -368,6 +369,40 @@ try (LlamaModel model = new LlamaModel(modelParams)) {
Reasoning/thinking models can receive custom Jinja template variables via
`ModelParameters#setChatTemplateKwargs(Map)`.

### Tool Calling

Use a tool-aware instruct model and enable Jinja when loading it. A typed request can either return
the model's tool calls through `chat`, or execute registered handlers until the model produces a
normal assistant response through `chatWithTools`:

```java
ToolDefinition weather = new ToolDefinition(
"get_weather",
"Get the current weather for a city",
"{\"type\":\"object\",\"properties\":{\"city\":{\"type\":\"string\"}},"
+ "\"required\":[\"city\"]}");

ChatRequest request = ChatRequest.empty()
.appendMessage("user", "What is the weather in Paris?")
.appendTool(weather)
.withToolChoice("auto")
.withParallelToolCalls(Boolean.FALSE);

Map<String, ToolHandler> handlers = Collections.singletonMap(
"get_weather", argumentsJson -> "{\"temperature_c\":21,\"condition\":\"sunny\"}");

try (LlamaModel model = new LlamaModel(new ModelParameters()
.setModel("models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf")
.enableJinja())) {
ChatResponse response = model.chatWithTools(request, handlers);
System.out.println(response.getFirstContent());
}
```

`tool_choice` is the OpenAI-compatible string form (`auto`, `none`, or `required`). Set
`parallel_tool_calls` to `false` when handlers should be issued one at a time. Handler failures and
unknown tool names are returned to the model as valid `{"error":"..."}` tool-result JSON.

### Infilling

You can simply set `InferenceParameters#setInputPrefix(String)` and `InferenceParameters#setInputSuffix(String)`.
Expand Down
43 changes: 5 additions & 38 deletions src/main/java/net/ladenthin/llama/LlamaModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
import net.ladenthin.llama.parameters.ChatRequest;
import net.ladenthin.llama.parameters.InferenceParameters;
import net.ladenthin.llama.parameters.ModelParameters;
import net.ladenthin.llama.value.ChatMessage;
import net.ladenthin.llama.value.ChatResponse;
import net.ladenthin.llama.value.CompletionResult;
import net.ladenthin.llama.value.LlamaOutput;
Expand All @@ -38,7 +37,6 @@
import net.ladenthin.llama.value.Pair;
import net.ladenthin.llama.value.ServerMetrics;
import net.ladenthin.llama.value.StopReason;
import net.ladenthin.llama.value.ToolCall;
import org.jspecify.annotations.Nullable;

/**
Expand Down Expand Up @@ -551,6 +549,10 @@ public ChatResponse chat(ChatRequest request) {
if (toolChoice.isPresent()) {
params = params.withToolChoice(toolChoice.get());
}
Optional<Boolean> parallelToolCalls = request.getParallelToolCalls();
if (parallelToolCalls.isPresent()) {
params = params.withParallelToolCalls(parallelToolCalls.get());
}
}
params = request.applyCustomizer(params);
String raw = chatComplete(params);
Expand All @@ -575,42 +577,7 @@ public ChatResponse chat(ChatRequest request) {
* (or the last response when the round cap is hit)
*/
public ChatResponse chatWithTools(ChatRequest request, java.util.Map<String, ToolHandler> handlers) {
final int maxRounds = request.getMaxToolRounds();
if (maxRounds < 1) {
throw new IllegalArgumentException("ChatRequest.maxToolRounds must be >= 1 (got " + maxRounds + "); "
+ "chatWithTools always issues at least one chat call.");
}
ChatRequest current = request;
ChatResponse last = chat(current);
for (int round = 1; round < maxRounds; round++) {
Optional<ChatMessage> assistantOpt = last.getFirstMessage();
// NOTE: inline !isPresent() here (not compatibilityHelper.isEmpty) so NullAway's
// CheckOptionalEmptiness recognises this as null-narrowing for the .get() below.
if (!assistantOpt.isPresent() || assistantOpt.get().getToolCalls().isEmpty()) {
return last;
}
ChatMessage assistant = assistantOpt.get();
current = current.appendMessage(assistant);
for (ToolCall call : assistant.getToolCalls()) {
ToolHandler handler = handlers.get(call.getName());
String result;
if (handler == null) {
result = "{\"error\":\"unknown tool: " + call.getName() + "\"}";
} else {
try {
result = handler.invoke(call.getArgumentsJson());
} catch (Exception e) {
result = "{\"error\":"
+ net.ladenthin.llama.json.ChatResponseParser.OBJECT_MAPPER.valueToTree(
e.getClass().getSimpleName() + ": " + e.getMessage())
+ "}";
}
}
current = current.appendMessage(ChatMessage.toolResult(call.getId(), result));
}
last = chat(current);
}
return last;
return ToolCallingAgent.run(request, handlers, this::chat);
}

/**
Expand Down
65 changes: 65 additions & 0 deletions src/main/java/net/ladenthin/llama/ToolCallingAgent.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import net.ladenthin.llama.callback.ToolHandler;
import net.ladenthin.llama.parameters.ChatRequest;
import net.ladenthin.llama.value.ChatMessage;
import net.ladenthin.llama.value.ChatResponse;
import net.ladenthin.llama.value.ToolCall;

/** Model-independent orchestration for the tool-calling agent loop. */
final class ToolCallingAgent {

private static final ObjectMapper MAPPER = new ObjectMapper();

private ToolCallingAgent() {}

static ChatResponse run(
ChatRequest request, Map<String, ToolHandler> handlers, Function<ChatRequest, ChatResponse> chatCall) {
final int maxRounds = request.getMaxToolRounds();
if (maxRounds < 1) {
throw new IllegalArgumentException("ChatRequest.maxToolRounds must be >= 1 (got " + maxRounds + "); "
+ "chatWithTools always issues at least one chat call.");
}

ChatRequest current = request;
ChatResponse last = chatCall.apply(current);
for (int round = 1; round < maxRounds; round++) {
Optional<ChatMessage> assistantOpt = last.getFirstMessage();
if (!assistantOpt.isPresent() || assistantOpt.get().getToolCalls().isEmpty()) {
return last;
}

ChatMessage assistant = assistantOpt.get();
current = current.appendMessage(assistant);
for (ToolCall call : assistant.getToolCalls()) {
current = current.appendMessage(ChatMessage.toolResult(call.getId(), invoke(call, handlers)));
}
last = chatCall.apply(current);
}
return last;
}

private static String invoke(ToolCall call, Map<String, ToolHandler> handlers) {
ToolHandler handler = handlers.get(call.getName());
if (handler == null) {
return errorJson("unknown tool: " + call.getName());
}
try {
return handler.invoke(call.getArgumentsJson());
} catch (Exception e) {
return errorJson(e.getClass().getSimpleName() + ": " + e.getMessage());
}
}

private static String errorJson(String message) {
return MAPPER.createObjectNode().put("error", message).toString();
}
}
Loading
Loading