feat(server): propagate parallel_tool_calls across all OpenAI-compatible surfaces

claude · claude · commit fae2b90e4abb · 2026-06-20T09:54:31.000Z
#244 made the chat core honor parallel_tool_calls, but only the OpenAI /v1/chat/completions surface forwarded it; the alternative protocol surfaces (which translate into that same chat core) silently dropped the equivalent flag. Close the gap: - Anthropic /v1/messages (AnthropicApiSupport.toOpenAiChatRequest): map tool_choice.disable_parallel_tool_use=true -> parallel_tool_calls=false (default stays parallel when unset/false). - OpenAI Responses /v1/responses (ResponsesApiSupport.toOpenAiChatRequest): forward parallel_tool_calls, and also forward tool_choice (string form), which was being dropped entirely — both now reach the shared OpenAiRequestMapper. Tests: - AnthropicApiSupportTest / ResponsesApiSupportTest: unit-cover the new mappings (set, and omitted-when-absent). - OpenAiServerToolCallingIntegrationTest (new): real-model end-to-end over HTTP using the Qwen2.5-1.5B tool model #244 wired into CI. tool_choice="required" forces a call, so it deterministically asserts the server returns a well-formed tool_calls array (arguments as a JSON string, llama.cpp #20198) and that parallel_tool_calls=false travels HTTP -> mapper -> native intact. Self-skips when the model is absent. Verified locally: spotless, compile, spotbugs clean; model-free translator tests pass. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01JdLpWD8nedY7LwNnHefZLF
diff --git a/src/main/java/net/ladenthin/llama/server/AnthropicApiSupport.java b/src/main/java/net/ladenthin/llama/server/AnthropicApiSupport.java
@@ -85,6 +85,12 @@ static ObjectNode toOpenAiChatRequest(JsonNode request) {
             if (toolChoice != null) {
                 openAi.put("tool_choice", toolChoice);
             }
+            // Anthropic expresses "no parallel tool use" via tool_choice.disable_parallel_tool_use;
+            // OpenAI's equivalent is parallel_tool_calls=false. Map it so the shared chat core honors
+            // a client's request to serialize tool calls (default stays parallel when unset/false).
+            if (request.path("tool_choice").path("disable_parallel_tool_use").asBoolean(false)) {
+                openAi.put("parallel_tool_calls", false);
+            }
         }
 
         copyNumber(request, "max_tokens", openAi, "max_tokens");
diff --git a/src/main/java/net/ladenthin/llama/server/ResponsesApiSupport.java b/src/main/java/net/ladenthin/llama/server/ResponsesApiSupport.java
@@ -79,6 +79,17 @@ static ObjectNode toOpenAiChatRequest(JsonNode request) {
                     function.set("parameters", tool.path("parameters").deepCopy());
                 }
             }
+            // The Responses API uses the same tool_choice + parallel_tool_calls fields as chat; forward
+            // them so the shared chat core honors them. The mapper consumes the string form of
+            // tool_choice ("auto"/"none"/"required"), which is what we forward here.
+            if (request.path("tool_choice").isTextual()) {
+                openAi.put("tool_choice", request.path("tool_choice").asText());
+            }
+            if (request.path("parallel_tool_calls").isBoolean()) {
+                openAi.put(
+                        "parallel_tool_calls",
+                        request.path("parallel_tool_calls").asBoolean());
+            }
         }
 
         copyNumber(request, "temperature", openAi, "temperature");
diff --git a/src/test/java/net/ladenthin/llama/server/AnthropicApiSupportTest.java b/src/test/java/net/ladenthin/llama/server/AnthropicApiSupportTest.java
@@ -160,4 +160,25 @@ public void sseEventBuildersAreWellFormed() throws IOException {
                 is("assistant"));
         assertThat(AnthropicApiSupport.messageStopEvent().startsWith("event: message_stop"), is(true));
     }
+
+    @Test
+    public void requestMapsDisableParallelToolUseToParallelToolCallsFalse() throws IOException {
+        // Anthropic tool_choice.disable_parallel_tool_use=true -> OpenAI parallel_tool_calls=false.
+        JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\","
+                + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}],"
+                + "\"tools\":[{\"name\":\"get_weather\",\"input_schema\":{\"type\":\"object\"}}],"
+                + "\"tool_choice\":{\"type\":\"auto\",\"disable_parallel_tool_use\":true}}"));
+        assertThat(openAi.path("parallel_tool_calls").isBoolean(), is(true));
+        assertThat(openAi.path("parallel_tool_calls").asBoolean(), is(false));
+    }
+
+    @Test
+    public void requestOmitsParallelToolCallsWhenParallelToolUseAllowed() throws IOException {
+        // disable_parallel_tool_use absent -> default (parallel allowed) -> no override emitted.
+        JsonNode openAi = AnthropicApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\","
+                + "\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}],"
+                + "\"tools\":[{\"name\":\"get_weather\",\"input_schema\":{\"type\":\"object\"}}],"
+                + "\"tool_choice\":{\"type\":\"auto\"}}"));
+        assertThat(openAi.has("parallel_tool_calls"), is(false));
+    }
 }
diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerToolCallingIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerToolCallingIntegrationTest.java
@@ -0,0 +1,125 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+import static org.hamcrest.Matchers.is;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.File;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.TestConstants;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * End-to-end tool-calling integration test for {@link OpenAiCompatServer}, driven over a real socket
+ * against the Qwen2.5-1.5B-Instruct tool model — a stronger tool-calling family than the 0.6B reasoning
+ * model used by {@link OpenAiCompatServerIntegrationTest}, so it actually emits tool calls. The model is
+ * resolved from {@link TestConstants#PROP_TOOL_MODEL_PATH} (CI sets it; otherwise
+ * {@link TestConstants#DEFAULT_TOOL_MODEL_PATH}) and the test self-skips when the GGUF is absent, so a
+ * model-free checkout is never broken.
+ *
+ * <p>Where {@link OpenAiCompatServerIntegrationTest}'s tool test can only assert a structurally valid
+ * message (the 0.6B model may not elect to call), these force a call via {@code tool_choice:"required"}
+ * so the native grammar must emit one — letting us assert, deterministically, that the HTTP server
+ * returns a well-formed OpenAI {@code tool_calls} array with {@code arguments} carried as a JSON
+ * <em>string</em> (the agentic-client invariant, llama.cpp #20198), and that #244's
+ * {@code parallel_tool_calls} flag travels HTTP &rarr; mapper &rarr; native without breaking the request.
+ */
+public class OpenAiServerToolCallingIntegrationTest extends OpenAiServerTestSupport {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+    private static final String MODEL_ID = "qwen25-tools";
+
+    /** A trivial single-required-argument function; {@code tool_choice:"required"} forces a call. */
+    private static final String TOOLS = "\"tools\":[{\"type\":\"function\",\"function\":{"
+            + "\"name\":\"get_weather\",\"description\":\"Get the weather for a city\","
+            + "\"parameters\":{\"type\":\"object\",\"properties\":{\"city\":{\"type\":\"string\"}},"
+            + "\"required\":[\"city\"]}}}]";
+
+    private static LlamaModel model;
+    private static OpenAiCompatServer server;
+    private static int port;
+
+    @BeforeAll
+    public static void setup() throws IOException {
+        String modelPath =
+                System.getProperty(TestConstants.PROP_TOOL_MODEL_PATH, TestConstants.DEFAULT_TOOL_MODEL_PATH);
+        Assumptions.assumeTrue(
+                new File(modelPath).exists(),
+                "Tool-calling model (Qwen2.5-1.5B) not found, skipping server tool-calling test: " + modelPath);
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(new ModelParameters()
+                .setModel(modelPath)
+                .setCtxSize(4096)
+                .setGpuLayers(gpuLayers)
+                .setFit(false)
+                .setParallel(1));
+        server = new OpenAiCompatServer(
+                        model,
+                        OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build())
+                .start();
+        port = server.getPort();
+    }
+
+    @AfterAll
+    public static void tearDown() {
+        if (server != null) {
+            server.close();
+        }
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    @Test
+    public void requiredToolChoiceReturnsWellFormedToolCalls() throws IOException {
+        // tool_choice=required forces a function call, so a capable model deterministically returns a
+        // structurally valid OpenAI tool_calls array regardless of its exact wording.
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":64,\"tool_choice\":\"required\","
+                + "\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather in Paris?\"}],"
+                + TOOLS + "}";
+        Response response = post(port, "/v1/chat/completions", body, "");
+        assertThat(response.code, is(200));
+        JsonNode toolCalls = MAPPER.readTree(response.body)
+                .path("choices")
+                .get(0)
+                .path("message")
+                .path("tool_calls");
+        assertThat(toolCalls.isArray(), is(true));
+        assertThat(toolCalls.size(), greaterThanOrEqualTo(1));
+        JsonNode function = toolCalls.get(0).path("function");
+        assertThat(function.path("name").asText(), is("get_weather"));
+        // arguments must be a JSON *string* (not an inlined object) — the agentic-client invariant.
+        assertThat(function.path("arguments").isTextual(), is(true));
+        assertThat(MAPPER.readTree(function.path("arguments").asText()).isObject(), is(true));
+    }
+
+    @Test
+    public void parallelToolCallsFalseIsAcceptedEndToEnd() throws IOException {
+        // parallel_tool_calls=false must flow HTTP -> OpenAiRequestMapper -> native without breaking the
+        // request; tool_choice=required still yields a well-formed tool call.
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":64,\"tool_choice\":\"required\","
+                + "\"parallel_tool_calls\":false,"
+                + "\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather in Paris?\"}],"
+                + TOOLS + "}";
+        Response response = post(port, "/v1/chat/completions", body, "");
+        assertThat(response.code, is(200));
+        JsonNode toolCalls = MAPPER.readTree(response.body)
+                .path("choices")
+                .get(0)
+                .path("message")
+                .path("tool_calls");
+        assertThat(toolCalls.isArray(), is(true));
+        assertThat(toolCalls.size(), greaterThanOrEqualTo(1));
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/ResponsesApiSupportTest.java b/src/test/java/net/ladenthin/llama/server/ResponsesApiSupportTest.java
@@ -121,4 +121,23 @@ public void responseEmitsFunctionCallItemsForToolCalls() throws IOException {
         assertThat(functionCall.path("name").asText(), is("f"));
         assertThat(functionCall.path("arguments").asText(), is("{\"a\":1}"));
     }
+
+    @Test
+    public void requestForwardsToolChoiceAndParallelToolCalls() throws IOException {
+        // The Responses API uses the same tool_choice + parallel_tool_calls fields as chat.
+        JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\",\"input\":\"hi\","
+                + "\"tools\":[{\"type\":\"function\",\"name\":\"f\",\"parameters\":{\"type\":\"object\"}}],"
+                + "\"tool_choice\":\"required\",\"parallel_tool_calls\":false}"));
+        assertThat(openAi.path("tool_choice").asText(), is("required"));
+        assertThat(openAi.path("parallel_tool_calls").isBoolean(), is(true));
+        assertThat(openAi.path("parallel_tool_calls").asBoolean(), is(false));
+    }
+
+    @Test
+    public void requestOmitsToolChoiceAndParallelToolCallsWhenAbsent() throws IOException {
+        JsonNode openAi = ResponsesApiSupport.toOpenAiChatRequest(read("{\"model\":\"m\",\"input\":\"hi\","
+                + "\"tools\":[{\"type\":\"function\",\"name\":\"f\",\"parameters\":{\"type\":\"object\"}}]}"));
+        assertThat(openAi.has("tool_choice"), is(false));
+        assertThat(openAi.has("parallel_tool_calls"), is(false));
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,12 @@ static ObjectNode toOpenAiChatRequest(JsonNode request) {`
`85`	`85`	`if (toolChoice != null) {`
`86`	`86`	`openAi.put("tool_choice", toolChoice);`
`87`	`87`	`}`
	`88`	`+ // Anthropic expresses "no parallel tool use" via tool_choice.disable_parallel_tool_use;`
	`89`	`+ // OpenAI's equivalent is parallel_tool_calls=false. Map it so the shared chat core honors`
	`90`	`+ // a client's request to serialize tool calls (default stays parallel when unset/false).`
	`91`	`+ if (request.path("tool_choice").path("disable_parallel_tool_use").asBoolean(false)) {`
	`92`	`+ openAi.put("parallel_tool_calls", false);`
	`93`	`+ }`
`88`	`94`	`}`
`89`	`95`
`90`	`96`	`copyNumber(request, "max_tokens", openAi, "max_tokens");`
Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,17 @@ static ObjectNode toOpenAiChatRequest(JsonNode request) {`
`79`	`79`	`function.set("parameters", tool.path("parameters").deepCopy());`
`80`	`80`	`}`
`81`	`81`	`}`
	`82`	`+ // The Responses API uses the same tool_choice + parallel_tool_calls fields as chat; forward`
	`83`	`+ // them so the shared chat core honors them. The mapper consumes the string form of`
	`84`	`+ // tool_choice ("auto"/"none"/"required"), which is what we forward here.`
	`85`	`+ if (request.path("tool_choice").isTextual()) {`
	`86`	`+ openAi.put("tool_choice", request.path("tool_choice").asText());`
	`87`	`+ }`
	`88`	`+ if (request.path("parallel_tool_calls").isBoolean()) {`
	`89`	`+ openAi.put(`
	`90`	`+ "parallel_tool_calls",`
	`91`	`+ request.path("parallel_tool_calls").asBoolean());`
	`92`	`+ }`
`82`	`93`	`}`
`83`	`94`
`84`	`95`	`copyNumber(request, "temperature", openAi, "temperature");`