test(server): gated integration round-trips for embeddings, rerank, completion/infill/generate

claude · claude · commit 2987cf755e86 · 2026-06-19T07:55:40.000Z
Completes the live end-to-end coverage of the IDE-backend surfaces. Each fixture boots a real server over a real socket in the matching model mode, reuses a model CI already downloads, self-skips when absent, and asserts structural shapes only: - OpenAiServerEmbeddingsIntegrationTest (CodeLlama-7B + enableEmbedding): POST /v1/embeddings returns an OpenAI {object:list, data:[{object:embedding, embedding:[…]}]} shape; also covers the bare /embeddings alias. - OpenAiServerRerankIntegrationTest (jina-reranker + enableReranking): POST /v1/rerank returns sorted {index, relevance_score} results capped by top_n, with the `data` alias. - OpenAiServerCompletionIntegrationTest (CodeLlama-7B): POST /v1/completions, /infill, and Ollama /api/generate (plain + FIM via `suffix`) — CodeLlama is FIM-capable per LlamaModelTest#testGenerateInfill. Also: add TestConstants.RERANKING_MODEL_PATH and route RerankingModelTest through it (removes the duplicated literal). Used Java-8-safe idioms throughout. These run in the same CI job that already round-trips the OpenAI chat path, so the Ollama/Anthropic/Responses/embeddings/rerank/completion surfaces are now all validated end-to-end against real models; only manual editor-client validation remains (TODO). Server + arch suite green (integration fixtures self-skip without models locally); javadoc + spotless clean. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_01JdLpWD8nedY7LwNnHefZLF
diff --git a/TODO.md b/TODO.md
@@ -48,10 +48,17 @@ primary goal: agentic tool-calling with Qwen):
   /v1/responses`, SSE events — `ResponsesApiSupport` + `ResponsesStreamTranslator`).
 - **`GET /props`** (llama.cpp-native): `default_generation_settings.n_ctx` + `modalities` so autocomplete
   clients (llama.vscode) size their context window (`OpenAiSseFormatter.propsJson`).
-- Gated **integration round-trips** (`OpenAiCompatServerIntegrationTest`, Qwen3-0.6B over a real socket;
-  runs in CI's `test-java-linux-x86_64` job, self-skips when the model is absent): OpenAI chat
-  (non-stream/stream/tools/models) plus Ollama `/api/chat` + discovery, Anthropic `/v1/messages`, OpenAI
-  `/v1/responses` (non-stream + stream) and `/props` — structural assertions only.
+- Gated **integration round-trips** over a real socket, run in CI's `test-java-linux-x86_64` job,
+  self-skipping when the model is absent — structural assertions only:
+  - `OpenAiCompatServerIntegrationTest` (Qwen3-0.6B, chat mode): OpenAI chat (non-stream/stream/tools/
+    models) plus Ollama `/api/chat` + discovery, Anthropic `/v1/messages`, OpenAI `/v1/responses`
+    (non-stream + stream) and `/props`.
+  - `OpenAiServerEmbeddingsIntegrationTest` (CodeLlama-7B + `enableEmbedding`): `/v1/embeddings` (+ bare
+    alias).
+  - `OpenAiServerRerankIntegrationTest` (jina-reranker + `enableReranking`): `/v1/rerank` (sorted
+    `results`/`data`, `top_n` cap).
+  - `OpenAiServerCompletionIntegrationTest` (CodeLlama-7B): `/v1/completions`, `/infill`, and Ollama
+    `/api/generate` (plain + FIM via `suffix`).
 
 **Open follow-ups (deferred):**
 
@@ -71,12 +78,10 @@ primary goal: agentic tool-calling with Qwen):
   `suffix`) applies the model's FIM tokens server-side, so this is lower value.
 - **Multi-model registry.** Only one model id is advertised/served today; serving several would need
   multi-model load + lifecycle management.
-- **Remaining live validation.** Gated server-side round-trips now exist for all four protocols (above).
-  Still open: (a) manual validation against the actual editor clients — point Copilot's Ollama provider /
-  a Custom Endpoint, Claude Code, and a Responses client at the running server; (b) gated round-trips for
-  `/v1/embeddings`, `/v1/rerank` and `/infill`, which need their own server fixtures in the matching mode
-  (`enableEmbedding` / `enableReranking` / a FIM-capable model). The models are already downloaded in CI
-  (nomic-embed, jina-reranker, CodeLlama-7B), so only the test fixtures are missing.
+- **Manual real-client validation.** Gated server-side round-trips now exist for every surface (above).
+  What remains is manual validation against the actual editor clients — point Copilot's Ollama provider /
+  a Custom Endpoint, Claude Code, and a Responses client at the running server — since a server-side
+  round-trip confirms the wire shapes but not each client's own parser.
 - **Gemma 4 tool-calling validation.** Confirm the pinned llama.cpp (`b9682`) includes the Gemma 4
   tool-call parser fixes; if not, bump per the upgrade procedure.
 
diff --git a/src/test/java/net/ladenthin/llama/RerankingModelTest.java b/src/test/java/net/ladenthin/llama/RerankingModelTest.java
@@ -33,12 +33,11 @@ public class RerankingModelTest {
     @BeforeAll
     public static void setup() {
         Assumptions.assumeTrue(
-                new File("models/jina-reranker-v1-tiny-en-Q4_0.gguf").exists(),
-                "Reranking model not available, skipping tests");
+                new File(TestConstants.RERANKING_MODEL_PATH).exists(), "Reranking model not available, skipping tests");
         int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
         model = new LlamaModel(new ModelParameters()
                 .setCtxSize(128)
-                .setModel("models/jina-reranker-v1-tiny-en-Q4_0.gguf")
+                .setModel(TestConstants.RERANKING_MODEL_PATH)
                 .setGpuLayers(gpuLayers)
                 .enableReranking()
                 .enableLogTimestamps()
diff --git a/src/test/java/net/ladenthin/llama/TestConstants.java b/src/test/java/net/ladenthin/llama/TestConstants.java
@@ -23,6 +23,9 @@ public class TestConstants {
     /** Path to the Qwen3 thinking model used for reasoning budget tests. */
     public static final String REASONING_MODEL_PATH = "models/Qwen3-0.6B-Q4_K_M.gguf";
 
+    /** Path to the reranking model used in tests (loaded with {@code enableReranking()}). */
+    public static final String RERANKING_MODEL_PATH = "models/jina-reranker-v1-tiny-en-Q4_0.gguf";
+
     /**
      * System property holding a path to a Nomic embedding model
      * ({@code nomic-embed-text-v1.5.f16.gguf} or a compatible BERT-family encoder).
diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerCompletionIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerCompletionIntegrationTest.java
@@ -0,0 +1,104 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.is;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.File;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.TestConstants;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * End-to-end integration test for the completion-family routes — {@code POST /v1/completions},
+ * {@code POST /infill} (fill-in-the-middle) and the Ollama {@code POST /api/generate} (plain + FIM via a
+ * {@code suffix}) — against a real model over a real socket. Reuses the CI text model (CodeLlama-7B,
+ * {@link TestConstants#MODEL_PATH}), which is FIM-capable (see {@code LlamaModelTest#testGenerateInfill}).
+ * Self-skips when the model file is absent. Assertions are structural (valid response envelopes) rather
+ * than value-specific. HTTP plumbing is inherited from {@link OpenAiServerTestSupport}.
+ */
+public class OpenAiServerCompletionIntegrationTest extends OpenAiServerTestSupport {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+    private static final String MODEL_ID = "completion-local";
+
+    private static LlamaModel model;
+    private static OpenAiCompatServer server;
+    private static int port;
+
+    @BeforeAll
+    public static void setup() throws IOException {
+        Assumptions.assumeTrue(
+                new File(TestConstants.MODEL_PATH).exists(),
+                "Text model (CodeLlama-7B) not found, skipping completion server integration test");
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(new ModelParameters()
+                .setModel(TestConstants.MODEL_PATH)
+                .setCtxSize(512)
+                .setGpuLayers(gpuLayers));
+        server = new OpenAiCompatServer(
+                        model,
+                        OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build())
+                .start();
+        port = server.getPort();
+    }
+
+    @AfterAll
+    public static void tearDown() {
+        if (server != null) {
+            server.close();
+        }
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    @Test
+    public void completionsReturnsTextChoice() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":16,\"prompt\":\"def add(a, b):\\n    return\"}";
+        Response response = post(port, "/v1/completions", body, "");
+        assertThat(response.code, is(200));
+        JsonNode json = MAPPER.readTree(response.body);
+        assertThat(json.path("object").asText(), is("text_completion"));
+        assertThat(json.path("choices").get(0).path("text").isTextual(), is(true));
+    }
+
+    @Test
+    public void infillReturnsContent() throws IOException {
+        String body = "{\"input_prefix\":\"def add(a, b):\\n    return \",\"input_suffix\":\"\\n\",\"n_predict\":16}";
+        Response response = post(port, "/infill", body, "");
+        assertThat(response.code, is(200));
+        // The native infill response carries the generated middle under "content".
+        assertThat(MAPPER.readTree(response.body).path("content").isTextual(), is(true));
+    }
+
+    @Test
+    public void ollamaGenerateNonStreamingRoundTrip() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":false,"
+                + "\"prompt\":\"def add(a, b):\\n    return\",\"options\":{\"num_predict\":16}}";
+        Response response = post(port, "/api/generate", body, "");
+        assertThat(response.code, is(200));
+        JsonNode json = MAPPER.readTree(response.body);
+        assertThat(json.path("response").isTextual(), is(true));
+        assertThat(json.path("done").asBoolean(), is(true));
+    }
+
+    @Test
+    public void ollamaGenerateWithSuffixUsesInfill() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":false,"
+                + "\"prompt\":\"def add(a, b):\\n    return \",\"suffix\":\"\\n\",\"options\":{\"num_predict\":16}}";
+        Response response = post(port, "/api/generate", body, "");
+        assertThat(response.code, is(200));
+        assertThat(MAPPER.readTree(response.body).path("response").isTextual(), is(true));
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerEmbeddingsIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerEmbeddingsIntegrationTest.java
@@ -0,0 +1,94 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.is;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.File;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.TestConstants;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * End-to-end integration test for the {@code POST /v1/embeddings} route against a real model loaded in
+ * embedding mode ({@code enableEmbedding()}), served over a real socket. Reuses the CI text model
+ * (CodeLlama-7B, {@link TestConstants#MODEL_PATH}) — the same model {@code LlamaEmbeddingsTest} drives in
+ * embedding mode. Self-skips when the model file is absent (e.g. a local checkout without models), so it
+ * never breaks a model-free run. Assertions are structural (valid OpenAI embeddings shape) rather than
+ * value-specific. HTTP plumbing is inherited from {@link OpenAiServerTestSupport}.
+ */
+public class OpenAiServerEmbeddingsIntegrationTest extends OpenAiServerTestSupport {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+    private static final String MODEL_ID = "embed-local";
+
+    private static LlamaModel model;
+    private static OpenAiCompatServer server;
+    private static int port;
+
+    @BeforeAll
+    public static void setup() throws IOException {
+        Assumptions.assumeTrue(
+                new File(TestConstants.MODEL_PATH).exists(),
+                "Text model (CodeLlama-7B) not found, skipping embeddings server integration test");
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(new ModelParameters()
+                .setModel(TestConstants.MODEL_PATH)
+                .setCtxSize(512)
+                .setGpuLayers(gpuLayers)
+                .enableEmbedding());
+        server = new OpenAiCompatServer(
+                        model,
+                        OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build())
+                .start();
+        port = server.getPort();
+    }
+
+    @AfterAll
+    public static void tearDown() {
+        if (server != null) {
+            server.close();
+        }
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    @Test
+    public void embeddingsReturnsAVector() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"input\":\"hello world\"}";
+        Response response = post(port, "/v1/embeddings", body, "");
+        assertThat(response.code, is(200));
+        JsonNode json = MAPPER.readTree(response.body);
+        assertThat(json.path("object").asText(), is("list"));
+        JsonNode first = json.path("data").get(0);
+        assertThat(first.path("object").asText(), is("embedding"));
+        assertThat(first.path("embedding").isArray(), is(true));
+        assertThat(first.path("embedding").size(), greaterThan(0));
+    }
+
+    @Test
+    public void embeddingsReachableWithoutV1Prefix() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"input\":\"alias check\"}";
+        Response response = post(port, "/embeddings", body, "");
+        assertThat(response.code, is(200));
+        assertThat(
+                MAPPER.readTree(response.body)
+                        .path("data")
+                        .get(0)
+                        .path("embedding")
+                        .isArray(),
+                is(true));
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/OpenAiServerRerankIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OpenAiServerRerankIntegrationTest.java
@@ -0,0 +1,89 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.is;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.File;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.TestConstants;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+/**
+ * End-to-end integration test for the {@code POST /v1/rerank} route against a real model loaded in
+ * reranking mode ({@code enableReranking()}), served over a real socket. Reuses the CI reranking model
+ * (jina-reranker, {@link TestConstants#RERANKING_MODEL_PATH}). Self-skips when the model file is absent.
+ * Assertions are structural (sorted {@code results}/{@code data} of {@code index}+{@code relevance_score})
+ * and check the {@code top_n} cap; exact scores are model-dependent. HTTP plumbing is inherited from
+ * {@link OpenAiServerTestSupport}.
+ */
+public class OpenAiServerRerankIntegrationTest extends OpenAiServerTestSupport {
+
+    private static final ObjectMapper MAPPER = new ObjectMapper();
+    private static final String MODEL_ID = "rerank-local";
+
+    private static LlamaModel model;
+    private static OpenAiCompatServer server;
+    private static int port;
+
+    @BeforeAll
+    public static void setup() throws IOException {
+        Assumptions.assumeTrue(
+                new File(TestConstants.RERANKING_MODEL_PATH).exists(),
+                "Reranking model (jina-reranker) not found, skipping rerank server integration test");
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+        model = new LlamaModel(new ModelParameters()
+                .setModel(TestConstants.RERANKING_MODEL_PATH)
+                .setCtxSize(512)
+                .setGpuLayers(gpuLayers)
+                .enableReranking()
+                .skipWarmup());
+        server = new OpenAiCompatServer(
+                        model,
+                        OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build())
+                .start();
+        port = server.getPort();
+    }
+
+    @AfterAll
+    public static void tearDown() {
+        if (server != null) {
+            server.close();
+        }
+        if (model != null) {
+            model.close();
+        }
+    }
+
+    @Test
+    public void rerankReturnsScoredResultsCappedByTopN() throws IOException {
+        String body = "{\"model\":\"" + MODEL_ID + "\",\"query\":\"Machine learning is\","
+                + "\"documents\":[\"A machine applies forces to perform an action.\","
+                + "\"Machine learning is a field of artificial intelligence.\","
+                + "\"Paris is the capital of France.\"],\"top_n\":2}";
+        Response response = post(port, "/v1/rerank", body, "");
+        assertThat(response.code, is(200));
+        JsonNode json = MAPPER.readTree(response.body);
+        assertThat(json.path("object").asText(), is("list"));
+        JsonNode results = json.path("results");
+        assertThat(results.isArray(), is(true));
+        assertThat(results.size(), greaterThan(0));
+        assertThat(results.size(), lessThanOrEqualTo(2)); // top_n cap
+        assertThat(results.get(0).path("index").isInt(), is(true));
+        assertThat(results.get(0).path("relevance_score").isNumber(), is(true));
+        // `data` is an alias of `results` for Continue (#6478).
+        assertThat(json.path("data").size(), is(results.size()));
+    }
+}