Skip to content

Commit f5158a3

Browse files
committed
test: add gated OpenAI server integration test reusing Qwen3-0.6B
Reuse the reasoning model the CI pipeline already downloads (TestConstants.REASONING_MODEL_PATH = models/Qwen3-0.6B-Q4_K_M.gguf) — no extra download. Qwen3-0.6B is instruct-tuned and tool-calling capable, so it exercises the real native chat + streaming path (including the tools/use_jinja path) end-to-end over a socket. Self-skips via Assume when the model file is absent, matching the existing model-gated tests, so a model-free `mvn test` is unaffected. Assertions are structural (valid chat.completion, stream emits chunks + [DONE], a tools request returns a valid message object) because a 0.6B model's wording and whether it elects to call a tool are non-deterministic; the deterministic chunk and tool-call plumbing stays covered by OpenAiCompatServerHttpTest with a fake backend. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com> Claude-Session: https://claude.ai/code/session_014L2dLbAtwdq7C6a2gFRsQQ
1 parent 0bb02e1 commit f5158a3

1 file changed

Lines changed: 171 additions & 0 deletions

File tree

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
2+
//
3+
// SPDX-License-Identifier: MIT
4+
5+
package net.ladenthin.llama.server;
6+
7+
import static java.nio.charset.StandardCharsets.UTF_8;
8+
import static org.hamcrest.MatcherAssert.assertThat;
9+
import static org.hamcrest.Matchers.containsString;
10+
import static org.hamcrest.Matchers.greaterThan;
11+
import static org.hamcrest.Matchers.is;
12+
13+
import com.fasterxml.jackson.databind.JsonNode;
14+
import com.fasterxml.jackson.databind.ObjectMapper;
15+
import java.io.ByteArrayOutputStream;
16+
import java.io.File;
17+
import java.io.IOException;
18+
import java.io.InputStream;
19+
import java.io.OutputStream;
20+
import java.net.HttpURLConnection;
21+
import java.net.URL;
22+
import net.ladenthin.llama.LlamaModel;
23+
import net.ladenthin.llama.TestConstants;
24+
import net.ladenthin.llama.parameters.ModelParameters;
25+
import org.junit.jupiter.api.AfterAll;
26+
import org.junit.jupiter.api.Assumptions;
27+
import org.junit.jupiter.api.BeforeAll;
28+
import org.junit.jupiter.api.Test;
29+
30+
/**
31+
* End-to-end integration test for {@link OpenAiCompatServer} against a real model served over a real
32+
* socket. Reuses the Qwen3-0.6B GGUF that the CI pipeline already downloads as the reasoning model
33+
* ({@link TestConstants#REASONING_MODEL_PATH}); it is instruct-tuned (has a chat template) and one of
34+
* llama.cpp's better tool-calling families, so no extra download is needed. Self-skips when the model
35+
* file is absent (e.g. a local checkout without models), so it never breaks a model-free run.
36+
*
37+
* <p>Assertions are deliberately structural (valid OpenAI shapes, stream terminates) rather than
38+
* content-specific — a 0.6B model's exact wording and whether it elects to call a tool are not
39+
* deterministic. The deterministic chunk/tool-call plumbing is covered by
40+
* {@link OpenAiCompatServerHttpTest} with a fake backend.
41+
*/
42+
public class OpenAiCompatServerIntegrationTest {
43+
44+
private static final ObjectMapper MAPPER = new ObjectMapper();
45+
private static final String MODEL_ID = "qwen3-local";
46+
47+
private static LlamaModel model;
48+
private static OpenAiCompatServer server;
49+
private static int port;
50+
51+
@BeforeAll
52+
public static void setup() throws IOException {
53+
Assumptions.assumeTrue(
54+
new File(TestConstants.REASONING_MODEL_PATH).exists(),
55+
"Reasoning model (Qwen3-0.6B) not found, skipping OpenAI server integration test");
56+
int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
57+
model = new LlamaModel(new ModelParameters()
58+
.setModel(TestConstants.REASONING_MODEL_PATH)
59+
.setCtxSize(1024)
60+
.setGpuLayers(gpuLayers)
61+
.setFit(false)
62+
.setParallel(2));
63+
server = new OpenAiCompatServer(
64+
model,
65+
OpenAiServerConfig.builder().port(0).modelId(MODEL_ID).build())
66+
.start();
67+
port = server.getPort();
68+
}
69+
70+
@AfterAll
71+
public static void tearDown() {
72+
if (server != null) {
73+
server.close();
74+
}
75+
if (model != null) {
76+
model.close();
77+
}
78+
}
79+
80+
@Test
81+
public void nonStreamingChatReturnsValidCompletion() throws IOException {
82+
String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":16,"
83+
+ "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]}";
84+
Response response = post("/v1/chat/completions", body);
85+
assertThat(response.code, is(200));
86+
JsonNode json = MAPPER.readTree(response.body);
87+
assertThat(json.path("object").asText(), is("chat.completion"));
88+
assertThat(json.path("choices").size(), greaterThan(0));
89+
assertThat(json.path("choices").get(0).path("message").path("role").asText(), is("assistant"));
90+
}
91+
92+
@Test
93+
public void streamingChatEmitsChunksAndDone() throws IOException {
94+
String body = "{\"model\":\"" + MODEL_ID + "\",\"stream\":true,\"max_tokens\":16,"
95+
+ "\"messages\":[{\"role\":\"user\",\"content\":\"Say hello in one word.\"}]}";
96+
Response response = post("/v1/chat/completions", body);
97+
assertThat(response.code, is(200));
98+
assertThat(response.body, containsString("chat.completion.chunk"));
99+
assertThat(response.body, containsString("data: [DONE]"));
100+
}
101+
102+
@Test
103+
public void toolRequestRoundTripsThroughTheJinjaPath() throws IOException {
104+
// Forwards an OpenAI tools array; the mapper enables use_jinja so the native parser applies
105+
// Qwen3's tool-aware template. We assert the request is accepted and returns a structurally
106+
// valid OpenAI message (content and/or tool_calls) — not that this tiny model elects to call.
107+
String body = "{\"model\":\"" + MODEL_ID + "\",\"max_tokens\":48,"
108+
+ "\"messages\":[{\"role\":\"user\",\"content\":\"What is the weather in Paris?\"}],"
109+
+ "\"tools\":[{\"type\":\"function\",\"function\":{\"name\":\"get_weather\","
110+
+ "\"description\":\"Get the weather for a city\",\"parameters\":{\"type\":\"object\","
111+
+ "\"properties\":{\"city\":{\"type\":\"string\"}},\"required\":[\"city\"]}}}]}";
112+
Response response = post("/v1/chat/completions", body);
113+
assertThat(response.code, is(200));
114+
JsonNode message = MAPPER.readTree(response.body).path("choices").get(0).path("message");
115+
assertThat(message.isObject(), is(true));
116+
}
117+
118+
@Test
119+
public void modelsEndpointAdvertisesTheServedModel() throws IOException {
120+
Response response = get("/v1/models");
121+
assertThat(response.code, is(200));
122+
assertThat(response.body, containsString(MODEL_ID));
123+
}
124+
125+
// ----- HTTP helpers -----
126+
127+
private static Response post(String path, String body) throws IOException {
128+
HttpURLConnection conn = (HttpURLConnection) new URL("http://127.0.0.1:" + port + path).openConnection();
129+
conn.setRequestMethod("POST");
130+
conn.setDoOutput(true);
131+
conn.setRequestProperty("Content-Type", "application/json");
132+
try (OutputStream os = conn.getOutputStream()) {
133+
os.write(body.getBytes(UTF_8));
134+
}
135+
return read(conn);
136+
}
137+
138+
private static Response get(String path) throws IOException {
139+
HttpURLConnection conn = (HttpURLConnection) new URL("http://127.0.0.1:" + port + path).openConnection();
140+
conn.setRequestMethod("GET");
141+
return read(conn);
142+
}
143+
144+
private static Response read(HttpURLConnection conn) throws IOException {
145+
int code = conn.getResponseCode();
146+
InputStream is = code < 400 ? conn.getInputStream() : conn.getErrorStream();
147+
String body = is == null ? "" : readAll(is);
148+
return new Response(code, body);
149+
}
150+
151+
private static String readAll(InputStream is) throws IOException {
152+
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
153+
byte[] chunk = new byte[1024];
154+
int read;
155+
while ((read = is.read(chunk)) != -1) {
156+
buffer.write(chunk, 0, read);
157+
}
158+
return new String(buffer.toByteArray(), UTF_8);
159+
}
160+
161+
/** Minimal HTTP response holder. */
162+
private static final class Response {
163+
private final int code;
164+
private final String body;
165+
166+
Response(int code, String body) {
167+
this.code = code;
168+
this.body = body;
169+
}
170+
}
171+
}

0 commit comments

Comments
 (0)