Skip to content

Commit 1e673a9

Browse files
committed
Add CompletableFuture async wrappers for complete/chatComplete (§2.3)
LlamaModel gains completeAsync, chatCompleteAsync, and chatCompleteTextAsync — thin wrappers that dispatch the existing blocking methods through ForkJoinPool.commonPool(). The completeAsync(params, token) overload bridges future.cancel(true) to CancellationToken.cancel() so cancellation propagates into the inference loop. Reactive Flow.Publisher streaming (M-effort) is intentionally deferred to a follow-up; this PR delivers only the S-effort portion of §2.3. https://claude.ai/code/session_01R4ZrEy3ptJDLuUgUKuM4Gy
1 parent ad66e3a commit 1e673a9

2 files changed

Lines changed: 89 additions & 0 deletions

File tree

src/main/java/net/ladenthin/llama/LlamaModel.java

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import java.util.HashMap;
1515
import java.util.List;
1616
import java.util.Map;
17+
import java.util.concurrent.CompletableFuture;
1718
import java.util.function.BiConsumer;
1819

1920
/**
@@ -84,6 +85,58 @@ public String complete(InferenceParameters parameters) {
8485
* @param token cancellation handle; {@link CancellationToken#cancel()} aborts the loop
8586
* @return the text generated up to the point of stop or cancellation
8687
*/
88+
/**
89+
* Asynchronous variant of {@link #complete(InferenceParameters)}. Runs the inference on
90+
* the common {@link java.util.concurrent.ForkJoinPool} so it does not block the calling
91+
* thread. The native worker thread inside the JNI context still serializes the actual
92+
* model work — this wrapper only moves the blocking Java call off the caller.
93+
*
94+
* @param parameters the inference configuration
95+
* @return a future completed with the generated text
96+
*/
97+
public CompletableFuture<String> completeAsync(InferenceParameters parameters) {
98+
return CompletableFuture.supplyAsync(() -> complete(parameters));
99+
}
100+
101+
/**
102+
* Cancellable async variant. The returned future is wired to the supplied
103+
* {@link CancellationToken}: calling {@code future.cancel(true)} also invokes
104+
* {@link CancellationToken#cancel()} so the inference loop returns early.
105+
*
106+
* @param parameters the inference configuration
107+
* @param token cancellation handle bound to the underlying inference loop
108+
* @return a future completed with whatever text was generated up to the point of stop or cancellation
109+
*/
110+
public CompletableFuture<String> completeAsync(InferenceParameters parameters, CancellationToken token) {
111+
CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> complete(parameters, token));
112+
future.whenComplete((result, ex) -> {
113+
if (ex instanceof java.util.concurrent.CancellationException) {
114+
token.cancel();
115+
}
116+
});
117+
return future;
118+
}
119+
120+
/**
121+
* Asynchronous variant of {@link #chatComplete(InferenceParameters)}.
122+
*
123+
* @param parameters the inference parameters including messages
124+
* @return a future completed with the raw OAI-format JSON response
125+
*/
126+
public CompletableFuture<String> chatCompleteAsync(InferenceParameters parameters) {
127+
return CompletableFuture.supplyAsync(() -> chatComplete(parameters));
128+
}
129+
130+
/**
131+
* Asynchronous variant of {@link #chatCompleteText(InferenceParameters)}.
132+
*
133+
* @param parameters the inference parameters including messages
134+
* @return a future completed with the assistant's reply text
135+
*/
136+
public CompletableFuture<String> chatCompleteTextAsync(InferenceParameters parameters) {
137+
return CompletableFuture.supplyAsync(() -> chatCompleteText(parameters));
138+
}
139+
87140
public String complete(InferenceParameters parameters, CancellationToken token) {
88141
token.reset();
89142
parameters.setStream(true);

src/test/java/net/ladenthin/llama/LlamaModelTest.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,42 @@ public void testCompleteWithCancellationToken() throws Exception {
278278
Assert.assertNotNull(model.complete(new InferenceParameters(prefix).setNPredict(3)));
279279
}
280280

281+
/**
282+
* Regression: {@link LlamaModel#completeAsync(InferenceParameters)} must
283+
* complete with the same text {@link LlamaModel#complete(InferenceParameters)}
284+
* would have produced, on a background thread.
285+
*/
286+
@Test
287+
public void testCompleteAsync() throws Exception {
288+
InferenceParameters params = new InferenceParameters(prefix).setNPredict(8).setSeed(42);
289+
String sync = model.complete(new InferenceParameters(prefix).setNPredict(8).setSeed(42));
290+
String async = model.completeAsync(params).get(30, java.util.concurrent.TimeUnit.SECONDS);
291+
Assert.assertEquals(sync, async);
292+
}
293+
294+
/**
295+
* Regression: cancelling the future from {@link LlamaModel#completeAsync(InferenceParameters, CancellationToken)}
296+
* must propagate to the underlying inference loop via the token.
297+
*/
298+
@Test
299+
public void testCompleteAsyncCancelPropagates() throws Exception {
300+
InferenceParameters params = new InferenceParameters(prefix).setNPredict(512);
301+
CancellationToken token = new CancellationToken();
302+
java.util.concurrent.CompletableFuture<String> future = model.completeAsync(params, token);
303+
304+
Thread.sleep(200);
305+
future.cancel(true);
306+
307+
// give the propagation a moment
308+
for (int i = 0; i < 50 && !token.isCancelled() && i < 50; i++) {
309+
Thread.sleep(20);
310+
}
311+
Assert.assertTrue("cancel(true) on the future should flip the token", token.isCancelled());
312+
313+
// Model is still usable
314+
Assert.assertNotNull(model.complete(new InferenceParameters(prefix).setNPredict(3)));
315+
}
316+
281317
@Test
282318
public void testEmbedding() {
283319
float[] embedding = model.embed(prefix);

0 commit comments

Comments
 (0)