Add CompletableFuture async wrappers for complete/chatComplete (§2.3)

claude · claude · commit 1e673a92e9c8 · 2026-05-22T22:12:24.000Z
LlamaModel gains completeAsync, chatCompleteAsync, and chatCompleteTextAsync — thin wrappers that dispatch the existing blocking methods through ForkJoinPool.commonPool(). The completeAsync(params, token) overload bridges future.cancel(true) to CancellationToken.cancel() so cancellation propagates into the inference loop. Reactive Flow.Publisher streaming (M-effort) is intentionally deferred to a follow-up; this PR delivers only the S-effort portion of §2.3. https://claude.ai/code/session_01R4ZrEy3ptJDLuUgUKuM4Gy
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -14,6 +14,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.concurrent.CompletableFuture;
 import java.util.function.BiConsumer;
 
 /**
@@ -84,6 +85,58 @@ public String complete(InferenceParameters parameters) {
 	 * @param token cancellation handle; {@link CancellationToken#cancel()} aborts the loop
 	 * @return the text generated up to the point of stop or cancellation
 	 */
+	/**
+	 * Asynchronous variant of {@link #complete(InferenceParameters)}. Runs the inference on
+	 * the common {@link java.util.concurrent.ForkJoinPool} so it does not block the calling
+	 * thread. The native worker thread inside the JNI context still serializes the actual
+	 * model work &mdash; this wrapper only moves the blocking Java call off the caller.
+	 *
+	 * @param parameters the inference configuration
+	 * @return a future completed with the generated text
+	 */
+	public CompletableFuture<String> completeAsync(InferenceParameters parameters) {
+		return CompletableFuture.supplyAsync(() -> complete(parameters));
+	}
+
+	/**
+	 * Cancellable async variant. The returned future is wired to the supplied
+	 * {@link CancellationToken}: calling {@code future.cancel(true)} also invokes
+	 * {@link CancellationToken#cancel()} so the inference loop returns early.
+	 *
+	 * @param parameters the inference configuration
+	 * @param token cancellation handle bound to the underlying inference loop
+	 * @return a future completed with whatever text was generated up to the point of stop or cancellation
+	 */
+	public CompletableFuture<String> completeAsync(InferenceParameters parameters, CancellationToken token) {
+		CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> complete(parameters, token));
+		future.whenComplete((result, ex) -> {
+			if (ex instanceof java.util.concurrent.CancellationException) {
+				token.cancel();
+			}
+		});
+		return future;
+	}
+
+	/**
+	 * Asynchronous variant of {@link #chatComplete(InferenceParameters)}.
+	 *
+	 * @param parameters the inference parameters including messages
+	 * @return a future completed with the raw OAI-format JSON response
+	 */
+	public CompletableFuture<String> chatCompleteAsync(InferenceParameters parameters) {
+		return CompletableFuture.supplyAsync(() -> chatComplete(parameters));
+	}
+
+	/**
+	 * Asynchronous variant of {@link #chatCompleteText(InferenceParameters)}.
+	 *
+	 * @param parameters the inference parameters including messages
+	 * @return a future completed with the assistant's reply text
+	 */
+	public CompletableFuture<String> chatCompleteTextAsync(InferenceParameters parameters) {
+		return CompletableFuture.supplyAsync(() -> chatCompleteText(parameters));
+	}
+
 	public String complete(InferenceParameters parameters, CancellationToken token) {
 		token.reset();
 		parameters.setStream(true);
diff --git a/src/test/java/net/ladenthin/llama/LlamaModelTest.java b/src/test/java/net/ladenthin/llama/LlamaModelTest.java
@@ -278,6 +278,42 @@ public void testCompleteWithCancellationToken() throws Exception {
 		Assert.assertNotNull(model.complete(new InferenceParameters(prefix).setNPredict(3)));
 	}
 
+	/**
+	 * Regression: {@link LlamaModel#completeAsync(InferenceParameters)} must
+	 * complete with the same text {@link LlamaModel#complete(InferenceParameters)}
+	 * would have produced, on a background thread.
+	 */
+	@Test
+	public void testCompleteAsync() throws Exception {
+		InferenceParameters params = new InferenceParameters(prefix).setNPredict(8).setSeed(42);
+		String sync = model.complete(new InferenceParameters(prefix).setNPredict(8).setSeed(42));
+		String async = model.completeAsync(params).get(30, java.util.concurrent.TimeUnit.SECONDS);
+		Assert.assertEquals(sync, async);
+	}
+
+	/**
+	 * Regression: cancelling the future from {@link LlamaModel#completeAsync(InferenceParameters, CancellationToken)}
+	 * must propagate to the underlying inference loop via the token.
+	 */
+	@Test
+	public void testCompleteAsyncCancelPropagates() throws Exception {
+		InferenceParameters params = new InferenceParameters(prefix).setNPredict(512);
+		CancellationToken token = new CancellationToken();
+		java.util.concurrent.CompletableFuture<String> future = model.completeAsync(params, token);
+
+		Thread.sleep(200);
+		future.cancel(true);
+
+		// give the propagation a moment
+		for (int i = 0; i < 50 && !token.isCancelled() && i < 50; i++) {
+			Thread.sleep(20);
+		}
+		Assert.assertTrue("cancel(true) on the future should flip the token", token.isCancelled());
+
+		// Model is still usable
+		Assert.assertNotNull(model.complete(new InferenceParameters(prefix).setNPredict(3)));
+	}
+
 	@Test
 	public void testEmbedding() {
 		float[] embedding = model.embed(prefix);