Fix CI VM crash: make CancellationToken cooperative-only

claude · claude · commit e3b90431304d · 2026-05-22T22:35:49.000Z
Cross-thread cancel raced with the JNI receive loop: cancel() called cancelCompletion() from another thread, which erased the underlying server_response_reader unique_ptr while the main thread held a raw pointer to it and was blocked inside rd->next(). On the next token this dereferenced freed memory and aborted with std::system_error, crashing the test JVM (exit 134). Fix: cancel() now sets a volatile flag only. The inference loop in complete(params, token) checks the flag between tokens and, when set, calls cancelCompletion from the same thread that just returned from receiveCompletionJson — safe because no concurrent access remains. Latency becomes one token interval (tens to a few hundred ms on CPU) instead of immediate. Documented in CancellationToken javadoc. Tests: - LlamaModelTest#testCompleteWithCancellationToken: budget relaxed from 5s to 30s (was tight even on the happy path). - LlamaModelTest#testCompleteAsyncCancelPropagates: drop the brittle poll on token.isCancelled() (the worker resets the token on return before the assertion sees it); sleep for cancel propagation and verify the model is still usable. https://claude.ai/code/session_01R4ZrEy3ptJDLuUgUKuM4Gy
diff --git a/src/main/java/net/ladenthin/llama/CancellationToken.java b/src/main/java/net/ladenthin/llama/CancellationToken.java
@@ -4,68 +4,45 @@
 
 package net.ladenthin.llama;
 
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicReference;
-
 /**
  * Cancellation handle for a blocking {@link LlamaModel} call. Pass an instance to
  * {@link LlamaModel#complete(InferenceParameters, CancellationToken)} and invoke
  * {@link #cancel()} from another thread to abort the inference loop.
  * <p>
- * A token may be reused across calls but is not thread-safe for concurrent
- * <em>publishing</em> &mdash; only one call at a time should bind it via the package-private
- * {@code bind} method. {@link #cancel()} and {@link #isCancelled()} are safe to call
- * concurrently with the inference loop.
+ * Cancellation is cooperative: {@link #cancel()} only sets a flag, and the inference
+ * loop checks that flag between generated tokens. Effective latency is therefore one
+ * token interval (typically tens to a few hundred ms). The native task is <em>not</em>
+ * unblocked mid-token because the underlying JNI reader cannot be safely freed while
+ * another thread is blocked inside it.
+ * </p>
+ * <p>
+ * A token may be reused across calls. {@link #cancel()} and {@link #isCancelled()} are
+ * safe to invoke concurrently with the inference loop.
  * </p>
  */
 public final class CancellationToken {
 
-    private static final int NO_TASK = -1;
-
-    private final AtomicInteger taskId = new AtomicInteger(NO_TASK);
-    private final AtomicReference<LlamaModel> bound = new AtomicReference<LlamaModel>();
     private volatile boolean cancelled;
 
     public CancellationToken() {
         // empty
     }
 
-    /** Returns {@code true} once {@link #cancel()} has been called. */
+    /** Returns {@code true} once {@link #cancel()} has been called and before {@link #reset()}. */
     public boolean isCancelled() {
         return cancelled;
     }
 
     /**
-     * Request cancellation. If the token is already bound to a running inference, the
-     * underlying native task is cancelled immediately and the calling inference loop will
-     * return on its next iteration. Idempotent.
+     * Request cancellation. Sets the flag observed by the inference loop; the loop will
+     * return at its next token boundary. Idempotent and safe to call from any thread.
      */
     public void cancel() {
         cancelled = true;
-        LlamaModel m = bound.get();
-        int id = taskId.get();
-        if (m != null && id != NO_TASK) {
-            m.cancelCompletion(id);
-        }
-    }
-
-    /**
-     * Bind this token to a running native task. Called by {@link LlamaModel} after the
-     * task id has been allocated. If {@link #cancel()} was invoked before binding, the
-     * native task is cancelled here.
-     */
-    void bind(LlamaModel model, int id) {
-        bound.set(model);
-        taskId.set(id);
-        if (cancelled) {
-            model.cancelCompletion(id);
-        }
     }
 
-    /** Clear binding after the call returns. Resets cancelled flag so the token can be reused. */
+    /** Clear the cancelled flag so the token can be reused. Package-private. */
     void reset() {
-        bound.set(null);
-        taskId.set(NO_TASK);
         cancelled = false;
     }
 }
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -141,21 +141,17 @@ public String complete(InferenceParameters parameters, CancellationToken token)
 		token.reset();
 		parameters.setStream(true);
 		int taskId = requestCompletion(parameters.toString());
-		token.bind(this, taskId);
 		StringBuilder sb = new StringBuilder();
 		try {
 			while (true) {
 				if (token.isCancelled()) {
+					// Best-effort native release. Safe to call here because we are not
+					// concurrently inside receiveCompletionJson — the cooperative cancel
+					// flag stopped the loop at a token boundary.
+					cancelCompletion(taskId);
 					break;
 				}
-				String json;
-				try {
-					json = receiveCompletionJson(taskId);
-				} catch (LlamaException e) {
-					// Reader was erased by a concurrent cancel — treat as graceful stop.
-					if (token.isCancelled()) break;
-					throw e;
-				}
+				String json = receiveCompletionJson(taskId);
 				LlamaOutput out = completionParser.parse(json);
 				sb.append(out.text);
 				if (out.stop) {
diff --git a/src/test/java/net/ladenthin/llama/CancellationTokenTest.java b/src/test/java/net/ladenthin/llama/CancellationTokenTest.java
@@ -11,8 +11,8 @@
 
 @ClaudeGenerated(
         purpose = "Verify CancellationToken state transitions (initial, cancel, reset) "
-                + "and idempotency of cancel(). The bind-during-running path is exercised "
-                + "via the cross-thread test in LlamaModelTest."
+                + "and idempotency of cancel(). Cooperative cancellation behaviour during "
+                + "a live inference loop is exercised in LlamaModelTest."
 )
 public class CancellationTokenTest {
 
@@ -47,9 +47,8 @@ public void resetClearsCancelledFlag() {
     }
 
     @Test
-    public void cancelBeforeBindIsRememberedUntilReset() {
-        // Without binding, cancel() must still flip the flag — bind() is the path that
-        // forwards the cancel to the native task; the flag itself is independent.
+    public void cancelBeforeUseIsObserved() {
+        // cancel() before any inference loop sees the token should still flip the flag.
         CancellationToken t = new CancellationToken();
         t.cancel();
         assertTrue(t.isCancelled());
diff --git a/src/test/java/net/ladenthin/llama/LlamaModelTest.java b/src/test/java/net/ladenthin/llama/LlamaModelTest.java
@@ -245,9 +245,11 @@ public void testIteratorCloseIdempotent() {
 
 	/**
 	 * Regression: {@link LlamaModel#complete(InferenceParameters, CancellationToken)}
-	 * must return promptly when {@link CancellationToken#cancel()} is invoked from
-	 * another thread, returning whatever text was generated up to that point without
-	 * throwing. The model must remain usable for subsequent calls.
+	 * must return when {@link CancellationToken#cancel()} is invoked from another
+	 * thread, returning whatever text was generated up to that point without
+	 * throwing. Cancellation is cooperative — the loop checks the flag at token
+	 * boundaries — so the budget here is "much less than full n_predict completion
+	 * would take", not instantaneous.
 	 */
 	@Test
 	public void testCompleteWithCancellationToken() throws Exception {
@@ -268,10 +270,12 @@ public void testCompleteWithCancellationToken() throws Exception {
 		long elapsed = System.currentTimeMillis() - start;
 		canceller.join();
 
-		Assert.assertTrue("complete should return within 5s of cancel, took " + elapsed + "ms",
-				elapsed < 5000);
+		// 512 tokens on CPU would take many tens of seconds; cancellation should bring
+		// this well under that. Tolerate ~10s for the in-flight token to finish.
+		Assert.assertTrue("complete should return within 30s of cancel, took " + elapsed + "ms",
+				elapsed < 30000);
 		Assert.assertNotNull(partial);
-		// Token must be reset on return so it can be reused.
+		// Token is reset on return so it can be reused.
 		Assert.assertFalse("token should be reset after call returns", token.isCancelled());
 
 		// Model is still usable
@@ -293,7 +297,10 @@ public void testCompleteAsync() throws Exception {
 
 	/**
 	 * Regression: cancelling the future from {@link LlamaModel#completeAsync(InferenceParameters, CancellationToken)}
-	 * must propagate to the underlying inference loop via the token.
+	 * must not leak the underlying inference loop or destabilise the model. The
+	 * worker thread keeps running until the next token boundary, then returns;
+	 * future.cancel(true) only flips the future's state, the whenComplete handler
+	 * flips the token, and the cooperative loop unwinds shortly after.
 	 */
 	@Test
 	public void testCompleteAsyncCancelPropagates() throws Exception {
@@ -303,12 +310,12 @@ public void testCompleteAsyncCancelPropagates() throws Exception {
 
 		Thread.sleep(200);
 		future.cancel(true);
+		Assert.assertTrue("future should report cancelled", future.isCancelled());
 
-		// give the propagation a moment
-		for (int i = 0; i < 50 && !token.isCancelled() && i < 50; i++) {
-			Thread.sleep(20);
-		}
-		Assert.assertTrue("cancel(true) on the future should flip the token", token.isCancelled());
+		// Give the cooperative cancel time to unwind the worker thread before the
+		// next call. Polling the model state directly is racy; sleeping a generous
+		// interval (one token + cancel propagation) is sufficient on CPU.
+		Thread.sleep(5000);
 
 		// Model is still usable
 		Assert.assertNotNull(model.complete(new InferenceParameters(prefix).setNPredict(3)));