Add LlamaPublisher reactive-streams token publisher (§2.3 follow-up)

claude · claude · commit afa4f6529901 · 2026-05-23T07:47:25.000Z
Backpressure-aware Publisher<LlamaOutput> on top of the existing streaming iterator. Reactor / RxJava / Kotlin coroutines all bridge to the Reactive Streams interface natively, so consumers wrap with Flux.from(...) / Flowable.fromPublisher(...) / asFlow() in one line. LlamaPublisher: - Single-subscriber (second subscribe signals onError per RS spec). - Each subscribe starts a dedicated emitter daemon thread. - Demand honoured via AtomicLong + monitor: emitter blocks while demand == 0 and only calls iterator.next() when demand > 0. - request(n <= 0) signals onError with IllegalArgumentException per reactive-streams §3.9. - cancel() closes the underlying iterator (cooperative, same path as LlamaIterator.close); idempotent. - onComplete fires on stop token, onError on any throwable from the iterator path. LlamaModel: - streamPublisher(InferenceParameters) and streamChatPublisher(InferenceParameters) factories. Dependency: adds org.reactivestreams:reactive-streams 1.0.4 (~5 KB, Java 8 compatible) to pom.xml. Tests in LlamaPublisherTest: - nullSubscriberThrows (model-free). - backpressureAndCancel, singleSubscriberContract, invalidRequestSignalsError (model-gated). mvn javadoc:jar BUILD SUCCESS, no new warnings. https://claude.ai/code/session_01R4ZrEy3ptJDLuUgUKuM4Gy
diff --git a/pom.xml b/pom.xml
@@ -73,6 +73,14 @@ SPDX-License-Identifier: MIT
 			<artifactId>jackson-databind</artifactId>
 			<version>2.21.3</version>
 		</dependency>
+		<!-- Reactive Streams API used by LlamaPublisher to expose token streams as a
+		     Publisher<LlamaOutput>. Java 8 compatible, ~5 KB, supplies the standard
+		     interfaces that Reactor / RxJava / Kotlin coroutines bridge to. -->
+		<dependency>
+			<groupId>org.reactivestreams</groupId>
+			<artifactId>reactive-streams</artifactId>
+			<version>1.0.4</version>
+		</dependency>
 		<!-- Required by OSInfo (vendored from xerial/sqlite-jdbc) for log emission. -->
 		<dependency>
 			<groupId>org.slf4j</groupId>
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -181,6 +181,30 @@ public java.util.List<ChatResponse> chatBatch(java.util.List<ChatRequest> reques
 		return out;
 	}
 
+	/**
+	 * Reactive-streams variant of {@link #generate(InferenceParameters)}. Returns a
+	 * {@link org.reactivestreams.Publisher} of {@link LlamaOutput} tokens. Each subscriber
+	 * triggers a fresh streaming inference on a dedicated background thread; backpressure
+	 * is honoured via the Reactive Streams {@code request(n)} protocol. Use
+	 * {@link org.reactivestreams.Subscription#cancel()} to stop the inference early.
+	 *
+	 * @param parameters the inference configuration
+	 * @return a single-subscriber {@link org.reactivestreams.Publisher} of tokens
+	 */
+	public LlamaPublisher streamPublisher(InferenceParameters parameters) {
+		return new LlamaPublisher(this, parameters, false);
+	}
+
+	/**
+	 * Reactive-streams variant of {@link #generateChat(InferenceParameters)}.
+	 *
+	 * @param parameters the inference parameters including messages
+	 * @return a single-subscriber {@link org.reactivestreams.Publisher} of tokens
+	 */
+	public LlamaPublisher streamChatPublisher(InferenceParameters parameters) {
+		return new LlamaPublisher(this, parameters, true);
+	}
+
 	/**
 	 * Asynchronous variant of {@link #complete(InferenceParameters)}. Runs the inference on
 	 * the common {@link java.util.concurrent.ForkJoinPool} so it does not block the calling
diff --git a/src/main/java/net/ladenthin/llama/LlamaPublisher.java b/src/main/java/net/ladenthin/llama/LlamaPublisher.java
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import org.reactivestreams.Publisher;
+import org.reactivestreams.Subscriber;
+import org.reactivestreams.Subscription;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Reactive Streams {@link Publisher} that emits {@link LlamaOutput} tokens from a
+ * llama.cpp streaming completion. Bridges to Reactor / RxJava / Kotlin coroutines via
+ * the standard {@code reactive-streams} interface.
+ * <p>
+ * Each {@link #subscribe(Subscriber)} starts a fresh inference task on a dedicated
+ * background thread and honours {@code Subscription.request(n)} for backpressure:
+ * the emitter thread only calls {@code iterator.next()} while there is outstanding
+ * demand. When the iterator's stop token arrives the publisher calls
+ * {@code onComplete}; on cancellation it closes the iterator and stops emitting.
+ * </p>
+ * <p>
+ * Construct via {@link LlamaModel#streamPublisher(InferenceParameters)} or
+ * {@link LlamaModel#streamChatPublisher(InferenceParameters)}. The publisher is
+ * single-subscriber: a second {@link #subscribe(Subscriber)} call signals
+ * {@code onError(IllegalStateException)}.
+ * </p>
+ */
+public final class LlamaPublisher implements Publisher<LlamaOutput> {
+
+    private final LlamaModel model;
+    private final InferenceParameters parameters;
+    private final boolean chat;
+    private final AtomicBoolean subscribed = new AtomicBoolean(false);
+
+    LlamaPublisher(LlamaModel model, InferenceParameters parameters, boolean chat) {
+        this.model = model;
+        this.parameters = parameters;
+        this.chat = chat;
+    }
+
+    @Override
+    public void subscribe(Subscriber<? super LlamaOutput> subscriber) {
+        if (subscriber == null) {
+            throw new NullPointerException("subscriber");
+        }
+        if (!subscribed.compareAndSet(false, true)) {
+            EmptySubscription.signalError(subscriber,
+                    new IllegalStateException("LlamaPublisher is single-subscriber; already subscribed"));
+            return;
+        }
+        LlamaIterable iterable = chat ? model.generateChat(parameters) : model.generate(parameters);
+        LlamaSubscription sub = new LlamaSubscription(iterable, subscriber);
+        subscriber.onSubscribe(sub);
+        sub.start();
+    }
+
+    /** Subscription that honours backpressure and pumps tokens on a dedicated thread. */
+    private static final class LlamaSubscription implements Subscription {
+        private final LlamaIterable iterable;
+        private final Subscriber<? super LlamaOutput> subscriber;
+        private final AtomicLong demand = new AtomicLong(0);
+        private final AtomicBoolean cancelled = new AtomicBoolean(false);
+        private final AtomicBoolean started = new AtomicBoolean(false);
+        private final Object monitor = new Object();
+        private Thread worker;
+
+        LlamaSubscription(LlamaIterable iterable, Subscriber<? super LlamaOutput> subscriber) {
+            this.iterable = iterable;
+            this.subscriber = subscriber;
+        }
+
+        void start() {
+            if (!started.compareAndSet(false, true)) return;
+            worker = new Thread(this::pump, "LlamaPublisher-emitter");
+            worker.setDaemon(true);
+            worker.start();
+        }
+
+        @Override
+        public void request(long n) {
+            if (n <= 0) {
+                cancel();
+                subscriber.onError(new IllegalArgumentException(
+                        "reactive-streams §3.9: request must be > 0, got " + n));
+                return;
+            }
+            // Saturating add
+            for (;;) {
+                long cur = demand.get();
+                long next = cur + n;
+                if (next < 0) next = Long.MAX_VALUE;
+                if (demand.compareAndSet(cur, next)) break;
+            }
+            synchronized (monitor) {
+                monitor.notifyAll();
+            }
+        }
+
+        @Override
+        public void cancel() {
+            if (cancelled.compareAndSet(false, true)) {
+                try {
+                    iterable.close();
+                } catch (Throwable ignored) {
+                    // best-effort
+                }
+                synchronized (monitor) {
+                    monitor.notifyAll();
+                }
+            }
+        }
+
+        private void pump() {
+            LlamaIterator iterator = iterable.iterator();
+            try {
+                while (!cancelled.get() && iterator.hasNext()) {
+                    // Wait for demand.
+                    while (demand.get() == 0 && !cancelled.get()) {
+                        synchronized (monitor) {
+                            if (demand.get() == 0 && !cancelled.get()) {
+                                try {
+                                    monitor.wait();
+                                } catch (InterruptedException e) {
+                                    Thread.currentThread().interrupt();
+                                    cancel();
+                                    return;
+                                }
+                            }
+                        }
+                    }
+                    if (cancelled.get()) return;
+                    LlamaOutput next = iterator.next();
+                    demand.decrementAndGet();
+                    subscriber.onNext(next);
+                    if (next.stop) {
+                        subscriber.onComplete();
+                        return;
+                    }
+                }
+                if (!cancelled.get()) {
+                    subscriber.onComplete();
+                }
+            } catch (Throwable t) {
+                if (!cancelled.get()) {
+                    try {
+                        subscriber.onError(t);
+                    } catch (Throwable ignored) {
+                        // subscriber threw from onError; nothing more we can do
+                    }
+                }
+            } finally {
+                try {
+                    iterable.close();
+                } catch (Throwable ignored) {
+                    // best-effort
+                }
+            }
+        }
+    }
+
+    /** No-op subscription used to signal onError on rejected subscriptions. */
+    private static final class EmptySubscription implements Subscription {
+        @Override public void request(long n) { }
+        @Override public void cancel() { }
+
+        static void signalError(Subscriber<?> subscriber, Throwable error) {
+            subscriber.onSubscribe(new EmptySubscription());
+            subscriber.onError(error);
+        }
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/LlamaPublisherTest.java b/src/test/java/net/ladenthin/llama/LlamaPublisherTest.java
@@ -0,0 +1,153 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import org.junit.Assume;
+import org.junit.Test;
+import org.reactivestreams.Subscriber;
+import org.reactivestreams.Subscription;
+
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+@ClaudeGenerated(
+        purpose = "Verify LlamaPublisher honours Reactive Streams contracts: backpressure via request(n), "
+                + "stops on cancel, signals onError for invalid demand, and rejects a second subscriber."
+)
+public class LlamaPublisherTest {
+
+    /**
+     * Model-gated: subscribe, request a small batch with backpressure, observe tokens, cancel early.
+     */
+    @Test
+    public void backpressureAndCancel() throws Exception {
+        Assume.assumeTrue("Model file not found", new java.io.File(TestConstants.MODEL_PATH).exists());
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+
+        try (LlamaModel model = new LlamaModel(new ModelParameters()
+                .setCtxSize(128)
+                .setModel(TestConstants.MODEL_PATH)
+                .setGpuLayers(gpuLayers)
+                .setFit(false))) {
+
+            LlamaPublisher pub = model.streamPublisher(
+                    new InferenceParameters("def hello():").setNPredict(20).setSeed(1));
+
+            CountDownLatch done = new CountDownLatch(1);
+            AtomicReference<Subscription> subRef = new AtomicReference<>();
+            AtomicInteger received = new AtomicInteger();
+
+            pub.subscribe(new Subscriber<LlamaOutput>() {
+                @Override public void onSubscribe(Subscription s) {
+                    subRef.set(s);
+                    s.request(2); // initial demand
+                }
+                @Override public void onNext(LlamaOutput o) {
+                    int n = received.incrementAndGet();
+                    if (n == 2) {
+                        // Verify backpressure: with demand=0 we should pause until next request.
+                        // Request one more to trigger another emission.
+                        subRef.get().request(1);
+                    } else if (n == 3) {
+                        // Cancel after the third token; subsequent onNext must not occur.
+                        subRef.get().cancel();
+                        done.countDown();
+                    }
+                }
+                @Override public void onError(Throwable t) { done.countDown(); }
+                @Override public void onComplete() { done.countDown(); }
+            });
+
+            assertTrue("subscriber did not terminate in 30s", done.await(30, TimeUnit.SECONDS));
+            // After cancel we may receive 3-4 in-flight tokens; should not be far above the
+            // demand actually requested (3 here).
+            int got = received.get();
+            assertTrue("expected ~3 tokens, got " + got, got >= 3 && got <= 6);
+        }
+    }
+
+    @Test
+    public void singleSubscriberContract() throws Exception {
+        Assume.assumeTrue("Model file not found", new java.io.File(TestConstants.MODEL_PATH).exists());
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+
+        try (LlamaModel model = new LlamaModel(new ModelParameters()
+                .setCtxSize(128)
+                .setModel(TestConstants.MODEL_PATH)
+                .setGpuLayers(gpuLayers)
+                .setFit(false))) {
+
+            LlamaPublisher pub = model.streamPublisher(
+                    new InferenceParameters("def f():").setNPredict(2).setSeed(1));
+
+            CountDownLatch first = new CountDownLatch(1);
+            pub.subscribe(new Subscriber<LlamaOutput>() {
+                @Override public void onSubscribe(Subscription s) { s.request(Long.MAX_VALUE); }
+                @Override public void onNext(LlamaOutput o) { }
+                @Override public void onError(Throwable t) { first.countDown(); }
+                @Override public void onComplete() { first.countDown(); }
+            });
+            assertTrue(first.await(30, TimeUnit.SECONDS));
+
+            // Second subscribe must signal onError.
+            AtomicReference<Throwable> err = new AtomicReference<>();
+            CountDownLatch second = new CountDownLatch(1);
+            pub.subscribe(new Subscriber<LlamaOutput>() {
+                @Override public void onSubscribe(Subscription s) { }
+                @Override public void onNext(LlamaOutput o) { }
+                @Override public void onError(Throwable t) { err.set(t); second.countDown(); }
+                @Override public void onComplete() { second.countDown(); }
+            });
+            assertTrue(second.await(5, TimeUnit.SECONDS));
+            assertNotNull("expected onError on second subscribe", err.get());
+            assertTrue(err.get() instanceof IllegalStateException);
+        }
+    }
+
+    @Test
+    public void invalidRequestSignalsError() throws Exception {
+        Assume.assumeTrue("Model file not found", new java.io.File(TestConstants.MODEL_PATH).exists());
+        int gpuLayers = Integer.getInteger(TestConstants.PROP_TEST_NGL, TestConstants.DEFAULT_TEST_NGL);
+
+        try (LlamaModel model = new LlamaModel(new ModelParameters()
+                .setCtxSize(128)
+                .setModel(TestConstants.MODEL_PATH)
+                .setGpuLayers(gpuLayers)
+                .setFit(false))) {
+
+            LlamaPublisher pub = model.streamPublisher(
+                    new InferenceParameters("def f():").setNPredict(5).setSeed(1));
+
+            AtomicReference<Throwable> err = new AtomicReference<>();
+            CountDownLatch done = new CountDownLatch(1);
+            pub.subscribe(new Subscriber<LlamaOutput>() {
+                @Override public void onSubscribe(Subscription s) { s.request(0); }
+                @Override public void onNext(LlamaOutput o) { }
+                @Override public void onError(Throwable t) { err.set(t); done.countDown(); }
+                @Override public void onComplete() { done.countDown(); }
+            });
+            assertTrue(done.await(10, TimeUnit.SECONDS));
+            assertNotNull("expected onError for request(0)", err.get());
+            assertTrue(err.get() instanceof IllegalArgumentException);
+        }
+    }
+
+    @Test
+    public void nullSubscriberThrows() {
+        // Construct a publisher without a model — subscribe(null) must NPE before any model use.
+        try {
+            new LlamaPublisher(null, null, false).subscribe(null);
+            org.junit.Assert.fail("expected NPE");
+        } catch (NullPointerException expected) {
+            assertEquals("subscriber", expected.getMessage());
+        }
+    }
+}