From f3ee50c36669a401b26c3b2ec19964dd8f580bf2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 22:04:32 +0000
Subject: [PATCH 1/3] CI: produce a jar-with-dependencies uber JAR as a build
 artifact

Add a managed maven-assembly-plugin (3.8.0) and an `assembly` profile
that builds llama-<version>-jar-with-dependencies.jar: the library
classes, all Java runtime dependencies, and the default-platform native
libs from src/main/resources in one drop-on-classpath JAR (no
Main-Class - it is a library). Activate it in the package job
(-P release,cuda,opencl-android,assembly) so the uber JAR rides along in
the existing `llama-jars` upload-artifact (a CI run artifact only, not a
Maven Central / GitHub-Release asset). Document the command in CLAUDE.md.

Recorded as deliberate cross-repo non-parity (BAF + jllama only) in
workspace/crossrepostatus.md.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01UZbmBX5CjqVwPcaTS61im6
---
 .github/workflows/publish.yml |  8 ++++++-
 CLAUDE.md                     |  1 +
 pom.xml                       | 40 +++++++++++++++++++++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 973042f4..cc160814 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -818,7 +818,13 @@ jobs:
           distribution: 'temurin'
           java-version: ${{ env.JAVA_VERSION }}
       - name: Build JARs
-        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android -Dmaven.test.skip=true -Dgpg.skip=true package
+        # `assembly` additionally produces the fat jar-with-dependencies uber JAR
+        # (llama-<version>-jar-with-dependencies.jar: library classes + Java runtime deps +
+        # default-platform native libs in one drop-on-classpath JAR; no Main-Class - it is a
+        # library). It lands in target/ and is uploaded in the `llama-jars` artifact below - a
+        # CI run artifact only, NOT a Maven Central / GitHub-Release asset. Documented as
+        # deliberate non-parity (BAF + jllama only) in workspace/crossrepostatus.md.
+        run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
       - name: Upload JARs
         uses: actions/upload-artifact@v7
         with:
diff --git a/CLAUDE.md b/CLAUDE.md
index af16e6f1..0214dc6b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -229,6 +229,7 @@ For the full record of upstream API breaks across version ranges (b5022 &#x2192;
 mvn compile          # Compiles Java and generates JNI headers
 mvn test             # Run all tests (requires native library and model files)
 mvn package          # Build JAR
+mvn -P assembly package  # Also build the fat jar-with-dependencies uber JAR (library + Java deps + native libs); CI builds it and uploads it in the `llama-jars` artifact
 mvn test -Dtest=LlamaModelTest#testGenerate  # Run a single test method
 ```
 
diff --git a/pom.xml b/pom.xml
index 885efe02..063a6e71 100644
--- a/pom.xml
+++ b/pom.xml
@@ -259,6 +259,11 @@ SPDX-License-Identifier: MIT
 					<artifactId>git-commit-id-maven-plugin</artifactId>
 					<version>10.0.0</version>
 				</plugin>
+				<plugin>
+					<groupId>org.apache.maven.plugins</groupId>
+					<artifactId>maven-assembly-plugin</artifactId>
+					<version>3.8.0</version>
+				</plugin>
 				<plugin>
 					<groupId>org.apache.maven.plugins</groupId>
 					<artifactId>maven-compiler-plugin</artifactId>
@@ -968,5 +973,40 @@ SPDX-License-Identifier: MIT
 				</plugins>
 			</build>
 		</profile>
+		<profile>
+			<!--
+				Builds the fat jar-with-dependencies uber JAR: the library classes, the
+				default-platform native libs from src/main/resources, and all runtime Java
+				dependencies in one drop-on-classpath JAR. No Main-Class (this is a library,
+				not a CLI). Off by default; the CI `package` job activates it so the uber JAR
+				rides along in the `llama-jars` upload-artifact bundle (a CI run artifact only,
+				not a Maven Central / GitHub-Release asset). Documented in CLAUDE.md
+				"Build Commands" as `mvn -P assembly package` and as deliberate cross-repo
+				non-parity (BAF + jllama only) in workspace/crossrepostatus.md.
+			-->
+			<id>assembly</id>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-assembly-plugin</artifactId>
+						<configuration>
+							<descriptorRefs>
+								<descriptorRef>jar-with-dependencies</descriptorRef>
+							</descriptorRefs>
+						</configuration>
+						<executions>
+							<execution>
+								<id>build-fat-jar</id>
+								<phase>package</phase>
+								<goals>
+									<goal>single</goal>
+								</goals>
+							</execution>
+						</executions>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
 	</profiles>
 </project>

From 18e3008cca6b5f789c8f5c8d2ca84bd7badc0250 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 23:02:44 +0000
Subject: [PATCH 2/3] Add an OpenAI-compatible HTTP server entry point
 (NanoHTTPD)

Introduce net.ladenthin.llama.server.LlamaServer, a runnable main class (and the
fat-jar Main-Class) that loads a GGUF model in-process and serves the
OpenAI-compatible endpoints over a tiny NanoHTTPD server:

  POST /v1/chat/completions  -> LlamaModel.handleChatCompletions
  POST /v1/completions       -> LlamaModel.handleCompletionsOai
  POST /v1/embeddings        -> LlamaModel.handleEmbeddings (needs --embedding)
  GET  /v1/models            -> configured model alias
  GET  /health               -> {"status":"ok"}

The handle* methods already return OAI-shaped JSON, so the server only forwards
request bodies. Design:
- OaiRouter (model-free, unit-tested) maps method+path+body to a response;
  OaiHttpServer is the thin NanoHTTPD adapter; LlamaModelOaiBackend bridges to
  LlamaModel; LlamaServerArgs parses --model/--host/--port/--ctx-size/
  --n-gpu-layers/--threads/--embedding/--model-alias/--help.
- handleChatCompletions widened to public to match the other raw OAI handlers.
- NanoHTTPD is an <optional> compile dependency: bundled in the fat jar, not
  inherited by library consumers (Java-8 clean, zero transitive deps).
- New `server` ArchUnit layer (the only layer allowed to access the Api root).
- spotbugs-exclude: PATH_TRAVERSAL_IN + CRLF_INJECTION_LOGS on the server
  package (operator-supplied CLI input; same threat model as LlamaLoader), CC on
  the flag switch (desugared String-switch artifact), EI_EXPOSE_REP2 on the
  backend (non-owning model wrapper, mirrors Session).

Tests (model-free): LlamaServerArgsTest (10), OaiRouterTest (10),
OaiHttpServerIntegrationTest (real loopback socket + fake backend, 1). Verified:
spotless, compile (Error Prone/NullAway/Checker), spotbugs Max+Low, javadoc, and
the assembly fat jar (Main-Class set, NanoHTTPD bundled) all clean.

Docs: README "OpenAI-compatible HTTP server" + Features bullet; CLAUDE.md note.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01UZbmBX5CjqVwPcaTS61im6
---
 CLAUDE.md                                     |   1 +
 README.md                                     |  32 ++++
 pom.xml                                       |  20 ++
 spotbugs-exclude.xml                          |  47 +++++
 .../java/net/ladenthin/llama/LlamaModel.java  |  15 +-
 .../llama/server/LlamaModelOaiBackend.java    |  67 +++++++
 .../ladenthin/llama/server/LlamaServer.java   |  93 ++++++++++
 .../llama/server/LlamaServerArgs.java         | 175 ++++++++++++++++++
 .../llama/server/LlamaServerConfig.java       |  66 +++++++
 .../ladenthin/llama/server/OaiBackend.java    |  50 +++++
 .../ladenthin/llama/server/OaiHttpServer.java |  83 +++++++++
 .../ladenthin/llama/server/OaiResponse.java   |  55 ++++++
 .../net/ladenthin/llama/server/OaiRouter.java | 130 +++++++++++++
 .../ladenthin/llama/server/package-info.java  |  23 +++
 .../llama/LlamaArchitectureTest.java          |  17 +-
 .../llama/server/LlamaServerArgsTest.java     | 115 ++++++++++++
 .../server/OaiHttpServerIntegrationTest.java  | 128 +++++++++++++
 .../ladenthin/llama/server/OaiRouterTest.java | 156 ++++++++++++++++
 18 files changed, 1267 insertions(+), 6 deletions(-)
 create mode 100644 src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/LlamaServer.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/OaiBackend.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/OaiHttpServer.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/OaiResponse.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/OaiRouter.java
 create mode 100644 src/main/java/net/ladenthin/llama/server/package-info.java
 create mode 100644 src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java
 create mode 100644 src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java
 create mode 100644 src/test/java/net/ladenthin/llama/server/OaiRouterTest.java

diff --git a/CLAUDE.md b/CLAUDE.md
index 0214dc6b..84edd5a8 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -453,6 +453,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
 - `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`.
 - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
 - `OSInfo` — Detects OS and architecture for library resolution.
+- `server.LlamaServer` — Optional OpenAI-compatible HTTP server and the fat-jar `Main-Class`. `LlamaServerArgs` parses the CLI; `OaiRouter` / `OaiHttpServer` (NanoHTTPD) map `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings` and `GET /v1/models` to the `LlamaModel.handle*` methods. NanoHTTPD is an `<optional>` dependency (bundled only in the fat jar, not inherited by library consumers). The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`). See README "OpenAI-compatible HTTP server".
 
 **Native layer** (`src/main/cpp/`):
 - `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods.
diff --git a/README.md b/README.md
index 802cbf75..ec7bce30 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,7 @@ Inference of Meta's LLaMA model (and others) in pure C/C++.
 - **Infilling** (fill-in-the-middle) for code models.
 - **Tokenize / detokenize** and **JSON-schema → grammar** conversion.
 - **Raw JSON endpoint handlers** mirroring the upstream llama.cpp HTTP server (`/completions`, `/v1/completions`, `/embeddings`, `/infill`, `/tokenize`, `/detokenize`).
+- **Runnable OpenAI-compatible HTTP server** (`LlamaServer`, the fat-jar `Main-Class`): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`.
 - **Model metadata** access (`getModelMeta()`) and **server management** (metrics, slot save/restore, runtime thread reconfiguration).
 - Pre-built native binaries for Linux (x86-64, aarch64), macOS (x86-64, arm64), and Windows (x86-64, x86); CUDA, Metal, and Vulkan supported via local build.
 
@@ -396,6 +397,37 @@ a JSON response, matching the HTTP server's contract:
 Server state is exposed via `getMetrics()`, `eraseSlot(int)`, `saveSlot(int, String)`,
 `restoreSlot(int, String)`, and `getModelMeta()`.
 
+### OpenAI-compatible HTTP server
+
+The fat jar built by the `assembly` profile (`mvn -P assembly package`) is runnable: its
+`Main-Class` is `net.ladenthin.llama.server.LlamaServer`, a small [NanoHTTPD](https://github.com/NanoHttpd/nanohttpd)
+server that loads a GGUF model in-process and serves OpenAI-compatible endpoints by forwarding each
+request body to the matching `LlamaModel.handle*` method:
+
+```bash
+java -jar target/llama-<version>-jar-with-dependencies.jar \
+    --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+```
+
+| Method &amp; path | Backed by |
+|---|---|
+| `POST /v1/chat/completions` | `LlamaModel.handleChatCompletions` |
+| `POST /v1/completions` | `LlamaModel.handleCompletionsOai` |
+| `POST /v1/embeddings` (requires `--embedding`) | `LlamaModel.handleEmbeddings` |
+| `GET /v1/models` | the configured model alias |
+| `GET /health` | static `{"status":"ok"}` |
+
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{"messages":[{"role":"user","content":"Hello!"}]}'
+```
+
+Run with `--help` for all options (`--ctx-size`, `--threads`, `--model-alias`, …). Responses are
+non-streaming (the full JSON result is returned per request). The NanoHTTPD dependency is declared
+`<optional>`, so it is bundled in the fat jar but **not** inherited by projects that use this
+library as a Maven dependency; running the server requires the fat jar (or adding NanoHTTPD yourself).
+
 ### Model/Inference Configuration
 
 There are two sets of parameters you can configure, `ModelParameters` and `InferenceParameters`. Both provide builder 
diff --git a/pom.xml b/pom.xml
index 063a6e71..dfd0b278 100644
--- a/pom.xml
+++ b/pom.xml
@@ -58,6 +58,7 @@ SPDX-License-Identifier: MIT
 		<checker.version>4.2.0</checker.version>
 		<jackson.version>2.22.0</jackson.version>
 		<reactor.version>3.8.6</reactor.version>
+		<nanohttpd.version>2.3.1</nanohttpd.version>
 		<slf4j.version>2.0.18</slf4j.version>
 		<logback.version>1.5.34</logback.version>
 		<animal-sniffer.version>1.27</animal-sniffer.version>
@@ -148,6 +149,20 @@ SPDX-License-Identifier: MIT
 			<artifactId>jackson-databind</artifactId>
 			<version>${jackson.version}</version>
 		</dependency>
+		<!--
+			Embedded HTTP server for the optional OpenAI-compatible server entry point
+			(net.ladenthin.llama.server.LlamaServer, the fat-jar Main-Class). Declared
+			<optional> so library consumers do NOT inherit it on their classpath; the
+			assembly (jar-with-dependencies) profile still bundles it so the fat jar can
+			run the server. Pure Java, zero transitive deps (Java-8 clean), so it does
+			not perturb the enforcer dependencyConvergence rule.
+		-->
+		<dependency>
+			<groupId>org.nanohttpd</groupId>
+			<artifactId>nanohttpd</artifactId>
+			<version>${nanohttpd.version}</version>
+			<optional>true</optional>
+		</dependency>
 		<!-- Required by OSInfo (vendored from xerial/sqlite-jdbc) for log emission. -->
 		<dependency>
 			<groupId>org.slf4j</groupId>
@@ -994,6 +1009,11 @@ SPDX-License-Identifier: MIT
 							<descriptorRefs>
 								<descriptorRef>jar-with-dependencies</descriptorRef>
 							</descriptorRefs>
+							<archive>
+								<manifest>
+									<mainClass>net.ladenthin.llama.server.LlamaServer</mainClass>
+								</manifest>
+							</archive>
 						</configuration>
 						<executions>
 							<execution>
diff --git a/spotbugs-exclude.xml b/spotbugs-exclude.xml
index 09d420ff..3dbffe91 100644
--- a/spotbugs-exclude.xml
+++ b/spotbugs-exclude.xml
@@ -360,4 +360,51 @@ SPDX-License-Identifier: MIT
         <Method name="requireNonNull"/>
     </Match>
 
+    <!--
+        The OpenAI-compatible server (net.ladenthin.llama.server.*) is a CLI entry point:
+        the model path, host, port and alias all come from command-line arguments supplied
+        by whoever launches the process. findsecbugs flags Paths.get on the model path
+        (PATH_TRAVERSAL_IN) and the startup log lines that echo these values
+        (CRLF_INJECTION_LOGS) because they are non-literal, but the threat model is identical
+        to the LlamaLoader PATH_TRAVERSAL suppression above: an attacker who can set the
+        server's command line has already won, and there is no untrusted end-user input
+        reaching these paths or log statements. There is also no meaningful "allowed root"
+        to canonicalise the operator-chosen model path against.
+    -->
+    <Match>
+        <Class name="~net\.ladenthin\.llama\.server\..*"/>
+        <Or>
+            <Bug pattern="PATH_TRAVERSAL_IN"/>
+            <Bug pattern="CRLF_INJECTION_LOGS"/>
+        </Or>
+    </Match>
+
+    <!--
+        LlamaServerArgs.parse is a flat command-line flag dispatcher: a single switch over
+        the known flags, one case per option, read top to bottom. javac desugars a String
+        switch into a hashCode lookup plus an equals chain (two branches per case), which
+        fb-contrib's bytecode-level CC_CYCLOMATIC_COMPLEXITY counts as a very high score.
+        The source complexity is low and table-flat; extracting the cases into a dispatch
+        map would not make it clearer, so we accept the detector artifact here.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.LlamaServerArgs"/>
+        <Bug pattern="CC_CYCLOMATIC_COMPLEXITY"/>
+        <Method name="parse"/>
+    </Match>
+
+    <!--
+        LlamaModelOaiBackend is a thin non-owning wrapper around a LlamaModel (the same
+        deliberate dependency-injection contract as Session above): the server owns the one
+        LlamaModel and its native context, and the backend holds the passed-in reference to
+        serve requests. The model must NOT be defensively copied, so storing the reference is
+        by design; spotbugs flags it as EI_EXPOSE_REP2 because the constructor stores an
+        externally-mutable object, which is true but intended.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.LlamaModelOaiBackend"/>
+        <Bug pattern="EI_EXPOSE_REP2"/>
+        <Method name="&lt;init&gt;"/>
+    </Match>
+
 </FindBugsFilter>
diff --git a/src/main/java/net/ladenthin/llama/LlamaModel.java b/src/main/java/net/ladenthin/llama/LlamaModel.java
index 71ee21c1..3e958b03 100644
--- a/src/main/java/net/ladenthin/llama/LlamaModel.java
+++ b/src/main/java/net/ladenthin/llama/LlamaModel.java
@@ -835,7 +835,20 @@ public String restoreSlot(int slotId, String filepath) {
 
     native String handleSlotAction(int action, int slotId, @Nullable String filename);
 
-    native String handleChatCompletions(String params);
+    /**
+     * Run an OpenAI-compatible chat completion (mirrors the {@code /v1/chat/completions}
+     * endpoint). The request JSON must contain a {@code "messages"} array in the standard
+     * OpenAI chat format; the model's chat template is applied automatically. Returns the
+     * result in OAI format with a {@code "choices"} array. This is the raw JSON-in/JSON-out
+     * form used by {@link #chatComplete(net.ladenthin.llama.parameters.InferenceParameters)}
+     * and by the embedded OpenAI-compatible server
+     * ({@link net.ladenthin.llama.server.LlamaServer}); it is the chat counterpart of
+     * {@link #handleCompletionsOai(String)} and {@link #handleEmbeddings(String, boolean)}.
+     *
+     * @param params JSON string with OAI-compatible chat-completion parameters (incl. {@code "messages"})
+     * @return JSON response in OAI chat-completion format
+     */
+    public native String handleChatCompletions(String params);
 
     native int requestChatCompletion(String params);
 }
diff --git a/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java b/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java
new file mode 100644
index 00000000..7c1a85a5
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import lombok.ToString;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * {@link OaiBackend} backed by a loaded {@link LlamaModel}. Each operation forwards the raw request
+ * JSON to the matching {@code LlamaModel.handle*} method, which already produces
+ * OpenAI-compatible response JSON, so no per-field marshalling happens here.
+ *
+ * <p>The model is owned by the caller ({@link LlamaServer}); this class neither closes it nor holds
+ * any other resource.</p>
+ */
+@ToString
+public final class LlamaModelOaiBackend implements OaiBackend {
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    private final LlamaModel model;
+    private final String modelId;
+
+    /**
+     * Create a backend over a loaded model.
+     *
+     * @param model   the loaded model to serve requests with
+     * @param modelId the identifier reported by {@link #listModels()} and echoed in responses
+     */
+    public LlamaModelOaiBackend(LlamaModel model, String modelId) {
+        this.model = model;
+        this.modelId = modelId;
+    }
+
+    @Override
+    public String chatCompletions(String requestJson) {
+        return model.handleChatCompletions(requestJson);
+    }
+
+    @Override
+    public String completions(String requestJson) {
+        return model.handleCompletionsOai(requestJson);
+    }
+
+    @Override
+    public String embeddings(String requestJson) {
+        return model.handleEmbeddings(requestJson, true);
+    }
+
+    @Override
+    public String listModels() {
+        final ObjectNode root = OBJECT_MAPPER.createObjectNode();
+        root.put("object", "list");
+        final ArrayNode data = root.putArray("data");
+        final ObjectNode entry = data.addObject();
+        entry.put("id", modelId);
+        entry.put("object", "model");
+        entry.put("owned_by", "llamacpp");
+        // ObjectNode.toString() emits valid JSON without a checked exception.
+        return root.toString();
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServer.java b/src/main/java/net/ladenthin/llama/server/LlamaServer.java
new file mode 100644
index 00000000..2f630513
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/LlamaServer.java
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import fi.iki.elonen.NanoHTTPD;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Entry point for the optional OpenAI-compatible HTTP server, and the {@code Main-Class} of the
+ * {@code -jar-with-dependencies} assembly.
+ *
+ * <p>It parses the command line ({@link LlamaServerArgs}), loads a GGUF model into a
+ * {@link LlamaModel}, and serves OpenAI-compatible endpoints over NanoHTTPD via {@link OaiRouter} /
+ * {@link OaiHttpServer}. A shutdown hook stops the server and closes the model on JVM exit
+ * (e.g. Ctrl-C / SIGTERM). Run {@code --help} for the full option list.</p>
+ *
+ * <p>Example:</p>
+ *
+ * <pre>{@code
+ * java -jar llama-<version>-jar-with-dependencies.jar \
+ *     --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+ * }</pre>
+ *
+ * <p>Responses are non-streaming: the full JSON result is returned per request.</p>
+ */
+public final class LlamaServer {
+
+    private static final Logger LOG = LoggerFactory.getLogger(LlamaServer.class);
+
+    private LlamaServer() {}
+
+    /**
+     * Start the server (blocks the JVM alive on a non-daemon listener thread), or print help.
+     *
+     * @param args command-line arguments; see {@link LlamaServerArgs#usage()}
+     * @throws IOException if the HTTP server cannot bind the configured host/port
+     */
+    public static void main(String[] args) throws IOException {
+        if (LlamaServerArgs.isHelpRequested(args)) {
+            LOG.info("{}{}", System.lineSeparator(), LlamaServerArgs.usage());
+            return;
+        }
+
+        final LlamaServerConfig config = LlamaServerArgs.parse(args);
+        final LlamaModel model = loadModel(config);
+        final OaiBackend backend = new LlamaModelOaiBackend(model, config.getModelAlias());
+        final OaiHttpServer server = new OaiHttpServer(config.getHost(), config.getPort(), new OaiRouter(backend));
+
+        Runtime.getRuntime().addShutdownHook(new Thread(() -> shutdown(server, model), "llama-server-shutdown"));
+
+        try {
+            // daemon=false: the non-daemon listener thread keeps the JVM alive after main() returns.
+            server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, false);
+        } catch (IOException e) {
+            // Close the just-loaded native model before propagating the bind failure.
+            model.close();
+            throw e;
+        }
+
+        LOG.info(
+                "LlamaServer listening on http://{}:{} (model={})",
+                config.getHost(),
+                config.getPort(),
+                config.getModelAlias());
+    }
+
+    private static LlamaModel loadModel(LlamaServerConfig config) {
+        final ModelParameters params =
+                new ModelParameters().setModel(config.getModelPath()).setGpuLayers(config.getGpuLayers());
+        if (config.getCtxSize() > 0) {
+            params.setCtxSize(config.getCtxSize());
+        }
+        if (config.getThreads() > 0) {
+            params.setThreads(config.getThreads());
+        }
+        if (config.isEmbedding()) {
+            params.enableEmbedding();
+        }
+        LOG.info("Loading model {} ...", config.getModelPath());
+        return new LlamaModel(params);
+    }
+
+    private static void shutdown(OaiHttpServer server, LlamaModel model) {
+        server.stop();
+        model.close();
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java b/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java
new file mode 100644
index 00000000..1bfcef71
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/LlamaServerArgs.java
@@ -0,0 +1,175 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import org.jspecify.annotations.Nullable;
+
+/**
+ * Command-line argument parser for {@link LlamaServer}. Pure and free of any native dependency, so
+ * it can be unit-tested in isolation (no socket, no model).
+ *
+ * <p>{@link #parse(String[])} returns a validated {@link LlamaServerConfig} or throws
+ * {@link IllegalArgumentException} (whose message includes the {@link #usage()} text) for unknown
+ * flags, missing values or a missing required {@code --model}. {@code -h}/{@code --help} is detected
+ * separately via {@link #isHelpRequested(String[])} so callers can print help without it being
+ * treated as an error.</p>
+ */
+public final class LlamaServerArgs {
+
+    /** Default bind interface (loopback only; pass {@code --host 0.0.0.0} to expose on the LAN). */
+    public static final String DEFAULT_HOST = "127.0.0.1";
+
+    /** Default TCP port. */
+    public static final int DEFAULT_PORT = 8080;
+
+    private LlamaServerArgs() {}
+
+    /**
+     * Whether the arguments request the help text.
+     *
+     * @param args the raw command-line arguments
+     * @return {@code true} if {@code -h} or {@code --help} is present
+     */
+    public static boolean isHelpRequested(String... args) {
+        for (final String arg : args) {
+            if ("-h".equals(arg) || "--help".equals(arg)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    /**
+     * Parse the command-line arguments into a {@link LlamaServerConfig}.
+     *
+     * @param args the raw command-line arguments
+     * @return the validated configuration
+     * @throws IllegalArgumentException if an argument is unknown, a value is missing or malformed,
+     *                                  or the required {@code --model} is absent
+     */
+    public static LlamaServerConfig parse(String... args) {
+        String host = DEFAULT_HOST;
+        int port = DEFAULT_PORT;
+        @Nullable String modelPath = null;
+        @Nullable String modelAlias = null;
+        int ctxSize = 0;
+        int gpuLayers = 0;
+        int threads = 0;
+        boolean embedding = false;
+
+        for (int i = 0; i < args.length; i++) {
+            final String arg = args[i];
+            switch (arg) {
+                case "-m":
+                case "--model":
+                    modelPath = nextValue(args, ++i, arg);
+                    break;
+                case "--host":
+                    host = nextValue(args, ++i, arg);
+                    break;
+                case "-p":
+                case "--port":
+                    port = intValue(args, ++i, arg);
+                    break;
+                case "-c":
+                case "--ctx-size":
+                    ctxSize = intValue(args, ++i, arg);
+                    break;
+                case "-ngl":
+                case "--n-gpu-layers":
+                    gpuLayers = intValue(args, ++i, arg);
+                    break;
+                case "-t":
+                case "--threads":
+                    threads = intValue(args, ++i, arg);
+                    break;
+                case "--model-alias":
+                    modelAlias = nextValue(args, ++i, arg);
+                    break;
+                case "--embedding":
+                case "--embeddings":
+                    embedding = true;
+                    break;
+                case "-h":
+                case "--help":
+                    // Detected by isHelpRequested(); accepted here so parse() still succeeds.
+                    break;
+                default:
+                    throw error("Unknown argument: " + arg);
+            }
+        }
+
+        if (modelPath == null) {
+            throw error("Missing required argument: -m/--model <path-to-gguf>");
+        }
+        final String alias = modelAlias != null ? modelAlias : deriveAlias(modelPath);
+        return new LlamaServerConfig(host, port, modelPath, alias, ctxSize, gpuLayers, threads, embedding);
+    }
+
+    /**
+     * The human-readable usage / help text.
+     *
+     * @return the usage text
+     */
+    public static String usage() {
+        return String.join(
+                System.lineSeparator(),
+                "LlamaServer - OpenAI-compatible HTTP server for java-llama.cpp",
+                "",
+                "Usage:",
+                "  java -jar llama-<version>-jar-with-dependencies.jar --model <path.gguf> [options]",
+                "",
+                "Required:",
+                "  -m,  --model <path>        Path to the GGUF model file",
+                "",
+                "Options:",
+                "  --host <host>              Interface to bind (default: " + DEFAULT_HOST + ")",
+                "  -p,  --port <port>         TCP port to listen on (default: " + DEFAULT_PORT + ")",
+                "  -c,  --ctx-size <n>        Context window size (default: llama.cpp default)",
+                "  -ngl,--n-gpu-layers <n>    Layers to offload to GPU (default: 0 = CPU only)",
+                "  -t,  --threads <n>         Inference thread count (default: llama.cpp default)",
+                "  --model-alias <name>       Model id reported by /v1/models (default: file name)",
+                "  --embedding                Load in embedding mode (enables POST /v1/embeddings)",
+                "  -h,  --help                Show this help and exit",
+                "",
+                "Endpoints:",
+                "  POST /v1/chat/completions",
+                "  POST /v1/completions",
+                "  POST /v1/embeddings        (requires --embedding)",
+                "  GET  /v1/models",
+                "  GET  /health");
+    }
+
+    private static String nextValue(String[] args, int valueIndex, String flag) {
+        if (valueIndex >= args.length) {
+            throw error("Missing value for " + flag);
+        }
+        return args[valueIndex];
+    }
+
+    private static int intValue(String[] args, int valueIndex, String flag) {
+        final String raw = nextValue(args, valueIndex, flag);
+        try {
+            return Integer.parseInt(raw.trim());
+        } catch (NumberFormatException e) {
+            throw error(flag + " expects an integer, got: " + raw, e);
+        }
+    }
+
+    private static String deriveAlias(String modelPath) {
+        final Path name = Paths.get(modelPath).getFileName();
+        return name != null ? name.toString() : modelPath;
+    }
+
+    private static IllegalArgumentException error(String message) {
+        return error(message, null);
+    }
+
+    private static IllegalArgumentException error(String message, @Nullable Throwable cause) {
+        return new IllegalArgumentException(message + System.lineSeparator() + System.lineSeparator() + usage(), cause);
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java b/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java
new file mode 100644
index 00000000..f5f37a2e
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/LlamaServerConfig.java
@@ -0,0 +1,66 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.ToString;
+
+/**
+ * Immutable, parsed configuration for {@link LlamaServer}, produced by
+ * {@link LlamaServerArgs#parse(String[])}.
+ *
+ * <p>{@code ctxSize} and {@code threads} use {@code 0} as a sentinel meaning "leave the llama.cpp
+ * default" — they are only applied to {@link net.ladenthin.llama.parameters.ModelParameters} when
+ * positive. {@code gpuLayers} is always applied (its own default of {@code 0} already means
+ * CPU-only).</p>
+ *
+ * <p>Value equality / {@code toString} are generated by Lombok over all fields.</p>
+ */
+@Getter
+@ToString
+@EqualsAndHashCode
+public final class LlamaServerConfig {
+
+    private final String host;
+    private final int port;
+    private final String modelPath;
+    private final String modelAlias;
+    private final int ctxSize;
+    private final int gpuLayers;
+    private final int threads;
+    private final boolean embedding;
+
+    /**
+     * Create a server configuration.
+     *
+     * @param host       the interface to bind (e.g. {@code "127.0.0.1"} or {@code "0.0.0.0"})
+     * @param port       the TCP port to listen on
+     * @param modelPath  the path to the GGUF model file to load
+     * @param modelAlias the identifier reported by {@code /v1/models}
+     * @param ctxSize    context window size, or {@code 0} to use the llama.cpp default
+     * @param gpuLayers  number of layers to offload to the GPU ({@code 0} = CPU-only)
+     * @param threads    inference thread count, or {@code 0} to use the llama.cpp default
+     * @param embedding  whether to load the model in embedding mode (enables {@code /v1/embeddings})
+     */
+    public LlamaServerConfig(
+            String host,
+            int port,
+            String modelPath,
+            String modelAlias,
+            int ctxSize,
+            int gpuLayers,
+            int threads,
+            boolean embedding) {
+        this.host = host;
+        this.port = port;
+        this.modelPath = modelPath;
+        this.modelAlias = modelAlias;
+        this.ctxSize = ctxSize;
+        this.gpuLayers = gpuLayers;
+        this.threads = threads;
+        this.embedding = embedding;
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/OaiBackend.java b/src/main/java/net/ladenthin/llama/server/OaiBackend.java
new file mode 100644
index 00000000..f8b57ca2
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/OaiBackend.java
@@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+/**
+ * The inference operations the {@link OaiRouter} forwards HTTP requests to, abstracted behind an
+ * interface so the router can be unit-tested without loading a native model. The production
+ * implementation is {@link LlamaModelOaiBackend}, which delegates to
+ * {@link net.ladenthin.llama.LlamaModel}.
+ *
+ * <p>Each request method takes the raw OpenAI-compatible request body and returns the raw
+ * OpenAI-compatible response JSON. Implementations may throw a {@link RuntimeException} (e.g.
+ * {@link net.ladenthin.llama.exception.LlamaException}) on inference failure; the router converts
+ * that into an HTTP {@code 500} error response.</p>
+ */
+public interface OaiBackend {
+
+    /**
+     * Run a chat completion ({@code POST /v1/chat/completions}).
+     *
+     * @param requestJson the OAI chat-completion request body (must contain {@code "messages"})
+     * @return the OAI chat-completion response JSON
+     */
+    String chatCompletions(String requestJson);
+
+    /**
+     * Run a text completion ({@code POST /v1/completions}).
+     *
+     * @param requestJson the OAI completion request body (must contain {@code "prompt"})
+     * @return the OAI completion response JSON
+     */
+    String completions(String requestJson);
+
+    /**
+     * Generate embeddings ({@code POST /v1/embeddings}).
+     *
+     * @param requestJson the OAI embeddings request body (must contain {@code "input"})
+     * @return the OAI embeddings response JSON
+     */
+    String embeddings(String requestJson);
+
+    /**
+     * List the available model(s) ({@code GET /v1/models}).
+     *
+     * @return the OAI model-list response JSON
+     */
+    String listModels();
+}
diff --git a/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java b/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java
new file mode 100644
index 00000000..f8567a41
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/OaiHttpServer.java
@@ -0,0 +1,83 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import fi.iki.elonen.NanoHTTPD;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import lombok.ToString;
+import org.jspecify.annotations.Nullable;
+
+/**
+ * Thin NanoHTTPD adapter: reads the method, path and (for body-bearing methods) the raw request
+ * body from each session, hands them to an {@link OaiRouter}, and converts the resulting
+ * {@link OaiResponse} into a fixed-length {@code application/json} NanoHTTPD response.
+ *
+ * <p>All request-shaping decisions live in {@link OaiRouter}; this class only bridges NanoHTTPD's
+ * session API to that router so the routing logic stays unit-testable without a socket.</p>
+ */
+@ToString
+public final class OaiHttpServer extends NanoHTTPD {
+
+    private static final String MIME_JSON = "application/json";
+
+    private static final String MALFORMED_BODY_JSON =
+            "{\"error\":{\"message\":\"Malformed request body\",\"type\":\"invalid_request_error\"}}";
+
+    private final OaiRouter router;
+
+    /**
+     * Create (but do not start) the server.
+     *
+     * @param host   the interface to bind, e.g. {@code "127.0.0.1"} or {@code "0.0.0.0"}
+     * @param port   the TCP port to listen on
+     * @param router the router that turns requests into responses
+     */
+    public OaiHttpServer(String host, int port, OaiRouter router) {
+        super(host, port);
+        this.router = router;
+    }
+
+    @Override
+    public Response serve(IHTTPSession session) {
+        final String method = session.getMethod().name();
+        final String uri = session.getUri();
+
+        @Nullable String body = null;
+        if (bodyBearing(method)) {
+            final Map<String, String> files = new HashMap<>();
+            try {
+                session.parseBody(files);
+            } catch (IOException | ResponseException e) {
+                return newFixedLengthResponse(Response.Status.BAD_REQUEST, MIME_JSON, MALFORMED_BODY_JSON);
+            }
+            // For non-multipart bodies NanoHTTPD stores the raw payload under "postData".
+            body = files.get("postData");
+        }
+
+        final OaiResponse routed = router.route(method, uri, body);
+        return newFixedLengthResponse(statusFor(routed.getStatus()), MIME_JSON, routed.getBody());
+    }
+
+    private static boolean bodyBearing(String method) {
+        return "POST".equals(method) || "PUT".equals(method) || "PATCH".equals(method);
+    }
+
+    private static Response.IStatus statusFor(int code) {
+        switch (code) {
+            case 200:
+                return Response.Status.OK;
+            case 400:
+                return Response.Status.BAD_REQUEST;
+            case 404:
+                return Response.Status.NOT_FOUND;
+            case 405:
+                return Response.Status.METHOD_NOT_ALLOWED;
+            default:
+                return Response.Status.INTERNAL_ERROR;
+        }
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/OaiResponse.java b/src/main/java/net/ladenthin/llama/server/OaiResponse.java
new file mode 100644
index 00000000..f772525d
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/OaiResponse.java
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import lombok.EqualsAndHashCode;
+import lombok.ToString;
+
+/**
+ * Immutable result of routing one HTTP request: an HTTP status code and a JSON body.
+ *
+ * <p>Produced by {@link OaiRouter#route(String, String, String)} and adapted to a NanoHTTPD
+ * response by {@link OaiHttpServer}. Keeping it independent of NanoHTTPD lets the routing logic be
+ * unit-tested without binding a socket. The body is always JSON (the server always replies with
+ * {@code application/json}).</p>
+ *
+ * <p>Value equality / {@code toString} are generated by Lombok over the status and body.</p>
+ */
+@ToString
+@EqualsAndHashCode
+public final class OaiResponse {
+
+    private final int status;
+    private final String body;
+
+    /**
+     * Create a routed response.
+     *
+     * @param status the HTTP status code (e.g. {@code 200}, {@code 400}, {@code 404}, {@code 500})
+     * @param body   the JSON response body
+     */
+    public OaiResponse(int status, String body) {
+        this.status = status;
+        this.body = body;
+    }
+
+    /**
+     * The HTTP status code.
+     *
+     * @return the status code
+     */
+    public int getStatus() {
+        return status;
+    }
+
+    /**
+     * The JSON response body.
+     *
+     * @return the body
+     */
+    public String getBody() {
+        return body;
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/OaiRouter.java b/src/main/java/net/ladenthin/llama/server/OaiRouter.java
new file mode 100644
index 00000000..1524c408
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/OaiRouter.java
@@ -0,0 +1,130 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import java.util.function.Function;
+import lombok.ToString;
+import org.jspecify.annotations.Nullable;
+
+/**
+ * Maps an HTTP method + path + body to an {@link OaiResponse} by dispatching to an
+ * {@link OaiBackend}. This is the testable core of the server: it is independent of NanoHTTPD and
+ * of {@link net.ladenthin.llama.LlamaModel}, so it can be exercised with a fake backend and plain
+ * strings (no socket, no native library, no GGUF model).
+ *
+ * <p>Supported routes:</p>
+ * <ul>
+ *   <li>{@code POST /v1/chat/completions} &rarr; {@link OaiBackend#chatCompletions(String)}</li>
+ *   <li>{@code POST /v1/completions} &rarr; {@link OaiBackend#completions(String)}</li>
+ *   <li>{@code POST /v1/embeddings} &rarr; {@link OaiBackend#embeddings(String)}</li>
+ *   <li>{@code GET /v1/models} &rarr; {@link OaiBackend#listModels()}</li>
+ *   <li>{@code GET /health} and {@code GET /} &rarr; a static {@code {"status":"ok"}}</li>
+ * </ul>
+ *
+ * <p>Unknown paths yield {@code 404}; a known path with the wrong method yields {@code 405}; an
+ * empty body on a {@code POST} route yields {@code 400}; any {@link RuntimeException} thrown by the
+ * backend (e.g. inference failure) is converted to {@code 500}. Error bodies use the OpenAI error
+ * envelope {@code {"error":{"message":...,"type":...}}}.</p>
+ */
+@ToString
+public final class OaiRouter {
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    private static final String METHOD_GET = "GET";
+    private static final String METHOD_POST = "POST";
+
+    private static final String HEALTH_BODY = "{\"status\":\"ok\"}";
+
+    private final OaiBackend backend;
+
+    /**
+     * Create a router over a backend.
+     *
+     * @param backend the inference backend requests are dispatched to
+     */
+    public OaiRouter(OaiBackend backend) {
+        this.backend = backend;
+    }
+
+    /**
+     * Route a single request.
+     *
+     * @param method  the HTTP method (e.g. {@code "GET"}, {@code "POST"})
+     * @param rawPath the request path, optionally including a {@code ?query} suffix
+     * @param body    the request body, or {@code null} when there is none (e.g. for {@code GET})
+     * @return the status code and JSON body to return to the client
+     */
+    public OaiResponse route(String method, String rawPath, @Nullable String body) {
+        final String path = stripQuery(rawPath);
+        try {
+            switch (path) {
+                case "/v1/chat/completions":
+                    return post(method, body, backend::chatCompletions);
+                case "/v1/completions":
+                    return post(method, body, backend::completions);
+                case "/v1/embeddings":
+                    return post(method, body, backend::embeddings);
+                case "/v1/models":
+                    return get(method, backend::listModels);
+                case "/health":
+                case "/":
+                    return get(method, () -> HEALTH_BODY);
+                default:
+                    return error(404, "not_found", "Unknown endpoint: " + path);
+            }
+        } catch (RuntimeException e) {
+            return error(500, "internal_error", describe(e));
+        }
+    }
+
+    private OaiResponse post(String method, @Nullable String body, Function<String, String> handler) {
+        if (!METHOD_POST.equals(method)) {
+            return methodNotAllowed(method);
+        }
+        if (body == null || body.trim().isEmpty()) {
+            return error(400, "invalid_request_error", "Request body is required");
+        }
+        return new OaiResponse(200, handler.apply(body));
+    }
+
+    private OaiResponse get(String method, java.util.function.Supplier<String> handler) {
+        if (!METHOD_GET.equals(method)) {
+            return methodNotAllowed(method);
+        }
+        return new OaiResponse(200, handler.get());
+    }
+
+    private OaiResponse methodNotAllowed(String method) {
+        return error(405, "method_not_allowed", "Method not allowed: " + method);
+    }
+
+    private static String stripQuery(String rawPath) {
+        final int q = rawPath.indexOf('?');
+        return q >= 0 ? rawPath.substring(0, q) : rawPath;
+    }
+
+    private static String describe(RuntimeException e) {
+        final String message = e.getMessage();
+        return message != null ? message : e.getClass().getSimpleName();
+    }
+
+    private static OaiResponse error(int status, String type, String message) {
+        final ObjectNode root = OBJECT_MAPPER.createObjectNode();
+        final ObjectNode err = root.putObject("error");
+        err.put("message", message);
+        err.put("type", type);
+        String json;
+        try {
+            json = OBJECT_MAPPER.writeValueAsString(root);
+        } catch (JsonProcessingException e) {
+            json = "{\"error\":{\"message\":\"serialization failed\",\"type\":\"internal_error\"}}";
+        }
+        return new OaiResponse(status, json);
+    }
+}
diff --git a/src/main/java/net/ladenthin/llama/server/package-info.java b/src/main/java/net/ladenthin/llama/server/package-info.java
new file mode 100644
index 00000000..4c642b28
--- /dev/null
+++ b/src/main/java/net/ladenthin/llama/server/package-info.java
@@ -0,0 +1,23 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+/**
+ * Optional, self-contained OpenAI-compatible HTTP server built on the in-process
+ * {@link net.ladenthin.llama.LlamaModel} API.
+ *
+ * <p>{@link net.ladenthin.llama.server.LlamaServer} is the {@code main} entry point (and the
+ * {@code Main-Class} of the {@code -jar-with-dependencies} assembly). It loads a GGUF model and
+ * exposes {@code POST /v1/chat/completions}, {@code POST /v1/completions},
+ * {@code POST /v1/embeddings} and {@code GET /v1/models} by forwarding the request body to the
+ * matching {@code LlamaModel.handle*} method, which already returns OpenAI-shaped JSON.</p>
+ *
+ * <p>The HTTP layer is NanoHTTPD (a tiny, dependency-free, Java&nbsp;8 server). The dependency is
+ * declared {@code <optional>} so it is bundled in the fat jar but not inherited by library
+ * consumers. The routing logic ({@link net.ladenthin.llama.server.OaiRouter}) is decoupled from
+ * NanoHTTPD so it can be unit-tested without binding a socket or loading a model.</p>
+ *
+ * <p>JSpecify {@code @NullMarked} is applied module-wide; everything is non-null unless annotated
+ * {@code @Nullable}.</p>
+ */
+package net.ladenthin.llama.server;
diff --git a/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java b/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
index 17f21566..667b1be5 100644
--- a/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
+++ b/src/test/java/net/ladenthin/llama/LlamaArchitectureTest.java
@@ -88,8 +88,11 @@ public class LlamaArchitectureTest {
      * {@code mayOnlyBeAccessedByLayers} lists the EXACT set of packages that reference it today
      * (verified against the compiled bytecode graph), so even intra-tier edges are governed: a
      * new dependency between any two packages fails the build unless this rule is updated to
-     * intend it. Conceptual tiers (informational): {@code Api} (root) &gt; {@code Loader} &gt;
-     * {@code Json}/{@code Parameters} &gt; {@code Value}/{@code Callback}/{@code Exception}/{@code Args}.
+     * intend it. Conceptual tiers (informational): {@code Server} &gt; {@code Api} (root) &gt;
+     * {@code Loader} &gt; {@code Json}/{@code Parameters} &gt;
+     * {@code Value}/{@code Callback}/{@code Exception}/{@code Args}. The {@code Server} layer is the
+     * optional OpenAI-compatible HTTP entry point; it is the only layer permitted to access the
+     * {@code Api} root.
      */
     @ArchTest
     static final ArchRule layeredArchitecture = layeredArchitecture()
@@ -110,14 +113,16 @@ public class LlamaArchitectureTest {
             .definedBy("net.ladenthin.llama.exception..")
             .layer("Args")
             .definedBy("net.ladenthin.llama.args..")
+            .layer("Server")
+            .definedBy("net.ladenthin.llama.server..")
             .whereLayer("Api")
-            .mayNotBeAccessedByAnyLayer()
+            .mayOnlyBeAccessedByLayers("Server")
             .whereLayer("Loader")
             .mayOnlyBeAccessedByLayers("Api")
             .whereLayer("Json")
             .mayOnlyBeAccessedByLayers("Api")
             .whereLayer("Parameters")
-            .mayOnlyBeAccessedByLayers("Api", "Loader")
+            .mayOnlyBeAccessedByLayers("Api", "Loader", "Server")
             .whereLayer("Value")
             .mayOnlyBeAccessedByLayers("Api", "Json", "Parameters")
             .whereLayer("Callback")
@@ -125,7 +130,9 @@ public class LlamaArchitectureTest {
             .whereLayer("Exception")
             .mayOnlyBeAccessedByLayers("Api", "Loader")
             .whereLayer("Args")
-            .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters");
+            .mayOnlyBeAccessedByLayers("Api", "Loader", "Parameters")
+            .whereLayer("Server")
+            .mayNotBeAccessedByAnyLayer();
 
     /**
      * Production code must not import unsupported / internal JDK packages.
diff --git a/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java b/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java
new file mode 100644
index 00000000..bca12fd9
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/server/LlamaServerArgsTest.java
@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+import net.ladenthin.llama.ClaudeGenerated;
+import org.junit.jupiter.api.Test;
+
+@ClaudeGenerated(
+        purpose = "Verify LlamaServerArgs parses long/short flags, applies defaults, derives the model alias from the "
+                + "model path, and rejects unknown flags, missing values, malformed integers and a missing --model.")
+public class LlamaServerArgsTest {
+
+    @Test
+    public void minimalArgsApplyDefaults() {
+        LlamaServerConfig config = LlamaServerArgs.parse(new String[] {"--model", "models/Qwen3-0.6B.gguf"});
+        assertThat(config.getModelPath(), is("models/Qwen3-0.6B.gguf"));
+        assertThat(config.getHost(), is(LlamaServerArgs.DEFAULT_HOST));
+        assertThat(config.getPort(), is(LlamaServerArgs.DEFAULT_PORT));
+        assertThat(config.getCtxSize(), is(0));
+        assertThat(config.getGpuLayers(), is(0));
+        assertThat(config.getThreads(), is(0));
+        assertThat(config.isEmbedding(), is(false));
+        // Alias defaults to the model file name.
+        assertThat(config.getModelAlias(), is("Qwen3-0.6B.gguf"));
+    }
+
+    @Test
+    public void allLongFlagsParsed() {
+        LlamaServerConfig config = LlamaServerArgs.parse(new String[] {
+            "--model", "m.gguf",
+            "--host", "0.0.0.0",
+            "--port", "9090",
+            "--ctx-size", "4096",
+            "--n-gpu-layers", "99",
+            "--threads", "8",
+            "--model-alias", "my-model",
+            "--embedding"
+        });
+        assertThat(config.getModelPath(), is("m.gguf"));
+        assertThat(config.getHost(), is("0.0.0.0"));
+        assertThat(config.getPort(), is(9090));
+        assertThat(config.getCtxSize(), is(4096));
+        assertThat(config.getGpuLayers(), is(99));
+        assertThat(config.getThreads(), is(8));
+        assertThat(config.getModelAlias(), is("my-model"));
+        assertThat(config.isEmbedding(), is(true));
+    }
+
+    @Test
+    public void shortFlagsParsed() {
+        LlamaServerConfig config = LlamaServerArgs.parse(
+                new String[] {"-m", "m.gguf", "-p", "1234", "-c", "512", "-ngl", "10", "-t", "4"});
+        assertThat(config.getPort(), is(1234));
+        assertThat(config.getCtxSize(), is(512));
+        assertThat(config.getGpuLayers(), is(10));
+        assertThat(config.getThreads(), is(4));
+    }
+
+    @Test
+    public void aliasDerivedFromNestedPath() {
+        LlamaServerConfig config = LlamaServerArgs.parse(new String[] {"-m", "/opt/models/Llama-3.gguf"});
+        assertThat(config.getModelAlias(), is("Llama-3.gguf"));
+    }
+
+    @Test
+    public void missingModelThrows() {
+        IllegalArgumentException ex =
+                assertThrows(IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {}));
+        assertThat(ex.getMessage(), containsString("--model"));
+    }
+
+    @Test
+    public void unknownFlagThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--bogus"}));
+        assertThat(ex.getMessage(), containsString("Unknown argument: --bogus"));
+    }
+
+    @Test
+    public void missingValueThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class, () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--port"}));
+        assertThat(ex.getMessage(), containsString("Missing value for --port"));
+    }
+
+    @Test
+    public void nonIntegerPortThrows() {
+        IllegalArgumentException ex = assertThrows(
+                IllegalArgumentException.class,
+                () -> LlamaServerArgs.parse(new String[] {"-m", "m.gguf", "--port", "abc"}));
+        assertThat(ex.getMessage(), containsString("expects an integer"));
+    }
+
+    @Test
+    public void helpRequestedDetection() {
+        assertThat(LlamaServerArgs.isHelpRequested(new String[] {"-h"}), is(true));
+        assertThat(LlamaServerArgs.isHelpRequested(new String[] {"--help"}), is(true));
+        assertThat(LlamaServerArgs.isHelpRequested(new String[] {"--model", "m.gguf"}), is(false));
+    }
+
+    @Test
+    public void usageMentionsEndpointsAndRequiredFlag() {
+        String usage = LlamaServerArgs.usage();
+        assertThat(usage, containsString("--model"));
+        assertThat(usage, containsString("/v1/chat/completions"));
+        assertThat(usage, containsString("/v1/embeddings"));
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java b/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java
new file mode 100644
index 00000000..63af9c47
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/server/OaiHttpServerIntegrationTest.java
@@ -0,0 +1,128 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+
+import fi.iki.elonen.NanoHTTPD;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import net.ladenthin.llama.ClaudeGenerated;
+import org.junit.jupiter.api.Test;
+
+@ClaudeGenerated(
+        purpose =
+                "End-to-end exercise of OaiHttpServer over a real loopback socket (ephemeral port, fake backend, no "
+                        + "native model): confirms the NanoHTTPD adapter extracts the method/URI, reads the JSON POST body via "
+                        + "the 'postData' idiom, forwards it to the router, and maps the routed status/body back to the client.")
+public class OaiHttpServerIntegrationTest {
+
+    /** Fake backend that echoes the received chat body so the test can assert it round-tripped. */
+    private static final class EchoBackend implements OaiBackend {
+        private String lastChatBody = "";
+
+        @Override
+        public String chatCompletions(String requestJson) {
+            lastChatBody = requestJson;
+            return "{\"object\":\"chat.completion\",\"echo\":" + requestJson + "}";
+        }
+
+        @Override
+        public String completions(String requestJson) {
+            return "{\"object\":\"text_completion\"}";
+        }
+
+        @Override
+        public String embeddings(String requestJson) {
+            return "{\"object\":\"list\"}";
+        }
+
+        @Override
+        public String listModels() {
+            return "{\"object\":\"list\",\"data\":[]}";
+        }
+
+        String lastChatBody() {
+            return lastChatBody;
+        }
+    }
+
+    @Test
+    public void servesHealthAndChatOverRealSocket() throws IOException {
+        EchoBackend backend = new EchoBackend();
+        OaiHttpServer server = new OaiHttpServer("127.0.0.1", 0, new OaiRouter(backend));
+        // daemon=true so a failed assertion never leaves a non-daemon listener thread behind.
+        server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, true);
+        try {
+            final int port = server.getListeningPort();
+            final String base = "http://127.0.0.1:" + port;
+
+            Response health = httpGet(base + "/health");
+            assertThat(health.status, is(200));
+            assertThat(health.body, containsString("\"status\":\"ok\""));
+
+            final String chatRequest = "{\"messages\":[{\"role\":\"user\",\"content\":\"hi\"}]}";
+            Response chat = httpPost(base + "/v1/chat/completions", chatRequest);
+            assertThat(chat.status, is(200));
+            assertThat(chat.body, containsString("chat.completion"));
+            // The JSON POST body reached the backend intact (validates the parseBody/postData path).
+            assertThat(backend.lastChatBody(), is(chatRequest));
+
+            Response notFound = httpGet(base + "/v1/nope");
+            assertThat(notFound.status, is(404));
+        } finally {
+            server.stop();
+        }
+    }
+
+    private static final class Response {
+        private final int status;
+        private final String body;
+
+        Response(int status, String body) {
+            this.status = status;
+            this.body = body;
+        }
+    }
+
+    private static Response httpGet(String url) throws IOException {
+        final HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
+        conn.setRequestMethod("GET");
+        return readResponse(conn);
+    }
+
+    private static Response httpPost(String url, String body) throws IOException {
+        final HttpURLConnection conn = (HttpURLConnection) new URL(url).openConnection();
+        conn.setRequestMethod("POST");
+        conn.setDoOutput(true);
+        conn.setRequestProperty("Content-Type", "application/json");
+        try (OutputStream os = conn.getOutputStream()) {
+            os.write(body.getBytes(StandardCharsets.UTF_8));
+        }
+        return readResponse(conn);
+    }
+
+    private static Response readResponse(HttpURLConnection conn) throws IOException {
+        final int status = conn.getResponseCode();
+        try (InputStream in = status < 400 ? conn.getInputStream() : conn.getErrorStream()) {
+            final ByteArrayOutputStream out = new ByteArrayOutputStream();
+            final byte[] buffer = new byte[1024];
+            int read;
+            while ((read = in.read(buffer)) != -1) {
+                out.write(buffer, 0, read);
+            }
+            return new Response(status, new String(out.toByteArray(), StandardCharsets.UTF_8));
+        } finally {
+            conn.disconnect();
+        }
+    }
+}
diff --git a/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java b/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java
new file mode 100644
index 00000000..dd189ea0
--- /dev/null
+++ b/src/test/java/net/ladenthin/llama/server/OaiRouterTest.java
@@ -0,0 +1,156 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.containsString;
+import static org.hamcrest.Matchers.is;
+
+import net.ladenthin.llama.ClaudeGenerated;
+import org.junit.jupiter.api.Test;
+
+@ClaudeGenerated(
+        purpose =
+                "Verify OaiRouter dispatches each OAI endpoint to the backend, forwards the request body, enforces "
+                        + "method/body preconditions (405/400), returns 404 for unknown paths, strips query strings, and "
+                        + "converts backend exceptions into a 500 OpenAI error envelope. Uses a fake backend (no native model).")
+public class OaiRouterTest {
+
+    private static final String CHAT_RESPONSE = "{\"object\":\"chat.completion\"}";
+    private static final String COMPLETION_RESPONSE = "{\"object\":\"text_completion\"}";
+    private static final String EMBED_RESPONSE = "{\"object\":\"list\",\"data\":[]}";
+    private static final String MODELS_RESPONSE = "{\"object\":\"list\",\"data\":[{\"id\":\"m\"}]}";
+
+    /** Records the last forwarded body and returns canned per-endpoint JSON. */
+    private static final class RecordingBackend implements OaiBackend {
+        private String lastBody = "";
+
+        @Override
+        public String chatCompletions(String requestJson) {
+            lastBody = requestJson;
+            return CHAT_RESPONSE;
+        }
+
+        @Override
+        public String completions(String requestJson) {
+            lastBody = requestJson;
+            return COMPLETION_RESPONSE;
+        }
+
+        @Override
+        public String embeddings(String requestJson) {
+            lastBody = requestJson;
+            return EMBED_RESPONSE;
+        }
+
+        @Override
+        public String listModels() {
+            return MODELS_RESPONSE;
+        }
+
+        String lastBody() {
+            return lastBody;
+        }
+    }
+
+    private static final class ThrowingBackend implements OaiBackend {
+        @Override
+        public String chatCompletions(String requestJson) {
+            throw new IllegalStateException("boom");
+        }
+
+        @Override
+        public String completions(String requestJson) {
+            throw new IllegalStateException("boom");
+        }
+
+        @Override
+        public String embeddings(String requestJson) {
+            throw new IllegalStateException("boom");
+        }
+
+        @Override
+        public String listModels() {
+            throw new IllegalStateException("boom");
+        }
+    }
+
+    @Test
+    public void chatCompletionsForwardsBodyAndReturnsResponse() {
+        RecordingBackend backend = new RecordingBackend();
+        OaiRouter router = new OaiRouter(backend);
+        OaiResponse resp = router.route("POST", "/v1/chat/completions", "{\"messages\":[]}");
+        assertThat(resp.getStatus(), is(200));
+        assertThat(resp.getBody(), is(CHAT_RESPONSE));
+        assertThat(backend.lastBody(), is("{\"messages\":[]}"));
+    }
+
+    @Test
+    public void completionsRoute() {
+        OaiResponse resp =
+                new OaiRouter(new RecordingBackend()).route("POST", "/v1/completions", "{\"prompt\":\"hi\"}");
+        assertThat(resp.getStatus(), is(200));
+        assertThat(resp.getBody(), is(COMPLETION_RESPONSE));
+    }
+
+    @Test
+    public void embeddingsRoute() {
+        OaiResponse resp = new OaiRouter(new RecordingBackend()).route("POST", "/v1/embeddings", "{\"input\":\"hi\"}");
+        assertThat(resp.getStatus(), is(200));
+        assertThat(resp.getBody(), is(EMBED_RESPONSE));
+    }
+
+    @Test
+    public void modelsRoute() {
+        OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/models", null);
+        assertThat(resp.getStatus(), is(200));
+        assertThat(resp.getBody(), is(MODELS_RESPONSE));
+    }
+
+    @Test
+    public void modelsRouteIgnoresQueryString() {
+        OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/models?limit=1", null);
+        assertThat(resp.getStatus(), is(200));
+        assertThat(resp.getBody(), is(MODELS_RESPONSE));
+    }
+
+    @Test
+    public void healthRoutes() {
+        OaiRouter router = new OaiRouter(new RecordingBackend());
+        assertThat(router.route("GET", "/health", null).getStatus(), is(200));
+        assertThat(router.route("GET", "/health", null).getBody(), containsString("\"status\":\"ok\""));
+        assertThat(router.route("GET", "/", null).getStatus(), is(200));
+    }
+
+    @Test
+    public void wrongMethodYields405() {
+        OaiRouter router = new OaiRouter(new RecordingBackend());
+        assertThat(router.route("GET", "/v1/chat/completions", null).getStatus(), is(405));
+        assertThat(router.route("POST", "/v1/models", "{}").getStatus(), is(405));
+    }
+
+    @Test
+    public void emptyOrNullBodyYields400() {
+        OaiRouter router = new OaiRouter(new RecordingBackend());
+        assertThat(router.route("POST", "/v1/chat/completions", null).getStatus(), is(400));
+        assertThat(router.route("POST", "/v1/chat/completions", "   ").getStatus(), is(400));
+    }
+
+    @Test
+    public void unknownPathYields404() {
+        OaiResponse resp = new OaiRouter(new RecordingBackend()).route("GET", "/v1/nope", null);
+        assertThat(resp.getStatus(), is(404));
+        assertThat(resp.getBody(), containsString("\"type\":\"not_found\""));
+        assertThat(resp.getBody(), containsString("/v1/nope"));
+    }
+
+    @Test
+    public void backendExceptionYields500() {
+        OaiResponse resp = new OaiRouter(new ThrowingBackend()).route("POST", "/v1/chat/completions", "{}");
+        assertThat(resp.getStatus(), is(500));
+        assertThat(resp.getBody(), containsString("\"type\":\"internal_error\""));
+        assertThat(resp.getBody(), containsString("boom"));
+    }
+}

From e595af66703b5f6201e666efe515f3a5e6a0fc18 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 23:10:44 +0000
Subject: [PATCH 3/3] Trim cross-repo note from assembly comments; add SSE
 server TODO

Build comments: drop the 'deliberate non-parity (BAF + jllama only)' restatement and the crossrepostatus.md pointer from the package job + assembly profile comments (that lives only in the cross-repo doc). Also correct the now-stale 'no Main-Class' wording in both: the assembly fat jar is runnable via its LlamaServer Main-Class.

TODO: add an item to implement OpenAI-style SSE token streaming for the server (stream:true) and to find a Java-8-compatible HTTP layer with SSE support, or implement SSE on the existing NanoHTTPD via chunked responses. Javalin (the SSE-capable option) is unusable here: v5 needs Java 11, v6 needs Java 17, v4 is EOL.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01UZbmBX5CjqVwPcaTS61im6
---
 .github/workflows/publish.yml |  7 +++----
 TODO.md                       | 34 ++++++++++++++++++++++++++++++++++
 pom.xml                       | 11 +++++------
 3 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index cc160814..d6bccb4e 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -820,10 +820,9 @@ jobs:
       - name: Build JARs
         # `assembly` additionally produces the fat jar-with-dependencies uber JAR
         # (llama-<version>-jar-with-dependencies.jar: library classes + Java runtime deps +
-        # default-platform native libs in one drop-on-classpath JAR; no Main-Class - it is a
-        # library). It lands in target/ and is uploaded in the `llama-jars` artifact below - a
-        # CI run artifact only, NOT a Maven Central / GitHub-Release asset. Documented as
-        # deliberate non-parity (BAF + jllama only) in workspace/crossrepostatus.md.
+        # default-platform native libs in one drop-on-classpath JAR, runnable via its
+        # LlamaServer Main-Class). It lands in target/ and is uploaded in the `llama-jars`
+        # artifact below - a CI run artifact only, not a Maven Central / GitHub-Release asset.
         run: mvn --batch-mode --no-transfer-progress -P release,cuda,opencl-android,assembly -Dmaven.test.skip=true -Dgpg.skip=true package
       - name: Upload JARs
         uses: actions/upload-artifact@v7
diff --git a/TODO.md b/TODO.md
index f802f0a8..635a2d41 100644
--- a/TODO.md
+++ b/TODO.md
@@ -55,6 +55,40 @@ These are JNI plumbing items for upstream API additions. Policy: add only after
 
   **Out of scope until evidence supports it**: actually implementing any of the above. This entry exists so that when someone asks "can I ship java-llama.cpp as a single 30 MB binary?" the answer points to a concrete investigation plan rather than restarting from zero.
 
+### OpenAI-compatible server: token streaming (SSE) + Java-8 HTTP layer
+
+The `net.ladenthin.llama.server.LlamaServer` MVP is **non-streaming**: every request calls
+the blocking `LlamaModel.handle*` method and returns the full JSON response in one shot. A
+client that sends `"stream": true` still receives a single response, not the incremental
+`text/event-stream` (SSE) `data: {chunk}\n\n` events the OpenAI API emits for streaming
+chat/completions. This is the main functional gap of the server today.
+
+The token source already exists — `LlamaModel.generateChat(InferenceParameters)` /
+`generate(...)` yield tokens incrementally through a Java `Iterator` (`LlamaIterable`). What
+is missing is an HTTP layer that emits SSE.
+
+**Find a Java-8-compatible HTTP layer with good SSE support (alternative to Javalin), or
+implement SSE on NanoHTTPD.** Javalin has a first-class `ctx.sse(...)` API but is **not
+usable here**: Javalin 5 requires Java 11 and Javalin 6 requires Java 17, while this repo
+targets Java 8; Javalin 4 (the last Java-8 release) is EOL. Options, in rough order of
+preference:
+- **Implement SSE on the existing NanoHTTPD** via `NanoHTTPD.newChunkedResponse(status,
+  "text/event-stream", InputStream)`, bridging a `LlamaIterable` to an `InputStream` that
+  writes `data: {chunk}\n\n` frames. No new dependency, stays Java-8 clean; likely the right
+  answer. Cost: the iterator→SSE bridge plus closing the `LlamaIterable` on client
+  disconnect.
+- **Undertow** — Java-8 compatible, has a server-sent-events handler, but a heavier
+  dependency tree.
+- **Spark Java** (Jetty 9) — Java-8 compatible; SSE support is limited/manual.
+- Avoid: Javalin 5/6 (Java 11/17), Javalin 4 (EOL), and the JDK `com.sun.net.httpserver`
+  (ArchUnit-banned `com.sun..`).
+
+Scope when implemented: honour `"stream": true` on `POST /v1/chat/completions` and
+`POST /v1/completions`, emit OpenAI-style SSE chunks terminated by `data: [DONE]`, close the
+underlying `LlamaIterable` on disconnect, and keep the non-streaming path as the default. Add
+a model-free routing test plus a real-socket SSE integration test (mirroring
+`OaiHttpServerIntegrationTest`).
+
 ## Open — cross-cutting (slice for this repo)
 
 - **jqwik pin policy** — see [`../workspace/policies/jqwik-prompt-injection.md`](../workspace/policies/jqwik-prompt-injection.md). `jqwik.version ≤ 1.9.3` is mandatory.
diff --git a/pom.xml b/pom.xml
index dfd0b278..49a03a0c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -992,12 +992,11 @@ SPDX-License-Identifier: MIT
 			<!--
 				Builds the fat jar-with-dependencies uber JAR: the library classes, the
 				default-platform native libs from src/main/resources, and all runtime Java
-				dependencies in one drop-on-classpath JAR. No Main-Class (this is a library,
-				not a CLI). Off by default; the CI `package` job activates it so the uber JAR
-				rides along in the `llama-jars` upload-artifact bundle (a CI run artifact only,
-				not a Maven Central / GitHub-Release asset). Documented in CLAUDE.md
-				"Build Commands" as `mvn -P assembly package` and as deliberate cross-repo
-				non-parity (BAF + jllama only) in workspace/crossrepostatus.md.
+				dependencies in one drop-on-classpath JAR, runnable via the LlamaServer
+				Main-Class (set below) to start the OpenAI-compatible HTTP server. Off by
+				default; the CI `package` job activates it so the uber JAR rides along in the
+				`llama-jars` upload-artifact bundle. Documented in CLAUDE.md "Build Commands"
+				as `mvn -P assembly package`.
 			-->
 			<id>assembly</id>
 			<build>