bernardladenthin
diff --git a/‎CLAUDE.md‎
Lines changed: 1 addition & 0 deletions b/‎CLAUDE.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README.md‎
Lines changed: 32 additions & 0 deletions b/‎README.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎pom.xml‎
Lines changed: 20 additions & 0 deletions b/‎pom.xml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎spotbugs-exclude.xml‎
Lines changed: 47 additions & 0 deletions b/‎spotbugs-exclude.xml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/LlamaModel.java‎
Lines changed: 14 additions & 1 deletion b/‎src/main/java/net/ladenthin/llama/LlamaModel.java‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java‎
Lines changed: 67 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/LlamaModelOaiBackend.java‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/LlamaServer.java‎
Lines changed: 93 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/LlamaServer.java‎
Lines changed: 93 additions & 0 deletions
@@ -453,6 +453,7 @@ If the local check passes (`BUILD SUCCESS`), the `mvn package` job in
 - `LlamaIterator` / `LlamaIterable` — Streaming generation via Java `Iterator`/`Iterable`.
 - `LlamaLoader` — Extracts the platform-specific native library from the JAR to a temp directory, or finds it on `java.library.path`.
 - `OSInfo` — Detects OS and architecture for library resolution.
+- `server.LlamaServer` — Optional OpenAI-compatible HTTP server and the fat-jar `Main-Class`. `LlamaServerArgs` parses the CLI; `OaiRouter` / `OaiHttpServer` (NanoHTTPD) map `POST /v1/chat/completions`, `/v1/completions`, `/v1/embeddings` and `GET /v1/models` to the `LlamaModel.handle*` methods. NanoHTTPD is an `<optional>` dependency (bundled only in the fat jar, not inherited by library consumers). The `server` package is a dedicated top layer in the ArchUnit `layeredArchitecture` rule (the only layer allowed to access the root `Api`). See README "OpenAI-compatible HTTP server".
 
 **Native layer** (`src/main/cpp/`):
 - `jllama.cpp` — JNI implementation bridging Java calls to llama.cpp. ~1,215 lines; 17 native methods.
 
@@ -97,6 +97,7 @@ Inference of Meta's LLaMA model (and others) in pure C/C++.
 - **Infilling** (fill-in-the-middle) for code models.
 - **Tokenize / detokenize** and **JSON-schema → grammar** conversion.
 - **Raw JSON endpoint handlers** mirroring the upstream llama.cpp HTTP server (`/completions`, `/v1/completions`, `/embeddings`, `/infill`, `/tokenize`, `/detokenize`).
+- **Runnable OpenAI-compatible HTTP server** (`LlamaServer`, the fat-jar `Main-Class`): `java -jar …-jar-with-dependencies.jar --model model.gguf --port 8080`.
 - **Model metadata** access (`getModelMeta()`) and **server management** (metrics, slot save/restore, runtime thread reconfiguration).
 - Pre-built native binaries for Linux (x86-64, aarch64), macOS (x86-64, arm64), and Windows (x86-64, x86); CUDA, Metal, and Vulkan supported via local build.
 
@@ -396,6 +397,37 @@ a JSON response, matching the HTTP server's contract:
 Server state is exposed via `getMetrics()`, `eraseSlot(int)`, `saveSlot(int, String)`,
 `restoreSlot(int, String)`, and `getModelMeta()`.
 
+### OpenAI-compatible HTTP server
+
+The fat jar built by the `assembly` profile (`mvn -P assembly package`) is runnable: its
+`Main-Class` is `net.ladenthin.llama.server.LlamaServer`, a small [NanoHTTPD](https://github.com/NanoHttpd/nanohttpd)
+server that loads a GGUF model in-process and serves OpenAI-compatible endpoints by forwarding each
+request body to the matching `LlamaModel.handle*` method:
+
+```bash
+java -jar target/llama-<version>-jar-with-dependencies.jar \
+    --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+```
+
+| Method &amp; path | Backed by |
+|---|---|
+| `POST /v1/chat/completions` | `LlamaModel.handleChatCompletions` |
+| `POST /v1/completions` | `LlamaModel.handleCompletionsOai` |
+| `POST /v1/embeddings` (requires `--embedding`) | `LlamaModel.handleEmbeddings` |
+| `GET /v1/models` | the configured model alias |
+| `GET /health` | static `{"status":"ok"}` |
+
+```bash
+curl http://localhost:8080/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{"messages":[{"role":"user","content":"Hello!"}]}'
+```
+
+Run with `--help` for all options (`--ctx-size`, `--threads`, `--model-alias`, …). Responses are
+non-streaming (the full JSON result is returned per request). The NanoHTTPD dependency is declared
+`<optional>`, so it is bundled in the fat jar but **not** inherited by projects that use this
+library as a Maven dependency; running the server requires the fat jar (or adding NanoHTTPD yourself).
+
 ### Model/Inference Configuration
 
 There are two sets of parameters you can configure, `ModelParameters` and `InferenceParameters`. Both provide builder 
 
@@ -58,6 +58,7 @@ SPDX-License-Identifier: MIT
 		<checker.version>4.2.0</checker.version>
 		<jackson.version>2.22.0</jackson.version>
 		<reactor.version>3.8.6</reactor.version>
+		<nanohttpd.version>2.3.1</nanohttpd.version>
 		<slf4j.version>2.0.18</slf4j.version>
 		<logback.version>1.5.34</logback.version>
 		<animal-sniffer.version>1.27</animal-sniffer.version>
@@ -148,6 +149,20 @@ SPDX-License-Identifier: MIT
 			<artifactId>jackson-databind</artifactId>
 			<version>${jackson.version}</version>
 		</dependency>
+		<!--
+			Embedded HTTP server for the optional OpenAI-compatible server entry point
+			(net.ladenthin.llama.server.LlamaServer, the fat-jar Main-Class). Declared
+			<optional> so library consumers do NOT inherit it on their classpath; the
+			assembly (jar-with-dependencies) profile still bundles it so the fat jar can
+			run the server. Pure Java, zero transitive deps (Java-8 clean), so it does
+			not perturb the enforcer dependencyConvergence rule.
+		-->
+		<dependency>
+			<groupId>org.nanohttpd</groupId>
+			<artifactId>nanohttpd</artifactId>
+			<version>${nanohttpd.version}</version>
+			<optional>true</optional>
+		</dependency>
 		<!-- Required by OSInfo (vendored from xerial/sqlite-jdbc) for log emission. -->
 		<dependency>
 			<groupId>org.slf4j</groupId>
@@ -994,6 +1009,11 @@ SPDX-License-Identifier: MIT
 							<descriptorRefs>
 								<descriptorRef>jar-with-dependencies</descriptorRef>
 							</descriptorRefs>
+							<archive>
+								<manifest>
+									<mainClass>net.ladenthin.llama.server.LlamaServer</mainClass>
+								</manifest>
+							</archive>
 						</configuration>
 						<executions>
 							<execution>
 
@@ -360,4 +360,51 @@ SPDX-License-Identifier: MIT
         <Method name="requireNonNull"/>
     </Match>
 
+    <!--
+        The OpenAI-compatible server (net.ladenthin.llama.server.*) is a CLI entry point:
+        the model path, host, port and alias all come from command-line arguments supplied
+        by whoever launches the process. findsecbugs flags Paths.get on the model path
+        (PATH_TRAVERSAL_IN) and the startup log lines that echo these values
+        (CRLF_INJECTION_LOGS) because they are non-literal, but the threat model is identical
+        to the LlamaLoader PATH_TRAVERSAL suppression above: an attacker who can set the
+        server's command line has already won, and there is no untrusted end-user input
+        reaching these paths or log statements. There is also no meaningful "allowed root"
+        to canonicalise the operator-chosen model path against.
+    -->
+    <Match>
+        <Class name="~net\.ladenthin\.llama\.server\..*"/>
+        <Or>
+            <Bug pattern="PATH_TRAVERSAL_IN"/>
+            <Bug pattern="CRLF_INJECTION_LOGS"/>
+        </Or>
+    </Match>
+
+    <!--
+        LlamaServerArgs.parse is a flat command-line flag dispatcher: a single switch over
+        the known flags, one case per option, read top to bottom. javac desugars a String
+        switch into a hashCode lookup plus an equals chain (two branches per case), which
+        fb-contrib's bytecode-level CC_CYCLOMATIC_COMPLEXITY counts as a very high score.
+        The source complexity is low and table-flat; extracting the cases into a dispatch
+        map would not make it clearer, so we accept the detector artifact here.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.LlamaServerArgs"/>
+        <Bug pattern="CC_CYCLOMATIC_COMPLEXITY"/>
+        <Method name="parse"/>
+    </Match>
+
+    <!--
+        LlamaModelOaiBackend is a thin non-owning wrapper around a LlamaModel (the same
+        deliberate dependency-injection contract as Session above): the server owns the one
+        LlamaModel and its native context, and the backend holds the passed-in reference to
+        serve requests. The model must NOT be defensively copied, so storing the reference is
+        by design; spotbugs flags it as EI_EXPOSE_REP2 because the constructor stores an
+        externally-mutable object, which is true but intended.
+    -->
+    <Match>
+        <Class name="net.ladenthin.llama.server.LlamaModelOaiBackend"/>
+        <Bug pattern="EI_EXPOSE_REP2"/>
+        <Method name="&lt;init&gt;"/>
+    </Match>
+
 </FindBugsFilter>
@@ -835,7 +835,20 @@ public String restoreSlot(int slotId, String filepath) {
 
     native String handleSlotAction(int action, int slotId, @Nullable String filename);
 
-    native String handleChatCompletions(String params);
+    /**
+     * Run an OpenAI-compatible chat completion (mirrors the {@code /v1/chat/completions}
+     * endpoint). The request JSON must contain a {@code "messages"} array in the standard
+     * OpenAI chat format; the model's chat template is applied automatically. Returns the
+     * result in OAI format with a {@code "choices"} array. This is the raw JSON-in/JSON-out
+     * form used by {@link #chatComplete(net.ladenthin.llama.parameters.InferenceParameters)}
+     * and by the embedded OpenAI-compatible server
+     * ({@link net.ladenthin.llama.server.LlamaServer}); it is the chat counterpart of
+     * {@link #handleCompletionsOai(String)} and {@link #handleEmbeddings(String, boolean)}.
+     *
+     * @param params JSON string with OAI-compatible chat-completion parameters (incl. {@code "messages"})
+     * @return JSON response in OAI chat-completion format
+     */
+    public native String handleChatCompletions(String params);
 
     native int requestChatCompletion(String params);
 }
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import lombok.ToString;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * {@link OaiBackend} backed by a loaded {@link LlamaModel}. Each operation forwards the raw request
+ * JSON to the matching {@code LlamaModel.handle*} method, which already produces
+ * OpenAI-compatible response JSON, so no per-field marshalling happens here.
+ *
+ * <p>The model is owned by the caller ({@link LlamaServer}); this class neither closes it nor holds
+ * any other resource.</p>
+ */
+@ToString
+public final class LlamaModelOaiBackend implements OaiBackend {
+
+    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
+
+    private final LlamaModel model;
+    private final String modelId;
+
+    /**
+     * Create a backend over a loaded model.
+     *
+     * @param model   the loaded model to serve requests with
+     * @param modelId the identifier reported by {@link #listModels()} and echoed in responses
+     */
+    public LlamaModelOaiBackend(LlamaModel model, String modelId) {
+        this.model = model;
+        this.modelId = modelId;
+    }
+
+    @Override
+    public String chatCompletions(String requestJson) {
+        return model.handleChatCompletions(requestJson);
+    }
+
+    @Override
+    public String completions(String requestJson) {
+        return model.handleCompletionsOai(requestJson);
+    }
+
+    @Override
+    public String embeddings(String requestJson) {
+        return model.handleEmbeddings(requestJson, true);
+    }
+
+    @Override
+    public String listModels() {
+        final ObjectNode root = OBJECT_MAPPER.createObjectNode();
+        root.put("object", "list");
+        final ArrayNode data = root.putArray("data");
+        final ObjectNode entry = data.addObject();
+        entry.put("id", modelId);
+        entry.put("object", "model");
+        entry.put("owned_by", "llamacpp");
+        // ObjectNode.toString() emits valid JSON without a checked exception.
+        return root.toString();
+    }
+}
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.server;
+
+import fi.iki.elonen.NanoHTTPD;
+import java.io.IOException;
+import net.ladenthin.llama.LlamaModel;
+import net.ladenthin.llama.parameters.ModelParameters;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Entry point for the optional OpenAI-compatible HTTP server, and the {@code Main-Class} of the
+ * {@code -jar-with-dependencies} assembly.
+ *
+ * <p>It parses the command line ({@link LlamaServerArgs}), loads a GGUF model into a
+ * {@link LlamaModel}, and serves OpenAI-compatible endpoints over NanoHTTPD via {@link OaiRouter} /
+ * {@link OaiHttpServer}. A shutdown hook stops the server and closes the model on JVM exit
+ * (e.g. Ctrl-C / SIGTERM). Run {@code --help} for the full option list.</p>
+ *
+ * <p>Example:</p>
+ *
+ * <pre>{@code
+ * java -jar llama-<version>-jar-with-dependencies.jar \
+ *     --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+ * }</pre>
+ *
+ * <p>Responses are non-streaming: the full JSON result is returned per request.</p>
+ */
+public final class LlamaServer {
+
+    private static final Logger LOG = LoggerFactory.getLogger(LlamaServer.class);
+
+    private LlamaServer() {}
+
+    /**
+     * Start the server (blocks the JVM alive on a non-daemon listener thread), or print help.
+     *
+     * @param args command-line arguments; see {@link LlamaServerArgs#usage()}
+     * @throws IOException if the HTTP server cannot bind the configured host/port
+     */
+    public static void main(String[] args) throws IOException {
+        if (LlamaServerArgs.isHelpRequested(args)) {
+            LOG.info("{}{}", System.lineSeparator(), LlamaServerArgs.usage());
+            return;
+        }
+
+        final LlamaServerConfig config = LlamaServerArgs.parse(args);
+        final LlamaModel model = loadModel(config);
+        final OaiBackend backend = new LlamaModelOaiBackend(model, config.getModelAlias());
+        final OaiHttpServer server = new OaiHttpServer(config.getHost(), config.getPort(), new OaiRouter(backend));
+
+        Runtime.getRuntime().addShutdownHook(new Thread(() -> shutdown(server, model), "llama-server-shutdown"));
+
+        try {
+            // daemon=false: the non-daemon listener thread keeps the JVM alive after main() returns.
+            server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, false);
+        } catch (IOException e) {
+            // Close the just-loaded native model before propagating the bind failure.
+            model.close();
+            throw e;
+        }
+
+        LOG.info(
+                "LlamaServer listening on http://{}:{} (model={})",
+                config.getHost(),
+                config.getPort(),
+                config.getModelAlias());
+    }
+
+    private static LlamaModel loadModel(LlamaServerConfig config) {
+        final ModelParameters params =
+                new ModelParameters().setModel(config.getModelPath()).setGpuLayers(config.getGpuLayers());
+        if (config.getCtxSize() > 0) {
+            params.setCtxSize(config.getCtxSize());
+        }
+        if (config.getThreads() > 0) {
+            params.setThreads(config.getThreads());
+        }
+        if (config.isEmbedding()) {
+            params.enableEmbedding();
+        }
+        LOG.info("Loading model {} ...", config.getModelPath());
+        return new LlamaModel(params);
+    }
+
+    private static void shutdown(OaiHttpServer server, LlamaModel model) {
+        server.stop();
+        model.close();
+    }
+}