bernardladenthin · bernardladenthin · Jul 1, 2026 · Jun 30, 2026
@@ -24,6 +24,7 @@ path = [
     ".github/ISSUE_TEMPLATE/bug_report.md",
     ".github/ISSUE_TEMPLATE/feature_request.md",
     ".claude/commands/find-cpp-duplication.md",
+    "langchain4j-jllama/README.md",
 ]
 SPDX-FileCopyrightText = [
     "2023-2025 Konstantin Herud",

@@ -0,0 +1,110 @@
+# langchain4j-jllama
+
+[LangChain4j](https://github.com/langchain4j/langchain4j) adapters backed by an **in-process**
+[java-llama.cpp](https://github.com/bernardladenthin/java-llama.cpp) model over JNI — no HTTP server,
+no separate process.
+
+This is a **separate Maven artifact** on purpose: it depends on `langchain4j-core`, but the core
+`net.ladenthin:llama` binding does **not** depend on langchain4j, so plain java-llama.cpp users never
+pull langchain4j (or its Java 17 floor) transitively.
+
+> **Already have an OpenAI-compatible setup?** java-llama.cpp also ships
+> `net.ladenthin.llama.server.OpenAiCompatServer`, so you can point langchain4j's `langchain4j-open-ai`
+> client at a running server with zero code from this module. Use *this* module when you want the
+> in-process path (no HTTP hop, single process — e.g. desktop/Android/embedded).
+
+## Adapters
+
+| Class | langchain4j interface | java-llama.cpp call |
+|-------|-----------------------|---------------------|
+| `JllamaChatModel` | `ChatModel` | `LlamaModel.chat(...)` |
+| `JllamaStreamingChatModel` | `StreamingChatModel` | `LlamaModel.generateChat(...)` (token streaming) |
+| `JllamaEmbeddingModel` | `EmbeddingModel` | `LlamaModel.embed(...)` |
+| `JllamaScoringModel` | `ScoringModel` (re-ranking) | `LlamaModel.handleRerank(...)` |
+
+## Lifecycle: the model is *borrowed*
+
+Every adapter takes a `LlamaModel` you already loaded and **keeps owning**. The adapter never loads
+or closes the native model — you manage it (try-with-resources or explicit `close()`). One
+`LlamaModel` can back several adapters at once.
+
+```java
+try (LlamaModel llama = new LlamaModel(new ModelParameters().setModel("models/qwen3-0.6b.gguf"))) {
+    ChatModel chat = new JllamaChatModel(llama);
+
+    String reply = chat.chat("Write a haiku about lazy senior devs.");
+    System.out.println(reply);
+}
+```
+
+Streaming:
+
+```java
+StreamingChatModel chat = new JllamaStreamingChatModel(llama);
+chat.chat("Tell me a story.", new StreamingChatResponseHandler() {
+    @Override public void onPartialResponse(String token) { System.out.print(token); }
+    @Override public void onCompleteResponse(ChatResponse response) { /* done */ }
+    @Override public void onError(Throwable error) { error.printStackTrace(); }
+});
+```
+
+Embeddings (model loaded with `enableEmbedding()`) and re-ranking
+(`enableReranking()`) plug straight into langchain4j RAG:
+
+```java
+EmbeddingModel embeddings = new JllamaEmbeddingModel(embeddingLlama);
+ScoringModel reranker     = new JllamaScoringModel(rerankLlama);
+```
+
+## Dependency
+
+```xml
+<dependency>
+    <groupId>net.ladenthin</groupId>
+    <artifactId>langchain4j-jllama</artifactId>
+    <version>5.0.4-SNAPSHOT</version>
+</dependency>
+```
+
+`langchain4j-core` is pulled transitively. You still supply a java-llama.cpp native library for your
+platform the usual way (bundled in the `net.ladenthin:llama` JAR or on `java.library.path`).
+
+## Building
+
+This is a **sibling module**, not part of the root reactor. Install the core artifact first, then
+build here:
+
+```bash
+# from the repo root: publish the core net.ladenthin:llama jar to your local ~/.m2
+mvn -DskipTests install
+
+# then build/test this module
+cd langchain4j-jllama
+mvn test
+```
+
+The end-to-end test (`JllamaChatModelIntegrationTest`) self-skips unless you pass a model:
+
+```bash
+mvn test -Dnet.ladenthin.llama.model.path=/abs/path/to/model.gguf
+```
+
+## Not mapped yet
+
+- **Tool calling.** `ChatRequest.toolSpecifications()` are not forwarded, so the chat adapters return
+  assistant *text*, not `AiMessage.toolExecutionRequests()`. (java-llama.cpp itself supports tool
+  calling via `LlamaModel.chatWithTools` / typed `ToolDefinition`; bridging that to langchain4j
+  `ToolSpecification` is the planned next step.)
+- **Multimodal user input.** A multi-content `UserMessage` is flattened to its text parts; image/audio
+  content is dropped.
+- **Per-token tool-call / thinking stream events.** Streaming forwards plain text via
+  `onPartialResponse`.
+- **`response_format` (JSON mode).** `ChatRequest.responseFormat()` (json_object / json_schema) is not
+  forwarded; `modelName()` is ignored since one model is bound per adapter.
+
+Mapped request parameters: `temperature`, `topP`, `topK`, `maxOutputTokens`, `frequencyPenalty`,
+`presencePenalty`, `stopSequences`. The non-streaming chat response carries the model's real finish
+reason (`stop`/`length`/`tool_calls`) and token usage; the streaming completion carries assembled text
+(no per-token usage).
+
+Requires Java 17+ (langchain4j 1.x baseline). Targets `langchain4j-core` 1.17.1.
@@ -0,0 +1,94 @@
+<!--
+SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+
+SPDX-License-Identifier: MIT
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<groupId>net.ladenthin</groupId>
+	<artifactId>langchain4j-jllama</artifactId>
+	<version>5.0.4-SNAPSHOT</version>
+	<packaging>jar</packaging>
+
+	<name>${project.groupId}:${project.artifactId}</name>
+	<description>LangChain4j integration for java-llama.cpp: in-process ChatModel,
+		StreamingChatModel, EmbeddingModel and ScoringModel adapters backed by a
+		llama.cpp model over JNI (no HTTP hop).</description>
+	<url>https://github.com/bernardladenthin/java-llama.cpp</url>
+
+	<licenses>
+		<license>
+			<name>MIT License</name>
+			<url>https://www.opensource.org/licenses/mit-license.php</url>
+			<distribution>repo</distribution>
+		</license>
+	</licenses>
+
+	<developers>
+		<developer>
+			<name>Bernard Ladenthin</name>
+			<organizationUrl>https://github.com/bernardladenthin</organizationUrl>
+		</developer>
+	</developers>
+
+	<scm>
+		<connection>scm:git:https://github.com/bernardladenthin/java-llama.cpp.git</connection>
+		<developerConnection>scm:git:https://github.com/bernardladenthin/java-llama.cpp.git</developerConnection>
+		<url>https://github.com/bernardladenthin/java-llama.cpp/tree/main</url>
+	</scm>
+
+	<properties>
+		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+		<maven.compiler.release>17</maven.compiler.release>
+		<!-- Keep in lockstep with the core java-llama.cpp artifact version. -->
+		<jllama.version>5.0.4-SNAPSHOT</jllama.version>
+		<langchain4j.version>1.17.1</langchain4j.version>
+		<junit.version>6.1.1</junit.version>
+		<hamcrest.version>3.0</hamcrest.version>
+		<surefire.version>3.5.5</surefire.version>
+	</properties>
+
+	<dependencies>
+		<!-- The JNI binding we adapt. Provided-by-the-consumer in spirit, but compile
+		     scope so a consumer that only declares langchain4j-jllama still gets it. -->
+		<dependency>
+			<groupId>net.ladenthin</groupId>
+			<artifactId>llama</artifactId>
+			<version>${jllama.version}</version>
+		</dependency>
+
+		<!-- The interfaces we implement (ChatModel/StreamingChatModel/EmbeddingModel/ScoringModel). -->
+		<dependency>
+			<groupId>dev.langchain4j</groupId>
+			<artifactId>langchain4j-core</artifactId>
+			<version>${langchain4j.version}</version>
+		</dependency>
+
+		<dependency>
+			<groupId>org.junit.jupiter</groupId>
+			<artifactId>junit-jupiter</artifactId>
+			<version>${junit.version}</version>
+			<scope>test</scope>
+		</dependency>
+		<dependency>
+			<groupId>org.hamcrest</groupId>
+			<artifactId>hamcrest</artifactId>
+			<version>${hamcrest.version}</version>
+			<scope>test</scope>
+		</dependency>
+	</dependencies>
+
+	<build>
+		<plugins>
+			<plugin>
+				<groupId>org.apache.maven.plugins</groupId>
+				<artifactId>maven-surefire-plugin</artifactId>
+				<version>${surefire.version}</version>
+			</plugin>
+		</plugins>
+	</build>
+</project>
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.model.chat.ChatModel;
+import dev.langchain4j.model.chat.request.ChatRequest;
+import dev.langchain4j.model.chat.response.ChatResponse;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link ChatModel} backed by an in-process java-llama.cpp model (over JNI, no HTTP).
+ *
+ * <p>The model is <em>borrowed</em>: this adapter never loads or closes it. Construct it from a
+ * {@link LlamaModel} you already own and keep managing that model's lifecycle (try-with-resources or
+ * an explicit {@code close()}). One {@code LlamaModel} can back several adapters at once.
+ *
+ * <p>Mapped today: messages (system/user/assistant/tool-result) and the sampling parameters
+ * {@code temperature}/{@code topP}/{@code topK}/{@code maxOutputTokens}/{@code stopSequences}.
+ * Tool <em>specifications</em> on the request are not yet forwarded, so this returns assistant text,
+ * not tool calls — see the module README for the planned tool-calling bridge.
+ */
+public final class JllamaChatModel implements ChatModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates a chat model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded model to drive; not closed by this adapter
+     */
+    public JllamaChatModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public ChatResponse doChat(ChatRequest chatRequest) {
+        net.ladenthin.llama.value.ChatResponse response =
+                model.chat(LangChain4jMapping.toJllamaRequest(chatRequest));
+        return LangChain4jMapping.toLangChainResponse(response);
+    }
+}
@@ -0,0 +1,44 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.data.embedding.Embedding;
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.model.embedding.EmbeddingModel;
+import dev.langchain4j.model.output.Response;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link EmbeddingModel} backed by an in-process java-llama.cpp model.
+ *
+ * <p>The backing {@link LlamaModel} must be loaded in embedding mode
+ * ({@code ModelParameters.enableEmbedding()}). The model is <em>borrowed</em> (never closed here) —
+ * see {@link JllamaChatModel}.
+ */
+public final class JllamaEmbeddingModel implements EmbeddingModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates an embedding model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded embedding-mode model to drive; not closed by this adapter
+     */
+    public JllamaEmbeddingModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public Response<List<Embedding>> embedAll(List<TextSegment> textSegments) {
+        List<Embedding> embeddings = new ArrayList<>(textSegments.size());
+        for (TextSegment segment : textSegments) {
+            embeddings.add(Embedding.from(model.embed(segment.text())));
+        }
+        return Response.from(embeddings);
+    }
+}
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama.langchain4j;
+
+import dev.langchain4j.data.segment.TextSegment;
+import dev.langchain4j.model.output.Response;
+import dev.langchain4j.model.scoring.ScoringModel;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import net.ladenthin.llama.LlamaModel;
+
+/**
+ * langchain4j {@link ScoringModel} (re-ranker) backed by an in-process java-llama.cpp model.
+ *
+ * <p>Maps onto java-llama.cpp's native rerank endpoint, so the backing {@link LlamaModel} must be
+ * loaded in reranking mode ({@code ModelParameters.enableReranking()}). Scores are returned in the
+ * same order as the input segments. The model is <em>borrowed</em> (never closed here) — see
+ * {@link JllamaChatModel}.
+ */
+public final class JllamaScoringModel implements ScoringModel {
+
+    private final LlamaModel model;
+
+    /**
+     * Creates a scoring model over a borrowed {@link LlamaModel}.
+     *
+     * @param model the loaded reranking-mode model to drive; not closed by this adapter
+     */
+    public JllamaScoringModel(LlamaModel model) {
+        this.model = Objects.requireNonNull(model, "model");
+    }
+
+    @Override
+    public Response<List<Double>> scoreAll(List<TextSegment> segments, String query) {
+        String[] documents = new String[segments.size()];
+        for (int i = 0; i < segments.size(); i++) {
+            documents[i] = segments.get(i).text();
+        }
+        double[] scores = LangChain4jMapping.parseRerankScores(model.handleRerank(query, documents), documents.length);
+        List<Double> result = new ArrayList<>(scores.length);
+        for (double score : scores) {
+            result.add(score);
+        }
+        return Response.from(result);
+    }
+}