Add in-process fine-tuning proof of concept (LlamaTrainer)

vaiju1981 · vaiju1981 · commit 2b9fc24be154 · 2026-07-01T12:21:17.000-07:00
Wire llama.cpp's ggml-opt training path into the JNI layer, mirroring upstream
examples/training/finetune.cpp: load a model, tokenize a text corpus into a
ggml-opt dataset, run llama_opt_init + llama_opt_epoch for N epochs, and write
the fine-tuned GGUF via llama_model_save_to_file.

- train_engine.{h,cpp} - self-contained native finetune(), independent of the
  inference server_context (loads its own model + context; forces no-mmap and an
  f32 KV cache, as training requires)
- LlamaTrainer - minimal Java entry point (static finetune(...) overloads)
- CMakeLists.txt - compile train_engine.cpp into libjllama

The ggml-opt / llama_opt symbols already link into the static libjllama with no
build-system change (verified with nm), so this is pure JNI + C++ wiring. The
finetuneNative symbol is exported, the library links and loads cleanly, and the
Java layer compiles through the strict Error Prone / NullAway pipeline.

Scope is deliberately a proof of concept: full-model fine-tuning is compute- and
memory-intensive and upstream training support is experimental. The actual
training run is exercised by a model-gated integration test that self-skips
unless -Dnet.ladenthin.llama.train.model is set. A richer FineTuner API (dataset
handling, optimizer / LoRA options, progress callbacks) can build on this base.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -308,6 +308,7 @@ endif()
 add_library(jllama SHARED
     src/main/cpp/jllama.cpp
     src/main/cpp/tts_engine.cpp
+    src/main/cpp/train_engine.cpp
     ${JLLAMA_TTS_GEN_CPP}
     src/main/cpp/utils.hpp
     ${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
diff --git a/src/main/cpp/train_engine.cpp b/src/main/cpp/train_engine.cpp
@@ -0,0 +1,126 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+#include "train_engine.h"
+
+#include "common.h"
+#include "ggml-opt.h"
+#include "llama.h"
+
+#include <jni.h>
+
+#include <exception>
+#include <string>
+#include <vector>
+
+namespace jllama_train {
+
+bool finetune(const finetune_config &cfg, std::string &err) {
+    common_params params;
+    params.escape = false;
+    params.model.path = cfg.model_path;
+    params.prompt = cfg.training_text;
+    params.out_file = cfg.output_path;
+    params.n_ctx = cfg.n_ctx;
+    params.n_gpu_layers = cfg.n_gpu_layers;
+    params.lr.lr0 = cfg.learning_rate;
+    params.lr.epochs = static_cast<unsigned>(cfg.epochs > 0 ? cfg.epochs : 1);
+    params.lr.init(); // required after setting lr fields, before get_lr() is used by the optimizer
+
+    // Training needs writable weights (mmap yields read-only pointers) and an f32 KV cache
+    // (OUT_PROD has no f16 support) — same forced settings as upstream finetune.cpp.
+    params.use_mmap = false;
+    params.cache_type_k = GGML_TYPE_F32;
+    params.cache_type_v = GGML_TYPE_F32;
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    common_init_result_ptr llama_init = common_init_from_params(params);
+    llama_model *model = llama_init->model();
+    llama_context *ctx = llama_init->context();
+    if (model == nullptr || ctx == nullptr) {
+        err = "failed to load model for training: " + cfg.model_path;
+        return false;
+    }
+
+    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
+    if (tokens.size() < 2) {
+        err = "training text produced too few tokens (need at least 2)";
+        return false;
+    }
+
+    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2);
+
+    llama_opt_params lopt_params = {
+        /*n_ctx_train     =*/0,
+        /*param_filter    =*/llama_opt_param_filter_all,
+        /*param_filter_ud =*/nullptr,
+        /*get_opt_pars    =*/common_opt_lr_pars,
+        /*get_opt_pars_ud =*/&params.lr,
+        /*optimizer_type  =*/params.optimizer,
+    };
+    llama_opt_init(ctx, model, lopt_params);
+
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_eval = ggml_opt_result_init();
+
+    for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) {
+        llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
+                        ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_eval);
+    }
+
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_eval);
+    ggml_opt_dataset_free(dataset);
+
+    llama_model_save_to_file(model, params.out_file.c_str());
+
+    // Deliberately NOT calling llama_backend_free(): other live llama contexts in this JVM
+    // (e.g. an inference LlamaModel) may still depend on the initialized backend.
+    return true;
+}
+
+} // namespace jllama_train
+
+extern "C" JNIEXPORT jstring JNICALL Java_net_ladenthin_llama_LlamaTrainer_finetuneNative(
+    JNIEnv *env, jclass, jstring jmodel, jstring jtext, jstring jout, jint epochs, jfloat learningRate,
+    jint nCtx, jint nGpuLayers) {
+    const auto to_str = [env](jstring s) -> std::string {
+        if (s == nullptr) {
+            return "";
+        }
+        const char *c = env->GetStringUTFChars(s, nullptr);
+        std::string out = c != nullptr ? c : "";
+        if (c != nullptr) {
+            env->ReleaseStringUTFChars(s, c);
+        }
+        return out;
+    };
+
+    jllama_train::finetune_config cfg;
+    cfg.model_path = to_str(jmodel);
+    cfg.training_text = to_str(jtext);
+    cfg.output_path = to_str(jout);
+    cfg.epochs = static_cast<int>(epochs);
+    cfg.learning_rate = static_cast<float>(learningRate);
+    cfg.n_ctx = static_cast<int>(nCtx);
+    cfg.n_gpu_layers = static_cast<int>(nGpuLayers);
+
+    std::string err;
+    try {
+        if (jllama_train::finetune(cfg, err)) {
+            return env->NewStringUTF(""); // empty == success
+        }
+    } catch (const std::exception &e) {
+        err = e.what();
+    } catch (...) {
+        err = "unknown C++ exception during fine-tuning";
+    }
+    return env->NewStringUTF(err.c_str());
+}
diff --git a/src/main/cpp/train_engine.h b/src/main/cpp/train_engine.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+//
+// Native fine-tuning engine (proof-of-concept): a self-contained wrapper over llama.cpp's
+// ggml-opt training path (llama_opt_init / llama_opt_epoch), mirroring upstream
+// examples/training/finetune.cpp. Loads its own model + context (independent of the inference
+// server_context in jllama.cpp), fine-tunes on a text corpus, and writes a new GGUF via
+// llama_model_save_to_file. Kept out of jllama.cpp so the JNI layer stays thin.
+
+#ifndef JLLAMA_TRAIN_ENGINE_H
+#define JLLAMA_TRAIN_ENGINE_H
+
+#include <string>
+
+namespace jllama_train {
+
+// One fine-tuning run's inputs.
+struct finetune_config {
+    std::string model_path;    // base GGUF to fine-tune
+    std::string training_text; // corpus (tokenized in-process)
+    std::string output_path;   // where the fine-tuned GGUF is written
+    int         epochs;        // number of passes over the corpus (>= 1)
+    float       learning_rate; // AdamW lr at the first epoch
+    int         n_ctx;         // context size; 0 = the model's trained context
+    int         n_gpu_layers;  // layers offloaded to the GPU; -1 = auto
+};
+
+// Run one fine-tuning job end to end. Returns true on success; on failure returns false and sets
+// `err`. Not re-entrant; intended to be called off the JVM's critical threads (it blocks for the
+// full training run).
+bool finetune(const finetune_config &cfg, std::string &err);
+
+} // namespace jllama_train
+
+#endif // JLLAMA_TRAIN_ENGINE_H
diff --git a/src/main/java/net/ladenthin/llama/LlamaTrainer.java b/src/main/java/net/ladenthin/llama/LlamaTrainer.java
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import java.nio.file.Path;
+import net.ladenthin.llama.exception.LlamaException;
+import net.ladenthin.llama.loader.LlamaLoader;
+
+/**
+ * Proof-of-concept in-process fine-tuning entry point, wrapping llama.cpp's ggml-opt training path
+ * ({@code llama_opt_init} / {@code llama_opt_epoch}) the same way the upstream
+ * {@code examples/training/finetune.cpp} tool does. Loads its own model and context (independent of
+ * {@link LlamaModel}), fine-tunes on a text corpus, and writes a new GGUF.
+ *
+ * <p><strong>Status: proof of concept.</strong> Full-model fine-tuning is compute- and
+ * memory-intensive and blocks for the whole run; upstream training support is itself experimental.
+ * This surface is intentionally minimal so the native path (which links ggml-opt into
+ * {@code libjllama} with no extra dependency) can be exercised end to end before a richer
+ * {@code FineTuner} API is designed.
+ */
+public final class LlamaTrainer {
+
+    static {
+        LlamaLoader.initialize();
+    }
+
+    private LlamaTrainer() {}
+
+    /**
+     * Fine-tune {@code model} on {@code trainingText} for {@code epochs} passes, writing the result
+     * to {@code output}. Uses the model's trained context size and automatic GPU-layer selection.
+     *
+     * @param model the base GGUF model to fine-tune
+     * @param trainingText the training corpus (tokenized in-process)
+     * @param output the path the fine-tuned GGUF is written to
+     * @param epochs number of passes over the corpus (at least 1)
+     * @param learningRate the AdamW learning rate at the first epoch (e.g. {@code 1e-5f})
+     * @throws LlamaException if the model cannot be loaded or training fails
+     */
+    public static void finetune(Path model, String trainingText, Path output, int epochs, float learningRate) {
+        finetune(model, trainingText, output, epochs, learningRate, 0, -1);
+    }
+
+    /**
+     * Fine-tune {@code model} on {@code trainingText}, with explicit context size and GPU offload.
+     *
+     * @param model the base GGUF model to fine-tune
+     * @param trainingText the training corpus (tokenized in-process)
+     * @param output the path the fine-tuned GGUF is written to
+     * @param epochs number of passes over the corpus (at least 1)
+     * @param learningRate the AdamW learning rate at the first epoch (e.g. {@code 1e-5f})
+     * @param nCtx context size in tokens, or {@code 0} to use the model's trained context
+     * @param nGpuLayers number of layers to offload to the GPU, or {@code -1} for automatic
+     * @throws LlamaException if the model cannot be loaded or training fails
+     */
+    public static void finetune(
+            Path model, String trainingText, Path output, int epochs, float learningRate, int nCtx, int nGpuLayers) {
+        String error =
+                finetuneNative(
+                        model.toString(), trainingText, output.toString(), epochs, learningRate, nCtx, nGpuLayers);
+        if (error != null && !error.isEmpty()) {
+            throw new LlamaException(error);
+        }
+    }
+
+    private static native String finetuneNative(
+            String modelPath,
+            String trainingText,
+            String outputPath,
+            int epochs,
+            float learningRate,
+            int nCtx,
+            int nGpuLayers);
+}
diff --git a/src/test/java/net/ladenthin/llama/LlamaTrainerIntegrationTest.java b/src/test/java/net/ladenthin/llama/LlamaTrainerIntegrationTest.java
@@ -0,0 +1,43 @@
+// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
+//
+// SPDX-License-Identifier: MIT
+
+package net.ladenthin.llama;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.is;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import org.junit.jupiter.api.Assumptions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+/**
+ * End-to-end fine-tuning smoke over a real model. Self-skips unless a (small) GGUF is provided via
+ * {@code -Dnet.ladenthin.llama.train.model=/abs/path/to/model.gguf}. Full-model fine-tuning is
+ * compute- and memory-intensive, so this is opt-in and never runs in a default build.
+ */
+class LlamaTrainerIntegrationTest {
+
+    @Test
+    void finetuneWritesAnOutputModel(@TempDir Path tmp) throws Exception {
+        String modelPath = System.getProperty("net.ladenthin.llama.train.model");
+        Assumptions.assumeTrue(
+                modelPath != null && !modelPath.isEmpty() && Files.exists(Paths.get(modelPath)),
+                "set -Dnet.ladenthin.llama.train.model=/path/to/small.gguf to run the fine-tune smoke");
+
+        StringBuilder corpus = new StringBuilder();
+        for (int i = 0; i < 64; i++) {
+            corpus.append("The quick brown fox jumps over the lazy dog. ");
+        }
+
+        Path output = tmp.resolve("finetuned.gguf");
+        LlamaTrainer.finetune(Paths.get(modelPath), corpus.toString(), output, 1, 1e-5f);
+
+        assertThat(Files.exists(output), is(true));
+        assertThat(Files.size(output), greaterThan(0L));
+    }
+}