Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ endif()
add_library(jllama SHARED
src/main/cpp/jllama.cpp
src/main/cpp/tts_engine.cpp
src/main/cpp/train_engine.cpp
${JLLAMA_TTS_GEN_CPP}
src/main/cpp/utils.hpp
${llama.cpp_SOURCE_DIR}/tools/server/server-common.cpp
Expand Down
126 changes: 126 additions & 0 deletions src/main/cpp/train_engine.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

#include "train_engine.h"

#include "common.h"
#include "ggml-opt.h"
#include "llama.h"

#include <jni.h>

#include <exception>
#include <string>
#include <vector>

namespace jllama_train {

bool finetune(const finetune_config &cfg, std::string &err) {
common_params params;
params.escape = false;
params.model.path = cfg.model_path;
params.prompt = cfg.training_text;
params.out_file = cfg.output_path;
params.n_ctx = cfg.n_ctx;
params.n_gpu_layers = cfg.n_gpu_layers;
params.lr.lr0 = cfg.learning_rate;
params.lr.epochs = static_cast<unsigned>(cfg.epochs > 0 ? cfg.epochs : 1);
params.lr.init(); // required after setting lr fields, before get_lr() is used by the optimizer

// Training needs writable weights (mmap yields read-only pointers) and an f32 KV cache
// (OUT_PROD has no f16 support) — same forced settings as upstream finetune.cpp.
params.use_mmap = false;
params.cache_type_k = GGML_TYPE_F32;
params.cache_type_v = GGML_TYPE_F32;

llama_backend_init();
llama_numa_init(params.numa);

common_init_result_ptr llama_init = common_init_from_params(params);
llama_model *model = llama_init->model();
llama_context *ctx = llama_init->context();
if (model == nullptr || ctx == nullptr) {
err = "failed to load model for training: " + cfg.model_path;
return false;
}

std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
if (tokens.size() < 2) {
err = "training text produced too few tokens (need at least 2)";
return false;
}

ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx, tokens, llama_n_ctx(ctx) / 2);

llama_opt_params lopt_params = {
/*n_ctx_train =*/0,
/*param_filter =*/llama_opt_param_filter_all,
/*param_filter_ud =*/nullptr,
/*get_opt_pars =*/common_opt_lr_pars,
/*get_opt_pars_ud =*/&params.lr,
/*optimizer_type =*/params.optimizer,
};
llama_opt_init(ctx, model, lopt_params);

const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - params.val_split);

ggml_opt_result_t result_train = ggml_opt_result_init();
ggml_opt_result_t result_eval = ggml_opt_result_init();

for (params.lr.epoch = 0; params.lr.epoch < params.lr.epochs; ++params.lr.epoch) {
llama_opt_epoch(ctx, dataset, result_train, result_eval, idata_split,
ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
ggml_opt_result_reset(result_train);
ggml_opt_result_reset(result_eval);
}

ggml_opt_result_free(result_train);
ggml_opt_result_free(result_eval);
ggml_opt_dataset_free(dataset);

llama_model_save_to_file(model, params.out_file.c_str());

// Deliberately NOT calling llama_backend_free(): other live llama contexts in this JVM
// (e.g. an inference LlamaModel) may still depend on the initialized backend.
return true;
}

} // namespace jllama_train

extern "C" JNIEXPORT jstring JNICALL Java_net_ladenthin_llama_LlamaTrainer_finetuneNative(
JNIEnv *env, jclass, jstring jmodel, jstring jtext, jstring jout, jint epochs, jfloat learningRate,
jint nCtx, jint nGpuLayers) {
const auto to_str = [env](jstring s) -> std::string {
if (s == nullptr) {
return "";
}
const char *c = env->GetStringUTFChars(s, nullptr);
std::string out = c != nullptr ? c : "";
if (c != nullptr) {
env->ReleaseStringUTFChars(s, c);
}
return out;
};

jllama_train::finetune_config cfg;
cfg.model_path = to_str(jmodel);
cfg.training_text = to_str(jtext);
cfg.output_path = to_str(jout);
cfg.epochs = static_cast<int>(epochs);
cfg.learning_rate = static_cast<float>(learningRate);
cfg.n_ctx = static_cast<int>(nCtx);
cfg.n_gpu_layers = static_cast<int>(nGpuLayers);

std::string err;
try {
if (jllama_train::finetune(cfg, err)) {
return env->NewStringUTF(""); // empty == success
}
} catch (const std::exception &e) {
err = e.what();
} catch (...) {
err = "unknown C++ exception during fine-tuning";
}
return env->NewStringUTF(err.c_str());
}
36 changes: 36 additions & 0 deletions src/main/cpp/train_engine.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT
//
// Native fine-tuning engine (proof-of-concept): a self-contained wrapper over llama.cpp's
// ggml-opt training path (llama_opt_init / llama_opt_epoch), mirroring upstream
// examples/training/finetune.cpp. Loads its own model + context (independent of the inference
// server_context in jllama.cpp), fine-tunes on a text corpus, and writes a new GGUF via
// llama_model_save_to_file. Kept out of jllama.cpp so the JNI layer stays thin.

#ifndef JLLAMA_TRAIN_ENGINE_H
#define JLLAMA_TRAIN_ENGINE_H

#include <string>

namespace jllama_train {

// One fine-tuning run's inputs.
struct finetune_config {
std::string model_path; // base GGUF to fine-tune
std::string training_text; // corpus (tokenized in-process)
std::string output_path; // where the fine-tuned GGUF is written
int epochs; // number of passes over the corpus (>= 1)
float learning_rate; // AdamW lr at the first epoch
int n_ctx; // context size; 0 = the model's trained context
int n_gpu_layers; // layers offloaded to the GPU; -1 = auto
};

// Run one fine-tuning job end to end. Returns true on success; on failure returns false and sets
// `err`. Not re-entrant; intended to be called off the JVM's critical threads (it blocks for the
// full training run).
bool finetune(const finetune_config &cfg, std::string &err);

} // namespace jllama_train

#endif // JLLAMA_TRAIN_ENGINE_H
76 changes: 76 additions & 0 deletions src/main/java/net/ladenthin/llama/LlamaTrainer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import java.nio.file.Path;
import net.ladenthin.llama.exception.LlamaException;
import net.ladenthin.llama.loader.LlamaLoader;

/**
* Proof-of-concept in-process fine-tuning entry point, wrapping llama.cpp's ggml-opt training path
* ({@code llama_opt_init} / {@code llama_opt_epoch}) the same way the upstream
* {@code examples/training/finetune.cpp} tool does. Loads its own model and context (independent of
* {@link LlamaModel}), fine-tunes on a text corpus, and writes a new GGUF.
*
* <p><strong>Status: proof of concept.</strong> Full-model fine-tuning is compute- and
* memory-intensive and blocks for the whole run; upstream training support is itself experimental.
* This surface is intentionally minimal so the native path (which links ggml-opt into
* {@code libjllama} with no extra dependency) can be exercised end to end before a richer
* {@code FineTuner} API is designed.
*/
public final class LlamaTrainer {

static {
LlamaLoader.initialize();
}

private LlamaTrainer() {}

/**
* Fine-tune {@code model} on {@code trainingText} for {@code epochs} passes, writing the result
* to {@code output}. Uses the model's trained context size and automatic GPU-layer selection.
*
* @param model the base GGUF model to fine-tune
* @param trainingText the training corpus (tokenized in-process)
* @param output the path the fine-tuned GGUF is written to
* @param epochs number of passes over the corpus (at least 1)
* @param learningRate the AdamW learning rate at the first epoch (e.g. {@code 1e-5f})
* @throws LlamaException if the model cannot be loaded or training fails
*/
public static void finetune(Path model, String trainingText, Path output, int epochs, float learningRate) {
finetune(model, trainingText, output, epochs, learningRate, 0, -1);
}

/**
* Fine-tune {@code model} on {@code trainingText}, with explicit context size and GPU offload.
*
* @param model the base GGUF model to fine-tune
* @param trainingText the training corpus (tokenized in-process)
* @param output the path the fine-tuned GGUF is written to
* @param epochs number of passes over the corpus (at least 1)
* @param learningRate the AdamW learning rate at the first epoch (e.g. {@code 1e-5f})
* @param nCtx context size in tokens, or {@code 0} to use the model's trained context
* @param nGpuLayers number of layers to offload to the GPU, or {@code -1} for automatic
* @throws LlamaException if the model cannot be loaded or training fails
*/
public static void finetune(
Path model, String trainingText, Path output, int epochs, float learningRate, int nCtx, int nGpuLayers) {
String error =
finetuneNative(
model.toString(), trainingText, output.toString(), epochs, learningRate, nCtx, nGpuLayers);
if (error != null && !error.isEmpty()) {
throw new LlamaException(error);
}
}

private static native String finetuneNative(
String modelPath,
String trainingText,
String outputPath,
int epochs,
float learningRate,
int nCtx,
int nGpuLayers);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.greaterThan;
import static org.hamcrest.Matchers.is;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.junit.jupiter.api.Assumptions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

/**
* End-to-end fine-tuning smoke over a real model. Self-skips unless a (small) GGUF is provided via
* {@code -Dnet.ladenthin.llama.train.model=/abs/path/to/model.gguf}. Full-model fine-tuning is
* compute- and memory-intensive, so this is opt-in and never runs in a default build.
*/
class LlamaTrainerIntegrationTest {

@Test
void finetuneWritesAnOutputModel(@TempDir Path tmp) throws Exception {
String modelPath = System.getProperty("net.ladenthin.llama.train.model");
Assumptions.assumeTrue(
modelPath != null && !modelPath.isEmpty() && Files.exists(Paths.get(modelPath)),
"set -Dnet.ladenthin.llama.train.model=/path/to/small.gguf to run the fine-tune smoke");

StringBuilder corpus = new StringBuilder();
for (int i = 0; i < 64; i++) {
corpus.append("The quick brown fox jumps over the lazy dog. ");
}

Path output = tmp.resolve("finetuned.gguf");
LlamaTrainer.finetune(Paths.get(modelPath), corpus.toString(), output, 1, 1e-5f);

assertThat(Files.exists(output), is(true));
assertThat(Files.size(output), greaterThan(0L));
}
}
Loading