|
| 1 | +// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com> |
| 2 | +// |
| 3 | +// SPDX-License-Identifier: MIT |
| 4 | + |
| 5 | +package net.ladenthin.llama.server; |
| 6 | + |
| 7 | +import fi.iki.elonen.NanoHTTPD; |
| 8 | +import java.io.IOException; |
| 9 | +import net.ladenthin.llama.LlamaModel; |
| 10 | +import net.ladenthin.llama.parameters.ModelParameters; |
| 11 | +import org.slf4j.Logger; |
| 12 | +import org.slf4j.LoggerFactory; |
| 13 | + |
| 14 | +/** |
| 15 | + * Entry point for the optional OpenAI-compatible HTTP server, and the {@code Main-Class} of the |
| 16 | + * {@code -jar-with-dependencies} assembly. |
| 17 | + * |
| 18 | + * <p>It parses the command line ({@link LlamaServerArgs}), loads a GGUF model into a |
| 19 | + * {@link LlamaModel}, and serves OpenAI-compatible endpoints over NanoHTTPD via {@link OaiRouter} / |
| 20 | + * {@link OaiHttpServer}. A shutdown hook stops the server and closes the model on JVM exit |
| 21 | + * (e.g. Ctrl-C / SIGTERM). Run {@code --help} for the full option list.</p> |
| 22 | + * |
| 23 | + * <p>Example:</p> |
| 24 | + * |
| 25 | + * <pre>{@code |
| 26 | + * java -jar llama-<version>-jar-with-dependencies.jar \ |
| 27 | + * --model models/Qwen3-0.6B-Q4_K_M.gguf --host 0.0.0.0 --port 8080 --n-gpu-layers 99 |
| 28 | + * }</pre> |
| 29 | + * |
| 30 | + * <p>Responses are non-streaming: the full JSON result is returned per request.</p> |
| 31 | + */ |
| 32 | +public final class LlamaServer { |
| 33 | + |
| 34 | + private static final Logger LOG = LoggerFactory.getLogger(LlamaServer.class); |
| 35 | + |
| 36 | + private LlamaServer() {} |
| 37 | + |
| 38 | + /** |
| 39 | + * Start the server (blocks the JVM alive on a non-daemon listener thread), or print help. |
| 40 | + * |
| 41 | + * @param args command-line arguments; see {@link LlamaServerArgs#usage()} |
| 42 | + * @throws IOException if the HTTP server cannot bind the configured host/port |
| 43 | + */ |
| 44 | + public static void main(String[] args) throws IOException { |
| 45 | + if (LlamaServerArgs.isHelpRequested(args)) { |
| 46 | + LOG.info("{}{}", System.lineSeparator(), LlamaServerArgs.usage()); |
| 47 | + return; |
| 48 | + } |
| 49 | + |
| 50 | + final LlamaServerConfig config = LlamaServerArgs.parse(args); |
| 51 | + final LlamaModel model = loadModel(config); |
| 52 | + final OaiBackend backend = new LlamaModelOaiBackend(model, config.getModelAlias()); |
| 53 | + final OaiHttpServer server = new OaiHttpServer(config.getHost(), config.getPort(), new OaiRouter(backend)); |
| 54 | + |
| 55 | + Runtime.getRuntime().addShutdownHook(new Thread(() -> shutdown(server, model), "llama-server-shutdown")); |
| 56 | + |
| 57 | + try { |
| 58 | + // daemon=false: the non-daemon listener thread keeps the JVM alive after main() returns. |
| 59 | + server.start(NanoHTTPD.SOCKET_READ_TIMEOUT, false); |
| 60 | + } catch (IOException e) { |
| 61 | + // Close the just-loaded native model before propagating the bind failure. |
| 62 | + model.close(); |
| 63 | + throw e; |
| 64 | + } |
| 65 | + |
| 66 | + LOG.info( |
| 67 | + "LlamaServer listening on http://{}:{} (model={})", |
| 68 | + config.getHost(), |
| 69 | + config.getPort(), |
| 70 | + config.getModelAlias()); |
| 71 | + } |
| 72 | + |
| 73 | + private static LlamaModel loadModel(LlamaServerConfig config) { |
| 74 | + final ModelParameters params = |
| 75 | + new ModelParameters().setModel(config.getModelPath()).setGpuLayers(config.getGpuLayers()); |
| 76 | + if (config.getCtxSize() > 0) { |
| 77 | + params.setCtxSize(config.getCtxSize()); |
| 78 | + } |
| 79 | + if (config.getThreads() > 0) { |
| 80 | + params.setThreads(config.getThreads()); |
| 81 | + } |
| 82 | + if (config.isEmbedding()) { |
| 83 | + params.enableEmbedding(); |
| 84 | + } |
| 85 | + LOG.info("Loading model {} ...", config.getModelPath()); |
| 86 | + return new LlamaModel(params); |
| 87 | + } |
| 88 | + |
| 89 | + private static void shutdown(OaiHttpServer server, LlamaModel model) { |
| 90 | + server.stop(); |
| 91 | + model.close(); |
| 92 | + } |
| 93 | +} |
0 commit comments