java-llama.cpp/src/main/java/net/ladenthin/llama/LlamaModel.java at 0a56e495d2a075d635b3d534df162a083dc37bfd · vaiju1981/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
// SPDX-FileCopyrightText: 2023-2025 Konstantin Herud
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import java.lang.annotation.Native;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import lombok.ToString;
import net.ladenthin.llama.args.LogFormat;
import net.ladenthin.llama.callback.CancellationToken;
import net.ladenthin.llama.callback.LoadProgressCallback;
import net.ladenthin.llama.callback.ToolHandler;
import net.ladenthin.llama.exception.LlamaException;
import net.ladenthin.llama.json.ChatResponseParser;
import net.ladenthin.llama.json.ChatStreamChunkParser;
import net.ladenthin.llama.json.CompletionResponseParser;
import net.ladenthin.llama.json.RerankResponseParser;
import net.ladenthin.llama.loader.LlamaLoader;
import net.ladenthin.llama.loader.SkipDownloadFailureTranslator;
import net.ladenthin.llama.parameters.ChatRequest;
import net.ladenthin.llama.parameters.InferenceParameters;
import net.ladenthin.llama.parameters.ModelParameters;
import net.ladenthin.llama.value.ChatResponse;
import net.ladenthin.llama.value.CompletionResult;
import net.ladenthin.llama.value.LlamaOutput;
import net.ladenthin.llama.value.LogLevel;
import net.ladenthin.llama.value.ModelMeta;
import net.ladenthin.llama.value.Pair;
import net.ladenthin.llama.value.ServerMetrics;
import net.ladenthin.llama.value.StopReason;
import org.jspecify.annotations.Nullable;

/**
 * This class is a wrapper around the llama.cpp functionality.
 * Upon being created, it natively allocates memory for the model context.
 * Thus, this class is an {@link AutoCloseable}, in order to de-allocate the memory when it is no longer being needed.
 * <p>
 * The main functionality of this class is:
 * <ul>
 *     <li>Streaming answers (and probabilities) via {@link #generate(InferenceParameters)}</li>
 *     <li>Creating whole responses to prompts via {@link #complete(InferenceParameters)}</li>
 *     <li>Creating embeddings via {@link #embed(String)} (make sure to configure {@link net.ladenthin.llama.parameters.ModelParameters#enableEmbedding()}</li>
 *     <li>Accessing the tokenizer via {@link #encode(String)} and {@link #decode(int[])}</li>
 * </ul>
 *
 * <p>{@code toString} is generated by Lombok over the native context handle ({@code ctx})
 * plus the parser collaborator references; that gives logs and debuggers a useful
 * "{@code LlamaModel(ctx=12345..., ...)}" identity dump.
 * {@code equals}/{@code hashCode} are intentionally NOT generated: model instances own
 * a native context and are managed by reference identity, not by value.</p>
 */
@ToString
public class LlamaModel implements AutoCloseable {

    private static final com.fasterxml.jackson.databind.ObjectMapper OBJECT_MAPPER =
            new com.fasterxml.jackson.databind.ObjectMapper();

    static {
        LlamaLoader.initialize();
    }

    @Native
    private long ctx;

    private final CompletionResponseParser completionParser = new CompletionResponseParser();
    private final ChatResponseParser chatParser = new ChatResponseParser();
    private final RerankResponseParser rerankParser = new RerankResponseParser();
    private final ChatStreamChunkParser chatStreamParser = new ChatStreamChunkParser();

    /**
     * Load with the given {@link net.ladenthin.llama.parameters.ModelParameters}. Make sure to either set
     * <ul>
     *     <li>{@link net.ladenthin.llama.parameters.ModelParameters#setModel(String)}</li>
     *     <li>{@link net.ladenthin.llama.parameters.ModelParameters#setModelUrl(String)}</li>
     *     <li>{@link net.ladenthin.llama.parameters.ModelParameters#setHfRepo(String)}, {@link net.ladenthin.llama.parameters.ModelParameters#setHfFile(String)}</li>
     * </ul>
     *
     * @param parameters the set of options
     * @throws net.ladenthin.llama.exception.ModelUnavailableException if {@link net.ladenthin.llama.parameters.ModelParameters#setSkipDownload(boolean)
     *                                   setSkipDownload(true)} (or
     *                                   {@link net.ladenthin.llama.args.ModelFlag#SKIP_DOWNLOAD})
     *                                   is set and the configured model file is missing or invalid
     * @throws net.ladenthin.llama.exception.LlamaException            for any other load failure
     */
    // loadModel is a native method; it does not call back into Java with this,
    // so the @UnderInitialization receiver warning is a CF false positive.
    @SuppressWarnings("method.invocation")
    public LlamaModel(ModelParameters parameters) {
        try {
            loadModel(parameters.toArray());
        } catch (LlamaException e) {
            throw SkipDownloadFailureTranslator.translate(parameters, e);
        }
    }

    /**
     * Load the model and forward progress updates to {@code progress}. The callback is
     * invoked synchronously on the constructor thread by the native loader and may
     * return {@code false} to abort the load (in which case this constructor throws
     * {@link net.ladenthin.llama.exception.LlamaException}).
     *
     * @param parameters the set of options
     * @param progress   load progress sink; {@code null} disables the callback
     * @throws net.ladenthin.llama.exception.LlamaException if loading fails or the callback aborts
     */
    // loadModel / loadModelWithProgress are native methods; they do not call back
    // into Java with this, so the @UnderInitialization receiver warning is a CF
    // false positive.
    @SuppressWarnings("method.invocation")
    public LlamaModel(ModelParameters parameters, LoadProgressCallback progress) {
        try {
            if (progress == null) {
                loadModel(parameters.toArray());
            } else {
                loadModelWithProgress(parameters.toArray(), progress);
            }
        } catch (LlamaException e) {
            throw SkipDownloadFailureTranslator.translate(parameters, e);
        }
    }

    /**
     * Generate and return a whole answer with custom parameters. Note, that the prompt isn't preprocessed in any
     * way, nothing like "User: ", "###Instruction", etc. is added.
     *
     * @param parameters the inference configuration
     * @return an LLM response
     */
    public String complete(InferenceParameters parameters) {
        InferenceParameters nonStreaming = parameters.withStream(false);
        int taskId = requestCompletion(nonStreaming.toString());
        String json = receiveCompletionJson(taskId);
        return completionParser.parse(json).text;
    }

    /**
     * Typed variant of {@link #complete(InferenceParameters)} that surfaces per-completion
     * {@link net.ladenthin.llama.value.Usage}, {@link net.ladenthin.llama.value.Timings}, {@link net.ladenthin.llama.value.TokenLogprob} entries, and {@link net.ladenthin.llama.value.StopReason}.
     * <p>
     * Logprobs are populated only when {@link net.ladenthin.llama.parameters.InferenceParameters#withNProbs(int)} is &gt; 0.
     * The raw native JSON is preserved on {@link net.ladenthin.llama.value.CompletionResult#getRawJson()}.
     *
     * @param parameters the inference configuration
     * @return a populated {@link net.ladenthin.llama.value.CompletionResult}
     */
    public CompletionResult completeWithStats(InferenceParameters parameters) {
        InferenceParameters nonStreaming = parameters.withStream(false);
        int taskId = requestCompletion(nonStreaming.toString());
        String json = receiveCompletionJson(taskId);
        return completionParser.parseCompletionResult(json);
    }

    /**
     * Dispatch a list of completion requests in parallel and return the generated texts
     * in the same order. Each request is sent immediately; the native scheduler dispatches
     * tasks across whatever slot count {@link net.ladenthin.llama.parameters.ModelParameters#setParallel(int)} was
     * configured with. With a default single-slot model the requests still run, but
     * sequentially.
     *
     * @param requests the inference parameter blocks (must be distinct instances)
     * @return the generated texts in input order
     */
    public java.util.List<String> completeBatch(java.util.Collection<InferenceParameters> requests) {
        java.util.List<CompletableFuture<String>> futures =
                new java.util.ArrayList<CompletableFuture<String>>(requests.size());
        for (InferenceParameters req : requests) {
            futures.add(completeAsync(req));
        }
        java.util.List<String> out = new java.util.ArrayList<String>(futures.size());
        for (CompletableFuture<String> f : futures) {
            out.add(f.join());
        }
        return out;
    }

    /**
     * Like {@link #completeBatch(java.util.Collection)} but each result carries
     * {@link net.ladenthin.llama.value.CompletionResult}'s typed Usage, Timings, logprobs, and stop reason.
     *
     * @param requests the inference parameter blocks (must be distinct instances)
     * @return parsed completion results in input order
     */
    public java.util.List<CompletionResult> completeBatchWithStats(java.util.Collection<InferenceParameters> requests) {
        java.util.List<CompletableFuture<CompletionResult>> futures =
                new java.util.ArrayList<CompletableFuture<CompletionResult>>(requests.size());
        for (final InferenceParameters req : requests) {
            futures.add(CompletableFuture.supplyAsync(() -> completeWithStats(req)));
        }
        java.util.List<CompletionResult> out = new java.util.ArrayList<CompletionResult>(futures.size());
        for (CompletableFuture<CompletionResult> f : futures) {
            out.add(f.join());
        }
        return out;
    }

    /**
     * Dispatch a list of typed chat requests in parallel and return the parsed responses
     * in the same order. Requires {@link net.ladenthin.llama.parameters.ModelParameters#setParallel(int)} &gt; 1 for
     * actual parallelism; otherwise the calls run sequentially on the single slot.
     *
     * @param requests the typed chat requests (must be distinct instances)
     * @return parsed responses in input order
     */
    public java.util.List<ChatResponse> chatBatch(java.util.Collection<ChatRequest> requests) {
        java.util.List<CompletableFuture<ChatResponse>> futures =
                new java.util.ArrayList<CompletableFuture<ChatResponse>>(requests.size());
        for (final ChatRequest req : requests) {
            futures.add(CompletableFuture.supplyAsync(() -> chat(req)));
        }
        java.util.List<ChatResponse> out = new java.util.ArrayList<ChatResponse>(futures.size());
        for (CompletableFuture<ChatResponse> f : futures) {
            out.add(f.join());
        }
        return out;
    }

    /**
     * Asynchronous variant of {@link #complete(InferenceParameters)}. Runs the inference on
     * the common {@link java.util.concurrent.ForkJoinPool} so it does not block the calling
     * thread. The native worker thread inside the JNI context still serializes the actual
     * model work &mdash; this wrapper only moves the blocking Java call off the caller.
     *
     * @param parameters the inference configuration
     * @return a future completed with the generated text
     */
    public CompletableFuture<String> completeAsync(InferenceParameters parameters) {
        return CompletableFuture.supplyAsync(() -> complete(parameters));
    }

    /**
     * Cancellable async variant. The returned future is wired to the supplied
     * {@link net.ladenthin.llama.callback.CancellationToken}: calling {@code future.cancel(true)} also invokes
     * {@link net.ladenthin.llama.callback.CancellationToken#cancel()} so the inference loop returns early.
     *
     * @param parameters the inference configuration
     * @param token cancellation handle bound to the underlying inference loop
     * @return a future completed with whatever text was generated up to the point of stop or cancellation
     */
    // The whenComplete return value is deliberately discarded: it is a
    // fire-and-forget cancellation callback attached to `future`, and `future`
    // (not the chained stage) is what the caller observes. The suppression sits
    // on the method instead of on a local variable because the local-variable
    // form triggered fb-contrib DLS_DEAD_LOCAL_STORE — see workspace/crossrepostatus.md
    // "FireAndForget DLS reckoning" row for the cross-repo policy.
    @SuppressWarnings("FutureReturnValueIgnored")
    public CompletableFuture<String> completeAsync(InferenceParameters parameters, CancellationToken token) {
        CompletableFuture<String> future = CompletableFuture.supplyAsync(() -> complete(parameters, token));
        future.whenComplete((result, ex) -> {
            if (ex instanceof java.util.concurrent.CancellationException) {
                token.cancel();
            }
        });
        return future;
    }

    /**
     * Asynchronous variant of {@link #chatComplete(InferenceParameters)}.
     *
     * @param parameters the inference parameters including messages
     * @return a future completed with the raw OAI-format JSON response
     */
    public CompletableFuture<String> chatCompleteAsync(InferenceParameters parameters) {
        return CompletableFuture.supplyAsync(() -> chatComplete(parameters));
    }

    /**
     * Asynchronous variant of {@link #chatCompleteText(InferenceParameters)}.
     *
     * @param parameters the inference parameters including messages
     * @return a future completed with the assistant's reply text
     */
    public CompletableFuture<String> chatCompleteTextAsync(InferenceParameters parameters) {
        return CompletableFuture.supplyAsync(() -> chatCompleteText(parameters));
    }

    /**
     * Cancellable variant of {@link #complete(InferenceParameters)}. Runs in streaming mode
     * internally so the inference loop can observe a {@link net.ladenthin.llama.callback.CancellationToken#cancel()} call
     * from another thread between token boundaries and return early with whatever text was
     * accumulated so far.
     *
     * @param parameters the inference configuration (its {@code stream} flag is set to {@code true})
     * @param token cancellation handle observed at each token boundary
     * @return the text generated up to the point of stop or cancellation
     */
    public String complete(InferenceParameters parameters, CancellationToken token) {
        token.reset();
        InferenceParameters streaming = parameters.withStream(true);
        int taskId = requestCompletion(streaming.toString());
        StringBuilder sb = new StringBuilder();
        try {
            while (true) {
                if (token.isCancelled()) {
                    // Best-effort native release. Safe to call here because we are not
                    // concurrently inside receiveCompletionJson — the cooperative cancel
                    // flag stopped the loop at a token boundary.
                    cancelCompletion(taskId);
                    break;
                }
                String json = receiveCompletionJson(taskId);
                LlamaOutput out = completionParser.parse(json);
                sb.append(out.text);
                if (out.stop) {
                    break;
                }
            }
        } finally {
            token.reset();
        }
        return sb.toString();
    }

    /**
     * Generate and stream outputs with custom inference parameters. Note, that the prompt isn't preprocessed in any
     * way, nothing like "User: ", "###Instruction", etc. is added.
     *
     * <p>The returned {@link LlamaIterable} implements {@link AutoCloseable}. Wrap it in a
     * try-with-resources block to guarantee the native task slot is released even when the
     * consumer exits the loop early:
     * <pre>{@code
     * try (LlamaIterable it = model.generate(params)) {
     *     for (LlamaOutput out : it) {
     *         if (shouldStop(out)) break;   // close() cancels the native task automatically
     *     }
     * }
     * }</pre>
     *
     * @param parameters the inference configuration
     * @return iterable LLM outputs
     */
    public LlamaIterable generate(InferenceParameters parameters) {
        return new LlamaIterable(new LlamaIterator(this, parameters));
    }

    /**
     * Get the embedding of a string. Note, that the prompt isn't preprocessed in any way, nothing like
     * "User: ", "###Instruction", etc. is added.
     *
     * @param prompt the string to embed
     * @return an embedding float array
     * @throws IllegalStateException if embedding mode was not activated (see {@link net.ladenthin.llama.parameters.ModelParameters#enableEmbedding()})
     */
    public native float[] embed(String prompt);

    /**
     * Tokenize a prompt given the native tokenizer
     *
     * @param prompt the prompt to tokenize
     * @return an array of integers each representing a token id
     */
    public native int[] encode(String prompt);

    /**
     * Convert an array of token ids to its string representation
     *
     * @param tokens an array of tokens
     * @return the token ids decoded to a string
     */
    public String decode(int... tokens) {
        byte[] bytes = decodeBytes(tokens);
        return new String(bytes, StandardCharsets.UTF_8);
    }

    /**
     * Sets a callback for native llama.cpp log messages.
     * Per default, log messages are written in JSON to stdout. Note, that in text mode the callback will be also
     * invoked with log messages of the GGML backend, while JSON mode can only access request log messages.
     * In JSON mode, GGML messages will still be written to stdout.
     * To only change the log format but keep logging to stdout, the given callback can be <code>null</code>.
     * To disable logging, pass an empty callback, i.e., <code>(level, msg) {@literal ->} {}</code>.
     *
     * @param format the log format to use
     * @param callback a method to call for log messages
     */
    public static native void setLogger(LogFormat format, BiConsumer<LogLevel, String> callback);

    @Override
    public void close() {
        delete();
    }

    /**
     * Declared {@code final} and empty to defeat the finalizer-attack vector
     * on partially-constructed instances when one of the load-throwing
     * constructors aborts &mdash; see SpotBugs {@code CT_CONSTRUCTOR_THROW}.
     * Subclassing this class is still permitted (e.g. for test doubles), but
     * no subclass can override this no-op finalizer to capture a reference to
     * a half-built model.
     * <p>
     * {@link Object#finalize()} is deprecated since JDK 9 and marked for
     * removal, but is still present in JDK 21 (the current build JDK). When
     * the language eventually removes the mechanism this override can be
     * deleted, since the attack vector disappears together with finalization.
     * </p>
     */
    @SuppressWarnings({"deprecation", "removal", "Finalize"})
    @Override
    protected final void finalize() {
        // no-op
    }

    // don't overload native methods since the C++ function names get nasty
    native int requestCompletion(String params);

    native String receiveCompletionJson(int taskId);

    native void cancelCompletion(int taskId);

    native byte[] decodeBytes(int... tokens);

    private native void loadModel(String... parameters);

    private native void loadModelWithProgress(String[] parameters, LoadProgressCallback callback);

    private native void delete();

    native void releaseTask(int taskId);

    private static native byte[] jsonSchemaToGrammarBytes(String schema);

    /**
     * Converts a JSON schema to a grammar string usable by {@link net.ladenthin.llama.parameters.ModelParameters#setGrammar(String)}.
     *
     * @param schema the JSON schema as a string
     * @return the converted grammar string
     */
    public static String jsonSchemaToGrammar(String schema) {
        return new String(jsonSchemaToGrammarBytes(schema), StandardCharsets.UTF_8);
    }

    /**
     * Rerank the given documents against the query.
     *
     * @param reRank whether to sort results by score in descending order
     * @param query the query string
     * @param documents the documents to rank
     * @return a list of document/score pairs, sorted if {@code reRank} is {@code true}
     */
    public List<Pair<String, Float>> rerank(boolean reRank, String query, String... documents) {
        String json = handleRerank(query, documents);
        List<Pair<String, Float>> rankedDocuments = rerankParser.parse(json);
        if (reRank) {
            rankedDocuments.sort((a, b) -> Float.compare(b.getValue(), a.getValue()));
        }
        return rankedDocuments;
    }

    /**
     * Rerank the given documents against the query, returning a {@link net.ladenthin.llama.value.LlamaOutput} with scored documents
     * in the probabilities map.
     *
     * @param query the query string
     * @param documents the documents to rank
     * @return a LlamaOutput with document/score pairs in the probabilities map
     */
    public LlamaOutput rerank(String query, String... documents) {
        String json = handleRerank(query, documents);
        List<Pair<String, Float>> results = rerankParser.parse(json);
        Map<String, Float> probabilities = new HashMap<>(results.size());
        for (Pair<String, Float> pair : results) {
            probabilities.put(pair.getKey(), pair.getValue());
        }
        return new LlamaOutput(query, probabilities, true, StopReason.EOS);
    }

    native String handleRerank(String query, String... documents);

    /**
     * Applies the chat template to the given inference parameters and returns the formatted string.
     *
     * @param parameters the inference parameters containing message configuration
     * @return the formatted chat template string
     */
    public String applyTemplate(InferenceParameters parameters) {
        return applyTemplate(parameters.toString());
    }
    /**
     * Native bridge that applies the chat template to a JSON-serialized parameter blob.
     *
     * @param parametersJson JSON-serialized inference parameters
     * @return the formatted chat template string
     */
    public native String applyTemplate(String parametersJson);

    /**
     * Run an OpenAI-compatible chat completion. The parameters must contain a "messages" array
     * in the standard OpenAI chat format (objects with "role" and "content" fields). The model's
     * chat template is automatically applied.
     * <p>
     * Example usage:
     * <pre>{@code
     * List<Pair<String, String>> messages = new ArrayList<>();
     * messages.add(new Pair<>("user", "What is the capital of France?"));
     *
     * InferenceParameters params = InferenceParameters.empty()
     *     .withMessages("You are a helpful assistant.", messages)
     *     .withNPredict(128)
     *     .withTemperature(0.7f);
     *
     * String response = model.chatComplete(params);
     * }</pre>
     *
     * @param parameters the inference parameters including messages
     * @return the model's response as a JSON string containing the completion result
     * @throws net.ladenthin.llama.exception.LlamaException if the model was loaded in embedding mode or if inference fails
     */
    public String chatComplete(InferenceParameters parameters) {
        InferenceParameters nonStreaming = parameters.withStream(false);
        return handleChatCompletions(nonStreaming.toString());
    }

    /**
     * Run an OpenAI-compatible chat completion and return only the assistant's text content.
     * This is the plain-string equivalent of {@link #chatComplete(InferenceParameters)}, which
     * returns the raw OAI JSON. Use this when you want the generated text directly, the same
     * way {@link #complete(InferenceParameters)} works for raw completions.
     *
     * @param parameters the inference parameters including messages
     * @return the assistant's reply text (extracted from {@code choices[0].message.content})
     * @throws net.ladenthin.llama.exception.LlamaException if the model was loaded in embedding mode or if inference fails
     */
    public String chatCompleteText(InferenceParameters parameters) {
        return chatParser.extractChoiceContent(chatComplete(parameters));
    }

    /**
     * Typed chat completion: serialize a {@link net.ladenthin.llama.parameters.ChatRequest} (with optional tools), call
     * the native chat endpoint, and return a parsed {@link net.ladenthin.llama.value.ChatResponse} carrying typed
     * {@link net.ladenthin.llama.value.Usage}, {@link net.ladenthin.llama.value.Timings}, and {@link net.ladenthin.llama.value.ChatChoice} list.
     *
     * @param request the typed request (messages + optional tools)
     * @return the parsed typed response
     */
    public ChatResponse chat(ChatRequest request) {
        InferenceParameters params = InferenceParameters.empty().withMessagesJson(request.buildMessagesJson());
        Optional<String> toolsJsonOpt = request.buildToolsJson();
        if (toolsJsonOpt.isPresent()) {
            params = params.withToolsJson(toolsJsonOpt.get()).withUseChatTemplate(true);
            Optional<String> toolChoice = request.getToolChoice();
            if (toolChoice.isPresent()) {
                params = params.withToolChoice(toolChoice.get());
            }
            Optional<Boolean> parallelToolCalls = request.getParallelToolCalls();
            if (parallelToolCalls.isPresent()) {
                params = params.withParallelToolCalls(parallelToolCalls.get());
            }
        }
        params = request.applyCustomizer(params);
        String raw = chatComplete(params);
        return chatParser.parseResponse(raw);
    }

    /**
     * Tool-calling agent loop. Repeatedly calls {@link #chat(ChatRequest)}; on each
     * response that includes {@code tool_calls}, invokes the matching {@link net.ladenthin.llama.callback.ToolHandler}
     * for every call, appends the assistant turn and tool-result turns to the request's
     * message list, and loops until either the model responds without tool calls or the
     * round cap from {@link net.ladenthin.llama.parameters.ChatRequest#getMaxToolRounds()} is reached.
     * <p>
     * Handler exceptions are caught and reported back to the model as
     * {@code {"error":"..."}} tool results so the loop can continue. Unknown tool names
     * produce {@code {"error":"unknown tool: <name>"}}.
     * </p>
     *
     * @param request  the typed request; must declare tools that the model can call
     * @param handlers map from tool name to handler
     * @return the final {@link net.ladenthin.llama.value.ChatResponse} when the model stops issuing tool calls
     *         (or the last response when the round cap is hit)
     */
    public ChatResponse chatWithTools(ChatRequest request, java.util.Map<String, ToolHandler> handlers) {
        return ToolCallingAgent.run(request, handlers, this::chat);
    }

    /**
     * Stream an OpenAI-compatible chat completion token by token. The parameters must contain a
     * "messages" array in the standard OpenAI chat format. The model's chat template is automatically applied.
     * <p>
     * Example usage:
     * <pre>{@code
     * List<Pair<String, String>> messages = new ArrayList<>();
     * messages.add(new Pair<>("user", "Tell me a story."));
     *
     * InferenceParameters params = InferenceParameters.empty()
     *     .withMessages("You are a storyteller.", messages)
     *     .withNPredict(128);
     *
     * for (LlamaOutput output : model.generateChat(params)) {
     *     System.out.print(output.text);
     * }
     * }</pre>
     *
     * @param parameters the inference parameters including messages
     * @return iterable LLM outputs with the chat template applied
     * @throws net.ladenthin.llama.exception.LlamaException if inference fails
     */
    public LlamaIterable generateChat(InferenceParameters parameters) {
        return new LlamaIterable(new LlamaIterator(this, parameters, true));
    }

    /**
     * Stream an OpenAI-compatible chat completion as {@code chat.completion.chunk} JSON objects,
     * feeding each chunk's JSON string to {@code chunkSink} as it is produced.
     * <p>
     * Unlike {@link #generateChat(InferenceParameters)} (which yields raw token text), this method
     * routes through the native OpenAI streaming formatter, so each emitted chunk is a ready-to-send
     * OpenAI streaming event — including streamed {@code delta.tool_calls} when the model issues a
     * tool call. The final chunk carries a non-null {@code finish_reason} and, when the request set
     * {@code stream_options.include_usage}, a trailing usage chunk. This is the building block for an
     * OpenAI-compatible HTTP endpoint (Server-Sent Events): forward each chunk verbatim as one
     * {@code data:} line and emit {@code data: [DONE]} after this method returns.
     * <p>
     * The {@code "messages"} array (and any {@code "tools"}/{@code "tool_choice"}) is forwarded
     * verbatim to the native chat-template parser. Streaming is forced on regardless of the
     * {@code stream} flag in {@code parameters}. If {@code chunkSink} throws, the in-flight native
     * task is cancelled and the exception propagates to the caller.
     *
     * @param parameters the inference parameters including messages (and optional tools)
     * @param chunkSink receiver for each {@code chat.completion.chunk} JSON string, in order
     * @throws net.ladenthin.llama.exception.LlamaException if inference fails
     */
    public void streamChatCompletion(InferenceParameters parameters, Consumer<String> chunkSink) {
        InferenceParameters streaming = parameters.withStream(true);
        int taskId = requestChatCompletionStream(streaming.toString());
        boolean stopped = false;
        try {
            while (!stopped) {
                String envelope = receiveChatCompletionChunk(taskId);
                stopped = chatStreamParser.feed(envelope, chunkSink);
            }
        } finally {
            // On a clean stop the native reader was already released when the final chunk was
            // delivered; this best-effort cancel covers an early exit (e.g. chunkSink threw) so the
            // native task/slot is not leaked. Safe here because we are not concurrently inside
            // receiveChatCompletionChunk, and cancelling an already-finished task is a no-op.
            if (!stopped) {
                cancelCompletion(taskId);
            }
        }
    }

    /**
     * Run a blocking completion and return the full result as a JSON string.
     * This is the JSON-in/JSON-out equivalent of {@link #complete(InferenceParameters)}.
     *
     * @param paramsJson JSON string with at least a "prompt" field
     * @return JSON response from the server
     */
    public native String handleCompletions(String paramsJson);

    /**
     * Run an OpenAI-compatible completion (mirrors /v1/completions endpoint).
     * Returns the result in OAI format with choices array.
     *
     * @param paramsJson JSON string with OAI-compatible completion parameters
     * @return JSON response in OAI format
     */
    public native String handleCompletionsOai(String paramsJson);

    /**
     * Run a text infill completion with explicit prefix/suffix.
     * The request JSON must contain "input_prefix" and "input_suffix" fields.
     *
     * @param paramsJson JSON string with infill parameters
     * @return JSON response from the server
     */
    public native String handleInfill(String paramsJson);

    /**
     * Generate embeddings for the given input. The request JSON should contain
     * an "input" (OAI-compat) or "content" field.
     *
     * @param paramsJson JSON string with embedding request
     * @param oaiCompat whether to format the response in OAI-compatible format
     * @return JSON response with embedding vectors
     */
    public native String handleEmbeddings(String paramsJson, boolean oaiCompat);

    /**
     * Tokenize text content, optionally including token piece information.
     *
     * @param content the text to tokenize
     * @param addSpecial whether to add special tokens (BOS/EOS)
     * @param withPieces whether to include token piece strings in the response
     * @return JSON response with token data
     */
    public native String handleTokenize(String content, boolean addSpecial, boolean withPieces);

    /**
     * Detokenize an array of token IDs back to text.
     *
     * @param tokens array of token IDs
     * @return JSON response with the decoded text
     */
    public native String handleDetokenize(int... tokens);

    // ------------------------------------------------------------------
    // Server management
    // ------------------------------------------------------------------

    /**
     * Get server metrics and slot information as a JSON string.
     *
     * @return JSON with slot data, idle/processing counts, and performance metrics
     */
    public String getMetrics() {
        return handleSlotAction(0, 0, null);
    }

    /**
     * Run {@link #complete(InferenceParameters)} constrained to the supplied JSON Schema
     * and deserialize the result into an instance of {@code type}. The schema is applied
     * via {@link net.ladenthin.llama.parameters.InferenceParameters#withJsonSchema(String)} for the duration of this call;
     * the supplied {@code parameters} object is mutated.
     * <p>
     * Callers are responsible for producing a JSON Schema that matches the target type;
     * this project intentionally does not pull in a schema-from-POJO generator. Use the
     * single-argument overload {@link #completeAsJson(Class, InferenceParameters)} when
     * the schema has already been set on {@code parameters}.
     *
     * @param type       the target POJO class for Jackson deserialization
     * @param schema     JSON Schema string applied via {@code withJsonSchema}
     * @param parameters inference parameters (a new derivation with the schema set is used)
     * @param <T>        target type
     * @return parsed POJO of type {@code T}
     * @throws net.ladenthin.llama.exception.LlamaException when the response is not valid JSON for the target type
     */
    public <T> T completeAsJson(Class<T> type, String schema, InferenceParameters parameters) {
        return completeAsJson(type, parameters.withJsonSchema(schema));
    }

    /**
     * Run {@link #complete(InferenceParameters)} and deserialize the result as JSON into
     * {@code type}. The {@code parameters} object should already have a JSON Schema set
     * via {@link net.ladenthin.llama.parameters.InferenceParameters#withJsonSchema(String)} or a grammar via
     * {@link net.ladenthin.llama.parameters.InferenceParameters#withGrammar(String)} — otherwise the model output is
     * unlikely to parse.
     *
     * @param type       the target POJO class for Jackson deserialization
     * @param parameters inference parameters (schema/grammar already set by the caller)
     * @param <T>        target type
     * @return parsed POJO of type {@code T}
     * @throws net.ladenthin.llama.exception.LlamaException when the response is not valid JSON for the target type
     */
    public <T> T completeAsJson(Class<T> type, InferenceParameters parameters) {
        String raw = complete(parameters);
        try {
            return OBJECT_MAPPER.readValue(raw, type);
        } catch (java.io.IOException e) {
            throw new LlamaException(
                    "Failed to parse completion as " + type.getSimpleName() + ": " + e.getMessage(), e);
        }
    }

    /**
     * Typed accessor for {@link #getMetrics()}. Parses the raw JSON into a
     * {@link net.ladenthin.llama.value.ServerMetrics} view that exposes cumulative {@link net.ladenthin.llama.value.Usage} and
     * {@link net.ladenthin.llama.value.Timings}, slot counts, and a passthrough to the underlying JSON.
     *
     * @return parsed {@link net.ladenthin.llama.value.ServerMetrics}
     * @throws net.ladenthin.llama.exception.LlamaException if the native call fails or the response cannot be parsed
     */
    public ServerMetrics getMetricsTyped() {
        try {
            return new ServerMetrics(OBJECT_MAPPER.readTree(getMetrics()));
        } catch (java.io.IOException e) {
            throw new LlamaException("Failed to parse server metrics JSON: " + e.getMessage(), e);
        }
    }

    /**
     * Returns model metadata with typed accessors for vocab, context, embedding,
     * parameter count, size, and modality support flags (vision, audio).
     * <p>
     * The returned {@link net.ladenthin.llama.value.ModelMeta} wraps the raw JSON from the native layer.
     * Call {@link net.ladenthin.llama.value.ModelMeta#toString()} to re-serialize to compact JSON for use
     * in {@code assertEquals}.
     * </p>
     *
     * @return {@link net.ladenthin.llama.value.ModelMeta} parsed from the native {@code model_meta()} response
     * @throws net.ladenthin.llama.exception.LlamaException if the native call fails or the response cannot be parsed
     */
    public ModelMeta getModelMeta() {
        try {
            return new ModelMeta(OBJECT_MAPPER.readTree(getModelMetaJson()));
        } catch (java.io.IOException e) {
            throw new LlamaException("Failed to parse model meta JSON: " + e.getMessage(), e);
        }
    }

    native String getModelMetaJson();

    /**
     * Erase the KV cache for a specific slot.
     *
     * @param slotId the slot ID to erase
     * @return JSON with erase result
     */
    public String eraseSlot(int slotId) {
        return handleSlotAction(3, slotId, null);
    }

    /**
     * Save a slot's KV cache state to a file.
     *
     * @param slotId the slot ID to save
     * @param filepath the file path to save to
     * @return JSON with save result
     */
    public String saveSlot(int slotId, String filepath) {
        return handleSlotAction(1, slotId, filepath);
    }

    /**
     * Restore a slot's KV cache state from a file.
     *
     * @param slotId the slot ID to restore
     * @param filepath the file path to restore from
     * @return JSON with restore result
     */
    public String restoreSlot(int slotId, String filepath) {
        return handleSlotAction(2, slotId, filepath);
    }

    /**
     * Configure runtime inference parameters.
     * Accepts a JSON string with optional keys:
     * <ul>
     *   <li>"slot_prompt_similarity" (float, 0.0-1.0)</li>
     *   <li>"n_threads" (int, &gt; 0)</li>
     *   <li>"n_threads_batch" (int, &gt; 0)</li>
     * </ul>
     *
     * @param configJson JSON configuration string
     * @return true if configuration was applied successfully
     */
    public native boolean configureParallelInference(String configJson);

    native String handleSlotAction(int action, int slotId, @Nullable String filename);

    /**
     * Run an OpenAI-compatible chat completion (mirrors the {@code /v1/chat/completions}
     * endpoint). The request JSON must contain a {@code "messages"} array in the standard
     * OpenAI chat format; the model's chat template is applied automatically. Returns the
     * result in OAI format with a {@code "choices"} array. This is the raw JSON-in/JSON-out
     * form used by {@link #chatComplete(net.ladenthin.llama.parameters.InferenceParameters)}
     * and by the embedded OpenAI-compatible server
     * ({@link net.ladenthin.llama.server.LlamaServer}); it is the chat counterpart of
     * {@link #handleCompletionsOai(String)} and {@link #handleEmbeddings(String, boolean)}.
     *
     * @param params JSON string with OAI-compatible chat-completion parameters (incl. {@code "messages"})
     * @return JSON response in OAI chat-completion format
     */
    public native String handleChatCompletions(String params);

    native int requestChatCompletion(String params);

    native int requestChatCompletionStream(String params);

    native String receiveChatCompletionChunk(int taskId);
}