java-llama.cpp/src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java at c91d6f245a3ec323f5b06b7d8b6a0f29a98811e0 · bernardladenthin/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
// SPDX-FileCopyrightText: 2023-2025 Konstantin Herud
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama.parameters;

import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.EqualsAndHashCode;
import net.ladenthin.llama.args.ContinuationMode;
import net.ladenthin.llama.args.MiroStat;
import net.ladenthin.llama.args.ReasoningFormat;
import net.ladenthin.llama.args.Sampler;
import net.ladenthin.llama.value.ChatMessage;
import net.ladenthin.llama.value.Pair;
import org.jspecify.annotations.Nullable;

/**
 * Immutable typed parameters for {@link net.ladenthin.llama.LlamaModel} inference calls
 * ({@link net.ladenthin.llama.LlamaModel#generate(InferenceParameters)},
 * {@link net.ladenthin.llama.LlamaModel#complete(InferenceParameters)}, etc.), populated through a
 * functional {@code withX(...)} API.
 *
 * <h2>Design</h2>
 *
 * <p>All instances are immutable: the inherited {@code parameters} map is
 * {@link java.util.Collections#unmodifiableMap(Map) unmodifiable} and every
 * {@code withX} call routes through the parent's protected helpers to allocate a
 * new {@code InferenceParameters} with one entry inserted or replaced. The
 * original instance is never touched.
 *
 * <h2>Construction patterns</h2>
 *
 * <pre>{@code
 * InferenceParameters params = InferenceParameters.of("two plus two?")
 *         .withNPredict(8)
 *         .withSeed(1)
 *         .withTemperature(0.2f);
 * }</pre>
 *
 * <p>The legacy {@code new InferenceParameters(prompt)} constructor remains
 * available and is exactly equivalent to {@link #of(String)}.
 *
 * <p>{@code equals}/{@code hashCode} are generated by Lombok with {@code callSuper=true}
 * so the parent {@link JsonParameters} parameters map participates in equality.
 * {@code toString} is inherited from {@link JsonParameters} and emits the accumulated
 * parameters as a JSON object string consumed by the native server.
 */
@SuppressWarnings("unused")
@EqualsAndHashCode(callSuper = true)
public final class InferenceParameters extends JsonParameters {

    private static final String PARAM_PROMPT = "prompt";
    private static final String PARAM_INPUT_PREFIX = "input_prefix";
    private static final String PARAM_INPUT_SUFFIX = "input_suffix";
    private static final String PARAM_CACHE_PROMPT = "cache_prompt";
    private static final String PARAM_CACHE_REUSE = "n_cache_reuse";
    private static final String PARAM_SLOT_ID = "id_slot";
    private static final String PARAM_STREAM_OPTIONS = "stream_options";
    private static final String PARAM_RESPONSE_FORMAT = "response_format";
    private static final String PARAM_N_PREDICT = "n_predict";
    private static final String PARAM_TOP_K = "top_k";
    private static final String PARAM_TOP_P = "top_p";
    private static final String PARAM_MIN_P = "min_p";
    private static final String PARAM_TFS_Z = "tfs_z";
    private static final String PARAM_TYPICAL_P = "typical_p";
    private static final String PARAM_TEMPERATURE = "temperature";
    private static final String PARAM_DYNATEMP_RANGE = "dynatemp_range";
    private static final String PARAM_DYNATEMP_EXPONENT = "dynatemp_exponent";
    private static final String PARAM_REPEAT_LAST_N = "repeat_last_n";
    private static final String PARAM_REPEAT_PENALTY = "repeat_penalty";
    private static final String PARAM_FREQUENCY_PENALTY = "frequency_penalty";
    private static final String PARAM_PRESENCE_PENALTY = "presence_penalty";
    private static final String PARAM_MIROSTAT = "mirostat";
    private static final String PARAM_MIROSTAT_TAU = "mirostat_tau";
    private static final String PARAM_MIROSTAT_ETA = "mirostat_eta";
    private static final String PARAM_PENALIZE_NL = "penalize_nl";
    private static final String PARAM_N_KEEP = "n_keep";
    private static final String PARAM_SEED = "seed";
    private static final String PARAM_N_PROBS = "n_probs";
    private static final String PARAM_MIN_KEEP = "min_keep";
    private static final String PARAM_GRAMMAR = "grammar";
    private static final String PARAM_JSON_SCHEMA = "json_schema";
    private static final String PARAM_PENALTY_PROMPT = "penalty_prompt";
    private static final String PARAM_IGNORE_EOS = "ignore_eos";
    private static final String PARAM_LOGIT_BIAS = "logit_bias";
    private static final String PARAM_STOP = "stop";
    private static final String PARAM_SAMPLERS = "samplers";
    private static final String PARAM_STREAM = "stream";
    private static final String PARAM_USE_CHAT_TEMPLATE = "use_chat_template";
    private static final String PARAM_CHAT_TEMPLATE = "chat_template";
    private static final String PARAM_USE_JINJA = "use_jinja";
    private static final String PARAM_CHAT_TEMPLATE_KWARGS = "chat_template_kwargs";
    private static final String PARAM_MESSAGES = "messages";
    private static final String PARAM_TOP_N_SIGMA = "top_n_sigma";
    private static final String PARAM_REASONING_FORMAT = "reasoning_format";
    private static final String PARAM_REASONING_BUDGET_TOKENS = "reasoning_budget_tokens";
    private static final String PARAM_CONTINUE_FINAL_MESSAGE = "continue_final_message";
    private static final String PARAM_TOOLS = "tools";
    private static final String PARAM_TOOL_CHOICE = "tool_choice";
    private static final String PARAM_PARALLEL_TOOL_CALLS = "parallel_tool_calls";

    private static final InferenceParameters EMPTY = new InferenceParameters();

    /** Private no-arg: starts from an empty parameter map. */
    private InferenceParameters() {
        super();
    }

    /** Private all-args: wraps a pre-built unmodifiable map verbatim. */
    private InferenceParameters(Map<String, String> parameters) {
        super(parameters);
    }

    /**
     * Creates inference parameters with the given prompt. Equivalent to
     * {@link #of(String)} and kept for API compatibility.
     *
     * @param prompt the prompt to start generation with
     */
    public InferenceParameters(String prompt) {
        super(singletonPrompt(prompt));
    }

    private static Map<String, String> singletonPrompt(String prompt) {
        // Mirror the JSON-encoding path used by withOptionalJson so toString() output
        // is byte-identical between `new InferenceParameters(p)` and `of(p)`.
        Map<String, String> m = new HashMap<>();
        m.put(PARAM_PROMPT, new net.ladenthin.llama.parameters.ParameterJsonSerializer().toJsonString(prompt));
        return Collections.unmodifiableMap(m);
    }

    /**
     * Returns the canonical empty inference-parameter set (no prompt, no overrides).
     * Use this as the starting point for chained {@code withX} derivations.
     *
     * @return the cached empty instance
     */
    public static InferenceParameters empty() {
        return EMPTY;
    }

    /**
     * Returns inference parameters seeded with the given prompt. Equivalent to
     * {@code empty().withPrompt(prompt)} but produces the same JSON encoding as the
     * legacy public constructor.
     *
     * @param prompt the prompt to start generation with
     * @return a new instance carrying only the prompt entry
     */
    public static InferenceParameters of(String prompt) {
        return new InferenceParameters(prompt);
    }

    @Override
    @SuppressWarnings({"unchecked", "TypeParameterUnusedInFormals"})
    protected <T extends JsonParameters> T withParameters(Map<String, String> newParameters) {
        return (T) new InferenceParameters(newParameters);
    }

    // -----------------------------------------------------------------------
    // Wither setters — one per parameter, each returns a new instance.
    // -----------------------------------------------------------------------

    /**
     * Returns a new request with the prompt replaced (default: empty).
     *
     * @param prompt the prompt to start generation with; {@code null} clears any prior prompt
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withPrompt(@Nullable String prompt) {
        return withOptionalJson(PARAM_PROMPT, prompt);
    }

    /**
     * Returns a new request with the infilling prefix replaced (default: empty).
     *
     * @param inputPrefix the prefix for infilling; {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withInputPrefix(@Nullable String inputPrefix) {
        return withOptionalJson(PARAM_INPUT_PREFIX, inputPrefix);
    }

    /**
     * Returns a new request with the infilling suffix replaced (default: empty).
     *
     * @param inputSuffix the suffix for infilling; {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withInputSuffix(@Nullable String inputSuffix) {
        return withOptionalJson(PARAM_INPUT_SUFFIX, inputSuffix);
    }

    /**
     * Returns a new request with the prompt-cache flag replaced.
     *
     * @param cachePrompt whether to cache the prompt
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withCachePrompt(boolean cachePrompt) {
        return withScalar(PARAM_CACHE_PROMPT, cachePrompt);
    }

    /**
     * Returns a new request with the minimum reusable KV-cache chunk size replaced.
     * A value of {@code 0} disables non-prefix chunk reuse. Ordinary common-prefix
     * reuse remains controlled by {@link #withCachePrompt(boolean)}.
     *
     * @param cacheReuse minimum reusable chunk size, or {@code 0} to disable
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withCacheReuse(int cacheReuse) {
        if (cacheReuse < 0) {
            throw new IllegalArgumentException("cacheReuse must be non-negative but was " + cacheReuse);
        }
        return withScalar(PARAM_CACHE_REUSE, cacheReuse);
    }

    /**
     * Returns a new request pinned to a llama.cpp server slot. Pinning is useful
     * for deterministic multi-turn KV reuse and for matching inference with
     * {@code saveSlot}/{@code restoreSlot} operations.
     *
     * @param slotId non-negative slot identifier
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withSlotId(int slotId) {
        if (slotId < 0) {
            throw new IllegalArgumentException("slotId must be non-negative but was " + slotId);
        }
        return withScalar(PARAM_SLOT_ID, slotId);
    }

    /**
     * Returns a new request with the number of tokens to predict replaced
     * (default: -1, -1 = infinity, -2 = until context filled).
     *
     * @param nPredict tokens to predict
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withNPredict(int nPredict) {
        return withScalar(PARAM_N_PREDICT, nPredict);
    }

    /**
     * Returns a new request with the top-k sampling value replaced (default: 40, 0 = disabled).
     *
     * @param topK the top-k value (0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTopK(int topK) {
        return withScalar(PARAM_TOP_K, topK);
    }

    /**
     * Returns a new request with top-p sampling replaced (default: 0.9, 1.0 = disabled).
     *
     * @param topP the top-p value (1.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTopP(float topP) {
        return withScalar(PARAM_TOP_P, topP);
    }

    /**
     * Returns a new request with min-p sampling replaced (default: 0.1, 0.0 = disabled).
     *
     * @param minP the min-p value (0.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMinP(float minP) {
        return withScalar(PARAM_MIN_P, minP);
    }

    /**
     * Returns a new request with tail-free sampling z replaced (default: 1.0, 1.0 = disabled).
     *
     * @param tfsZ tail-free sampling parameter z (1.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTfsZ(float tfsZ) {
        return withScalar(PARAM_TFS_Z, tfsZ);
    }

    /**
     * Returns a new request with locally-typical sampling p replaced (default: 1.0, 1.0 = disabled).
     *
     * @param typicalP locally typical sampling parameter p (1.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTypicalP(float typicalP) {
        return withScalar(PARAM_TYPICAL_P, typicalP);
    }

    /**
     * Returns a new request with the temperature replaced (default: 0.8).
     *
     * @param temperature the sampling temperature
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTemperature(float temperature) {
        return withScalar(PARAM_TEMPERATURE, temperature);
    }

    /**
     * Returns a new request with the dynamic-temperature range replaced (default: 0.0, 0.0 = disabled).
     *
     * @param dynatempRange the dynamic temperature range (0.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withDynamicTemperatureRange(float dynatempRange) {
        return withScalar(PARAM_DYNATEMP_RANGE, dynatempRange);
    }

    /**
     * Returns a new request with the dynamic-temperature exponent replaced (default: 1.0).
     *
     * @param dynatempExponent the dynamic temperature exponent
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withDynamicTemperatureExponent(float dynatempExponent) {
        return withScalar(PARAM_DYNATEMP_EXPONENT, dynatempExponent);
    }

    /**
     * Returns a new request with the repetition-penalty window replaced (default: 64, 0 = disabled, -1 = ctx_size).
     *
     * @param repeatLastN window size (0 = disabled, -1 = ctx_size)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withRepeatLastN(int repeatLastN) {
        return withScalar(PARAM_REPEAT_LAST_N, repeatLastN);
    }

    /**
     * Returns a new request with the repetition penalty replaced (default: 1.0, 1.0 = disabled).
     *
     * @param repeatPenalty repeat penalty (1.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withRepeatPenalty(float repeatPenalty) {
        return withScalar(PARAM_REPEAT_PENALTY, repeatPenalty);
    }

    /**
     * Returns a new request with the frequency penalty replaced (default: 0.0, 0.0 = disabled).
     *
     * @param frequencyPenalty alpha frequency penalty (0.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withFrequencyPenalty(float frequencyPenalty) {
        return withScalar(PARAM_FREQUENCY_PENALTY, frequencyPenalty);
    }

    /**
     * Returns a new request with the presence penalty replaced (default: 0.0, 0.0 = disabled).
     *
     * @param presencePenalty alpha presence penalty (0.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withPresencePenalty(float presencePenalty) {
        return withScalar(PARAM_PRESENCE_PENALTY, presencePenalty);
    }

    /**
     * Returns a new request with the MiroStat strategy replaced.
     *
     * @param mirostat the MiroStat sampling strategy
     * @return a new instance; this instance is unchanged
     */
    // .ordinal() is intentional here: the llama.cpp server expects the integer
    // ordinal of the MiroStat enum (0 = OFF, 1 = V1, 2 = V2) on the wire. The
    // declared order of MiroStat.values() matches the upstream contract.
    @SuppressWarnings("EnumOrdinal")
    public InferenceParameters withMiroStat(MiroStat mirostat) {
        return withScalar(PARAM_MIROSTAT, mirostat.ordinal());
    }

    /**
     * Returns a new request with the MiroStat tau replaced (default: 5.0).
     *
     * @param mirostatTau the MiroStat target entropy parameter tau
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMiroStatTau(float mirostatTau) {
        return withScalar(PARAM_MIROSTAT_TAU, mirostatTau);
    }

    /**
     * Returns a new request with the MiroStat eta replaced (default: 0.1).
     *
     * @param mirostatEta the MiroStat learning rate parameter eta
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMiroStatEta(float mirostatEta) {
        return withScalar(PARAM_MIROSTAT_ETA, mirostatEta);
    }

    /**
     * Returns a new request with the newline-penalty flag replaced.
     *
     * @param penalizeNl whether to penalize newline tokens
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withPenalizeNl(boolean penalizeNl) {
        return withScalar(PARAM_PENALIZE_NL, penalizeNl);
    }

    /**
     * Returns a new request with the {@code n_keep} value replaced (default: 0, -1 = all).
     *
     * @param nKeep tokens to keep from the initial prompt (-1 = all)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withNKeep(int nKeep) {
        return withScalar(PARAM_N_KEEP, nKeep);
    }

    /**
     * Returns a new request with the RNG seed replaced (default: -1, use random seed for &lt; 0).
     *
     * @param seed the RNG seed
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withSeed(int seed) {
        return withScalar(PARAM_SEED, seed);
    }

    /**
     * Returns a new request with the {@code n_probs} value replaced.
     *
     * @param nProbs number of top-token probabilities to output
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withNProbs(int nProbs) {
        return withScalar(PARAM_N_PROBS, nProbs);
    }

    /**
     * Returns a new request with the {@code min_keep} value replaced (0 = disabled).
     *
     * @param minKeep minimum number of tokens samplers should return (0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMinKeep(int minKeep) {
        return withScalar(PARAM_MIN_KEEP, minKeep);
    }

    /**
     * Returns a new request with a BNF-like grammar constraint replaced.
     *
     * @param grammar BNF-like grammar string; {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withGrammar(@Nullable String grammar) {
        return withOptionalJson(PARAM_GRAMMAR, grammar);
    }

    /**
     * Returns a new request with a per-request JSON-schema constraint replaced. The
     * native server converts the schema to a GBNF grammar internally; the schema string
     * is passed verbatim and must be valid JSON Schema.
     *
     * @param schema JSON Schema as a JSON-encoded string
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withJsonSchema(String schema) {
        return withRaw(PARAM_JSON_SCHEMA, schema);
    }

    /**
     * Returns a new request with the OpenAI streaming {@code stream_options} object replaced. Passing
     * {@code {"include_usage":true}} makes the native server emit a trailing {@code usage} chunk after
     * the stream completes (with an empty {@code choices} array), which OpenAI clients — notably the
     * VS&nbsp;Code Copilot custom endpoint — rely on for token accounting.
     *
     * @param streamOptionsJson the {@code stream_options} object as a JSON-encoded string
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withStreamOptions(String streamOptionsJson) {
        return withRaw(PARAM_STREAM_OPTIONS, streamOptionsJson);
    }

    /**
     * Returns a new request with the OpenAI {@code response_format} object replaced. The native server
     * turns {@code {"type":"json_object"}} or {@code {"type":"json_schema","json_schema":{...}}} into a
     * GBNF grammar constraint internally, so the model is forced to emit conforming JSON — the OpenAI
     * "structured outputs" feature that strict agent clients use.
     *
     * @param responseFormatJson the {@code response_format} object as a JSON-encoded string
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withResponseFormat(String responseFormatJson) {
        return withRaw(PARAM_RESPONSE_FORMAT, responseFormatJson);
    }

    /**
     * Returns a new request with the repetition-penalty prompt-portion override replaced.
     *
     * @param penaltyPrompt the string portion of the prompt to penalize; {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withPenaltyPrompt(@Nullable String penaltyPrompt) {
        return withOptionalJson(PARAM_PENALTY_PROMPT, penaltyPrompt);
    }

    /**
     * Returns a new request with the repetition-penalty prompt-portion override replaced
     * (token-id form). Empty input is a no-op (returns {@code this}).
     *
     * @param tokens token ids of the prompt portion to penalize
     * @return a new instance with the array set, or {@code this} if {@code tokens} is empty
     */
    public InferenceParameters withPenaltyPrompt(int... tokens) {
        if (tokens.length == 0) {
            return this;
        }
        return withRaw(PARAM_PENALTY_PROMPT, serializer.buildIntArray(tokens).toString());
    }

    /**
     * Returns a new request with the EOS-ignore flag replaced.
     *
     * @param ignoreEos whether to ignore the end-of-stream token
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withIgnoreEos(boolean ignoreEos) {
        return withScalar(PARAM_IGNORE_EOS, ignoreEos);
    }

    /**
     * Returns a new request with the logit bias (token-id form) replaced. Empty input is a
     * no-op (returns {@code this}). This entry overrides any prior logit-bias setter.
     *
     * @param logitBias token-id to bias-value
     * @return a new instance with the bias set, or {@code this} if {@code logitBias} is empty
     */
    public InferenceParameters withTokenIdBias(Map<Integer, Float> logitBias) {
        if (logitBias.isEmpty()) {
            return this;
        }
        return withRaw(
                PARAM_LOGIT_BIAS, serializer.buildTokenIdBiasArray(logitBias).toString());
    }

    /**
     * Returns a new request with the disabled token-id set replaced (logit-bias form with
     * negative infinity). Empty input is a no-op (returns {@code this}). Overrides prior
     * logit-bias setters.
     *
     * @param tokenIds token ids to disable
     * @return a new instance with the bias set, or {@code this} if {@code tokenIds} is empty
     */
    public InferenceParameters withDisabledTokenIds(Collection<Integer> tokenIds) {
        if (tokenIds.isEmpty()) {
            return this;
        }
        return withRaw(
                PARAM_LOGIT_BIAS, serializer.buildDisableTokenIdArray(tokenIds).toString());
    }

    /**
     * Returns a new request with the logit bias (token-string form) replaced. Empty input
     * is a no-op (returns {@code this}). Overrides prior logit-bias setters.
     *
     * @param logitBias token string to bias value
     * @return a new instance with the bias set, or {@code this} if {@code logitBias} is empty
     */
    public InferenceParameters withTokenBias(Map<String, Float> logitBias) {
        if (logitBias.isEmpty()) {
            return this;
        }
        return withRaw(
                PARAM_LOGIT_BIAS,
                serializer.buildTokenStringBiasArray(logitBias).toString());
    }

    /**
     * Returns a new request with the disabled token-string set replaced (logit-bias form
     * with negative infinity). Empty input is a no-op (returns {@code this}). Overrides
     * prior logit-bias setters.
     *
     * @param tokens token strings to disable
     * @return a new instance with the bias set, or {@code this} if {@code tokens} is empty
     */
    public InferenceParameters withDisabledTokens(Collection<String> tokens) {
        if (tokens.isEmpty()) {
            return this;
        }
        return withRaw(
                PARAM_LOGIT_BIAS,
                serializer.buildDisableTokenStringArray(tokens).toString());
    }

    /**
     * Returns a new request with the stop-strings array replaced. Empty input is a no-op.
     *
     * @param stopStrings strings whose presence stops generation
     * @return a new instance with the stop-array set, or {@code this} if {@code stopStrings} is empty
     */
    public InferenceParameters withStopStrings(String... stopStrings) {
        if (stopStrings.length == 0) {
            return this;
        }
        return withRaw(PARAM_STOP, serializer.buildStopStrings(stopStrings).toString());
    }

    /**
     * Returns a new request with the sampler chain replaced. Empty input is a no-op.
     *
     * @param samplers samplers to use, in order
     * @return a new instance with the sampler array set, or {@code this} if {@code samplers} is empty
     */
    public InferenceParameters withSamplers(Sampler... samplers) {
        if (samplers.length == 0) {
            return this;
        }
        return withRaw(PARAM_SAMPLERS, serializer.buildSamplers(samplers).toString());
    }

    /**
     * Returns a new request with the chat-template flag replaced.
     *
     * @param useChatTemplate whether to apply a chat template
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withUseChatTemplate(boolean useChatTemplate) {
        return withScalar(PARAM_USE_JINJA, useChatTemplate);
    }

    /**
     * Returns a new request with the chat-template string replaced.
     *
     * @param chatTemplate the Jinja-style chat template to use; {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withChatTemplate(@Nullable String chatTemplate) {
        return withOptionalJson(PARAM_CHAT_TEMPLATE, chatTemplate);
    }

    /**
     * Returns a new request with custom Jinja template kwargs replaced. Values must be
     * valid JSON.
     *
     * @param kwargs variable names to JSON-serialized values
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withChatTemplateKwargs(Map<String, String> kwargs) {
        return withRaw(
                PARAM_CHAT_TEMPLATE_KWARGS,
                serializer.buildRawValueObject(kwargs).toString());
    }

    /**
     * Returns a new request with chat messages replaced. Allows one optional system
     * message and one-or-more user/assistant message pairs.
     *
     * @param systemMessage optional system message ({@code null} or empty allowed)
     * @param messages user/assistant message pairs (role -&gt; content)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMessages(@Nullable String systemMessage, List<Pair<String, String>> messages) {
        return withRaw(
                PARAM_MESSAGES,
                serializer.buildMessages(systemMessage, messages).toString());
    }

    /**
     * Returns a new request with chat messages replaced (multimodal-capable variant).
     * Messages with non-null {@link net.ladenthin.llama.value.ChatMessage#getParts()} are serialized as OAI
     * array-form content (text + image_url parts).
     *
     * @param messages ordered messages, including any {@code "system"} prelude
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMessages(List<ChatMessage> messages) {
        return withRaw(PARAM_MESSAGES, serializer.buildMessages(messages).toString());
    }

    /**
     * Returns a new request with the {@code messages} array set from a pre-built JSON
     * string (e.g. {@link ChatRequest#buildMessagesJson()}).
     *
     * @param messagesJson the JSON array string
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withMessagesJson(String messagesJson) {
        return withRaw(PARAM_MESSAGES, messagesJson);
    }

    /**
     * Returns a new request with the OAI-style {@code tools} array set from a pre-built
     * JSON string (e.g. {@link ChatRequest#buildToolsJson()}).
     *
     * @param toolsJson the JSON array string
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withToolsJson(String toolsJson) {
        return withRaw(PARAM_TOOLS, toolsJson);
    }

    /**
     * Returns a new request with the OAI-style {@code tool_choice} hint replaced.
     *
     * @param toolChoice the hint string ({@code "auto"} / {@code "none"} / {@code "required"}); {@code null} clears
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withToolChoice(@Nullable String toolChoice) {
        return withOptionalJson(PARAM_TOOL_CHOICE, toolChoice);
    }

    /**
     * Returns a new request with the OpenAI-compatible {@code parallel_tool_calls} flag replaced.
     *
     * @param parallelToolCalls whether the model may emit more than one tool call in a turn
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withParallelToolCalls(boolean parallelToolCalls) {
        return withScalar(PARAM_PARALLEL_TOOL_CALLS, parallelToolCalls);
    }

    /**
     * Returns a new request with the top-n-sigma threshold replaced (default: -1.0, disabled).
     *
     * @param topNSigma sigma threshold (-1.0 = disabled)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withTopNSigma(float topNSigma) {
        return withScalar(PARAM_TOP_N_SIGMA, topNSigma);
    }

    /**
     * Returns a new request with the reasoning-format choice replaced.
     *
     * @param reasoningFormat the format used to handle thinking tokens
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withReasoningFormat(ReasoningFormat reasoningFormat) {
        return withRaw(PARAM_REASONING_FORMAT, toJsonString(reasoningFormat.getArgValue()));
    }

    /**
     * Returns a new request with the reasoning-token budget replaced. A value of {@code -1}
     * disables the budget.
     *
     * @param budgetTokens maximum reasoning tokens (-1 = unlimited)
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withReasoningBudgetTokens(int budgetTokens) {
        return withScalar(PARAM_REASONING_BUDGET_TOKENS, budgetTokens);
    }

    /**
     * Returns a new request with the boolean continue-final-message flag replaced.
     *
     * @param continueFinalMessage {@code true} to continue the last assistant message
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withContinueFinalMessage(boolean continueFinalMessage) {
        return withScalar(PARAM_CONTINUE_FINAL_MESSAGE, continueFinalMessage);
    }

    /**
     * Returns a new request with the channel-typed continue-final-message setting replaced.
     *
     * @param mode the channel to continue from
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withContinueFinalMessage(ContinuationMode mode) {
        return withRaw(PARAM_CONTINUE_FINAL_MESSAGE, toJsonString(mode.getValue()));
    }

    /**
     * Package-private: returns a new request with the {@code stream} flag replaced.
     * Used by {@link net.ladenthin.llama.LlamaModel} and {@link net.ladenthin.llama.LlamaIterator} to pin the streaming mode
     * for each request without mutating the caller's instance.
     *
     * @param stream whether to enable streaming
     * @return a new instance; this instance is unchanged
     */
    public InferenceParameters withStream(boolean stream) {
        return withScalar(PARAM_STREAM, stream);
    }
}