java-llama.cpp/src/main/java/net/ladenthin/llama/LlamaIterator.java at 6886225adc11861ffcb5680ec8496382463275a6 · bernardladenthin/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
// SPDX-FileCopyrightText: 2023-2025 Konstantin Herud
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama;

import net.ladenthin.llama.json.CompletionResponseParser;
import java.util.Iterator;
import java.util.NoSuchElementException;

/**
 * This iterator is used by {@link LlamaModel#generate(InferenceParameters)} and
 * {@link LlamaModel#generateChat(InferenceParameters)}. In addition to implementing {@link Iterator},
 * it allows to cancel ongoing inference (see {@link #cancel()}).
 *
 * <p>{@link LlamaIterator} implements {@link AutoCloseable}. When used via {@link LlamaIterable}
 * inside a try-with-resources block, {@link #close()} is called automatically on early exit
 * (e.g. {@code break}), preventing the native task slot from leaking.
 */
public final class LlamaIterator implements Iterator<LlamaOutput>, AutoCloseable {

    private final LlamaModel model;
    private final int taskId;
    private final CompletionResponseParser completionParser = new CompletionResponseParser();

    private boolean hasNext = true;

    LlamaIterator(LlamaModel model, InferenceParameters parameters) {
        this(model, parameters, false);
    }

    LlamaIterator(LlamaModel model, InferenceParameters parameters, boolean chat) {
        this.model = model;
        parameters.setStream(true);
        taskId = chat
                ? model.requestChatCompletion(parameters.toString())
                : model.requestCompletion(parameters.toString());
    }

    @Override
    public boolean hasNext() {
        return hasNext;
    }

    @Override
    public LlamaOutput next() {
        if (!hasNext) {
            throw new NoSuchElementException();
        }
        String json = model.receiveCompletionJson(taskId);
        LlamaOutput output = completionParser.parse(json);
        hasNext = !output.stop;
        if (output.stop) {
        	model.releaseTask(taskId);
        }
        return output;
    }

    /**
     * Cancel the ongoing generation process.
     */
    public void cancel() {
        model.cancelCompletion(taskId);
        hasNext = false;
    }

    /**
     * Cancels any in-progress generation if the iterator has not yet reached a stop token.
     * Safe to call multiple times — subsequent calls are no-ops.
     *
     * <p>Prefer using the enclosing {@link LlamaIterable} in a try-with-resources block rather
     * than calling this directly.
     */
    @Override
    public void close() {
        if (hasNext) {
            cancel();
        }
    }
}