java-llama.cpp/src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java at cf635fdfdd39f2ecff9bf9b9dd767d001c8f0ff2 · vaiju1981/java-llama.cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// SPDX-FileCopyrightText: 2026 Bernard Ladenthin <bernard.ladenthin@gmail.com>
//
// SPDX-License-Identifier: MIT

package net.ladenthin.llama.server;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import java.io.IOException;
import org.jspecify.annotations.Nullable;

/**
 * Pure formatting helpers for the OpenAI HTTP surface: Server-Sent-Events framing, the {@code [DONE]}
 * terminator, heartbeat comments, the {@code GET /v1/models} body, and the OpenAI error envelope.
 *
 * <p>Stateless and free of JNI / model dependencies, so each helper is unit-testable with literals.
 */
final class OpenAiSseFormatter {

    /** Shared Jackson mapper; thread-safe and reused. */
    static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

    private OpenAiSseFormatter() {}

    /**
     * Frame a chunk's JSON as one SSE {@code data:} event.
     *
     * @param json the chunk JSON to send
     * @return the SSE event text, terminated by a blank line
     */
    static String sseData(String json) {
        return "data: " + json + "\n\n";
    }

    /**
     * The terminating SSE event that marks the end of an OpenAI stream.
     *
     * @return {@code "data: [DONE]\n\n"}
     */
    static String sseDone() {
        return "data: [DONE]\n\n";
    }

    /**
     * An SSE comment line used as a keep-alive heartbeat. OpenAI clients ignore comment lines, but the
     * bytes reset the client's stream-inactivity timer during long prompt prefill.
     *
     * @return {@code ": ping\n\n"}
     */
    static String heartbeat() {
        return ": ping\n\n";
    }

    /**
     * Build an OpenAI error envelope: {@code {"error":{"message":…,"type":…,"code":…}}}.
     *
     * @param message human-readable error message
     * @param type OpenAI error type (e.g. {@code "invalid_request_error"}, {@code "server_error"})
     * @param code optional machine-readable code; {@code null} renders as JSON {@code null}
     * @return the error envelope serialized as JSON
     */
    static String errorJson(String message, String type, @Nullable String code) {
        ObjectNode error = OBJECT_MAPPER.createObjectNode();
        error.put("message", message);
        error.put("type", type);
        if (code != null) {
            error.put("code", code);
        } else {
            error.putNull("code");
        }
        ObjectNode root = OBJECT_MAPPER.createObjectNode();
        root.set("error", error);
        return root.toString();
    }

    /**
     * Guarantee a streamed chunk's usage object carries {@code usage.prompt_tokens_details.cached_tokens}.
     *
     * <p>When {@code stream_options.include_usage} is set, the OpenAI streaming protocol emits a trailing
     * usage chunk. The VS&nbsp;Code Copilot custom endpoint throws
     * {@code Cannot read properties of undefined (reading 'cached_tokens')} (microsoft/vscode #273482) if
     * {@code usage.prompt_tokens_details.cached_tokens} is missing, and upstream llama.cpp does not always
     * populate it. This fills a default of {@code 0} when absent. Token-delta chunks (which carry no
     * non-null usage object) are returned unchanged and unparsed, so the streaming hot path is untouched.
     *
     * @param chunkJson one {@code chat.completion.chunk} serialized as JSON
     * @return the chunk JSON with {@code cached_tokens} guaranteed present inside any non-null usage object
     */
    static String ensureUsageCachedTokens(String chunkJson) {
        // Fast path: only the trailing usage chunk carries a non-null usage object — skip the rest unparsed.
        if (!chunkJson.contains("\"usage\"") || chunkJson.contains("\"usage\":null")) {
            return chunkJson;
        }
        try {
            JsonNode root = OBJECT_MAPPER.readTree(chunkJson);
            if (!root.isObject() || !root.path("usage").isObject()) {
                return chunkJson;
            }
            ObjectNode usage = (ObjectNode) root.get("usage");
            JsonNode details = usage.path("prompt_tokens_details");
            if (details.isObject()) {
                if (details.has("cached_tokens")) {
                    return chunkJson; // already correct — emit verbatim
                }
                ((ObjectNode) details).put("cached_tokens", 0);
            } else {
                usage.putObject("prompt_tokens_details").put("cached_tokens", 0);
            }
            return root.toString();
        } catch (IOException e) {
            // Never break a live stream over a formatting nicety.
            return chunkJson;
        }
    }

    /**
     * Build the {@code GET /v1/models} body advertising a single model.
     *
     * @param modelId the model id to advertise
     * @return an OpenAI model-list object serialized as JSON
     */
    static String modelsJson(String modelId) {
        ObjectNode model = OBJECT_MAPPER.createObjectNode();
        model.put("id", modelId);
        model.put("object", "model");
        model.put("owned_by", "llama.cpp");
        ArrayNode data = OBJECT_MAPPER.createArrayNode();
        data.add(model);
        ObjectNode root = OBJECT_MAPPER.createObjectNode();
        root.put("object", "list");
        root.set("data", data);
        return root.toString();
    }

    /**
     * Build one OpenAI {@code text_completion} streaming chunk for {@code POST /v1/completions}.
     *
     * @param id the completion id, stable across the whole stream
     * @param created the creation timestamp in epoch seconds
     * @param model the served model id
     * @param text the incremental token text carried by this chunk
     * @param finishReason the finish reason on the final chunk, or {@code null} for intermediate chunks
     * @return the chunk serialized as JSON
     */
    static String completionChunk(String id, long created, String model, String text, @Nullable String finishReason) {
        ObjectNode choice = OBJECT_MAPPER.createObjectNode();
        choice.put("text", text);
        choice.put("index", 0);
        choice.putNull("logprobs");
        if (finishReason == null) {
            choice.putNull("finish_reason");
        } else {
            choice.put("finish_reason", finishReason);
        }
        ArrayNode choices = OBJECT_MAPPER.createArrayNode();
        choices.add(choice);
        ObjectNode root = OBJECT_MAPPER.createObjectNode();
        root.put("id", id);
        root.put("object", "text_completion");
        root.put("created", created);
        root.put("model", model);
        root.set("choices", choices);
        return root.toString();
    }

    /**
     * Build the llama.cpp-native {@code GET /props} body. Autocomplete clients (e.g. llama.vscode) read
     * {@code default_generation_settings.n_ctx} from here to size their context window, and newer clients
     * read the {@code modalities} block to gate image input.
     *
     * @param modelId the served model id
     * @param nCtx the advertised context length
     * @param vision whether image input is supported
     * @return the props object serialized as JSON
     */
    static String propsJson(String modelId, int nCtx, boolean vision) {
        ObjectNode root = OBJECT_MAPPER.createObjectNode();
        ObjectNode defaults = root.putObject("default_generation_settings");
        defaults.put("n_ctx", nCtx);
        defaults.put("model", modelId);
        root.put("total_slots", 1);
        root.put("model_alias", modelId);
        root.put("chat_template", "");
        ObjectNode modalities = root.putObject("modalities");
        modalities.put("vision", vision);
        modalities.put("audio", false);
        return root.toString();
    }
}