bernardladenthin
diff --git a/‎src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java‎
Lines changed: 14 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/parameters/InferenceParameters.java‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/LlamaModelBackend.java‎
Lines changed: 7 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/LlamaModelBackend.java‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/OpenAiBackend.java‎
Lines changed: 15 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/OpenAiBackend.java‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java‎
Lines changed: 77 additions & 7 deletions b/‎src/main/java/net/ladenthin/llama/server/OpenAiCompatServer.java‎
Lines changed: 77 additions & 7 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java‎
Lines changed: 13 additions & 1 deletion b/‎src/main/java/net/ladenthin/llama/server/OpenAiRequestMapper.java‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java‎
Lines changed: 32 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/OpenAiServerConfig.java‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java‎
Lines changed: 42 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/OpenAiSseFormatter.java‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎src/main/java/net/ladenthin/llama/server/package-info.java‎
Lines changed: 6 additions & 0 deletions b/‎src/main/java/net/ladenthin/llama/server/package-info.java‎
Lines changed: 6 additions & 0 deletions
@@ -58,6 +58,7 @@ public final class InferenceParameters extends JsonParameters {
     private static final String PARAM_INPUT_PREFIX = "input_prefix";
     private static final String PARAM_INPUT_SUFFIX = "input_suffix";
     private static final String PARAM_CACHE_PROMPT = "cache_prompt";
+    private static final String PARAM_STREAM_OPTIONS = "stream_options";
     private static final String PARAM_N_PREDICT = "n_predict";
     private static final String PARAM_TOP_K = "top_k";
     private static final String PARAM_TOP_P = "top_p";
@@ -438,6 +439,19 @@ public InferenceParameters withJsonSchema(String schema) {
         return withRaw(PARAM_JSON_SCHEMA, schema);
     }
 
+    /**
+     * Returns a new request with the OpenAI streaming {@code stream_options} object replaced. Passing
+     * {@code {"include_usage":true}} makes the native server emit a trailing {@code usage} chunk after
+     * the stream completes (with an empty {@code choices} array), which OpenAI clients — notably the
+     * VS&nbsp;Code Copilot custom endpoint — rely on for token accounting.
+     *
+     * @param streamOptionsJson the {@code stream_options} object as a JSON-encoded string
+     * @return a new instance; this instance is unchanged
+     */
+    public InferenceParameters withStreamOptions(String streamOptionsJson) {
+        return withRaw(PARAM_STREAM_OPTIONS, streamOptionsJson);
+    }
+
     /**
      * Returns a new request with the repetition-penalty prompt-portion override replaced.
      *
 
@@ -80,4 +80,11 @@ public String embeddings(JsonNode request) {
         // oaiCompat=true so the response uses the OpenAI {"object":"list","data":[{embedding}]} shape.
         return model.handleEmbeddings(request.toString(), true);
     }
+
+    @Override
+    public String infill(JsonNode request) {
+        // The native /infill handler parses the body itself (input_prefix/input_suffix/...) and applies
+        // the model's FIM tokens from GGUF metadata; forward verbatim.
+        return model.handleInfill(request.toString());
+    }
 }
@@ -60,4 +60,19 @@ interface OpenAiBackend {
      * @throws IOException if generation fails in a way the caller should surface as a server error
      */
     String embeddings(JsonNode request) throws IOException;
+
+    /**
+     * Run a (non-streaming) fill-in-the-middle completion ({@code POST /infill}). The request body is
+     * forwarded verbatim to the native llama.cpp infill handler, which applies the model's FIM control
+     * tokens server-side from GGUF metadata — so callers send raw {@code input_prefix} /
+     * {@code input_suffix} (and optional {@code input_extra} / {@code prompt}). This is the endpoint
+     * that drives local ghost-text autocomplete clients (llama.vscode, llama.vim, Twinny, Tabby,
+     * Continue's {@code llama.cpp} provider).
+     *
+     * @param request the parsed llama.cpp {@code /infill} request (typically {@code input_prefix} +
+     *                {@code input_suffix})
+     * @return the infill response serialized as JSON (clients read the {@code "content"} field)
+     * @throws IOException if generation fails in a way the caller should surface as a server error
+     */
+    String infill(JsonNode request) throws IOException;
 }
@@ -7,7 +7,9 @@
 import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.sun.net.httpserver.Filter;
 import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
 import com.sun.net.httpserver.HttpServer;
 import java.io.IOException;
 import java.io.InputStream;
@@ -71,6 +73,12 @@ public final class OpenAiCompatServer implements AutoCloseable {
     /** The embeddings route. */
     public static final String PATH_EMBEDDINGS = "/v1/embeddings";
 
+    /**
+     * The fill-in-the-middle (autocomplete) route. Deliberately the llama.cpp-native bare path (no
+     * {@code /v1}) so ghost-text clients such as llama.vscode and Tabby reach it unchanged.
+     */
+    public static final String PATH_INFILL = "/infill";
+
     /** The model-list route. */
     public static final String PATH_MODELS = "/v1/models";
 
@@ -94,6 +102,7 @@ public final class OpenAiCompatServer implements AutoCloseable {
     private final OpenAiServerConfig config;
     private final OpenAiBackend backend;
     private final HttpServer http;
+    private final Filter corsFilter;
     private final ExecutorService requestExecutor;
     private final ScheduledExecutorService heartbeatExecutor;
 
@@ -122,12 +131,21 @@ public OpenAiCompatServer(LlamaModel model, OpenAiServerConfig config) throws IO
         this.requestExecutor = Executors.newCachedThreadPool(namedFactory("jllama-openai-http"));
         this.heartbeatExecutor = Executors.newScheduledThreadPool(1, namedFactory("jllama-openai-hb"));
         this.http = HttpServer.create(new InetSocketAddress(config.getHost(), config.getPort()), 0);
-        http.createContext("/", this::handleNotFound);
-        http.createContext(PATH_HEALTH, this::handleHealth);
-        http.createContext(PATH_MODELS, this::handleModels);
-        http.createContext(PATH_CHAT_COMPLETIONS, this::handleChatCompletions);
-        http.createContext(PATH_COMPLETIONS, this::handleCompletions);
-        http.createContext(PATH_EMBEDDINGS, this::handleEmbeddings);
+        this.corsFilter = buildCorsFilter(config.getCorsAllowOrigin());
+        register("/", this::handleNotFound);
+        register(PATH_HEALTH, this::handleHealth);
+        // Each route is registered under its canonical path and a bare alias (clients disagree on
+        // whether to include the /v1 prefix), so both forms resolve to the same handler.
+        register(PATH_MODELS, this::handleModels);
+        register("/models", this::handleModels);
+        register(PATH_CHAT_COMPLETIONS, this::handleChatCompletions);
+        register("/chat/completions", this::handleChatCompletions);
+        register(PATH_COMPLETIONS, this::handleCompletions);
+        register("/completions", this::handleCompletions);
+        register(PATH_EMBEDDINGS, this::handleEmbeddings);
+        register("/embeddings", this::handleEmbeddings);
+        register(PATH_INFILL, this::handleInfill);
+        register("/v1/infill", this::handleInfill);
         http.setExecutor(requestExecutor);
     }
 
@@ -159,6 +177,42 @@ public void close() {
         heartbeatExecutor.shutdownNow();
     }
 
+    /**
+     * Register {@code handler} for {@code path} with the CORS filter attached. Centralised so the
+     * cross-cutting CORS/preflight wiring applies uniformly to every route (including the catch-all).
+     */
+    private void register(String path, HttpHandler handler) {
+        http.createContext(path, handler).getFilters().add(corsFilter);
+    }
+
+    /**
+     * Build a CORS filter that stamps {@code Access-Control-Allow-Origin} on every response and answers
+     * {@code OPTIONS} preflights with {@code 204} + the allowed methods/headers — so browser- and
+     * webview-based clients (which preflight an {@code Authorization} header) are not blocked.
+     */
+    private static Filter buildCorsFilter(String allowOrigin) {
+        return new Filter() {
+            @Override
+            public void doFilter(HttpExchange exchange, Chain chain) throws IOException {
+                exchange.getResponseHeaders().set("Access-Control-Allow-Origin", allowOrigin);
+                if ("OPTIONS".equalsIgnoreCase(exchange.getRequestMethod())) {
+                    exchange.getResponseHeaders().set("Access-Control-Allow-Methods", "GET, POST, OPTIONS");
+                    exchange.getResponseHeaders().set("Access-Control-Allow-Headers", "Content-Type, Authorization");
+                    exchange.getResponseHeaders().set("Access-Control-Max-Age", "86400");
+                    exchange.sendResponseHeaders(204, -1);
+                    exchange.close();
+                    return;
+                }
+                chain.doFilter(exchange);
+            }
+
+            @Override
+            public String description() {
+                return "CORS preflight + Access-Control-Allow-Origin";
+            }
+        };
+    }
+
     // ----- handlers -----
 
     private void handleChatCompletions(HttpExchange exchange) throws IOException {
@@ -204,6 +258,17 @@ private void handleEmbeddings(HttpExchange exchange) throws IOException {
         }
     }
 
+    private void handleInfill(HttpExchange exchange) throws IOException {
+        try {
+            JsonNode request = requirePostJson(exchange);
+            if (request != null) {
+                completeNonStreaming(exchange, request, backend::infill);
+            }
+        } finally {
+            exchange.close();
+        }
+    }
+
     /**
      * Run a non-streaming request through {@code producer} and write its JSON body, translating an
      * {@link IllegalArgumentException} to {@code 400} and any other failure to {@code 500}.
@@ -236,7 +301,12 @@ private void streamChat(HttpExchange exchange, JsonNode request) throws IOExcept
                 config.getHeartbeatMillis(),
                 TimeUnit.MILLISECONDS);
         try {
-            backend.stream(request, chunkJson -> writeStrict(os, writeLock, OpenAiSseFormatter.sseData(chunkJson)));
+            backend.stream(
+                    request,
+                    chunkJson -> writeStrict(
+                            os,
+                            writeLock,
+                            OpenAiSseFormatter.sseData(OpenAiSseFormatter.ensureUsageCachedTokens(chunkJson))));
             writeStrict(os, writeLock, OpenAiSseFormatter.sseDone());
         } catch (IllegalArgumentException e) {
             writeQuietly(
 
@@ -40,7 +40,12 @@ InferenceParameters toInferenceParameters(JsonNode request) {
             throw new IllegalArgumentException("'messages' must be a non-empty array");
         }
 
-        InferenceParameters params = InferenceParameters.empty().withMessagesJson(messages.toString());
+        // cache_prompt=true reuses the slot's KV prefix across turns — the standard llama.cpp-server
+        // default and what IDE clients rely on for acceptable repeated-prefix latency. OpenAI requests
+        // never carry this llama.cpp-specific flag, so defaulting it here is safe.
+        InferenceParameters params = InferenceParameters.empty()
+                .withMessagesJson(messages.toString())
+                .withCachePrompt(true);
 
         JsonNode tools = request.path("tools");
         if (tools.isArray() && tools.size() > 0) {
@@ -86,6 +91,13 @@ InferenceParameters toInferenceParameters(JsonNode request) {
             params = params.withStopStrings(stops);
         }
 
+        // Forward stream_options verbatim (e.g. {"include_usage":true}) so the native server emits the
+        // trailing usage chunk the OpenAI streaming protocol — and the Copilot custom endpoint — expect.
+        JsonNode streamOptions = request.path("stream_options");
+        if (streamOptions.isObject()) {
+            params = params.withStreamOptions(streamOptions.toString());
+        }
+
         return params;
     }
 
 
@@ -32,13 +32,21 @@ public final class OpenAiServerConfig {
     /** Default Server-Sent-Events heartbeat interval, in milliseconds. */
     public static final long DEFAULT_HEARTBEAT_MILLIS = 15_000L;
 
+    /**
+     * Default {@code Access-Control-Allow-Origin} value: {@code "*"}. Browser- and webview-based clients
+     * send a CORS preflight and require this header; {@code "*"} is the pragmatic default for a server
+     * that binds loopback and authenticates with a bearer token (not cookies).
+     */
+    public static final String DEFAULT_CORS_ALLOW_ORIGIN = "*";
+
     private final String host;
     private final int port;
     private final @Nullable String apiKey;
     private final String modelId;
     private final int maxInputTokens;
     private final int maxOutputTokens;
     private final long heartbeatMillis;
+    private final String corsAllowOrigin;
 
     private OpenAiServerConfig(Builder builder) {
         this.host = builder.host;
@@ -48,6 +56,7 @@ private OpenAiServerConfig(Builder builder) {
         this.maxInputTokens = builder.maxInputTokens;
         this.maxOutputTokens = builder.maxOutputTokens;
         this.heartbeatMillis = builder.heartbeatMillis;
+        this.corsAllowOrigin = builder.corsAllowOrigin;
     }
 
     /**
@@ -122,6 +131,15 @@ public long getHeartbeatMillis() {
         return heartbeatMillis;
     }
 
+    /**
+     * The {@code Access-Control-Allow-Origin} value sent on every response and CORS preflight.
+     *
+     * @return the allowed CORS origin
+     */
+    public String getCorsAllowOrigin() {
+        return corsAllowOrigin;
+    }
+
     /**
      * Whether bearer-token authentication is enabled (an API key is configured).
      *
@@ -152,6 +170,8 @@ public String toString() {
                 + maxOutputTokens
                 + ", heartbeatMillis="
                 + heartbeatMillis
+                + ", corsAllowOrigin="
+                + corsAllowOrigin
                 + '}';
     }
 
@@ -165,6 +185,7 @@ public static final class Builder {
         private int maxInputTokens = DEFAULT_MAX_INPUT_TOKENS;
         private int maxOutputTokens = DEFAULT_MAX_OUTPUT_TOKENS;
         private long heartbeatMillis = DEFAULT_HEARTBEAT_MILLIS;
+        private String corsAllowOrigin = DEFAULT_CORS_ALLOW_ORIGIN;
 
         private Builder() {}
 
@@ -245,6 +266,17 @@ public Builder heartbeatMillis(long heartbeatMillis) {
             return this;
         }
 
+        /**
+         * Sets the {@code Access-Control-Allow-Origin} value (CORS).
+         *
+         * @param corsAllowOrigin the allowed origin (e.g. {@code "*"} or a specific scheme/host/port)
+         * @return this builder
+         */
+        public Builder corsAllowOrigin(String corsAllowOrigin) {
+            this.corsAllowOrigin = corsAllowOrigin;
+            return this;
+        }
+
         /**
          * Builds the immutable configuration.
          *
 
@@ -4,9 +4,11 @@
 
 package net.ladenthin.llama.server;
 
+import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ArrayNode;
 import com.fasterxml.jackson.databind.node.ObjectNode;
+import java.io.IOException;
 import org.jspecify.annotations.Nullable;
 
 /**
@@ -73,6 +75,46 @@ static String errorJson(String message, String type, @Nullable String code) {
         return root.toString();
     }
 
+    /**
+     * Guarantee a streamed chunk's usage object carries {@code usage.prompt_tokens_details.cached_tokens}.
+     *
+     * <p>When {@code stream_options.include_usage} is set, the OpenAI streaming protocol emits a trailing
+     * usage chunk. The VS&nbsp;Code Copilot custom endpoint throws
+     * {@code Cannot read properties of undefined (reading 'cached_tokens')} (microsoft/vscode #273482) if
+     * {@code usage.prompt_tokens_details.cached_tokens} is missing, and upstream llama.cpp does not always
+     * populate it. This fills a default of {@code 0} when absent. Token-delta chunks (which carry no
+     * non-null usage object) are returned unchanged and unparsed, so the streaming hot path is untouched.
+     *
+     * @param chunkJson one {@code chat.completion.chunk} serialized as JSON
+     * @return the chunk JSON with {@code cached_tokens} guaranteed present inside any non-null usage object
+     */
+    static String ensureUsageCachedTokens(String chunkJson) {
+        // Fast path: only the trailing usage chunk carries a non-null usage object — skip the rest unparsed.
+        if (!chunkJson.contains("\"usage\"") || chunkJson.contains("\"usage\":null")) {
+            return chunkJson;
+        }
+        try {
+            JsonNode root = OBJECT_MAPPER.readTree(chunkJson);
+            if (!root.isObject() || !root.path("usage").isObject()) {
+                return chunkJson;
+            }
+            ObjectNode usage = (ObjectNode) root.get("usage");
+            JsonNode details = usage.path("prompt_tokens_details");
+            if (details.isObject()) {
+                if (details.has("cached_tokens")) {
+                    return chunkJson; // already correct — emit verbatim
+                }
+                ((ObjectNode) details).put("cached_tokens", 0);
+            } else {
+                usage.putObject("prompt_tokens_details").put("cached_tokens", 0);
+            }
+            return root.toString();
+        } catch (IOException e) {
+            // Never break a live stream over a formatting nicety.
+            return chunkJson;
+        }
+    }
+
     /**
      * Build the {@code GET /v1/models} body advertising a single model.
      *
 
@@ -21,10 +21,16 @@
  *       so streamed {@code delta.tool_calls} are preserved for agent-mode tool use.</li>
  *   <li>{@code POST /v1/completions} and {@code POST /v1/embeddings} — non-streaming, forwarding the
  *       request body to the matching {@code LlamaModel.handle*} method.</li>
+ *   <li>{@code POST /infill} — non-streaming fill-in-the-middle for local ghost-text autocomplete
+ *       clients (llama.vscode, Twinny, Tabby); the model's FIM tokens are applied server-side.</li>
  *   <li>{@code GET /v1/models} — advertises the configured model id.</li>
  *   <li>{@code GET /health} — unauthenticated liveness probe.</li>
  * </ul>
  *
+ * <p>Every route is also reachable without the {@code /v1} prefix, answers CORS preflight
+ * ({@code OPTIONS}) requests, and stamps {@code Access-Control-Allow-Origin} on responses so
+ * browser/webview clients are not blocked.</p>
+ *
  * <p>The HTTP surface is decoupled from the model behind {@link net.ladenthin.llama.server.OpenAiBackend}
  * (production implementation {@link net.ladenthin.llama.server.LlamaModelBackend}) so routing,
  * authentication, SSE framing and heartbeats are unit-testable with a fake backend — no socket and no
Original file line number	Diff line number	Diff line change
`@@ -80,4 +80,11 @@ public String embeddings(JsonNode request) {`
`80`	`80`	`// oaiCompat=true so the response uses the OpenAI {"object":"list","data":[{embedding}]} shape.`
`81`	`81`	`return model.handleEmbeddings(request.toString(), true);`
`82`	`82`	`}`
	`83`	`+`
	`84`	`+ @Override`
	`85`	`+ public String infill(JsonNode request) {`
	`86`	`+ // The native /infill handler parses the body itself (input_prefix/input_suffix/...) and applies`
	`87`	`+ // the model's FIM tokens from GGUF metadata; forward verbatim.`
	`88`	`+ return model.handleInfill(request.toString());`
	`89`	`+ }`
`83`	`90`	`}`