From f18db3e96846f2a0f3b8d13b33e4778296ddd9ce Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 9 Apr 2026 13:49:59 +0200
Subject: [PATCH 1/4] cli: add option to connect to server via http(s)

---
 common/arg.cpp           |  11 +-
 common/common.h          |   4 +
 tools/cli/CMakeLists.txt |   4 +-
 tools/cli/cli-remote.cpp | 467 +++++++++++++++++++++++++++++++++++++++
 tools/cli/cli-remote.h   |  69 ++++++
 tools/cli/cli.cpp        | 326 ++++++++++++++++++---------
 6 files changed, 774 insertions(+), 107 deletions(-)
 create mode 100644 tools/cli/cli-remote.cpp
 create mode 100644 tools/cli/cli-remote.h
diff --git a/common/arg.cpp b/common/arg.cpp
index c21598e7687..10119ca12e5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -600,9 +600,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         common_params_handle_model(params.vocoder.model,             params.hf_token, params.offline);
     }
 
-    // model is required (except for server)
+    // model is required (except for server, or when using --endpoint in CLI)
     // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion && params.endpoint.empty()) {
         throw std::invalid_argument("error: --model is required\n");
     }
 
@@ -1398,6 +1398,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.show_timings = value;
         }
     ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
+    add_opt(common_arg(
+        {"--endpoint"}, "URL",
+        string_format("connect to a running llama-server at URL instead of loading a model locally (e.g. http://localhost:8080)"),
+        [](common_params & params, const std::string & value) {
+            params.endpoint = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_ENDPOINT"));
     add_opt(common_arg(
         {"-f", "--file"}, "FNAME",
         "a file containing the prompt (default: none)",
diff --git a/common/common.h b/common/common.h
index a564b3b8c2b..9bb086315a7 100644
--- a/common/common.h
+++ b/common/common.h
@@ -555,6 +555,10 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    // remote server endpoint for CLI (e.g. "http://localhost:8080")
+    // when set, CLI connects to a running server instead of loading a model
+    std::string endpoint   = "";                                                                   // NOLINT
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index 7e01abb81b9..5518cd49483 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
+add_executable(${TARGET} cli.cpp cli-remote.cpp)
+target_link_libraries(${TARGET} PRIVATE server-context cpp-httplib PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 include_directories(../server)
diff --git a/tools/cli/cli-remote.cpp b/tools/cli/cli-remote.cpp
new file mode 100644
index 00000000000..1d6b3e57892
--- /dev/null
+++ b/tools/cli/cli-remote.cpp
@@ -0,0 +1,467 @@
+#include "cli-remote.h"
+
+#include "common.h"
+#include "console.h"
+#include "http.h"
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <filesystem>
+#include <fstream>
+#include <mutex>
+#include <thread>
+
+// shared with cli.cpp
+extern std::atomic<bool> g_is_interrupted;
+
+// base64 encoding for multimodal content over HTTP
+static std::string base64_encode(const std::string & in) {
+    static const char base64_chars[] =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string out;
+    out.reserve(4 * ((in.size() + 2) / 3));
+    int i = 0;
+    int j = 0;
+    unsigned char arr3[3];
+    unsigned char arr4[4];
+    size_t in_len = in.size();
+    const unsigned char * bytes = reinterpret_cast<const unsigned char *>(in.data());
+    while (in_len--) {
+        arr3[i++] = *(bytes++);
+        if (i == 3) {
+            arr4[0] = (arr3[0] & 0xfc) >> 2;
+            arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
+            arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
+            arr4[3] = arr3[2] & 0x3f;
+            for (i = 0; i < 4; i++) {
+                out += base64_chars[arr4[i]];
+            }
+            i = 0;
+        }
+    }
+    if (i) {
+        for (j = i; j < 3; j++) {
+            arr3[j] = '\0';
+        }
+        arr4[0] = (arr3[0] & 0xfc) >> 2;
+        arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
+        arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
+        for (j = 0; j < i + 1; j++) {
+            out += base64_chars[arr4[j]];
+        }
+        while (i++ < 3) {
+            out += '=';
+        }
+    }
+    return out;
+}
+
+//
+// cli_backend_remote
+//
+
+cli_backend_remote::cli_backend_remote(const std::string & url) : endpoint_url(url) {
+    // strip trailing slash
+    while (!endpoint_url.empty() && endpoint_url.back() == '/') {
+        endpoint_url.pop_back();
+    }
+}
+
+bool cli_backend_remote::connect() {
+    // query /props to get server metadata
+    try {
+        auto [cli, parts] = common_http_client(endpoint_url);
+        cli.set_connection_timeout(5, 0);
+        cli.set_read_timeout(10, 0);
+
+        auto res = cli.Get("/props");
+        if (!res || res->status != 200) {
+            console::error("Failed to connect to server at %s\n", endpoint_url.c_str());
+            if (res) {
+                console::error("HTTP status: %d\n", res->status);
+            } else {
+                console::error("Connection error: %s\n", httplib::to_string(res.error()).c_str());
+            }
+            return false;
+        }
+
+        auto data = json::parse(res->body);
+        model_name  = json_value(data, "model_alias", std::string("unknown"));
+        build_info_ = json_value(data, "build_info", std::string(""));
+
+        if (data.contains("modalities")) {
+            auto mods = data.at("modalities");
+            has_vision_ = json_value(mods, "vision", false);
+            has_audio_  = json_value(mods, "audio",  false);
+        }
+
+        return true;
+    } catch (const std::exception & e) {
+        console::error("Failed to connect to server at %s: %s\n", endpoint_url.c_str(), e.what());
+        return false;
+    }
+}
+
+std::string cli_backend_remote::get_model_name() const { return model_name; }
+bool cli_backend_remote::has_vision() const { return has_vision_; }
+bool cli_backend_remote::has_audio() const { return has_audio_; }
+std::string cli_backend_remote::get_build_info() const { return build_info_; }
+
+std::string cli_backend_remote::generate_completion(
+        const json & messages,
+        const common_params & params,
+        bool verbose_prompt,
+        result_timings & out_timings) {
+    // build the OAI chat completion request
+    json request_body = {
+        {"messages", messages},
+        {"stream",   true},
+    };
+
+    // sampling parameters
+    {
+        auto & s = params.sampling;
+        if (s.temp != 0.8f)    request_body["temperature"]         = s.temp;
+        if (s.top_k != 40)     request_body["top_k"]               = s.top_k;
+        if (s.top_p != 0.95f)  request_body["top_p"]               = s.top_p;
+        if (s.min_p != 0.05f)  request_body["min_p"]              = s.min_p;
+        if (s.penalty_repeat != 1.0f) request_body["repeat_penalty"] = s.penalty_repeat;
+        if (s.penalty_present != 0.0f) request_body["presence_penalty"] = s.penalty_present;
+        if (s.penalty_freq != 0.0f) request_body["frequency_penalty"] = s.penalty_freq;
+        if (s.seed != LLAMA_DEFAULT_SEED) request_body["seed"] = s.seed;
+    }
+
+    if (params.n_predict >= 0) {
+        request_body["max_tokens"] = params.n_predict;
+    }
+
+    if (!params.antiprompt.empty()) {
+        request_body["stop"] = params.antiprompt;
+    }
+
+    // reasoning budget
+    if (params.sampling.reasoning_budget_tokens >= 0) {
+        request_body["thinking_budget_tokens"] = params.sampling.reasoning_budget_tokens;
+    }
+
+    // reasoning/thinking control via chat_template_kwargs
+    if (params.enable_reasoning == 0) {
+        request_body["chat_template_kwargs"] = json{{"enable_thinking", false}};
+    }
+
+    if (verbose_prompt) {
+        console::set_display(DISPLAY_TYPE_PROMPT);
+        console::log("POST /v1/chat/completions %s\n\n", request_body.dump().c_str());
+        console::set_display(DISPLAY_TYPE_RESET);
+    }
+
+    // do the HTTP request with SSE streaming
+    std::string curr_content;
+    bool is_thinking = false;
+
+    try {
+        auto [cli, parts] = common_http_client(endpoint_url);
+        cli.set_connection_timeout(10, 0);
+        cli.set_read_timeout(600, 0); // 10 min read timeout for long generations
+        cli.set_write_timeout(10, 0);
+
+        std::string body_str = request_body.dump();
+
+        auto done = std::make_shared<std::atomic<bool>>(false);
+        auto error_msg = std::make_shared<std::string>();
+
+        httplib::Request req;
+        req.method = "POST";
+        req.path = "/v1/chat/completions";
+        req.set_header("Content-Type", "application/json");
+        req.body = body_str;
+
+        // collect chunks into a thread-safe queue
+        auto chunks = std::make_shared<std::vector<std::string>>();
+        auto chunks_mutex = std::make_shared<std::mutex>();
+        auto chunks_cv = std::make_shared<std::condition_variable>();
+
+        req.content_receiver = [chunks, chunks_mutex, chunks_cv](const char * data, size_t data_length, size_t, size_t) -> bool {
+            std::string chunk(data, data_length);
+            {
+                std::lock_guard<std::mutex> lock(*chunks_mutex);
+                chunks->push_back(std::move(chunk));
+            }
+            chunks_cv->notify_one();
+            return true;
+        };
+
+        // send request in a separate thread
+        std::thread req_thread([&cli, req = std::move(req), done, error_msg]() mutable {
+            auto result = cli.send(std::move(req));
+            if (result.error() != httplib::Error::Success) {
+                *error_msg = httplib::to_string(result.error());
+            } else if (result && result->status != 200) {
+                try {
+                    auto err_json = json::parse(result->body);
+                    if (err_json.contains("error") && err_json["error"].contains("message")) {
+                        *error_msg = err_json["error"]["message"].get<std::string>();
+                    } else {
+                        *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
+                    }
+                } catch (...) {
+                    *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
+                }
+            }
+            done->store(true);
+        });
+        req_thread.detach();
+
+        // process SSE stream
+        console::spinner::start();
+
+        std::string sse_buffer;
+        bool first_chunk = true;
+        bool stream_done = false;
+
+        while (!stream_done) {
+            // wait for data
+            std::string new_data;
+            {
+                std::unique_lock<std::mutex> lock(*chunks_mutex);
+                if (chunks->empty()) {
+                    chunks_cv->wait_for(lock, std::chrono::milliseconds(100), [&] {
+                        return !chunks->empty() || done->load() || g_is_interrupted.load();
+                    });
+                    if (chunks->empty()) {
+                        if (done->load() || g_is_interrupted.load()) {
+                            break;
+                        }
+                        continue;
+                    }
+                }
+                for (auto & chunk : *chunks) {
+                    new_data += chunk;
+                }
+                chunks->clear();
+            }
+
+            if (new_data.empty()) {
+                continue;
+            }
+
+            sse_buffer += new_data;
+
+            // process complete SSE lines
+            // SSE format: "data: {json}\n\n" or "data: [DONE]\n\n"
+            size_t pos = 0;
+            while (pos < sse_buffer.size() && !stream_done) {
+                // find "data: " prefix
+                size_t data_pos = sse_buffer.find("data: ", pos);
+                if (data_pos == std::string::npos) {
+                    break;
+                }
+
+                // find the end of this line
+                size_t line_end = sse_buffer.find('\n', data_pos);
+                if (line_end == std::string::npos) {
+                    // incomplete line, wait for more data
+                    break;
+                }
+
+                std::string data_str = sse_buffer.substr(data_pos + 6, line_end - data_pos - 6);
+                // trim whitespace
+                while (!data_str.empty() && (data_str.back() == '\n' || data_str.back() == '\r' || data_str.back() == ' ')) {
+                    data_str.pop_back();
+                }
+
+                pos = line_end + 1;
+
+                if (first_chunk) {
+                    console::spinner::stop();
+                    first_chunk = false;
+                }
+
+                if (data_str == "[DONE]") {
+                    stream_done = true;
+                    break;
+                }
+
+                if (data_str.empty()) {
+                    continue;
+                }
+
+                // parse the JSON chunk
+                try {
+                    auto chunk_json = json::parse(data_str);
+
+                    // check for error
+                    if (chunk_json.contains("error")) {
+                        auto & err = chunk_json["error"];
+                        if (err.is_object() && err.contains("message")) {
+                            console::error("Error: %s\n", err["message"].get<std::string>().c_str());
+                        } else {
+                            console::error("Error: %s\n", err.dump().c_str());
+                        }
+                        stream_done = true;
+                        break;
+                    }
+
+                    if (!chunk_json.contains("choices") || !chunk_json["choices"].is_array() || chunk_json["choices"].empty()) {
+                        continue;
+                    }
+
+                    auto & choice = chunk_json["choices"][0];
+
+                    // check finish_reason
+                    if (choice.contains("finish_reason") && !choice["finish_reason"].is_null()) {
+                        // extract timing info if present
+                        if (chunk_json.contains("timings")) {
+                            auto & t = chunk_json["timings"];
+                            out_timings.prompt_n            = json_value(t, "prompt_n", -1);
+                            out_timings.prompt_ms           = json_value(t, "prompt_ms", 0.0);
+                            out_timings.prompt_per_second   = json_value(t, "prompt_per_second", 0.0);
+                            out_timings.predicted_n         = json_value(t, "predicted_n", -1);
+                            out_timings.predicted_ms        = json_value(t, "predicted_ms", 0.0);
+                            out_timings.predicted_per_second = json_value(t, "predicted_per_second", 0.0);
+                        }
+                        stream_done = true;
+                        break;
+                    }
+
+                    // extract delta
+                    if (!choice.contains("delta")) {
+                        continue;
+                    }
+                    auto & delta = choice["delta"];
+
+                    // extract usage/timings from streaming chunks
+                    if (chunk_json.contains("timings")) {
+                        auto & t = chunk_json["timings"];
+                        out_timings.prompt_n            = json_value(t, "prompt_n", out_timings.prompt_n);
+                        out_timings.prompt_ms           = json_value(t, "prompt_ms", out_timings.prompt_ms);
+                        out_timings.prompt_per_second   = json_value(t, "prompt_per_second", out_timings.prompt_per_second);
+                        out_timings.predicted_n         = json_value(t, "predicted_n", out_timings.predicted_n);
+                        out_timings.predicted_ms        = json_value(t, "predicted_ms", out_timings.predicted_ms);
+                        out_timings.predicted_per_second = json_value(t, "predicted_per_second", out_timings.predicted_per_second);
+                    }
+
+                    // content delta
+                    if (delta.contains("content") && delta["content"].is_string()) {
+                        std::string content_delta = delta["content"].get<std::string>();
+                        if (!content_delta.empty()) {
+                            if (is_thinking) {
+                                console::log("\n[End thinking]\n\n");
+                                console::set_display(DISPLAY_TYPE_RESET);
+                                is_thinking = false;
+                            }
+                            curr_content += content_delta;
+                            console::log("%s", content_delta.c_str());
+                            console::flush();
+                        }
+                    }
+
+                    // reasoning content delta
+                    if (delta.contains("reasoning_content") && delta["reasoning_content"].is_string()) {
+                        std::string reasoning_delta = delta["reasoning_content"].get<std::string>();
+                        if (!reasoning_delta.empty()) {
+                            console::set_display(DISPLAY_TYPE_REASONING);
+                            if (!is_thinking) {
+                                console::log("[Start thinking]\n");
+                            }
+                            is_thinking = true;
+                            console::log("%s", reasoning_delta.c_str());
+                            console::flush();
+                        }
+                    }
+                } catch (const json::parse_error &) {
+                    // skip malformed JSON
+                    continue;
+                }
+
+                if (g_is_interrupted.load()) {
+                    stream_done = true;
+                    break;
+                }
+            }
+
+            // keep unprocessed part of the buffer
+            sse_buffer = sse_buffer.substr(pos);
+        }
+
+        if (first_chunk) {
+            console::spinner::stop();
+        }
+
+        if (!error_msg->empty()) {
+            console::error("Error: %s\n", error_msg->c_str());
+        }
+
+    } catch (const std::exception & e) {
+        console::spinner::stop();
+        console::error("HTTP error: %s\n", e.what());
+    }
+
+    g_is_interrupted.store(false);
+    return curr_content;
+}
+
+std::string cli_backend_remote::load_text_file(const std::string & fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        return "";
+    }
+    return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+}
+
+json cli_backend_remote::load_media_file(const std::string & fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        return json::object();
+    }
+
+    // read the file
+    std::string file_data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    std::string b64 = base64_encode(file_data);
+
+    // determine media type from extension
+    std::string media_type = "image/png"; // default
+    std::string content_type = "image_url"; // OAI content part type
+    auto ext = std::filesystem::path(fname).extension().string();
+    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+
+    if (ext == ".jpg" || ext == ".jpeg") {
+        media_type = "image/jpeg";
+    } else if (ext == ".png") {
+        media_type = "image/png";
+    } else if (ext == ".gif") {
+        media_type = "image/gif";
+    } else if (ext == ".webp") {
+        media_type = "image/webp";
+    } else if (ext == ".wav") {
+        media_type = "audio/wav";
+        content_type = "input_audio";
+    } else if (ext == ".mp3") {
+        media_type = "audio/mpeg";
+        content_type = "input_audio";
+    }
+
+    // build the OAI content part
+    if (content_type == "image_url") {
+        return json{
+            {"type", "image_url"},
+            {"image_url", {
+                {"url", "data:" + media_type + ";base64," + b64}
+            }}
+        };
+    } else {
+        // audio
+        return json{
+            {"type", "input_audio"},
+            {"input_audio", {
+                {"data", b64},
+                {"format", ext == ".mp3" ? "mp3" : "wav"}
+            }}
+        };
+    }
+}
+
+void cli_backend_remote::terminate() {
+    // nothing to do for remote — just return
+}
diff --git a/tools/cli/cli-remote.h b/tools/cli/cli-remote.h
new file mode 100644
index 00000000000..f545f5969b7
--- /dev/null
+++ b/tools/cli/cli-remote.h
@@ -0,0 +1,69 @@
+#pragma once
+
+#include "common.h"
+#include "console.h"
+
+#include "server-task.h"
+
+//
+// Backend interface — abstracts local (in-process) vs remote (HTTP) server
+//
+
+struct cli_backend {
+    virtual ~cli_backend() = default;
+
+    // model / server info
+    virtual std::string get_model_name() const = 0;
+    virtual bool has_vision() const = 0;
+    virtual bool has_audio() const = 0;
+    virtual std::string get_build_info() const = 0;
+
+    // chat completion (streaming), returns assistant content text
+    virtual std::string generate_completion(
+        const json & messages,
+        const common_params & params,
+        bool verbose_prompt,
+        result_timings & out_timings) = 0;
+
+    // load a local text file, return its contents (empty string on failure)
+    virtual std::string load_text_file(const std::string & fname) = 0;
+
+    // load a local media file, return the OAI content part JSON for it
+    // returns empty JSON object on failure
+    virtual json load_media_file(const std::string & fname) = 0;
+
+    // cleanup
+    virtual void terminate() = 0;
+};
+
+//
+// Remote backend — connects to a running llama-server via HTTP(S)
+//
+
+struct cli_backend_remote : cli_backend {
+    std::string endpoint_url;
+    std::string model_name;
+    bool has_vision_ = false;
+    bool has_audio_  = false;
+    std::string build_info_;
+
+    cli_backend_remote(const std::string & url);
+
+    bool connect();
+
+    std::string get_model_name() const override;
+    bool has_vision() const override;
+    bool has_audio() const override;
+    std::string get_build_info() const override;
+
+    std::string generate_completion(
+            const json & messages,
+            const common_params & params,
+            bool verbose_prompt,
+            result_timings & out_timings) override;
+
+    std::string load_text_file(const std::string & fname) override;
+    json load_media_file(const std::string & fname) override;
+
+    void terminate() override;
+};
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 369c24216b7..87d9c7fa487 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -9,6 +9,8 @@
 #include "server-context.h"
 #include "server-task.h"
 
+#include "cli-remote.h"
+
 #include <array>
 #include <atomic>
 #include <algorithm>
@@ -35,7 +37,7 @@ const char * LLAMA_ASCII_LOGO = R"(
                                     ▀▀    ▀▀
 )";
 
-static std::atomic<bool> g_is_interrupted = false;
+std::atomic<bool> g_is_interrupted = false;
 static bool should_stop() {
     return g_is_interrupted.load();
 }
@@ -53,51 +55,80 @@ static void signal_handler(int) {
 }
 #endif
 
-struct cli_context {
+//
+// Local backend — wraps server_context (original behavior)
+//
+
+struct cli_backend_local : cli_backend {
     server_context ctx_server;
-    json messages = json::array();
     std::vector<raw_buffer> input_files;
+    json chat_messages = json::array();
     task_params defaults;
     bool verbose_prompt;
+    std::thread inference_thread;
 
-    // thread for showing "loading" animation
-    std::atomic<bool> loading_show;
-
-    cli_context(const common_params & params) {
+    cli_backend_local(const common_params & params) {
         defaults.sampling    = params.sampling;
         defaults.speculative = params.speculative;
         defaults.n_keep      = params.n_keep;
         defaults.n_predict   = params.n_predict;
         defaults.antiprompt  = params.antiprompt;
 
-        defaults.stream = true; // make sure we always use streaming mode
-        defaults.timings_per_token = true; // in order to get timings even when we cancel mid-way
-        // defaults.return_progress = true; // TODO: show progress
+        defaults.stream = true;
+        defaults.timings_per_token = true;
 
         verbose_prompt = params.verbose_prompt;
     }
 
-    std::string generate_completion(result_timings & out_timings) {
+    bool load_model(common_params & params) {
+        return ctx_server.load_model(params);
+    }
+
+    void start_loop() {
+        inference_thread = std::thread([this]() {
+            ctx_server.start_loop();
+        });
+    }
+
+    std::string get_model_name() const override {
+        return ctx_server.get_meta().model_name;
+    }
+
+    bool has_vision() const override {
+        return ctx_server.get_meta().has_inp_image;
+    }
+
+    bool has_audio() const override {
+        return ctx_server.get_meta().has_inp_audio;
+    }
+
+    std::string get_build_info() const override {
+        return ctx_server.get_meta().build_info;
+    }
+
+    std::string generate_completion(
+            const json & messages,
+            const common_params & /*params*/,
+            bool /*verbose_prompt_arg*/,
+            result_timings & out_timings) override {
+        chat_messages = messages;
         server_response_reader rd = ctx_server.get_response_reader();
         auto chat_params = format_chat();
         {
-            // TODO: reduce some copies here in the future
             server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
             task.id         = rd.get_new_id();
             task.index      = 0;
-            task.params     = defaults;           // copy
-            task.cli_prompt = chat_params.prompt; // copy
-            task.cli_files  = input_files;        // copy
+            task.params     = defaults;
+            task.cli_prompt = chat_params.prompt;
+            task.cli_files  = input_files;
             task.cli        = true;
 
-            // chat template settings
             task.params.chat_parser_params = common_chat_parser_params(chat_params);
             task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
             if (!chat_params.parser.empty()) {
                 task.params.chat_parser_params.parser.load(chat_params.parser);
             }
 
-            // reasoning budget sampler
             if (!chat_params.thinking_end_tag.empty()) {
                 const llama_vocab * vocab = llama_model_get_vocab(
                     llama_get_model(ctx_server.get_llama_context()));
@@ -124,7 +155,6 @@ struct cli_context {
             console::set_display(DISPLAY_TYPE_RESET);
         }
 
-        // wait for first result
         console::spinner::start();
         server_task_result_ptr result = rd.next(should_stop);
 
@@ -178,25 +208,31 @@ struct cli_context {
             result = rd.next(should_stop);
         }
         g_is_interrupted.store(false);
-        // server_response_reader automatically cancels pending tasks upon destruction
         return curr_content;
     }
 
-    // TODO: support remote files in the future (http, https, etc)
-    std::string load_input_file(const std::string & fname, bool is_media) {
+    std::string load_text_file(const std::string & fname) override {
         std::ifstream file(fname, std::ios::binary);
         if (!file) {
             return "";
         }
-        if (is_media) {
-            raw_buffer buf;
-            buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            input_files.push_back(std::move(buf));
-            return get_media_marker();
-        } else {
-            std::string content((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-            return content;
+        return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    }
+
+    json load_media_file(const std::string & fname) override {
+        std::ifstream file(fname, std::ios::binary);
+        if (!file) {
+            return json::object();
         }
+        // for local backend, we just load into input_files and return the marker
+        raw_buffer buf;
+        buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+
+        // detect media type for HTTP compatibility (also stored for local use)
+        input_files.push_back(std::move(buf));
+        // local backend doesn't use the JSON content part — it uses input_files directly
+        // but we return a non-empty JSON so the caller knows it succeeded
+        return json{{"loaded", true}};
     }
 
     common_chat_params format_chat() {
@@ -206,11 +242,11 @@ struct cli_context {
         auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
 
         common_chat_templates_inputs inputs;
-        inputs.messages              = common_chat_msgs_parse_oaicompat(messages);
-        inputs.tools                 = {}; // TODO
+        inputs.messages              = common_chat_msgs_parse_oaicompat(chat_messages);
+        inputs.tools                 = {};
         inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
-        inputs.json_schema           = ""; // TODO
-        inputs.grammar               = ""; // TODO
+        inputs.json_schema           = "";
+        inputs.grammar               = "";
         inputs.use_jinja             = chat_params.use_jinja;
         inputs.parallel_tool_calls   = caps["supports_parallel_tool_calls"];
         inputs.add_generation_prompt = true;
@@ -218,9 +254,23 @@ struct cli_context {
         inputs.force_pure_content    = chat_params.force_pure_content;
         inputs.enable_thinking       = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
 
-        // Apply chat template to the list of messages
         return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
     }
+
+    void terminate() override {
+        ctx_server.terminate();
+        if (inference_thread.joinable()) {
+            inference_thread.join();
+        }
+    }
+
+    llama_context * get_llama_context() const {
+        return ctx_server.get_llama_context();
+    }
+
+    server_context_meta get_meta() const {
+        return ctx_server.get_meta();
+    }
 };
 
 // TODO?: Make this reusable, enums, docs
@@ -359,11 +409,7 @@ int main(int argc, char ** argv) {
         console::error("please use llama-completion instead\n");
     }
 
-    // struct that contains llama context and inference
-    cli_context ctx_cli(params);
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
+    const bool is_remote = !params.endpoint.empty();
 
     // TODO: avoid using atexit() here by making `console` a singleton
     console::init(params.simple_io, params.use_color);
@@ -386,33 +432,57 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    console::log("\nLoading model... "); // followed by loading animation
-    console::spinner::start();
-    if (!ctx_cli.ctx_server.load_model(params)) {
+    // shared state between backend and the interactive loop
+    json messages = json::array();
+    // for local backend: media files pending for the current user message
+    // for remote backend: OAI content parts for media
+    std::vector<json> pending_media_parts;
+
+    std::unique_ptr<cli_backend> backend;
+
+    if (is_remote) {
+        auto remote = std::make_unique<cli_backend_remote>(params.endpoint);
+        console::log("\nConnecting to %s ... ", params.endpoint.c_str());
+        console::spinner::start();
+        if (!remote->connect()) {
+            console::spinner::stop();
+            console::error("\nFailed to connect to server\n");
+            return 1;
+        }
         console::spinner::stop();
-        console::error("\nFailed to load the model\n");
-        return 1;
-    }
+        console::log("connected\n");
+        backend = std::move(remote);
+    } else {
+        auto local = std::make_unique<cli_backend_local>(params);
 
-    console::spinner::stop();
-    console::log("\n");
+        llama_backend_init();
+        llama_numa_init(params.numa);
 
-    std::thread inference_thread([&ctx_cli]() {
-        ctx_cli.ctx_server.start_loop();
-    });
+        console::log("\nLoading model... "); // followed by loading animation
+        console::spinner::start();
+        if (!local->load_model(params)) {
+            console::spinner::stop();
+            console::error("\nFailed to load the model\n");
+            return 1;
+        }
+        console::spinner::stop();
+        console::log("\n");
+
+        local->start_loop();
+        backend = std::move(local);
+    }
 
-    auto inf = ctx_cli.ctx_server.get_meta();
     std::string modalities = "text";
-    if (inf.has_inp_image) {
+    if (backend->has_vision()) {
         modalities += ", vision";
     }
-    if (inf.has_inp_audio) {
+    if (backend->has_audio()) {
         modalities += ", audio";
     }
 
     auto add_system_prompt = [&]() {
         if (!params.system_prompt.empty()) {
-            ctx_cli.messages.push_back({
+            messages.push_back({
                 {"role",    "system"},
                 {"content", params.system_prompt}
             });
@@ -422,8 +492,11 @@ int main(int argc, char ** argv) {
 
     console::log("\n");
     console::log("%s\n", LLAMA_ASCII_LOGO);
-    console::log("build      : %s\n", inf.build_info.c_str());
-    console::log("model      : %s\n", inf.model_name.c_str());
+    if (is_remote) {
+        console::log("server     : %s\n", params.endpoint.c_str());
+    }
+    console::log("build      : %s\n", backend->get_build_info().c_str());
+    console::log("model      : %s\n", backend->get_model_name().c_str());
     console::log("modalities : %s\n", modalities.c_str());
     if (!params.system_prompt.empty()) {
         console::log("using custom system prompt\n");
@@ -435,33 +508,43 @@ int main(int argc, char ** argv) {
     console::log("  /clear              clear the chat history\n");
     console::log("  /read <file>        add a text file\n");
     console::log("  /glob <pattern>     add text files using globbing pattern\n");
-    if (inf.has_inp_image) {
+    if (backend->has_vision()) {
         console::log("  /image <file>       add an image file\n");
     }
-    if (inf.has_inp_audio) {
+    if (backend->has_audio()) {
         console::log("  /audio <file>       add an audio file\n");
     }
     console::log("\n");
 
-    // interactive loop
     std::string cur_msg;
 
+    // helper: build a user message JSON with optional media parts
+    auto build_user_content = [&](const std::string & text, const std::vector<json> & media_parts) -> json {
+        if (media_parts.empty()) {
+            return json{{"type", "text"}, {"text", text}};
+        }
+        // OAI multipart content
+        json content = json::array();
+        // add media first, then text
+        for (const auto & part : media_parts) {
+            content.push_back(part);
+        }
+        if (!text.empty()) {
+            content.push_back(json{{"type", "text"}, {"text", text}});
+        }
+        return content;
+    };
+
     auto add_text_file = [&](const std::string & fname) -> bool {
-        std::string marker = ctx_cli.load_input_file(fname, false);
-        if (marker.empty()) {
+        std::string content = backend->load_text_file(fname);
+        if (content.empty()) {
             console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
             return false;
         }
-        if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
-            cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
-            cur_msg += fname;
-            cur_msg.push_back('\n');
-        } else {
-            cur_msg += "--- File: ";
-            cur_msg += fname;
-            cur_msg += " ---\n";
-        }
-        cur_msg += marker;
+        cur_msg += "--- File: ";
+        cur_msg += fname;
+        cur_msg += " ---\n";
+        cur_msg += content;
         console::log("Loaded text from '%s'\n", fname.c_str());
         return true;
     };
@@ -478,15 +561,20 @@ int main(int argc, char ** argv) {
                 buffer += line;
             } while (another_line);
         } else {
-            // process input prompt from args
+            // process input prompt from args — load any media files
             for (auto & fname : params.image) {
-                std::string marker = ctx_cli.load_input_file(fname, true);
-                if (marker.empty()) {
+                json media_part = backend->load_media_file(fname);
+                if (media_part.empty()) {
                     console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
                     break;
                 }
+                if (is_remote) {
+                    pending_media_parts.push_back(media_part);
+                } else {
+                    // local backend: use the media marker
+                    cur_msg += mtmd_default_marker();
+                }
                 console::log("Loaded media from '%s'\n", fname.c_str());
-                cur_msg += marker;
             }
             buffer = params.prompt;
             if (buffer.size() > 500) {
@@ -505,7 +593,7 @@ int main(int argc, char ** argv) {
         }
 
         // remove trailing newline
-        if (!buffer.empty() &&buffer.back() == '\n') {
+        if (!buffer.empty() && buffer.back() == '\n') {
             buffer.pop_back();
         }
 
@@ -520,32 +608,38 @@ int main(int argc, char ** argv) {
         if (string_starts_with(buffer, "/exit")) {
             break;
         } else if (string_starts_with(buffer, "/regen")) {
-            if (ctx_cli.messages.size() >= 2) {
-                size_t last_idx = ctx_cli.messages.size() - 1;
-                ctx_cli.messages.erase(last_idx);
+            if (messages.size() >= 2) {
+                size_t last_idx = messages.size() - 1;
+                messages.erase(last_idx);
                 add_user_msg = false;
             } else {
                 console::error("No message to regenerate.\n");
                 continue;
             }
         } else if (string_starts_with(buffer, "/clear")) {
-            ctx_cli.messages.clear();
+            messages.clear();
             add_system_prompt();
-
-            ctx_cli.input_files.clear();
+            pending_media_parts.clear();
+            if (!is_remote) {
+                auto * local = static_cast<cli_backend_local *>(backend.get());
+                local->input_files.clear();
+            }
             console::log("Chat history cleared.\n");
             continue;
         } else if (
-                (string_starts_with(buffer, "/image ") && inf.has_inp_image) ||
-                (string_starts_with(buffer, "/audio ") && inf.has_inp_audio)) {
-            // just in case (bad copy-paste for example), we strip all trailing/leading spaces
+                (string_starts_with(buffer, "/image ") && backend->has_vision()) ||
+                (string_starts_with(buffer, "/audio ") && backend->has_audio())) {
             std::string fname = string_strip(buffer.substr(7));
-            std::string marker = ctx_cli.load_input_file(fname, true);
-            if (marker.empty()) {
+            json media_part = backend->load_media_file(fname);
+            if (media_part.empty()) {
                 console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
                 continue;
             }
-            cur_msg += marker;
+            if (is_remote) {
+                pending_media_parts.push_back(media_part);
+            } else {
+                cur_msg += mtmd_default_marker();
+            }
             console::log("Loaded media from '%s'\n", fname.c_str());
             continue;
         } else if (string_starts_with(buffer, "/read ")) {
@@ -612,25 +706,47 @@ int main(int argc, char ** argv) {
 
         // generate response
         if (add_user_msg) {
-            ctx_cli.messages.push_back({
-                {"role",    "user"},
-                {"content", cur_msg}
-            });
+            if (is_remote) {
+                // for remote, build OAI content with media parts
+                json user_content = build_user_content(cur_msg, pending_media_parts);
+                if (pending_media_parts.empty()) {
+                    // simple text message
+                    messages.push_back({
+                        {"role",    "user"},
+                        {"content", cur_msg}
+                    });
+                } else {
+                    // multipart message
+                    messages.push_back({
+                        {"role",    "user"},
+                        {"content", user_content}
+                    });
+                }
+                pending_media_parts.clear();
+            } else {
+                // local backend: media markers are embedded in the text
+                messages.push_back({
+                    {"role",    "user"},
+                    {"content", cur_msg}
+                });
+            }
             cur_msg.clear();
         }
         result_timings timings;
-        std::string assistant_content = ctx_cli.generate_completion(timings);
-        ctx_cli.messages.push_back({
+        std::string assistant_content = backend->generate_completion(messages, params, params.verbose_prompt, timings);
+        messages.push_back({
             {"role",    "assistant"},
             {"content", assistant_content}
         });
         console::log("\n");
 
         if (params.show_timings) {
-            console::set_display(DISPLAY_TYPE_INFO);
-            console::log("\n");
-            console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
-            console::set_display(DISPLAY_TYPE_RESET);
+            if (timings.prompt_n >= 0 && timings.predicted_n >= 0) {
+                console::set_display(DISPLAY_TYPE_INFO);
+                console::log("\n");
+                console::log("[ Prompt: %.1f t/s | Generation: %.1f t/s ]\n", timings.prompt_per_second, timings.predicted_per_second);
+                console::set_display(DISPLAY_TYPE_RESET);
+            }
         }
 
         if (params.single_turn) {
@@ -641,12 +757,16 @@ int main(int argc, char ** argv) {
     console::set_display(DISPLAY_TYPE_RESET);
 
     console::log("\nExiting...\n");
-    ctx_cli.ctx_server.terminate();
-    inference_thread.join();
+    backend->terminate();
 
-    // bump the log level to display timings
-    common_log_set_verbosity_thold(LOG_LEVEL_INFO);
-    common_memory_breakdown_print(ctx_cli.ctx_server.get_llama_context());
+    if (!is_remote) {
+        // bump the log level to display timings
+        common_log_set_verbosity_thold(LOG_LEVEL_INFO);
+        auto * local = static_cast<cli_backend_local *>(backend.get());
+        common_memory_breakdown_print(local->get_llama_context());
+
+        llama_backend_free();
+    }
 
     return 0;
 }

From b8a38086f4abe282607939f9c91b0c645b796e2b Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Thu, 9 Apr 2026 14:43:13 +0200
Subject: [PATCH 2/4] fix Mac failing build

---
 tools/cli/cli-remote.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/cli/cli-remote.cpp b/tools/cli/cli-remote.cpp
index 1d6b3e57892..06a39131bd0 100644
--- a/tools/cli/cli-remote.cpp
+++ b/tools/cli/cli-remote.cpp
@@ -71,7 +71,8 @@ cli_backend_remote::cli_backend_remote(const std::string & url) : endpoint_url(u
 bool cli_backend_remote::connect() {
     // query /props to get server metadata
     try {
-        auto [cli, parts] = common_http_client(endpoint_url);
+        auto http_res = common_http_client(endpoint_url);
+        httplib::Client & cli = http_res.first;
         cli.set_connection_timeout(5, 0);
         cli.set_read_timeout(10, 0);
 
@@ -161,7 +162,8 @@ std::string cli_backend_remote::generate_completion(
     bool is_thinking = false;
 
     try {
-        auto [cli, parts] = common_http_client(endpoint_url);
+        auto http_res = common_http_client(endpoint_url);
+        httplib::Client cli = std::move(http_res.first);
         cli.set_connection_timeout(10, 0);
         cli.set_read_timeout(600, 0); // 10 min read timeout for long generations
         cli.set_write_timeout(10, 0);

From b6bcfbe4d74781eeaf8e3d05dd689fb01267778b Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Sat, 11 Apr 2026 21:39:08 +0200
Subject: [PATCH 3/4] Rewrite to http-only.

---
 tools/cli/CMakeLists.txt     |   6 +-
 tools/cli/cli-backend.cpp    | 805 +++++++++++++++++++++++++++++++++++
 tools/cli/cli-backend.h      |  82 ++++
 tools/cli/cli-remote.cpp     | 469 --------------------
 tools/cli/cli-remote.h       |  69 ---
 tools/cli/cli.cpp            | 340 ++-------------
 vendor/sheredom/subprocess.h |  29 +-
 7 files changed, 961 insertions(+), 839 deletions(-)
 create mode 100644 tools/cli/cli-backend.cpp
 create mode 100644 tools/cli/cli-backend.h
 delete mode 100644 tools/cli/cli-remote.cpp
 delete mode 100644 tools/cli/cli-remote.h

diff --git a/tools/cli/CMakeLists.txt b/tools/cli/CMakeLists.txt
index 5518cd49483..885892ea093 100644
--- a/tools/cli/CMakeLists.txt
+++ b/tools/cli/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(TARGET llama-cli)
-add_executable(${TARGET} cli.cpp cli-remote.cpp)
-target_link_libraries(${TARGET} PRIVATE server-context cpp-httplib PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
+add_executable(${TARGET} cli.cpp cli-backend.cpp)
+target_link_libraries(${TARGET} PRIVATE cpp-httplib PUBLIC llama-common ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 include_directories(../server)
+include_directories(../mtmd)
+include_directories(../../vendor)
 
 if(LLAMA_TOOLS_INSTALL)
     install(TARGETS ${TARGET} RUNTIME)
diff --git a/tools/cli/cli-backend.cpp b/tools/cli/cli-backend.cpp
new file mode 100644
index 00000000000..3ffd4a7eb92
--- /dev/null
+++ b/tools/cli/cli-backend.cpp
@@ -0,0 +1,805 @@
+#include "cli-backend.h"
+
+#include "common.h"
+#include "console.h"
+#include "http.h"
+#include "log.h"
+#include "server-common.h"
+
+#include <sheredom/subprocess.h>
+
+#include <algorithm>
+#include <atomic>
+#include <condition_variable>
+#include <filesystem>
+#include <fstream>
+#include <mutex>
+#include <thread>
+#include <unordered_map>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#include <windows.h>
+#else
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <signal.h>
+#endif
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach-o/dyld.h>
+#include <limits.h>
+#endif
+
+// shared with cli.cpp
+extern std::atomic<bool> g_is_interrupted;
+
+// base64 encoding for multimodal content over HTTP
+static std::string base64_encode(const std::string & in) {
+    static const char base64_chars[] =
+        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+    std::string out;
+    out.reserve(4 * ((in.size() + 2) / 3));
+    int i = 0;
+    int j = 0;
+    unsigned char arr3[3];
+    unsigned char arr4[4];
+    size_t in_len = in.size();
+    const unsigned char * bytes = reinterpret_cast<const unsigned char *>(in.data());
+    while (in_len--) {
+        arr3[i++] = *(bytes++);
+        if (i == 3) {
+            arr4[0] = (arr3[0] & 0xfc) >> 2;
+            arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
+            arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
+            arr4[3] = arr3[2] & 0x3f;
+            for (i = 0; i < 4; i++) {
+                out += base64_chars[arr4[i]];
+            }
+            i = 0;
+        }
+    }
+    if (i) {
+        for (j = i; j < 3; j++) {
+            arr3[j] = '\0';
+        }
+        arr4[0] = (arr3[0] & 0xfc) >> 2;
+        arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
+        arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
+        for (j = 0; j < i + 1; j++) {
+            out += base64_chars[arr4[j]];
+        }
+        while (i++ < 3) {
+            out += '=';
+        }
+    }
+    return out;
+}
+
+// get path to current executable
+static std::filesystem::path get_current_exec_path() {
+#if defined(_WIN32)
+    wchar_t buf[32768] = { 0 };
+    DWORD len = GetModuleFileNameW(nullptr, buf, _countof(buf));
+    if (len == 0 || len >= _countof(buf)) {
+        throw std::runtime_error("GetModuleFileNameW failed or path too long");
+    }
+    return std::filesystem::path(buf);
+#elif defined(__APPLE__) && defined(__MACH__)
+    char small_path[PATH_MAX];
+    uint32_t size = sizeof(small_path);
+
+    if (_NSGetExecutablePath(small_path, &size) == 0) {
+        try {
+            return std::filesystem::canonical(std::filesystem::path(small_path));
+        } catch (...) {
+            return std::filesystem::path(small_path);
+        }
+    } else {
+        std::vector<char> buf(size);
+        if (_NSGetExecutablePath(buf.data(), &size) == 0) {
+            try {
+                return std::filesystem::canonical(std::filesystem::path(buf.data()));
+            } catch (...) {
+                return std::filesystem::path(buf.data());
+            }
+        }
+        throw std::runtime_error("_NSGetExecutablePath failed after buffer resize");
+    }
+#else
+    char path[FILENAME_MAX];
+    ssize_t count = readlink("/proc/self/exe", path, FILENAME_MAX);
+    if (count <= 0) {
+        throw std::runtime_error("failed to resolve /proc/self/exe");
+    }
+    return std::filesystem::path(std::string(path, count));
+#endif
+}
+
+// get path to llama-server executable
+static std::filesystem::path get_server_exec_path() {
+    std::filesystem::path exec_path = get_current_exec_path();
+    std::filesystem::path exec_dir = exec_path.parent_path();
+
+#if defined(_WIN32)
+    return exec_dir / "llama-server.exe";
+#else
+    return exec_dir / "llama-server";
+#endif
+}
+
+// check if a port is available
+static bool is_port_available(int port) {
+#ifdef _WIN32
+    typedef SOCKET native_socket_t;
+#define INVALID_SOCKET_VAL INVALID_SOCKET
+#define CLOSE_SOCKET(s) closesocket(s)
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        return false;
+    }
+#else
+    typedef int native_socket_t;
+#define INVALID_SOCKET_VAL (-1)
+#define CLOSE_SOCKET(s) close(s)
+#endif
+
+    native_socket_t sock = socket(AF_INET, SOCK_STREAM, 0);
+    if (sock == INVALID_SOCKET_VAL) {
+#ifdef _WIN32
+        WSACleanup();
+#endif
+        return false;
+    }
+
+    struct sockaddr_in serv_addr;
+    std::memset(&serv_addr, 0, sizeof(serv_addr));
+    serv_addr.sin_family = AF_INET;
+    serv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
+    serv_addr.sin_port = htons(port);
+
+    bool available = bind(sock, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) == 0;
+
+    CLOSE_SOCKET(sock);
+#ifdef _WIN32
+    WSACleanup();
+#endif
+
+    return available;
+}
+
+// helper to convert vector<string> to char **
+static std::vector<char *> to_char_ptr_array(const std::vector<std::string> & vec) {
+    std::vector<char *> result;
+    result.reserve(vec.size() + 1);
+    for (const auto & s : vec) {
+        result.push_back(const_cast<char*>(s.c_str()));
+    }
+    result.push_back(nullptr);
+    return result;
+}
+
+//
+// cli_backend implementation
+//
+
+cli_backend::cli_backend() = default;
+
+// List of CLI-specific arguments that should NOT be passed to llama-server
+// These are arguments that the CLI consumes directly
+static const std::vector<std::string> CLI_SPECIFIC_ARGS = {
+    // Connection
+    "--endpoint",
+    // Input (prompt related)
+    "-p", "--prompt",
+    "-f", "--file",
+    "-sys", "--system-prompt",
+    "-sysf", "--system-prompt-file",
+    "--image",
+    // CLI behavior/display
+    "--show-timings", "--no-show-timings",
+    "--simple-io",
+    "--color", "--no-color",
+    "--multiline-input",
+    "-cnv", "--conversation", "--no-conversation",
+    "-i", "--interactive",
+    "--verbose-prompt",
+};
+
+// Check if an argument is CLI-specific
+static bool is_cli_specific_arg(const std::string & arg) {
+    return std::any_of(CLI_SPECIFIC_ARGS.begin(), CLI_SPECIFIC_ARGS.end(), [&] (auto & x) {
+        return (arg == x || arg.find(x + "=") == 0);
+
+    });
+}
+
+bool cli_backend::spawn_local_server(int argc, char ** argv) {
+    // 1. Find available port starting from 8080
+    int port = 8080;
+    const int max_port = 65535;
+    while (port < max_port && !is_port_available(port)) {
+        port++;
+    }
+    if (port >= max_port) {
+        console::error("Failed to find an available port\n");
+        return false;
+    }
+
+    // 2. Get path to llama-server executable
+    std::filesystem::path server_path;
+    try {
+        server_path = get_server_exec_path();
+    } catch (const std::exception & e) {
+        console::error("Failed to find llama-server executable: %s\n", e.what());
+        return false;
+    }
+
+    if (!std::filesystem::exists(server_path)) {
+        console::error("llama-server executable not found at: %s\n", server_path.string().c_str());
+        return false;
+    }
+
+    // 3. Build command line arguments
+    // Start with the server executable
+    std::vector<std::string> args;
+    args.push_back(server_path.string());
+
+    // Add --host 127.0.0.1 to bind only to localhost
+    args.push_back("--host");
+    args.push_back("127.0.0.1");
+
+    // Add the available port
+    args.push_back("--port");
+    args.push_back(std::to_string(port));
+
+    // Filter and copy arguments from original argv
+    // Skip argv[0] (program name)
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        // Skip CLI-specific arguments
+        if (is_cli_specific_arg(arg)) {
+            // Skip the value for arguments that take one
+            // Check if this arg doesn't contain '=' and next arg doesn't start with '-'
+            if (arg.find('=') == std::string::npos &&
+                (arg == "-p" || arg == "--prompt" ||
+                 arg == "-f" || arg == "--file" ||
+                 arg == "-sys" || arg == "--system-prompt" ||
+                 arg == "-sysf" || arg == "--system-prompt-file" ||
+                 arg == "--image" || arg == "--endpoint")) {
+                if (i + 1 < argc && argv[i + 1][0] != '-') {
+                    i++; // Skip the value too
+                }
+            }
+            continue;
+        }
+
+        args.push_back(arg);
+    }
+
+    // 4. Spawn subprocess in its own session/process group
+    // so that Ctrl+C (SIGINT) from the terminal doesn't reach the server.
+    server_proc = std::make_shared<subprocess_s>();
+    std::vector<char *> server_argv = to_char_ptr_array(args);
+
+    int options = subprocess_option_no_window
+                | subprocess_option_combined_stdout_stderr
+                | subprocess_option_new_session;
+
+    int spawn_result = subprocess_create_ex(server_argv.data(), options, nullptr, server_proc.get());
+
+    if (spawn_result != 0) {
+        console::error("Failed to spawn llama-server subprocess\n");
+        server_proc.reset();
+        return false;
+    }
+
+    is_local_server = true;
+
+    // 5. Wait for server to be ready by polling /props endpoint
+    endpoint_url = "http://127.0.0.1:" + std::to_string(port);
+
+    const int max_retries = 60; // 30 seconds total (500ms per retry)
+    const int retry_delay_ms = 500;
+
+    for (int i = 0; i < max_retries && !g_is_interrupted.load(); i++) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(retry_delay_ms));
+
+        try {
+            auto http_res = common_http_client(endpoint_url);
+            httplib::Client & cli = http_res.first;
+            cli.set_connection_timeout(1, 0);
+            cli.set_read_timeout(1, 0);
+
+            auto res = cli.Get("/props");
+            if (res && (res->status == 200 || res->status == 404)) {
+                // Server is ready, parse metadata
+                auto data = json::parse(res->body);
+                model_name  = json_value(data, "model_alias", std::string("unknown"));
+                build_info_ = json_value(data, "build_info", std::string(""));
+
+                if (data.contains("modalities")) {
+                    auto mods = data.at("modalities");
+                    has_vision_ = json_value(mods, "vision", false);
+                    has_audio_  = json_value(mods, "audio",  false);
+                }
+
+                return true;
+            }
+        } catch (...) {
+            LOG_DBG("Server not yet ready, still polling...");
+            // Server not ready yet, continue polling
+        }
+    }
+
+    // Timeout - terminate subprocess
+    console::error("Timeout waiting for local server to start\n");
+    subprocess_terminate(server_proc.get());
+    subprocess_destroy(server_proc.get());
+    server_proc.reset();
+    is_local_server = false;
+    return false;
+}
+
+bool cli_backend::connect() {
+    // strip trailing slash
+    while (!endpoint_url.empty() && endpoint_url.back() == '/') {
+        endpoint_url.pop_back();
+    }
+
+    // query /props to get server metadata
+    try {
+        auto http_res = common_http_client(endpoint_url);
+        httplib::Client & cli = http_res.first;
+        cli.set_connection_timeout(5, 0);
+        cli.set_read_timeout(10, 0);
+
+        auto res = cli.Get("/props");
+        if (!res || res->status != 200) {
+            console::error("Failed to connect to server at %s\n", endpoint_url.c_str());
+            if (res) {
+                console::error("HTTP status: %d\n", res->status);
+            } else {
+                console::error("Connection error: %s\n", httplib::to_string(res.error()).c_str());
+            }
+            return false;
+        }
+
+        auto data = json::parse(res->body);
+        model_name  = json_value(data, "model_alias", std::string("unknown"));
+        build_info_ = json_value(data, "build_info", std::string(""));
+
+        if (data.contains("modalities")) {
+            auto mods = data.at("modalities");
+            has_vision_ = json_value(mods, "vision", false);
+            has_audio_  = json_value(mods, "audio",  false);
+        }
+
+        return true;
+    } catch (const std::exception & e) {
+        console::error("Failed to connect to server at %s: %s\n", endpoint_url.c_str(), e.what());
+        return false;
+    }
+}
+
+std::string cli_backend::get_model_name() const { return model_name; }
+bool cli_backend::has_vision() const { return has_vision_; }
+bool cli_backend::has_audio() const { return has_audio_; }
+std::string cli_backend::get_build_info() const { return build_info_; }
+
+std::string cli_backend::generate_completion(
+        const json & messages,
+        const common_params & params,
+        bool verbose_prompt,
+        result_timings & out_timings) {
+    // build the OAI chat completion request
+    json request_body = {
+        {"messages", messages},
+        {"stream",   true},
+    };
+
+    // sampling parameters
+    {
+        const auto & s = params.sampling;
+        if (s.temp != 0.8f)          { request_body["temperature"]     = s.temp; }
+        if (s.top_k != 40)           { request_body["top_k"]           = s.top_k; }
+        if (s.top_p != 0.95f)        { request_body["top_p"]           = s.top_p; }
+        if (s.min_p != 0.05f)        { request_body["min_p"]           = s.min_p; }
+        if (s.penalty_repeat != 1.0f)  { request_body["repeat_penalty"]  = s.penalty_repeat; }
+        if (s.penalty_present != 0.0f) { request_body["presence_penalty"] = s.penalty_present; }
+        if (s.penalty_freq != 0.0f)    { request_body["frequency_penalty"] = s.penalty_freq; }
+        if (s.seed != LLAMA_DEFAULT_SEED) { request_body["seed"]         = s.seed; }
+    }
+
+    if (params.n_predict >= 0) {
+        request_body["max_tokens"] = params.n_predict;
+    }
+
+    if (!params.antiprompt.empty()) {
+        request_body["stop"] = params.antiprompt;
+    }
+
+    // reasoning budget
+    if (params.sampling.reasoning_budget_tokens >= 0) {
+        request_body["thinking_budget_tokens"] = params.sampling.reasoning_budget_tokens;
+    }
+
+    // reasoning/thinking control via chat_template_kwargs
+    if (params.enable_reasoning == 0) {
+        request_body["chat_template_kwargs"] = json{{"enable_thinking", false}};
+    }
+
+    if (verbose_prompt) {
+        console::set_display(DISPLAY_TYPE_PROMPT);
+        console::log("POST /v1/chat/completions %s\n\n", request_body.dump().c_str());
+        console::set_display(DISPLAY_TYPE_RESET);
+    }
+
+    // do the HTTP request with SSE streaming
+    std::string curr_content;
+    bool is_thinking = false;
+    std::thread req_thread;
+
+    try {
+        // Reset interrupt flag at start of new request
+        if (g_is_interrupted.load()) {
+            LOG_DBG("Resetting interrupt flag at start of new request");
+            g_is_interrupted.store(false);
+            this->was_interrupted = true;
+            // Longer delay after interruption to let server clean up slot
+            std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        }
+
+        // After interruption, poll server until ready
+        if (this->was_interrupted) {
+            LOG_DBG("Polling server slots after interruption");
+            auto test_res = common_http_client(endpoint_url);
+            httplib::Client & test_cli = test_res.first;
+            test_cli.set_connection_timeout(2, 0);
+            int retries = 0;
+            bool connected = false;
+            while (retries < 100) {
+                auto res = test_cli.Get("/slots?fail_on_no_slot=1");
+                if (res && (res->status == 200 || res->status == 404)) {
+                    LOG_DBG("Server ready after %d retries (status=%d)", retries, res ? res->status : 0);
+                    connected = true;
+                    break;
+                }
+                // If cannot connect at all, server may have crashed
+                if (!res) {
+                    LOG_DBG("Cannot connect to server (retry %d)", retries);
+                }
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                retries++;
+            }
+            this->was_interrupted = false;
+            if (!connected) {
+                LOG_DBG("Failed to reconnect to server after interruption");
+                console::error("Server unavailable after interruption\n");
+                return "";
+            }
+        }
+        LOG_DBG("Connecting to %s", endpoint_url.c_str());
+        auto http_res = common_http_client(endpoint_url);
+        LOG_DBG("Connected successfully");
+        httplib::Client cli = std::move(http_res.first);
+        cli.set_connection_timeout(10, 0);
+        cli.set_read_timeout(600, 0); // 10 min read timeout for long generations
+        cli.set_write_timeout(10, 0);
+
+        std::string body_str = request_body.dump();
+
+        auto done = std::make_shared<std::atomic<bool>>(false);
+        auto error_msg = std::make_shared<std::string>();
+
+        httplib::Request req;
+        req.method = "POST";
+        req.path = "/v1/chat/completions";
+        req.set_header("Content-Type", "application/json");
+        req.body = body_str;
+
+        // collect chunks into a thread-safe queue
+        auto chunks = std::make_shared<std::vector<std::string>>();
+        auto chunks_mutex = std::make_shared<std::mutex>();
+        auto chunks_cv = std::make_shared<std::condition_variable>();
+
+        req.content_receiver = [chunks, chunks_mutex, chunks_cv](const char * data, size_t data_length, size_t, size_t) -> bool {
+            std::string chunk(data, data_length);
+            {
+                std::lock_guard<std::mutex> lock(*chunks_mutex);
+                chunks->push_back(std::move(chunk));
+            }
+            chunks_cv->notify_one();
+            return true;
+        };
+
+        // send request in a separate thread
+        req_thread = std::thread([&cli, req = std::move(req), done, error_msg]() mutable {
+            LOG_DBG("HTTP request thread started");
+            auto result = cli.send(req);
+            LOG_DBG("HTTP request done, error=%d", (int)result.error());
+            // Don't set error if interrupted (Error::Read or Error::Connection expected)
+            if (result.error() != httplib::Error::Success) {
+                auto err_str = httplib::to_string(result.error());
+                // Only set error if not a connection error (likely from interruption)
+                if (err_str.find("read") == std::string::npos &&
+                    err_str.find("connection") == std::string::npos) {
+                    *error_msg = err_str;
+                    LOG_DBG("HTTP error set: %s", error_msg->c_str());
+                } else {
+                    LOG_DBG("HTTP connection error (likely from interrupt): %s", err_str.c_str());
+                }
+            } else if (result && result->status != 200) {
+                try {
+                    auto err_json = json::parse(result->body);
+                    if (err_json.contains("error") && err_json["error"].contains("message")) {
+                        *error_msg = err_json["error"]["message"].get<std::string>();
+                    } else {
+                        *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
+                    }
+                } catch (...) {
+                    *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
+                }
+            }
+            done->store(true);
+        });
+
+        // process SSE stream
+        console::spinner::start();
+
+        std::string sse_buffer;
+        bool first_chunk = true;
+        bool stream_done = false;
+        bool interrupted = false;
+
+        while (!stream_done && !interrupted) {
+            // Check for interrupt at start of each iteration
+            if (g_is_interrupted.load()) {
+                LOG_DBG("Interrupt detected, stopping client");
+                interrupted = true;
+                cli.stop();
+                break;
+            }
+
+            // wait for data
+            std::string new_data;
+            {
+                std::unique_lock<std::mutex> lock(*chunks_mutex);
+                if (chunks->empty()) {
+                    chunks_cv->wait_for(lock, std::chrono::milliseconds(100), [&] {
+                        return !chunks->empty() || done->load() || g_is_interrupted.load();
+                    });
+                    if (chunks->empty()) {
+                        if (done->load()) {
+                            break;
+                        }
+                        if (g_is_interrupted.load()) {
+                            interrupted = true;
+                            cli.stop();
+                            break;
+                        }
+                        continue;
+                    }
+                }
+                for (auto & chunk : *chunks) {
+                    new_data += chunk;
+                }
+                chunks->clear();
+            }
+
+            sse_buffer += new_data;
+
+            // process SSE lines
+            size_t pos;
+            while ((pos = sse_buffer.find("\n\n")) != std::string::npos) {
+                std::string event = sse_buffer.substr(0, pos);
+                sse_buffer.erase(0, pos + 2);
+
+                // look for data: line
+                size_t data_pos = event.find("data: ");
+                if (data_pos == std::string::npos) { continue; }
+
+                std::string data_line = event.substr(data_pos + 6);
+
+                if (data_line == "[DONE]") {
+                    stream_done = true;
+                    break;
+                }
+
+                try {
+                    auto chunk_json = json::parse(data_line);
+                    if (!chunk_json.contains("choices")) { continue; }
+
+                    auto & choices = chunk_json["choices"];
+                    if (choices.empty()) { continue; }
+
+                    // Extract timings if present (usually in the final chunk)
+                    if (chunk_json.contains("timings")) {
+                        auto & t = chunk_json["timings"];
+                        out_timings.prompt_n            = json_value(t, "prompt_n", out_timings.prompt_n);
+                        out_timings.prompt_ms           = json_value(t, "prompt_ms", out_timings.prompt_ms);
+                        out_timings.prompt_per_second   = json_value(t, "prompt_per_second", out_timings.prompt_per_second);
+                        out_timings.predicted_n         = json_value(t, "predicted_n", out_timings.predicted_n);
+                        out_timings.predicted_ms        = json_value(t, "predicted_ms", out_timings.predicted_ms);
+                        out_timings.predicted_per_second = json_value(t, "predicted_per_second", out_timings.predicted_per_second);
+                    }
+
+                    // Check for completion
+                    if (choices[0].contains("finish_reason") && !choices[0]["finish_reason"].is_null()) {
+                        stream_done = true;
+                        break;
+                    }
+
+                    if (!choices[0].contains("delta")) { continue; }
+
+                    auto & delta = choices[0]["delta"];
+
+                    if (first_chunk) {
+                        first_chunk = false;
+                        console::spinner::stop();
+                    }
+
+                    // Handle content (only display if not interrupted)
+                    if (delta.contains("content") && !delta["content"].is_null()) {
+                        std::string content_delta = delta["content"].get<std::string>();
+                        curr_content += content_delta;
+                        if (!interrupted) {
+                            if (is_thinking) {
+                                console::log("\n[End thinking]\n\n");
+                                console::set_display(DISPLAY_TYPE_RESET);
+                                is_thinking = false;
+                            }
+                            console::log("%s", content_delta.c_str());
+                            console::flush();
+                        }
+                    }
+
+                    // Handle reasoning_content (only display if not interrupted)
+                    if (delta.contains("reasoning_content") && !delta["reasoning_content"].is_null()) {
+                        std::string reasoning_delta = delta["reasoning_content"].get<std::string>();
+                        if (!interrupted) {
+                            console::set_display(DISPLAY_TYPE_REASONING);
+                            if (!is_thinking) {
+                                console::log("[Start thinking]\n");
+                            }
+                            is_thinking = true;
+                            console::log("%s", reasoning_delta.c_str());
+                            console::flush();
+                        }
+                    }
+                } catch (const json::parse_error & e) {
+                    // Ignore parsing errors for malformed chunks
+                    LOG_DBG("Ignoring malformed chunk due to JSON parse error: %s\n", e.what());
+                }
+            }
+        }
+
+        console::spinner::stop();
+
+        // Wait for request thread to finish (if not already done)
+        if (req_thread.joinable()) {
+            req_thread.join();
+        }
+
+        // Reset interrupt flag if we were interrupted
+        if (interrupted) {
+            g_is_interrupted.store(false);
+            this->was_interrupted = true;
+            // Don't show error messages when user intentionally interrupted
+            // Give server a moment to clean up the aborted request
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            return curr_content;
+        }
+
+        // Check for errors (only if not interrupted)
+        // When interrupted, we expect connection errors - don't show them
+        if (!interrupted && !error_msg->empty()) {
+            LOG_DBG("Error during request: %s", error_msg->c_str());
+            console::error("Error: %s\n", error_msg->c_str());
+            return curr_content;
+        }
+
+        LOG_DBG("Request completed successfully, content length: %zu", curr_content.length());
+        return curr_content;
+    } catch (const std::exception & e) {
+        console::spinner::stop();
+        // Make sure to join the thread on exception
+        if (req_thread.joinable()) {
+            req_thread.join();
+        }
+        // Don't show error if we were interrupted
+        if (!g_is_interrupted.load()) {
+            LOG_DBG("Exception during request: %s", e.what());
+            console::error("Error in generate_completion: %s\n", e.what());
+        } else {
+            LOG_DBG("Exception during request but interrupted, suppressing: %s", e.what());
+        }
+        return curr_content;
+    }
+}
+
+std::string cli_backend::load_text_file(const std::string & fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        return "";
+    }
+    return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+}
+
+json cli_backend::load_media_file(const std::string & fname) {
+    std::ifstream file(fname, std::ios::binary);
+    if (!file) {
+        return json::object();
+    }
+
+    std::string data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
+    if (data.empty()) {
+        return json::object();
+    }
+
+    // detect file type from extension
+    std::string ext;
+    size_t dot_pos = fname.find_last_of('.');
+    if (dot_pos != std::string::npos) {
+        ext = fname.substr(dot_pos);
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+    }
+
+    // base64 encode
+    std::string b64 = base64_encode(data);
+
+    // Media type info struct
+    struct media_type_info {
+        std::string media_type;
+        std::string content_type;
+    };
+
+    // Map of extension to media type info
+    static const std::unordered_map<std::string, media_type_info> MEDIA_TYPE_MAP = {
+        {".png",   {"image/png",  "image_url"}},
+        {".jpg",   {"image/jpeg", "image_url"}},
+        {".jpeg",  {"image/jpeg", "image_url"}},
+        {".gif",   {"image/gif",  "image_url"}},
+        {".webp",  {"image/webp", "image_url"}},
+        {".wav",   {"audio/wav",  "input_audio"}},
+        {".mp3",   {"audio/mpeg", "input_audio"}},
+    };
+
+    const auto it = MEDIA_TYPE_MAP.find(ext);
+    if (it == MEDIA_TYPE_MAP.end()) {
+        // unknown type, treat as binary
+        return json::object();
+    }
+    const auto & info = it->second;
+
+    // build the OAI content part
+    if (info.content_type == "image_url") {
+        return json{
+            {"type", "image_url"},
+            {"image_url", {
+                {"url", "data:" + info.media_type + ";base64," + b64}
+            }}
+        };
+    }
+    // audio
+    return json{
+        {"type", "input_audio"},
+        {"input_audio", {
+            {"data", b64},
+            {"format", ext == ".mp3" ? "mp3" : "wav"}
+        }}
+    };
+}
+
+void cli_backend::terminate() {
+    // terminate subprocess if running local server
+    if (is_local_server && server_proc) {
+        subprocess_terminate(server_proc.get());
+        subprocess_destroy(server_proc.get());
+        server_proc.reset();
+        is_local_server = false;
+    }
+}
diff --git a/tools/cli/cli-backend.h b/tools/cli/cli-backend.h
new file mode 100644
index 00000000000..779abd20f70
--- /dev/null
+++ b/tools/cli/cli-backend.h
@@ -0,0 +1,82 @@
+#pragma once
+
+#include "common.h"
+#include "console.h"
+
+#include <nlohmann/json.hpp>
+
+// forward declaration for subprocess
+struct subprocess_s;
+
+using json = nlohmann::ordered_json;
+
+// result_timings struct - copied from server-task.h for CLI use
+struct result_timings {
+    int32_t cache_n = -1;
+
+    int32_t prompt_n = -1;
+    double prompt_ms = 0.0;
+    double prompt_per_token_ms = 0.0;
+    double prompt_per_second = 0.0;
+
+    int32_t predicted_n = -1;
+    double predicted_ms = 0.0;
+    double predicted_per_token_ms = 0.0;
+    double predicted_per_second = 0.0;
+
+    // Optional speculative metrics
+    int32_t predicted_draft_n = -1;
+    double predicted_draft_ms = 0.0;
+    double draft_per_token_ms = 0.0;
+};
+
+//
+// Backend interface — connects to llama-server via HTTP(S)
+// Can connect to external server or spawn local server subprocess
+//
+
+struct cli_backend {
+    std::string endpoint_url;
+    std::string model_name;
+    bool has_vision_ = false;
+    bool has_audio_  = false;
+    std::string build_info_;
+
+    // subprocess management for local server
+    std::shared_ptr<subprocess_s> server_proc;
+    bool is_local_server = false;
+    bool was_interrupted = false;  // track if last request was interrupted
+
+    cli_backend();
+
+    // connect to a running server at endpoint_url
+    bool connect();
+
+    // spawn a local server subprocess
+    // argc/argv are the original command-line arguments
+    // CLI-specific arguments are filtered out, rest passed to llama-server
+    bool spawn_local_server(int argc, char ** argv);
+
+    // model / server info
+    std::string get_model_name() const;
+    bool has_vision() const;
+    bool has_audio() const;
+    std::string get_build_info() const;
+
+    // chat completion (streaming), returns assistant content text
+    std::string generate_completion(
+        const json & messages,
+        const common_params & params,
+        bool verbose_prompt,
+        result_timings & out_timings);
+
+    // load a local text file, return its contents (empty string on failure)
+    static std::string load_text_file(const std::string & fname);
+
+    // load a local media file, return the OAI content part JSON for it
+    // returns empty JSON object on failure
+    static json load_media_file(const std::string & fname);
+
+    // cleanup - terminates subprocess if running local server
+    void terminate();
+};
diff --git a/tools/cli/cli-remote.cpp b/tools/cli/cli-remote.cpp
deleted file mode 100644
index 06a39131bd0..00000000000
--- a/tools/cli/cli-remote.cpp
+++ /dev/null
@@ -1,469 +0,0 @@
-#include "cli-remote.h"
-
-#include "common.h"
-#include "console.h"
-#include "http.h"
-
-#include <algorithm>
-#include <atomic>
-#include <condition_variable>
-#include <filesystem>
-#include <fstream>
-#include <mutex>
-#include <thread>
-
-// shared with cli.cpp
-extern std::atomic<bool> g_is_interrupted;
-
-// base64 encoding for multimodal content over HTTP
-static std::string base64_encode(const std::string & in) {
-    static const char base64_chars[] =
-        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-    std::string out;
-    out.reserve(4 * ((in.size() + 2) / 3));
-    int i = 0;
-    int j = 0;
-    unsigned char arr3[3];
-    unsigned char arr4[4];
-    size_t in_len = in.size();
-    const unsigned char * bytes = reinterpret_cast<const unsigned char *>(in.data());
-    while (in_len--) {
-        arr3[i++] = *(bytes++);
-        if (i == 3) {
-            arr4[0] = (arr3[0] & 0xfc) >> 2;
-            arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
-            arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
-            arr4[3] = arr3[2] & 0x3f;
-            for (i = 0; i < 4; i++) {
-                out += base64_chars[arr4[i]];
-            }
-            i = 0;
-        }
-    }
-    if (i) {
-        for (j = i; j < 3; j++) {
-            arr3[j] = '\0';
-        }
-        arr4[0] = (arr3[0] & 0xfc) >> 2;
-        arr4[1] = ((arr3[0] & 0x03) << 4) + ((arr3[1] & 0xf0) >> 4);
-        arr4[2] = ((arr3[1] & 0x0f) << 2) + ((arr3[2] & 0xc0) >> 6);
-        for (j = 0; j < i + 1; j++) {
-            out += base64_chars[arr4[j]];
-        }
-        while (i++ < 3) {
-            out += '=';
-        }
-    }
-    return out;
-}
-
-//
-// cli_backend_remote
-//
-
-cli_backend_remote::cli_backend_remote(const std::string & url) : endpoint_url(url) {
-    // strip trailing slash
-    while (!endpoint_url.empty() && endpoint_url.back() == '/') {
-        endpoint_url.pop_back();
-    }
-}
-
-bool cli_backend_remote::connect() {
-    // query /props to get server metadata
-    try {
-        auto http_res = common_http_client(endpoint_url);
-        httplib::Client & cli = http_res.first;
-        cli.set_connection_timeout(5, 0);
-        cli.set_read_timeout(10, 0);
-
-        auto res = cli.Get("/props");
-        if (!res || res->status != 200) {
-            console::error("Failed to connect to server at %s\n", endpoint_url.c_str());
-            if (res) {
-                console::error("HTTP status: %d\n", res->status);
-            } else {
-                console::error("Connection error: %s\n", httplib::to_string(res.error()).c_str());
-            }
-            return false;
-        }
-
-        auto data = json::parse(res->body);
-        model_name  = json_value(data, "model_alias", std::string("unknown"));
-        build_info_ = json_value(data, "build_info", std::string(""));
-
-        if (data.contains("modalities")) {
-            auto mods = data.at("modalities");
-            has_vision_ = json_value(mods, "vision", false);
-            has_audio_  = json_value(mods, "audio",  false);
-        }
-
-        return true;
-    } catch (const std::exception & e) {
-        console::error("Failed to connect to server at %s: %s\n", endpoint_url.c_str(), e.what());
-        return false;
-    }
-}
-
-std::string cli_backend_remote::get_model_name() const { return model_name; }
-bool cli_backend_remote::has_vision() const { return has_vision_; }
-bool cli_backend_remote::has_audio() const { return has_audio_; }
-std::string cli_backend_remote::get_build_info() const { return build_info_; }
-
-std::string cli_backend_remote::generate_completion(
-        const json & messages,
-        const common_params & params,
-        bool verbose_prompt,
-        result_timings & out_timings) {
-    // build the OAI chat completion request
-    json request_body = {
-        {"messages", messages},
-        {"stream",   true},
-    };
-
-    // sampling parameters
-    {
-        auto & s = params.sampling;
-        if (s.temp != 0.8f)    request_body["temperature"]         = s.temp;
-        if (s.top_k != 40)     request_body["top_k"]               = s.top_k;
-        if (s.top_p != 0.95f)  request_body["top_p"]               = s.top_p;
-        if (s.min_p != 0.05f)  request_body["min_p"]              = s.min_p;
-        if (s.penalty_repeat != 1.0f) request_body["repeat_penalty"] = s.penalty_repeat;
-        if (s.penalty_present != 0.0f) request_body["presence_penalty"] = s.penalty_present;
-        if (s.penalty_freq != 0.0f) request_body["frequency_penalty"] = s.penalty_freq;
-        if (s.seed != LLAMA_DEFAULT_SEED) request_body["seed"] = s.seed;
-    }
-
-    if (params.n_predict >= 0) {
-        request_body["max_tokens"] = params.n_predict;
-    }
-
-    if (!params.antiprompt.empty()) {
-        request_body["stop"] = params.antiprompt;
-    }
-
-    // reasoning budget
-    if (params.sampling.reasoning_budget_tokens >= 0) {
-        request_body["thinking_budget_tokens"] = params.sampling.reasoning_budget_tokens;
-    }
-
-    // reasoning/thinking control via chat_template_kwargs
-    if (params.enable_reasoning == 0) {
-        request_body["chat_template_kwargs"] = json{{"enable_thinking", false}};
-    }
-
-    if (verbose_prompt) {
-        console::set_display(DISPLAY_TYPE_PROMPT);
-        console::log("POST /v1/chat/completions %s\n\n", request_body.dump().c_str());
-        console::set_display(DISPLAY_TYPE_RESET);
-    }
-
-    // do the HTTP request with SSE streaming
-    std::string curr_content;
-    bool is_thinking = false;
-
-    try {
-        auto http_res = common_http_client(endpoint_url);
-        httplib::Client cli = std::move(http_res.first);
-        cli.set_connection_timeout(10, 0);
-        cli.set_read_timeout(600, 0); // 10 min read timeout for long generations
-        cli.set_write_timeout(10, 0);
-
-        std::string body_str = request_body.dump();
-
-        auto done = std::make_shared<std::atomic<bool>>(false);
-        auto error_msg = std::make_shared<std::string>();
-
-        httplib::Request req;
-        req.method = "POST";
-        req.path = "/v1/chat/completions";
-        req.set_header("Content-Type", "application/json");
-        req.body = body_str;
-
-        // collect chunks into a thread-safe queue
-        auto chunks = std::make_shared<std::vector<std::string>>();
-        auto chunks_mutex = std::make_shared<std::mutex>();
-        auto chunks_cv = std::make_shared<std::condition_variable>();
-
-        req.content_receiver = [chunks, chunks_mutex, chunks_cv](const char * data, size_t data_length, size_t, size_t) -> bool {
-            std::string chunk(data, data_length);
-            {
-                std::lock_guard<std::mutex> lock(*chunks_mutex);
-                chunks->push_back(std::move(chunk));
-            }
-            chunks_cv->notify_one();
-            return true;
-        };
-
-        // send request in a separate thread
-        std::thread req_thread([&cli, req = std::move(req), done, error_msg]() mutable {
-            auto result = cli.send(std::move(req));
-            if (result.error() != httplib::Error::Success) {
-                *error_msg = httplib::to_string(result.error());
-            } else if (result && result->status != 200) {
-                try {
-                    auto err_json = json::parse(result->body);
-                    if (err_json.contains("error") && err_json["error"].contains("message")) {
-                        *error_msg = err_json["error"]["message"].get<std::string>();
-                    } else {
-                        *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
-                    }
-                } catch (...) {
-                    *error_msg = "HTTP " + std::to_string(result->status) + ": " + result->body.substr(0, 500);
-                }
-            }
-            done->store(true);
-        });
-        req_thread.detach();
-
-        // process SSE stream
-        console::spinner::start();
-
-        std::string sse_buffer;
-        bool first_chunk = true;
-        bool stream_done = false;
-
-        while (!stream_done) {
-            // wait for data
-            std::string new_data;
-            {
-                std::unique_lock<std::mutex> lock(*chunks_mutex);
-                if (chunks->empty()) {
-                    chunks_cv->wait_for(lock, std::chrono::milliseconds(100), [&] {
-                        return !chunks->empty() || done->load() || g_is_interrupted.load();
-                    });
-                    if (chunks->empty()) {
-                        if (done->load() || g_is_interrupted.load()) {
-                            break;
-                        }
-                        continue;
-                    }
-                }
-                for (auto & chunk : *chunks) {
-                    new_data += chunk;
-                }
-                chunks->clear();
-            }
-
-            if (new_data.empty()) {
-                continue;
-            }
-
-            sse_buffer += new_data;
-
-            // process complete SSE lines
-            // SSE format: "data: {json}\n\n" or "data: [DONE]\n\n"
-            size_t pos = 0;
-            while (pos < sse_buffer.size() && !stream_done) {
-                // find "data: " prefix
-                size_t data_pos = sse_buffer.find("data: ", pos);
-                if (data_pos == std::string::npos) {
-                    break;
-                }
-
-                // find the end of this line
-                size_t line_end = sse_buffer.find('\n', data_pos);
-                if (line_end == std::string::npos) {
-                    // incomplete line, wait for more data
-                    break;
-                }
-
-                std::string data_str = sse_buffer.substr(data_pos + 6, line_end - data_pos - 6);
-                // trim whitespace
-                while (!data_str.empty() && (data_str.back() == '\n' || data_str.back() == '\r' || data_str.back() == ' ')) {
-                    data_str.pop_back();
-                }
-
-                pos = line_end + 1;
-
-                if (first_chunk) {
-                    console::spinner::stop();
-                    first_chunk = false;
-                }
-
-                if (data_str == "[DONE]") {
-                    stream_done = true;
-                    break;
-                }
-
-                if (data_str.empty()) {
-                    continue;
-                }
-
-                // parse the JSON chunk
-                try {
-                    auto chunk_json = json::parse(data_str);
-
-                    // check for error
-                    if (chunk_json.contains("error")) {
-                        auto & err = chunk_json["error"];
-                        if (err.is_object() && err.contains("message")) {
-                            console::error("Error: %s\n", err["message"].get<std::string>().c_str());
-                        } else {
-                            console::error("Error: %s\n", err.dump().c_str());
-                        }
-                        stream_done = true;
-                        break;
-                    }
-
-                    if (!chunk_json.contains("choices") || !chunk_json["choices"].is_array() || chunk_json["choices"].empty()) {
-                        continue;
-                    }
-
-                    auto & choice = chunk_json["choices"][0];
-
-                    // check finish_reason
-                    if (choice.contains("finish_reason") && !choice["finish_reason"].is_null()) {
-                        // extract timing info if present
-                        if (chunk_json.contains("timings")) {
-                            auto & t = chunk_json["timings"];
-                            out_timings.prompt_n            = json_value(t, "prompt_n", -1);
-                            out_timings.prompt_ms           = json_value(t, "prompt_ms", 0.0);
-                            out_timings.prompt_per_second   = json_value(t, "prompt_per_second", 0.0);
-                            out_timings.predicted_n         = json_value(t, "predicted_n", -1);
-                            out_timings.predicted_ms        = json_value(t, "predicted_ms", 0.0);
-                            out_timings.predicted_per_second = json_value(t, "predicted_per_second", 0.0);
-                        }
-                        stream_done = true;
-                        break;
-                    }
-
-                    // extract delta
-                    if (!choice.contains("delta")) {
-                        continue;
-                    }
-                    auto & delta = choice["delta"];
-
-                    // extract usage/timings from streaming chunks
-                    if (chunk_json.contains("timings")) {
-                        auto & t = chunk_json["timings"];
-                        out_timings.prompt_n            = json_value(t, "prompt_n", out_timings.prompt_n);
-                        out_timings.prompt_ms           = json_value(t, "prompt_ms", out_timings.prompt_ms);
-                        out_timings.prompt_per_second   = json_value(t, "prompt_per_second", out_timings.prompt_per_second);
-                        out_timings.predicted_n         = json_value(t, "predicted_n", out_timings.predicted_n);
-                        out_timings.predicted_ms        = json_value(t, "predicted_ms", out_timings.predicted_ms);
-                        out_timings.predicted_per_second = json_value(t, "predicted_per_second", out_timings.predicted_per_second);
-                    }
-
-                    // content delta
-                    if (delta.contains("content") && delta["content"].is_string()) {
-                        std::string content_delta = delta["content"].get<std::string>();
-                        if (!content_delta.empty()) {
-                            if (is_thinking) {
-                                console::log("\n[End thinking]\n\n");
-                                console::set_display(DISPLAY_TYPE_RESET);
-                                is_thinking = false;
-                            }
-                            curr_content += content_delta;
-                            console::log("%s", content_delta.c_str());
-                            console::flush();
-                        }
-                    }
-
-                    // reasoning content delta
-                    if (delta.contains("reasoning_content") && delta["reasoning_content"].is_string()) {
-                        std::string reasoning_delta = delta["reasoning_content"].get<std::string>();
-                        if (!reasoning_delta.empty()) {
-                            console::set_display(DISPLAY_TYPE_REASONING);
-                            if (!is_thinking) {
-                                console::log("[Start thinking]\n");
-                            }
-                            is_thinking = true;
-                            console::log("%s", reasoning_delta.c_str());
-                            console::flush();
-                        }
-                    }
-                } catch (const json::parse_error &) {
-                    // skip malformed JSON
-                    continue;
-                }
-
-                if (g_is_interrupted.load()) {
-                    stream_done = true;
-                    break;
-                }
-            }
-
-            // keep unprocessed part of the buffer
-            sse_buffer = sse_buffer.substr(pos);
-        }
-
-        if (first_chunk) {
-            console::spinner::stop();
-        }
-
-        if (!error_msg->empty()) {
-            console::error("Error: %s\n", error_msg->c_str());
-        }
-
-    } catch (const std::exception & e) {
-        console::spinner::stop();
-        console::error("HTTP error: %s\n", e.what());
-    }
-
-    g_is_interrupted.store(false);
-    return curr_content;
-}
-
-std::string cli_backend_remote::load_text_file(const std::string & fname) {
-    std::ifstream file(fname, std::ios::binary);
-    if (!file) {
-        return "";
-    }
-    return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-}
-
-json cli_backend_remote::load_media_file(const std::string & fname) {
-    std::ifstream file(fname, std::ios::binary);
-    if (!file) {
-        return json::object();
-    }
-
-    // read the file
-    std::string file_data((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-    std::string b64 = base64_encode(file_data);
-
-    // determine media type from extension
-    std::string media_type = "image/png"; // default
-    std::string content_type = "image_url"; // OAI content part type
-    auto ext = std::filesystem::path(fname).extension().string();
-    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
-
-    if (ext == ".jpg" || ext == ".jpeg") {
-        media_type = "image/jpeg";
-    } else if (ext == ".png") {
-        media_type = "image/png";
-    } else if (ext == ".gif") {
-        media_type = "image/gif";
-    } else if (ext == ".webp") {
-        media_type = "image/webp";
-    } else if (ext == ".wav") {
-        media_type = "audio/wav";
-        content_type = "input_audio";
-    } else if (ext == ".mp3") {
-        media_type = "audio/mpeg";
-        content_type = "input_audio";
-    }
-
-    // build the OAI content part
-    if (content_type == "image_url") {
-        return json{
-            {"type", "image_url"},
-            {"image_url", {
-                {"url", "data:" + media_type + ";base64," + b64}
-            }}
-        };
-    } else {
-        // audio
-        return json{
-            {"type", "input_audio"},
-            {"input_audio", {
-                {"data", b64},
-                {"format", ext == ".mp3" ? "mp3" : "wav"}
-            }}
-        };
-    }
-}
-
-void cli_backend_remote::terminate() {
-    // nothing to do for remote — just return
-}
diff --git a/tools/cli/cli-remote.h b/tools/cli/cli-remote.h
deleted file mode 100644
index f545f5969b7..00000000000
--- a/tools/cli/cli-remote.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#pragma once
-
-#include "common.h"
-#include "console.h"
-
-#include "server-task.h"
-
-//
-// Backend interface — abstracts local (in-process) vs remote (HTTP) server
-//
-
-struct cli_backend {
-    virtual ~cli_backend() = default;
-
-    // model / server info
-    virtual std::string get_model_name() const = 0;
-    virtual bool has_vision() const = 0;
-    virtual bool has_audio() const = 0;
-    virtual std::string get_build_info() const = 0;
-
-    // chat completion (streaming), returns assistant content text
-    virtual std::string generate_completion(
-        const json & messages,
-        const common_params & params,
-        bool verbose_prompt,
-        result_timings & out_timings) = 0;
-
-    // load a local text file, return its contents (empty string on failure)
-    virtual std::string load_text_file(const std::string & fname) = 0;
-
-    // load a local media file, return the OAI content part JSON for it
-    // returns empty JSON object on failure
-    virtual json load_media_file(const std::string & fname) = 0;
-
-    // cleanup
-    virtual void terminate() = 0;
-};
-
-//
-// Remote backend — connects to a running llama-server via HTTP(S)
-//
-
-struct cli_backend_remote : cli_backend {
-    std::string endpoint_url;
-    std::string model_name;
-    bool has_vision_ = false;
-    bool has_audio_  = false;
-    std::string build_info_;
-
-    cli_backend_remote(const std::string & url);
-
-    bool connect();
-
-    std::string get_model_name() const override;
-    bool has_vision() const override;
-    bool has_audio() const override;
-    std::string get_build_info() const override;
-
-    std::string generate_completion(
-            const json & messages,
-            const common_params & params,
-            bool verbose_prompt,
-            result_timings & out_timings) override;
-
-    std::string load_text_file(const std::string & fname) override;
-    json load_media_file(const std::string & fname) override;
-
-    void terminate() override;
-};
diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp
index 87d9c7fa487..e1e68493064 100644
--- a/tools/cli/cli.cpp
+++ b/tools/cli/cli.cpp
@@ -2,14 +2,11 @@
 #include "common.h"
 #include "arg.h"
 #include "console.h"
-#include "fit.h"
-// #include "log.h"
+#include "log.h"
 
-#include "server-common.h"
-#include "server-context.h"
-#include "server-task.h"
+#include <nlohmann/json.hpp>
 
-#include "cli-remote.h"
+#include "cli-backend.h"
 
 #include <array>
 #include <atomic>
@@ -55,224 +52,6 @@ static void signal_handler(int) {
 }
 #endif
 
-//
-// Local backend — wraps server_context (original behavior)
-//
-
-struct cli_backend_local : cli_backend {
-    server_context ctx_server;
-    std::vector<raw_buffer> input_files;
-    json chat_messages = json::array();
-    task_params defaults;
-    bool verbose_prompt;
-    std::thread inference_thread;
-
-    cli_backend_local(const common_params & params) {
-        defaults.sampling    = params.sampling;
-        defaults.speculative = params.speculative;
-        defaults.n_keep      = params.n_keep;
-        defaults.n_predict   = params.n_predict;
-        defaults.antiprompt  = params.antiprompt;
-
-        defaults.stream = true;
-        defaults.timings_per_token = true;
-
-        verbose_prompt = params.verbose_prompt;
-    }
-
-    bool load_model(common_params & params) {
-        return ctx_server.load_model(params);
-    }
-
-    void start_loop() {
-        inference_thread = std::thread([this]() {
-            ctx_server.start_loop();
-        });
-    }
-
-    std::string get_model_name() const override {
-        return ctx_server.get_meta().model_name;
-    }
-
-    bool has_vision() const override {
-        return ctx_server.get_meta().has_inp_image;
-    }
-
-    bool has_audio() const override {
-        return ctx_server.get_meta().has_inp_audio;
-    }
-
-    std::string get_build_info() const override {
-        return ctx_server.get_meta().build_info;
-    }
-
-    std::string generate_completion(
-            const json & messages,
-            const common_params & /*params*/,
-            bool /*verbose_prompt_arg*/,
-            result_timings & out_timings) override {
-        chat_messages = messages;
-        server_response_reader rd = ctx_server.get_response_reader();
-        auto chat_params = format_chat();
-        {
-            server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
-            task.id         = rd.get_new_id();
-            task.index      = 0;
-            task.params     = defaults;
-            task.cli_prompt = chat_params.prompt;
-            task.cli_files  = input_files;
-            task.cli        = true;
-
-            task.params.chat_parser_params = common_chat_parser_params(chat_params);
-            task.params.chat_parser_params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
-            if (!chat_params.parser.empty()) {
-                task.params.chat_parser_params.parser.load(chat_params.parser);
-            }
-
-            if (!chat_params.thinking_end_tag.empty()) {
-                const llama_vocab * vocab = llama_model_get_vocab(
-                    llama_get_model(ctx_server.get_llama_context()));
-
-                task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens;
-                task.params.sampling.generation_prompt = chat_params.generation_prompt;
-
-                if (!chat_params.thinking_start_tag.empty()) {
-                    task.params.sampling.reasoning_budget_start =
-                        common_tokenize(vocab, chat_params.thinking_start_tag, false, true);
-                }
-                task.params.sampling.reasoning_budget_end =
-                    common_tokenize(vocab, chat_params.thinking_end_tag, false, true);
-                task.params.sampling.reasoning_budget_forced =
-                    common_tokenize(vocab, defaults.sampling.reasoning_budget_message + chat_params.thinking_end_tag, false, true);
-            }
-
-            rd.post_task({std::move(task)});
-        }
-
-        if (verbose_prompt) {
-            console::set_display(DISPLAY_TYPE_PROMPT);
-            console::log("%s\n\n", chat_params.prompt.c_str());
-            console::set_display(DISPLAY_TYPE_RESET);
-        }
-
-        console::spinner::start();
-        server_task_result_ptr result = rd.next(should_stop);
-
-        console::spinner::stop();
-        std::string curr_content;
-        bool is_thinking = false;
-
-        while (result) {
-            if (should_stop()) {
-                break;
-            }
-            if (result->is_error()) {
-                json err_data = result->to_json();
-                if (err_data.contains("message")) {
-                    console::error("Error: %s\n", err_data["message"].get<std::string>().c_str());
-                } else {
-                    console::error("Error: %s\n", err_data.dump().c_str());
-                }
-                return curr_content;
-            }
-            auto res_partial = dynamic_cast<server_task_result_cmpl_partial *>(result.get());
-            if (res_partial) {
-                out_timings = std::move(res_partial->timings);
-                for (const auto & diff : res_partial->oaicompat_msg_diffs) {
-                    if (!diff.content_delta.empty()) {
-                        if (is_thinking) {
-                            console::log("\n[End thinking]\n\n");
-                            console::set_display(DISPLAY_TYPE_RESET);
-                            is_thinking = false;
-                        }
-                        curr_content += diff.content_delta;
-                        console::log("%s", diff.content_delta.c_str());
-                        console::flush();
-                    }
-                    if (!diff.reasoning_content_delta.empty()) {
-                        console::set_display(DISPLAY_TYPE_REASONING);
-                        if (!is_thinking) {
-                            console::log("[Start thinking]\n");
-                        }
-                        is_thinking = true;
-                        console::log("%s", diff.reasoning_content_delta.c_str());
-                        console::flush();
-                    }
-                }
-            }
-            auto res_final = dynamic_cast<server_task_result_cmpl_final *>(result.get());
-            if (res_final) {
-                out_timings = std::move(res_final->timings);
-                break;
-            }
-            result = rd.next(should_stop);
-        }
-        g_is_interrupted.store(false);
-        return curr_content;
-    }
-
-    std::string load_text_file(const std::string & fname) override {
-        std::ifstream file(fname, std::ios::binary);
-        if (!file) {
-            return "";
-        }
-        return std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-    }
-
-    json load_media_file(const std::string & fname) override {
-        std::ifstream file(fname, std::ios::binary);
-        if (!file) {
-            return json::object();
-        }
-        // for local backend, we just load into input_files and return the marker
-        raw_buffer buf;
-        buf.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
-
-        // detect media type for HTTP compatibility (also stored for local use)
-        input_files.push_back(std::move(buf));
-        // local backend doesn't use the JSON content part — it uses input_files directly
-        // but we return a non-empty JSON so the caller knows it succeeded
-        return json{{"loaded", true}};
-    }
-
-    common_chat_params format_chat() {
-        auto meta = ctx_server.get_meta();
-        auto & chat_params = meta.chat_params;
-
-        auto caps = common_chat_templates_get_caps(chat_params.tmpls.get());
-
-        common_chat_templates_inputs inputs;
-        inputs.messages              = common_chat_msgs_parse_oaicompat(chat_messages);
-        inputs.tools                 = {};
-        inputs.tool_choice           = COMMON_CHAT_TOOL_CHOICE_NONE;
-        inputs.json_schema           = "";
-        inputs.grammar               = "";
-        inputs.use_jinja             = chat_params.use_jinja;
-        inputs.parallel_tool_calls   = caps["supports_parallel_tool_calls"];
-        inputs.add_generation_prompt = true;
-        inputs.reasoning_format      = COMMON_REASONING_FORMAT_DEEPSEEK;
-        inputs.force_pure_content    = chat_params.force_pure_content;
-        inputs.enable_thinking       = chat_params.enable_thinking ? common_chat_templates_support_enable_thinking(chat_params.tmpls.get()) : false;
-
-        return common_chat_templates_apply(chat_params.tmpls.get(), inputs);
-    }
-
-    void terminate() override {
-        ctx_server.terminate();
-        if (inference_thread.joinable()) {
-            inference_thread.join();
-        }
-    }
-
-    llama_context * get_llama_context() const {
-        return ctx_server.get_llama_context();
-    }
-
-    server_context_meta get_meta() const {
-        return ctx_server.get_meta();
-    }
-};
-
 // TODO?: Make this reusable, enums, docs
 static const std::array<std::string_view, 7> cmds = {
     "/audio ",
@@ -409,8 +188,6 @@ int main(int argc, char ** argv) {
         console::error("please use llama-completion instead\n");
     }
 
-    const bool is_remote = !params.endpoint.empty();
-
     // TODO: avoid using atexit() here by making `console` a singleton
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
@@ -434,42 +211,33 @@ int main(int argc, char ** argv) {
 
     // shared state between backend and the interactive loop
     json messages = json::array();
-    // for local backend: media files pending for the current user message
-    // for remote backend: OAI content parts for media
     std::vector<json> pending_media_parts;
+    bool was_generating = false;
 
-    std::unique_ptr<cli_backend> backend;
+    std::unique_ptr<cli_backend> backend = std::make_unique<cli_backend>();
 
-    if (is_remote) {
-        auto remote = std::make_unique<cli_backend_remote>(params.endpoint);
-        console::log("\nConnecting to %s ... ", params.endpoint.c_str());
+    // Connect to server - either external or spawn local
+    if (!params.endpoint.empty()) {
+        backend->endpoint_url = params.endpoint;
+        console::log("\nConnecting to %s ... ", backend->endpoint_url.c_str());
         console::spinner::start();
-        if (!remote->connect()) {
+        if (!backend->connect()) {
             console::spinner::stop();
             console::error("\nFailed to connect to server\n");
             return 1;
         }
         console::spinner::stop();
         console::log("connected\n");
-        backend = std::move(remote);
     } else {
-        auto local = std::make_unique<cli_backend_local>(params);
-
-        llama_backend_init();
-        llama_numa_init(params.numa);
-
-        console::log("\nLoading model... "); // followed by loading animation
+        console::log("\nStarting local server... ");
         console::spinner::start();
-        if (!local->load_model(params)) {
+        if (!backend->spawn_local_server(argc, argv)) {
             console::spinner::stop();
-            console::error("\nFailed to load the model\n");
+            console::error("\nFailed to start local server\n");
             return 1;
         }
         console::spinner::stop();
-        console::log("\n");
-
-        local->start_loop();
-        backend = std::move(local);
+        console::log("started on %s\n", backend->endpoint_url.c_str());
     }
 
     std::string modalities = "text";
@@ -492,9 +260,7 @@ int main(int argc, char ** argv) {
 
     console::log("\n");
     console::log("%s\n", LLAMA_ASCII_LOGO);
-    if (is_remote) {
-        console::log("server     : %s\n", params.endpoint.c_str());
-    }
+    console::log("server     : %s\n", backend->endpoint_url.c_str());
     console::log("build      : %s\n", backend->get_build_info().c_str());
     console::log("model      : %s\n", backend->get_model_name().c_str());
     console::log("modalities : %s\n", modalities.c_str());
@@ -568,12 +334,7 @@ int main(int argc, char ** argv) {
                     console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
                     break;
                 }
-                if (is_remote) {
-                    pending_media_parts.push_back(media_part);
-                } else {
-                    // local backend: use the media marker
-                    cur_msg += mtmd_default_marker();
-                }
+                pending_media_parts.push_back(media_part);
                 console::log("Loaded media from '%s'\n", fname.c_str());
             }
             buffer = params.prompt;
@@ -587,8 +348,15 @@ int main(int argc, char ** argv) {
         console::set_display(DISPLAY_TYPE_RESET);
         console::log("\n");
 
+        // Check for interrupt
         if (should_stop()) {
             g_is_interrupted.store(false);
+            if (was_generating) {
+                // Interrupt during generation: cancel it, continue chatting
+                was_generating = false;
+                continue;
+            }
+            // Interrupt at prompt: exit
             break;
         }
 
@@ -620,10 +388,6 @@ int main(int argc, char ** argv) {
             messages.clear();
             add_system_prompt();
             pending_media_parts.clear();
-            if (!is_remote) {
-                auto * local = static_cast<cli_backend_local *>(backend.get());
-                local->input_files.clear();
-            }
             console::log("Chat history cleared.\n");
             continue;
         } else if (
@@ -635,11 +399,7 @@ int main(int argc, char ** argv) {
                 console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
                 continue;
             }
-            if (is_remote) {
-                pending_media_parts.push_back(media_part);
-            } else {
-                cur_msg += mtmd_default_marker();
-            }
+            pending_media_parts.push_back(media_part);
             console::log("Loaded media from '%s'\n", fname.c_str());
             continue;
         } else if (string_starts_with(buffer, "/read ")) {
@@ -706,38 +466,35 @@ int main(int argc, char ** argv) {
 
         // generate response
         if (add_user_msg) {
-            if (is_remote) {
-                // for remote, build OAI content with media parts
-                json user_content = build_user_content(cur_msg, pending_media_parts);
-                if (pending_media_parts.empty()) {
-                    // simple text message
-                    messages.push_back({
-                        {"role",    "user"},
-                        {"content", cur_msg}
-                    });
-                } else {
-                    // multipart message
-                    messages.push_back({
-                        {"role",    "user"},
-                        {"content", user_content}
-                    });
-                }
-                pending_media_parts.clear();
-            } else {
-                // local backend: media markers are embedded in the text
+            // always use multipart content for consistency
+            json user_content = build_user_content(cur_msg, pending_media_parts);
+            if (pending_media_parts.empty()) {
+                // simple text message
                 messages.push_back({
                     {"role",    "user"},
                     {"content", cur_msg}
                 });
+            } else {
+                // multipart message
+                messages.push_back({
+                    {"role",    "user"},
+                    {"content", user_content}
+                });
             }
+            pending_media_parts.clear();
             cur_msg.clear();
         }
         result_timings timings;
+        was_generating = true;
         std::string assistant_content = backend->generate_completion(messages, params, params.verbose_prompt, timings);
-        messages.push_back({
-            {"role",    "assistant"},
-            {"content", assistant_content}
-        });
+        was_generating = false;
+        // Only add assistant message if we got content (not interrupted/error)
+        if (!assistant_content.empty() || !g_is_interrupted.load()) {
+            messages.push_back({
+                {"role",    "assistant"},
+                {"content", assistant_content}
+            });
+        }
         console::log("\n");
 
         if (params.show_timings) {
@@ -759,14 +516,5 @@ int main(int argc, char ** argv) {
     console::log("\nExiting...\n");
     backend->terminate();
 
-    if (!is_remote) {
-        // bump the log level to display timings
-        common_log_set_verbosity_thold(LOG_LEVEL_INFO);
-        auto * local = static_cast<cli_backend_local *>(backend.get());
-        common_memory_breakdown_print(local->get_llama_context());
-
-        llama_backend_free();
-    }
-
     return 0;
 }
diff --git a/vendor/sheredom/subprocess.h b/vendor/sheredom/subprocess.h
index 3e40bae046a..50be09b9ae5 100644
--- a/vendor/sheredom/subprocess.h
+++ b/vendor/sheredom/subprocess.h
@@ -89,7 +89,12 @@ enum subprocess_option_e {
   // Search for program names in the PATH variable. Always enabled on Windows.
   // Note: this will **not** search for paths in any provided custom environment
   // and instead uses the PATH of the spawning process.
-  subprocess_option_search_user_path = 0x10
+  subprocess_option_search_user_path = 0x10,
+
+  // Spawn the child in a new session/process group so that terminal signals
+  // (e.g. SIGINT from Ctrl+C) are not forwarded to the child process.
+  // On Unix this uses setsid(); on Windows it uses CREATE_NEW_PROCESS_GROUP.
+  subprocess_option_new_session = 0x20
 };
 
 #if defined(__cplusplus)
@@ -529,6 +534,10 @@ int subprocess_create_ex(const char *const commandLine[], int options,
     flags |= createNoWindow;
   }
 
+  if (subprocess_option_new_session == (options & subprocess_option_new_session)) {
+    flags |= CREATE_NEW_PROCESS_GROUP;
+  }
+
   if (subprocess_option_inherit_environment !=
       (options & subprocess_option_inherit_environment)) {
     if (SUBPROCESS_NULL == environment) {
@@ -873,22 +882,36 @@ int subprocess_create_ex(const char *const commandLine[], int options,
 #pragma clang diagnostic ignored "-Wcast-qual"
 #pragma clang diagnostic ignored "-Wold-style-cast"
 #endif
+  // Prepare spawn attributes if new session is requested
+  posix_spawnattr_t *attrp = SUBPROCESS_NULL;
+  posix_spawnattr_t attr;
+  if (subprocess_option_new_session == (options & subprocess_option_new_session)) {
+    posix_spawnattr_init(&attr);
+    short pflags = POSIX_SPAWN_SETSID;
+    posix_spawnattr_setflags(&attr, pflags);
+    attrp = &attr;
+  }
+
   if (subprocess_option_search_user_path ==
       (options & subprocess_option_search_user_path)) {
-    if (0 != posix_spawnp(&child, commandLine[0], &actions, SUBPROCESS_NULL,
+    if (0 != posix_spawnp(&child, commandLine[0], &actions, attrp,
                           SUBPROCESS_CONST_CAST(char *const *, commandLine),
                           used_environment)) {
+      if (attrp) posix_spawnattr_destroy(attrp);
       posix_spawn_file_actions_destroy(&actions);
       return -1;
     }
   } else {
-    if (0 != posix_spawn(&child, commandLine[0], &actions, SUBPROCESS_NULL,
+    if (0 != posix_spawn(&child, commandLine[0], &actions, attrp,
                          SUBPROCESS_CONST_CAST(char *const *, commandLine),
                          used_environment)) {
+      if (attrp) posix_spawnattr_destroy(attrp);
       posix_spawn_file_actions_destroy(&actions);
       return -1;
     }
   }
+
+  if (attrp) posix_spawnattr_destroy(attrp);
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif

From edc5863eaea34db5f67d938e659f1311394334dd Mon Sep 17 00:00:00 2001
From: Piotr Wilkin <piotr.wilkin@syndatis.com>
Date: Fri, 1 May 2026 20:23:29 +0200
Subject: [PATCH 4/4] fix compile errors

---
 vendor/sheredom/subprocess.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vendor/sheredom/subprocess.h b/vendor/sheredom/subprocess.h
index 50be09b9ae5..b95a9e70c80 100644
--- a/vendor/sheredom/subprocess.h
+++ b/vendor/sheredom/subprocess.h
@@ -504,6 +504,7 @@ int subprocess_create_ex(const char *const commandLine[], int options,
   const unsigned long startFUseStdHandles = 0x00000100;
   const unsigned long handleFlagInherit = 0x00000001;
   const unsigned long createNoWindow = 0x08000000;
+  const unsigned long createNewProcessGroup = 0x00000200;
   struct subprocess_subprocess_information_s processInfo;
   struct subprocess_security_attributes_s saAttr = {sizeof(saAttr),
                                                     SUBPROCESS_NULL, 1};
@@ -535,7 +536,7 @@ int subprocess_create_ex(const char *const commandLine[], int options,
   }
 
   if (subprocess_option_new_session == (options & subprocess_option_new_session)) {
-    flags |= CREATE_NEW_PROCESS_GROUP;
+    flags |= createNewProcessGroup;
   }
 
   if (subprocess_option_inherit_environment !=