slop: add preloading of models

meh · meh · commit 4e6cd75d9757 · 2026-05-06T19:38:56.000+02:00
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -3201,6 +3201,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.models_autoload = value;
         }
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
+    add_opt(common_arg(
+        {"--models-cache"},
+        "for server, cache these model GGUF files in page cache on startup. "
+        "If no argument is given, cache all models.",
+        [](common_params & params) {
+            // No argument: cache all models (empty string)
+            params.models_cache = "";
+            // Also check env var in case it was set
+            const char * env_val = std::getenv("LLAMA_ARG_MODELS_CACHE");
+            if (env_val != nullptr) {
+                params.models_cache = env_val;
+            }
+        }
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_CACHE"));
     add_opt(common_arg(
         {"--kv-cache-mode"}, "MODE",
         string_format("KV cache strategy for multi-model support: \"pool\" (pre-allocated per-model, default) or \"realloc\" (reallocate on swap)"),
@@ -4241,6 +4255,12 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
         [](common_params &, int) { /* unused */ }
     ).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
 
+    args.push_back(common_arg(
+        {"cache-on-startup"}, "NAME",
+        "cache this model's GGUF file in page cache on startup (for fast swapping)",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_CACHE_ON_STARTUP).set_preset_only());
+
     // args.push_back(common_arg(
     //     {"pin"},
     //     "in server router mode, do not unload this model if models_max is exceeded",
diff --git a/common/arg.h b/common/arg.h
@@ -9,8 +9,9 @@
 #include <cstring>
 
 // pseudo-env variable to identify preset-only arguments
-#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
-#define COMMON_ARG_PRESET_STOP_TIMEOUT    "__PRESET_STOP_TIMEOUT"
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP  "__PRESET_LOAD_ON_STARTUP"
+#define COMMON_ARG_PRESET_STOP_TIMEOUT     "__PRESET_STOP_TIMEOUT"
+#define COMMON_ARG_PRESET_CACHE_ON_STARTUP "__PRESET_CACHE_ON_STARTUP"
 
 //
 // CLI argument parsing
diff --git a/common/common.h b/common/common.h
@@ -293,6 +293,7 @@ struct common_params_model {
     std::string hf_file     = ""; // HF file                                                // NOLINT
     std::string docker_repo = ""; // Docker repo                                            // NOLINT
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
+    bool cache              = false; // cache GGUF file in page cache on startup             // NOLINT
 };
 
 struct common_ngram_mod;
@@ -632,6 +633,7 @@ struct common_params {
     int models_max = 4;             // maximum number of models to load simultaneously
     bool models_autoload = true;    // automatically load models when requested via the router server
     std::string kv_cache_mode = "pool"; // KV cache strategy for multi-model: "pool" or "realloc"
+    std::string models_cache = "";  // cache GGUF files in page cache on startup (comma-separated names; empty = all)
 
     bool log_json = false;
 
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -213,6 +213,7 @@ For the full list of features, please refer to [server's changelog](https://gith
 | `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
 | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
 | `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--models-cache [LIST]` | cache GGUF files in page cache for fast model swapping (non-router mode). No argument: cache all models. Comma-separated list: cache only specified models.<br/>(env: LLAMA_ARG_MODELS_CACHE) |
 | `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
 | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
 | `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
@@ -1599,6 +1600,7 @@ The precedence rule for preset options is as follows:
 
 We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
 - `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
+- `cache-on-startup` (boolean): Controls whether the model's GGUF file is cached in page cache on startup
 - `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
 
 ### Routing requests
@@ -1732,6 +1734,33 @@ Response:
 }
 ```
 
+### POST `/models/cache`: Cache a model's GGUF file
+
+Cache a model's GGUF file in the OS page cache (RAM) for fast model swapping. This fills the file into the page cache using `mmap` + `madvise(POSIX_MADV_WILLNEED)` (Linux/macOS) or `PrefetchVirtualMemory` (Windows), without loading the model weights into memory.
+
+Payload:
+
+```json
+{
+  "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+}
+```
+
+Response:
+
+```json
+{
+  "success": true
+}
+```
+
+**Notes:**
+- The `cached` field in the `/models` response indicates whether a model's file has been cached in page cache.
+- Use `--models-cache` CLI flag or `cache-on-startup` preset option to cache models automatically on startup.
+- `--models-cache` (no argument): caches all registered models.
+- `--models-cache modelA,modelB`: caches only the specified models.
+- Page cache warming uses `mmap` + `madvise(POSIX_MADV_WILLNEED)` (Linux/macOS) or `PrefetchVirtualMemory` (Windows) — the model weights are not loaded into memory.
+
 ## API errors
 
 `llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -13,6 +13,13 @@
 #include <sstream>
 #include <fstream>
 
+#if defined(_WIN32)
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
 json format_error_response(const std::string & message, const enum error_type type) {
     std::string type_str;
     int code = 500;
@@ -1584,3 +1591,84 @@ server_tokens format_prompt_rerank(
 
     return result;
 }
+
+//
+// Model cache: fill the OS page cache for a GGUF file so model swapping is fast
+//
+
+static bool cache_model_file_impl(const std::string & path) {
+    FILE * file = ggml_fopen(path.c_str(), "rb");
+    if (!file) {
+        SRV_WRN("failed to open GGUF file '%s' for caching: %s\n", path.c_str(), strerror(errno));
+        return false;
+    }
+
+    // Get file size
+    fseek(file, 0, SEEK_END);
+    size_t file_size = (size_t)ftell(file);
+    fseek(file, 0, SEEK_SET);
+
+    if (file_size == 0) {
+        SRV_WRN("GGUF file '%s' is empty\n", path.c_str());
+        fclose(file);
+        return false;
+    }
+
+    SRV_INF("caching GGUF file '%s' (%zu MiB)\n", path.c_str(), file_size / (1024 * 1024));
+
+#if defined(_WIN32)
+    {
+        HANDLE hFile = (HANDLE)_get_osfhandle(_fileno(file));
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        if (hMapping) {
+            void * addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+            if (addr) {
+                // Use PrefetchVirtualMemory on Windows (Vista+)
+#if _WIN32_WINNT >= 0x602
+                BOOL (WINAPI *pPrefetchVirtualMemory)(HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG) = nullptr;
+                HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
+                if (hKernel32) {
+                    pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *)GetProcAddress(hKernel32, "PrefetchVirtualMemory");
+                }
+                if (pPrefetchVirtualMemory) {
+                    WIN32_MEMORY_RANGE_ENTRY range;
+                    range.VirtualAddress = addr;
+                    range.NumberOfBytes = (SIZE_T)std::min(file_size, (size_t)4ULL * 1024 * 1024 * 1024); // cap at 4 GiB
+                    pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0);
+                }
+#endif
+                UnmapViewOfFile(addr);
+            }
+            CloseHandle(hMapping);
+        }
+    }
+#else
+    {
+        int fd = fileno(file);
+        // mmap the file
+        void * addr = mmap(NULL, file_size, PROT_READ, MAP_SHARED, fd, 0);
+        if (addr != MAP_FAILED) {
+#ifdef __linux__
+            // On Linux, use MAP_POPULATE to eagerly read pages
+            munmap(addr, file_size);
+            addr = mmap(NULL, file_size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
+#endif
+            if (addr != MAP_FAILED) {
+                // madvise(POSIX_MADV_WILLNEED) to tell the kernel we want these pages
+#ifdef POSIX_MADV_WILLNEED
+                madvise(addr, file_size, POSIX_MADV_WILLNEED);
+#endif
+                munmap(addr, file_size);
+            }
+        }
+    }
+#endif
+
+    fclose(file);
+    SRV_INF("caching GGUF file '%s' done\n", path.c_str());
+    return true;
+}
+
+bool cache_model_file(const std::string & path) {
+    return cache_model_file_impl(path);
+}
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
@@ -346,6 +346,12 @@ std::string format_oai_resp_sse(const json & data);
 // format Anthropic-style SSE with event types
 std::string format_anthropic_sse(const json & data);
 
+//
+// model cache (page cache fill for fast model swapping)
+//
+
+bool cache_model_file(const std::string & path);
+
 bool is_valid_utf8(const std::string & str);
 
 //
diff --git a/tools/server/server-model-manager.cpp b/tools/server/server-model-manager.cpp
@@ -270,6 +270,48 @@ void server_model_manager::wait_until_loading_finished(const std::string& name)
     });
 }
 
+void server_model_manager::cache(const std::string& name) {
+    std::string canonical = resolve_model_name(name);
+    if (canonical.empty()) {
+        SRV_WRN("model '%s' not found, skipping cache\n", name.c_str());
+        return;
+    }
+
+    std::lock_guard<std::mutex> lk(mutex_);
+    auto& info = mapping_[canonical];
+    if (info.model_path.empty()) {
+        SRV_WRN("model '%s' has no model path, skipping cache\n", canonical.c_str());
+        return;
+    }
+    if (info.cached) {
+        SRV_INF("model '%s' already cached\n", canonical.c_str());
+        return;
+    }
+
+    SRV_INF("caching model '%s' (path: %s)\n", canonical.c_str(), info.model_path.c_str());
+    info.cached = cache_model_file(info.model_path);
+    if (info.cached) {
+        SRV_INF("model '%s' cached successfully\n", canonical.c_str());
+    } else {
+        SRV_WRN("failed to cache model '%s'\n", canonical.c_str());
+    }
+}
+
+void server_model_manager::cache_all() {
+    std::vector<std::string> names;
+    {
+        std::lock_guard<std::mutex> lk(mutex_);
+        for (const auto& [name, info] : mapping_) {
+            if (!name.empty() && !info.model_path.empty() && !info.cached) {
+                names.push_back(name);
+            }
+        }
+    }
+    for (const auto& name : names) {
+        cache(name);
+    }
+}
+
 void server_model_manager::unload_lru(server_context& ctx) {
     std::string lru = find_lru_model();
     if (!lru.empty()) {
diff --git a/tools/server/server-model-manager.h b/tools/server/server-model-manager.h
@@ -33,6 +33,7 @@ struct server_model_info {
     server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
     int64_t last_used = 0;      // for LRU eviction (milliseconds since epoch)
     int exit_code = 0;          // exit code if failed
+    bool cached = false;        // GGUF file is cached in page cache for fast swapping
 
     bool is_ready() const {
         return status == SERVER_MODEL_STATUS_LOADED;
@@ -92,6 +93,12 @@ class server_model_manager {
     // Wait until a model finishes loading (thread-safe)
     void wait_until_loading_finished(const std::string& name);
 
+    // Cache a model's GGUF file in page cache (for fast swapping)
+    void cache(const std::string& name);
+
+    // Cache all models' GGUF files in page cache
+    void cache_all();
+
 private:
     // Find the LRU model name (must be called with mutex_ held)
     // Returns empty string if no model to evict
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp
@@ -949,6 +949,65 @@ void server_models::unload_all() {
     }
 }
 
+void server_models::cache(const std::string & name) {
+    std::lock_guard<std::mutex> lk(mutex);
+    auto it = mapping.find(name);
+    if (it == mapping.end()) {
+        // Also check aliases
+        bool found = false;
+        for (const auto & [key, inst] : mapping) {
+            if (inst.meta.aliases.count(name)) {
+                it = mapping.find(key);
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            SRV_WRN("model '%s' not found, skipping cache\n", name.c_str());
+            return;
+        }
+    }
+
+    auto & meta = it->second.meta;
+    if (meta.cached) {
+        SRV_INF("model '%s' already cached\n", meta.name.c_str());
+        return;
+    }
+
+    std::string model_path;
+    meta.preset.get_option("LLAMA_ARG_MODEL", model_path);
+    if (model_path.empty()) {
+        meta.preset.get_option("-m", model_path);
+    }
+    if (model_path.empty()) {
+        SRV_WRN("model '%s' has no model path, skipping cache\n", meta.name.c_str());
+        return;
+    }
+
+    SRV_INF("caching model '%s' (path: %s)\n", meta.name.c_str(), model_path.c_str());
+    meta.cached = cache_model_file(model_path);
+    if (meta.cached) {
+        SRV_INF("model '%s' cached successfully\n", meta.name.c_str());
+    } else {
+        SRV_WRN("failed to cache model '%s'\n", meta.name.c_str());
+    }
+}
+
+void server_models::cache_all() {
+    std::vector<std::string> names;
+    {
+        std::lock_guard<std::mutex> lk(mutex);
+        for (const auto & [name, inst] : mapping) {
+            if (!name.empty() && !inst.meta.cached) {
+                names.push_back(name);
+            }
+        }
+    }
+    for (const auto & name : names) {
+        cache(name);
+    }
+}
+
 void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
     std::unique_lock<std::mutex> lk(mutex);
     auto it = mapping.find(name);
@@ -1206,6 +1265,7 @@ void server_models_routes::init_routes() {
             json status {
                 {"value",  server_model_status_to_string(meta.status)},
                 {"args",   meta.args},
+                {"cached", meta.cached},
             };
             if (!meta.preset.name.empty()) {
                 common_preset preset_copy = meta.preset;
@@ -1238,6 +1298,26 @@ void server_models_routes::init_routes() {
         return res;
     };
 
+    this->post_router_models_cache = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_http_res>();
+        json body = json::parse(req.body);
+        std::string name = json_value(body, "model", std::string());
+
+        if (name.empty()) {
+            res_err(res, format_error_response("model name is required", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        if (!models.has_model(name)) {
+            res_err(res, format_error_response("model not found", ERROR_TYPE_NOT_FOUND));
+            return res;
+        }
+
+        models.cache(name);
+        res_ok(res, {{"success", true}});
+        return res;
+    };
+
     this->post_router_models_unload = [this](const server_http_req & req) {
         auto res = std::make_unique<server_http_res>();
         json body = json::parse(req.body);
diff --git a/tools/server/server-models.h b/tools/server/server-models.h
diff --git a/tools/server/server.cpp b/tools/server/server.cpp