Skip to content

Commit 4e6cd75

Browse files
committed
slop: add preloading of models
1 parent 44775c2 commit 4e6cd75

11 files changed

Lines changed: 385 additions & 2 deletions

common/arg.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3201,6 +3201,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32013201
params.models_autoload = value;
32023202
}
32033203
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
3204+
add_opt(common_arg(
3205+
{"--models-cache"},
3206+
"for server, cache these model GGUF files in page cache on startup. "
3207+
"If no argument is given, cache all models.",
3208+
[](common_params & params) {
3209+
// No argument: cache all models (empty string)
3210+
params.models_cache = "";
3211+
// Also check env var in case it was set
3212+
const char * env_val = std::getenv("LLAMA_ARG_MODELS_CACHE");
3213+
if (env_val != nullptr) {
3214+
params.models_cache = env_val;
3215+
}
3216+
}
3217+
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_CACHE"));
32043218
add_opt(common_arg(
32053219
{"--kv-cache-mode"}, "MODE",
32063220
string_format("KV cache strategy for multi-model support: \"pool\" (pre-allocated per-model, default) or \"realloc\" (reallocate on swap)"),
@@ -4241,6 +4255,12 @@ void common_params_add_preset_options(std::vector<common_arg> & args) {
42414255
[](common_params &, int) { /* unused */ }
42424256
).set_env(COMMON_ARG_PRESET_STOP_TIMEOUT).set_preset_only());
42434257

4258+
args.push_back(common_arg(
4259+
{"cache-on-startup"}, "NAME",
4260+
"cache this model's GGUF file in page cache on startup (for fast swapping)",
4261+
[](common_params &, const std::string &) { /* unused */ }
4262+
).set_env(COMMON_ARG_PRESET_CACHE_ON_STARTUP).set_preset_only());
4263+
42444264
// args.push_back(common_arg(
42454265
// {"pin"},
42464266
// "in server router mode, do not unload this model if models_max is exceeded",

common/arg.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@
99
#include <cstring>
1010

1111
// pseudo-env variable to identify preset-only arguments
12-
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13-
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
12+
#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
13+
#define COMMON_ARG_PRESET_STOP_TIMEOUT "__PRESET_STOP_TIMEOUT"
14+
#define COMMON_ARG_PRESET_CACHE_ON_STARTUP "__PRESET_CACHE_ON_STARTUP"
1415

1516
//
1617
// CLI argument parsing

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ struct common_params_model {
293293
std::string hf_file = ""; // HF file // NOLINT
294294
std::string docker_repo = ""; // Docker repo // NOLINT
295295
std::string name = ""; // in format <user>/<model>[:<tag>] (tag is optional) // NOLINT
296+
bool cache = false; // cache GGUF file in page cache on startup // NOLINT
296297
};
297298

298299
struct common_ngram_mod;
@@ -632,6 +633,7 @@ struct common_params {
632633
int models_max = 4; // maximum number of models to load simultaneously
633634
bool models_autoload = true; // automatically load models when requested via the router server
634635
std::string kv_cache_mode = "pool"; // KV cache strategy for multi-model: "pool" or "realloc"
636+
std::string models_cache = ""; // cache GGUF files in page cache on startup (comma-separated names; empty = all)
635637

636638
bool log_json = false;
637639

tools/server/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@ For the full list of features, please refer to [server's changelog](https://gith
213213
| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
214214
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
215215
| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
216+
| `--models-cache [LIST]` | cache GGUF files in page cache for fast model swapping (non-router mode). No argument: cache all models. Comma-separated list: cache only specified models.<br/>(env: LLAMA_ARG_MODELS_CACHE) |
216217
| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
217218
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
218219
| `-rea, --reasoning [on\|off\|auto]` | Use reasoning/thinking in the chat ('on', 'off', or 'auto', default: 'auto' (detect from template))<br/>(env: LLAMA_ARG_REASONING) |
@@ -1599,6 +1600,7 @@ The precedence rule for preset options is as follows:
15991600

16001601
We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
16011602
- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
1603+
- `cache-on-startup` (boolean): Controls whether the model's GGUF file is cached in page cache on startup
16021604
- `stop-timeout` (int, seconds): After requested unload, wait for this many seconds before forcing termination (default: 10)
16031605

16041606
### Routing requests
@@ -1732,6 +1734,33 @@ Response:
17321734
}
17331735
```
17341736

1737+
### POST `/models/cache`: Cache a model's GGUF file
1738+
1739+
Cache a model's GGUF file in the OS page cache (RAM) for fast model swapping. This fills the file into the page cache using `mmap` + `madvise(POSIX_MADV_WILLNEED)` (Linux/macOS) or `PrefetchVirtualMemory` (Windows), without loading the model weights into memory.
1740+
1741+
Payload:
1742+
1743+
```json
1744+
{
1745+
"model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
1746+
}
1747+
```
1748+
1749+
Response:
1750+
1751+
```json
1752+
{
1753+
"success": true
1754+
}
1755+
```
1756+
1757+
**Notes:**
1758+
- The `cached` field in the `/models` response indicates whether a model's file has been cached in page cache.
1759+
- Use `--models-cache` CLI flag or `cache-on-startup` preset option to cache models automatically on startup.
1760+
- `--models-cache` (no argument): caches all registered models.
1761+
- `--models-cache modelA,modelB`: caches only the specified models.
1762+
- Page cache warming uses `mmap` + `madvise(POSIX_MADV_WILLNEED)` (Linux/macOS) or `PrefetchVirtualMemory` (Windows) — the model weights are not loaded into memory.
1763+
17351764
## API errors
17361765

17371766
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi

tools/server/server-common.cpp

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@
1313
#include <sstream>
1414
#include <fstream>
1515

16+
#if defined(_WIN32)
17+
#include <windows.h>
18+
#else
19+
#include <sys/mman.h>
20+
#include <unistd.h>
21+
#endif
22+
1623
json format_error_response(const std::string & message, const enum error_type type) {
1724
std::string type_str;
1825
int code = 500;
@@ -1584,3 +1591,84 @@ server_tokens format_prompt_rerank(
15841591

15851592
return result;
15861593
}
1594+
1595+
//
1596+
// Model cache: fill the OS page cache for a GGUF file so model swapping is fast
1597+
//
1598+
1599+
static bool cache_model_file_impl(const std::string & path) {
1600+
FILE * file = ggml_fopen(path.c_str(), "rb");
1601+
if (!file) {
1602+
SRV_WRN("failed to open GGUF file '%s' for caching: %s\n", path.c_str(), strerror(errno));
1603+
return false;
1604+
}
1605+
1606+
// Get file size
1607+
fseek(file, 0, SEEK_END);
1608+
size_t file_size = (size_t)ftell(file);
1609+
fseek(file, 0, SEEK_SET);
1610+
1611+
if (file_size == 0) {
1612+
SRV_WRN("GGUF file '%s' is empty\n", path.c_str());
1613+
fclose(file);
1614+
return false;
1615+
}
1616+
1617+
SRV_INF("caching GGUF file '%s' (%zu MiB)\n", path.c_str(), file_size / (1024 * 1024));
1618+
1619+
#if defined(_WIN32)
1620+
{
1621+
HANDLE hFile = (HANDLE)_get_osfhandle(_fileno(file));
1622+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
1623+
if (hMapping) {
1624+
void * addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
1625+
if (addr) {
1626+
// Use PrefetchVirtualMemory on Windows (Vista+)
1627+
#if _WIN32_WINNT >= 0x602
1628+
BOOL (WINAPI *pPrefetchVirtualMemory)(HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG) = nullptr;
1629+
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
1630+
if (hKernel32) {
1631+
pPrefetchVirtualMemory = (decltype(pPrefetchVirtualMemory))(void *)GetProcAddress(hKernel32, "PrefetchVirtualMemory");
1632+
}
1633+
if (pPrefetchVirtualMemory) {
1634+
WIN32_MEMORY_RANGE_ENTRY range;
1635+
range.VirtualAddress = addr;
1636+
range.NumberOfBytes = (SIZE_T)std::min(file_size, (size_t)4ULL * 1024 * 1024 * 1024); // cap at 4 GiB
1637+
pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0);
1638+
}
1639+
#endif
1640+
UnmapViewOfFile(addr);
1641+
}
1642+
CloseHandle(hMapping);
1643+
}
1644+
}
1645+
#else
1646+
{
1647+
int fd = fileno(file);
1648+
// mmap the file
1649+
void * addr = mmap(NULL, file_size, PROT_READ, MAP_SHARED, fd, 0);
1650+
if (addr != MAP_FAILED) {
1651+
#ifdef __linux__
1652+
// On Linux, use MAP_POPULATE to eagerly read pages
1653+
munmap(addr, file_size);
1654+
addr = mmap(NULL, file_size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
1655+
#endif
1656+
if (addr != MAP_FAILED) {
1657+
// madvise(POSIX_MADV_WILLNEED) to tell the kernel we want these pages
1658+
#ifdef POSIX_MADV_WILLNEED
1659+
madvise(addr, file_size, POSIX_MADV_WILLNEED);
1660+
#endif
1661+
munmap(addr, file_size);
1662+
}
1663+
}
1664+
}
1665+
#endif
1666+
1667+
fclose(file);
1668+
SRV_INF("caching GGUF file '%s' done\n", path.c_str());
1669+
return true;
1670+
}
1671+
1672+
bool cache_model_file(const std::string & path) {
1673+
return cache_model_file_impl(path);
1674+
}

tools/server/server-common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,12 @@ std::string format_oai_resp_sse(const json & data);
346346
// format Anthropic-style SSE with event types
347347
std::string format_anthropic_sse(const json & data);
348348

349+
//
350+
// model cache (page cache fill for fast model swapping)
351+
//
352+
353+
bool cache_model_file(const std::string & path);
354+
349355
bool is_valid_utf8(const std::string & str);
350356

351357
//

tools/server/server-model-manager.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,48 @@ void server_model_manager::wait_until_loading_finished(const std::string& name)
270270
});
271271
}
272272

273+
void server_model_manager::cache(const std::string& name) {
274+
std::string canonical = resolve_model_name(name);
275+
if (canonical.empty()) {
276+
SRV_WRN("model '%s' not found, skipping cache\n", name.c_str());
277+
return;
278+
}
279+
280+
std::lock_guard<std::mutex> lk(mutex_);
281+
auto& info = mapping_[canonical];
282+
if (info.model_path.empty()) {
283+
SRV_WRN("model '%s' has no model path, skipping cache\n", canonical.c_str());
284+
return;
285+
}
286+
if (info.cached) {
287+
SRV_INF("model '%s' already cached\n", canonical.c_str());
288+
return;
289+
}
290+
291+
SRV_INF("caching model '%s' (path: %s)\n", canonical.c_str(), info.model_path.c_str());
292+
info.cached = cache_model_file(info.model_path);
293+
if (info.cached) {
294+
SRV_INF("model '%s' cached successfully\n", canonical.c_str());
295+
} else {
296+
SRV_WRN("failed to cache model '%s'\n", canonical.c_str());
297+
}
298+
}
299+
300+
void server_model_manager::cache_all() {
301+
std::vector<std::string> names;
302+
{
303+
std::lock_guard<std::mutex> lk(mutex_);
304+
for (const auto& [name, info] : mapping_) {
305+
if (!name.empty() && !info.model_path.empty() && !info.cached) {
306+
names.push_back(name);
307+
}
308+
}
309+
}
310+
for (const auto& name : names) {
311+
cache(name);
312+
}
313+
}
314+
273315
void server_model_manager::unload_lru(server_context& ctx) {
274316
std::string lru = find_lru_model();
275317
if (!lru.empty()) {

tools/server/server-model-manager.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ struct server_model_info {
3333
server_model_status status = SERVER_MODEL_STATUS_UNLOADED;
3434
int64_t last_used = 0; // for LRU eviction (milliseconds since epoch)
3535
int exit_code = 0; // exit code if failed
36+
bool cached = false; // GGUF file is cached in page cache for fast swapping
3637

3738
bool is_ready() const {
3839
return status == SERVER_MODEL_STATUS_LOADED;
@@ -92,6 +93,12 @@ class server_model_manager {
9293
// Wait until a model finishes loading (thread-safe)
9394
void wait_until_loading_finished(const std::string& name);
9495

96+
// Cache a model's GGUF file in page cache (for fast swapping)
97+
void cache(const std::string& name);
98+
99+
// Cache all models' GGUF files in page cache
100+
void cache_all();
101+
95102
private:
96103
// Find the LRU model name (must be called with mutex_ held)
97104
// Returns empty string if no model to evict

tools/server/server-models.cpp

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,65 @@ void server_models::unload_all() {
949949
}
950950
}
951951

952+
void server_models::cache(const std::string & name) {
953+
std::lock_guard<std::mutex> lk(mutex);
954+
auto it = mapping.find(name);
955+
if (it == mapping.end()) {
956+
// Also check aliases
957+
bool found = false;
958+
for (const auto & [key, inst] : mapping) {
959+
if (inst.meta.aliases.count(name)) {
960+
it = mapping.find(key);
961+
found = true;
962+
break;
963+
}
964+
}
965+
if (!found) {
966+
SRV_WRN("model '%s' not found, skipping cache\n", name.c_str());
967+
return;
968+
}
969+
}
970+
971+
auto & meta = it->second.meta;
972+
if (meta.cached) {
973+
SRV_INF("model '%s' already cached\n", meta.name.c_str());
974+
return;
975+
}
976+
977+
std::string model_path;
978+
meta.preset.get_option("LLAMA_ARG_MODEL", model_path);
979+
if (model_path.empty()) {
980+
meta.preset.get_option("-m", model_path);
981+
}
982+
if (model_path.empty()) {
983+
SRV_WRN("model '%s' has no model path, skipping cache\n", meta.name.c_str());
984+
return;
985+
}
986+
987+
SRV_INF("caching model '%s' (path: %s)\n", meta.name.c_str(), model_path.c_str());
988+
meta.cached = cache_model_file(model_path);
989+
if (meta.cached) {
990+
SRV_INF("model '%s' cached successfully\n", meta.name.c_str());
991+
} else {
992+
SRV_WRN("failed to cache model '%s'\n", meta.name.c_str());
993+
}
994+
}
995+
996+
void server_models::cache_all() {
997+
std::vector<std::string> names;
998+
{
999+
std::lock_guard<std::mutex> lk(mutex);
1000+
for (const auto & [name, inst] : mapping) {
1001+
if (!name.empty() && !inst.meta.cached) {
1002+
names.push_back(name);
1003+
}
1004+
}
1005+
}
1006+
for (const auto & name : names) {
1007+
cache(name);
1008+
}
1009+
}
1010+
9521011
void server_models::update_status(const std::string & name, server_model_status status, int exit_code) {
9531012
std::unique_lock<std::mutex> lk(mutex);
9541013
auto it = mapping.find(name);
@@ -1206,6 +1265,7 @@ void server_models_routes::init_routes() {
12061265
json status {
12071266
{"value", server_model_status_to_string(meta.status)},
12081267
{"args", meta.args},
1268+
{"cached", meta.cached},
12091269
};
12101270
if (!meta.preset.name.empty()) {
12111271
common_preset preset_copy = meta.preset;
@@ -1238,6 +1298,26 @@ void server_models_routes::init_routes() {
12381298
return res;
12391299
};
12401300

1301+
this->post_router_models_cache = [this](const server_http_req & req) {
1302+
auto res = std::make_unique<server_http_res>();
1303+
json body = json::parse(req.body);
1304+
std::string name = json_value(body, "model", std::string());
1305+
1306+
if (name.empty()) {
1307+
res_err(res, format_error_response("model name is required", ERROR_TYPE_INVALID_REQUEST));
1308+
return res;
1309+
}
1310+
1311+
if (!models.has_model(name)) {
1312+
res_err(res, format_error_response("model not found", ERROR_TYPE_NOT_FOUND));
1313+
return res;
1314+
}
1315+
1316+
models.cache(name);
1317+
res_ok(res, {{"success", true}});
1318+
return res;
1319+
};
1320+
12411321
this->post_router_models_unload = [this](const server_http_req & req) {
12421322
auto res = std::make_unique<server_http_res>();
12431323
json body = json::parse(req.body);

0 commit comments

Comments
 (0)