diff --git a/common/arg.cpp b/common/arg.cpp index 841ca3ce2ec2..f3f6ed9a6ddb 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -2494,6 +2495,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); + add_opt(common_arg( + {"--mmproj-swap-layers"}, "N", + "number of LLM layers to evict to host RAM when mmproj is active;\n" + "0 = disabled (default), -1 = auto-detect based on free VRAM;\n" + "requires CUDA backend and a loaded --mmproj model", + [](common_params & params, int value) { + params.n_mmproj_swap = value; + } + ).set_env("LLAMA_ARG_MMPROJ_SWAP_LAYERS")); add_opt(common_arg( {"-sm", "--split-mode"}, "{none,layer,row,tensor}", "how to split the model across multiple GPUs, one of:\n" diff --git a/common/common.h b/common/common.h index 94147d5d8cf1..c1f942dbb6ba 100644 --- a/common/common.h +++ b/common/common.h @@ -590,6 +590,9 @@ struct common_params { int image_max_tokens = -1; int mtmd_batch_max_tokens = 1024; + // mmproj swap pool (see common/llama_mmproj_pool.h) + int n_mmproj_swap = 0; // --mmproj-swap-layers: LLM layers evicted per vision step + // finetune struct lr_opt lr; enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW; diff --git a/common/llama_mmproj_pool.cpp b/common/llama_mmproj_pool.cpp new file mode 100644 index 000000000000..a0695b08c9f4 --- /dev/null +++ b/common/llama_mmproj_pool.cpp @@ -0,0 +1,315 @@ +#include "llama_mmproj_pool.h" +#include "llama-impl.h" +#include "../src/llama-model.h" +#include +#include +#include + +static double now_ms() { + using namespace std::chrono; + return duration(steady_clock::now().time_since_epoch()).count(); +} + +static size_t calc_aligned_size(const std::vector & tensors, size_t align = 256) { + size_t total = 0; + for (ggml_tensor * t : tensors) { + total = (total + align - 1) / align * align; + total += ggml_nbytes(t); + } + return total; +} + +static std::vector collect_evicted_tensors(struct llama_model * model, int n_swap_layers) { + if (!model || n_swap_layers <= 0) return {}; + const int n_layer = llama_model_n_layer(model); + const int first = std::max(0, n_layer - n_swap_layers); + std::vector result; + const auto & tensor_map = llama_internal_get_tensor_map(model); + + for (int il = first; il < n_layer; ++il) { + const std::string prefix = "blk." + std::to_string(il) + "."; + for (auto & [name, t] : tensor_map) { + if (t && name.rfind(prefix, 0) == 0) { + if (!t->buffer) continue; + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(t->buffer); + if (ggml_backend_buft_is_host(buft)) continue; + result.push_back(t); + } + } + } + return result; +} + +struct llama_mmproj_pool * llama_mmproj_pool_init( + struct llama_model * model, + int n_swap_layers, + std::vector & mmproj_tensors, + size_t dynamic_overhead_bytes) { + + if (mmproj_tensors.empty()) return nullptr; + + size_t align = 256; + size_t mmproj_host_size = calc_aligned_size(mmproj_tensors, align); + + // 1. Auto-calculation (-1) logic, combining precisely probed dynamic overhead + if (n_swap_layers < 0) { + int n_layer = llama_model_n_layer(model); + size_t accumulated_size = 0; + int calculated_layers = 0; + + // Target eviction size = Vision Weights + Compute Buffer + // Reserve a 5% safety margin for VRAM fragmentation + size_t target_eviction_size = (mmproj_host_size + dynamic_overhead_bytes) * 1.05; + + for (int il = n_layer - 1; il >= 0; --il) { + calculated_layers++; + auto evicted_tensors_tmp = collect_evicted_tensors(model, calculated_layers); + + accumulated_size = 0; + for (auto * t : evicted_tensors_tmp) { + accumulated_size += ggml_nbytes(t); + } + + if (accumulated_size >= target_eviction_size) { + break; + } + } + n_swap_layers = calculated_layers; + LLAMA_LOG_INFO("%s: auto mode: need %.0f MB (Weights) + %.0f MB (Overhead) for mmproj; will evict %d layers (target eviction %.0f MB)\n", + __func__, mmproj_host_size / 1e6, dynamic_overhead_bytes / 1e6, n_swap_layers, target_eviction_size / 1e6); + } + + if (n_swap_layers <= 0) return nullptr; + + + auto * pool = new llama_mmproj_pool(); + pool->evicted_tensors = collect_evicted_tensors(model, n_swap_layers); + + if (pool->evicted_tensors.empty()) { + delete pool; + return nullptr; + } + + // Get the actual GPU Backend Dev to prepare for pinned memory + ggml_backend_dev_t dev = nullptr; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t d = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(d) != GGML_BACKEND_DEVICE_TYPE_CPU) { + dev = d; + break; + } + } + + // Allocate Host buffer + size_t evicted_total_bytes = 0; + for (auto * t : pool->evicted_tensors) { + pool->evicted_offsets.push_back(evicted_total_bytes); + evicted_total_bytes += ggml_nbytes(t); + } + + pool->host_buf_size = evicted_total_bytes + mmproj_host_size; + ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr; + if (!host_buft) host_buft = ggml_backend_cpu_buffer_type(); + + pool->host_buf = ggml_backend_buft_alloc_buffer(host_buft, pool->host_buf_size); + if (!pool->host_buf) { + delete pool; + return nullptr; + } + pool->host_ptr = ggml_backend_buffer_get_base(pool->host_buf); + char * host_mm = (char *)pool->host_ptr + evicted_total_bytes; + + // 2.Restore the robust "Bin-Packing" method to prevent any risk of data corruption + struct Block { + ggml_tensor * t; + size_t used; + size_t cap; + }; + std::vector blocks; + for (auto * t : pool->evicted_tensors) { + blocks.push_back({t, 0, ggml_nbytes(t)}); + } + std::sort(blocks.begin(), blocks.end(), [](const Block & a, const Block & b) { return a.cap > b.cap; }); + + std::vector sorted_mmproj = mmproj_tensors; + std::sort(sorted_mmproj.begin(), sorted_mmproj.end(), [](ggml_tensor * a, ggml_tensor * b) { + return ggml_nbytes(a) > ggml_nbytes(b); + }); + + bool packing_failed = false; + size_t current_host_offset = 0; + + for (ggml_tensor * vt : sorted_mmproj) { + size_t vsize = ggml_nbytes(vt); + + current_host_offset = (current_host_offset + align - 1) / align * align; + char * host_data = host_mm + current_host_offset; + + if (vt->data) { + ggml_backend_tensor_get(vt, host_data, 0, vsize); // Backup vision model to host + } + current_host_offset += vsize; + + bool placed = false; + for (auto & b : blocks) { + size_t offset = (b.used + align - 1) / align * align; + if (offset + vsize <= b.cap) { + b.used = offset + vsize; + char * gpu_data = (char *)b.t->data + offset; + pool->mappings.push_back({vt, gpu_data, b.t->buffer, host_data, vsize}); + placed = true; + break; + } + } + if (!placed) { + packing_failed = true; + break; + } + } + + if (packing_failed) { + LLAMA_LOG_ERROR("%s: Fragmentation prevents packing mmproj tensors. Increase --mmproj-swap-layers.\n", __func__); + llama_mmproj_pool_free(pool); + return nullptr; + } + + // Redirect pointers, ready for execution + for (const auto & m : pool->mappings) { + m.vision_t->data = m.host_data; + m.vision_t->buffer = pool->host_buf; + } + + pool->state = llama_pool_state::LLM_RESIDENT; + LLAMA_LOG_INFO("%s: pool ready | %zu evicted (%.0f MB) | packed %zu mmproj (%.0f MB) | host_buft: %s\n", + __func__, pool->evicted_tensors.size(), evicted_total_bytes / 1e6, + pool->mappings.size(), mmproj_host_size / 1e6, ggml_backend_buft_name(host_buft)); + return pool; +} + + + + +// Helper: Given the physical address of allocated gpu_data, deduce which evicted tensor (LLM layer) it maps to +static int find_evicted_idx(void * gpu_data, const std::vector & ev_tensors) { + for (size_t i = 0; i < ev_tensors.size(); ++i) { + char * base = (char *)ev_tensors[i]->data; + size_t size = ggml_nbytes(ev_tensors[i]); + // If the vision data falls within this evicted LLM tensor's address range + if ((char *)gpu_data >= base && (char *)gpu_data < base + size) { + return (int)i; + } + } + return -1; +} + +bool llama_mmproj_pool_swap_in(struct llama_mmproj_pool * pool, struct llama_context * ctx) { + if (!pool) return false; + std::lock_guard guard(pool->mutex); + if (pool->state == llama_pool_state::MMPROJ_RESIDENT) return true; + if (pool->state == llama_pool_state::DISABLED || pool->state == llama_pool_state::CORRUPTED) return false; + + if (ctx) llama_synchronize(ctx); + double t0 = now_ms(); + pool->state = llama_pool_state::SWAPPING_OUT; + + char * host_llm = (char *)pool->host_ptr; + + // 3. Use pipelining strategy to achieve PCIe full-duplex parallelism, completely preventing VRAM read/write pollution + // First group vision tensors by the evicted LLM tensor they occupy + std::vector> grouped_mappings(pool->evicted_tensors.size()); + for (const auto & m : pool->mappings) { + int idx = find_evicted_idx(m.gpu_data, pool->evicted_tensors); + if (idx >= 0) { + grouped_mappings[idx].push_back(m); + } + } + + std::thread prev_load_thread; + + for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) { + // Step A: Read the LLM weights of the current layer back to host (Device-to-Host) + // This DMA copy is blocking in the main thread + ggml_backend_tensor_get( + pool->evicted_tensors[i], + host_llm + pool->evicted_offsets[i], + 0, + ggml_nbytes(pool->evicted_tensors[i]) + ); + + // Wait for the previous block's asynchronous write (H2D) to complete, preventing thread backlog + if (prev_load_thread.joinable()) { + prev_load_thread.join(); + } + + // Step B: Since the current layer (i-th) has been safely moved to host, its VRAM space can now be safely overwritten + // Launch a background thread to write the corresponding vision tensors to that VRAM (Host-to-Device) + // Key advantage: when the loop next executes D2H for layer i+1, it can run in full-duplex parallel with this H2D! + prev_load_thread = std::thread([pool, i, &grouped_mappings]() { + for (const auto & m : grouped_mappings[i]) { + m.vision_t->data = m.gpu_data; + m.vision_t->buffer = m.gpu_buffer; + ggml_backend_tensor_set(m.vision_t, m.host_data, 0, m.size); // Push to VRAM + } + }); + } + + // After the loop, ensure the final background write task has completed + if (prev_load_thread.joinable()) { + prev_load_thread.join(); + } + + pool->state = llama_pool_state::MMPROJ_RESIDENT; + + if (ctx) llama_synchronize(ctx); + pool->total_swap_ms += (now_ms() - t0); + ++pool->n_swaps; + return true; +} + + + + + + +void llama_mmproj_pool_swap_back(struct llama_mmproj_pool * pool, struct llama_context * ctx) { + if (!pool) return; + std::lock_guard guard(pool->mutex); + if (pool->state != llama_pool_state::MMPROJ_RESIDENT) return; + + if (ctx) llama_synchronize(ctx); + pool->state = llama_pool_state::SWAPPING_IN; + + // Vision -> Host (Adjust pointers only, no copy needed) + for (const auto & m : pool->mappings) { + m.vision_t->data = m.host_data; + m.vision_t->buffer = pool->host_buf; + } + + // LLM -> GPU (Restore LLM) + char * host_llm = (char *)pool->host_ptr; + for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) { + ggml_backend_tensor_set(pool->evicted_tensors[i], host_llm + pool->evicted_offsets[i], 0, ggml_nbytes(pool->evicted_tensors[i])); + } + + if (ctx) llama_synchronize(ctx); + pool->state = llama_pool_state::LLM_RESIDENT; +} + +void llama_mmproj_pool_free(struct llama_mmproj_pool * pool) { + if (!pool) return; + for (const auto & m : pool->mappings) { + if (m.vision_t) { + m.vision_t->data = nullptr; + m.vision_t->buffer = nullptr; + } + } + if (pool->host_buf) ggml_backend_buffer_free(pool->host_buf); + delete pool; +} + +void llama_mmproj_pool_log_stats(const struct llama_mmproj_pool * pool) { + if (!pool) return; + LLAMA_LOG_INFO("mmproj pool stats: n_swaps=%lld, avg_swap_ms=%.1f\n", + (long long)pool->n_swaps, + pool->n_swaps > 0 ? pool->total_swap_ms / pool->n_swaps : 0.0); +} diff --git a/common/llama_mmproj_pool.h b/common/llama_mmproj_pool.h new file mode 100644 index 000000000000..4022c046c56f --- /dev/null +++ b/common/llama_mmproj_pool.h @@ -0,0 +1,61 @@ +#pragma once + +#if defined(_WIN32) +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# ifndef NOMINMAX +# define NOMINMAX +# endif +#endif + +#include "ggml-backend.h" +#include "llama.h" +#include +#include +#include + +enum class llama_pool_state : uint8_t { + LLM_RESIDENT = 0, + SWAPPING_OUT = 1, + MMPROJ_RESIDENT = 2, + SWAPPING_IN = 3, + CORRUPTED = 4, + DISABLED = 5, +}; + +struct llama_mmproj_pool { + ggml_backend_buffer_t host_buf = nullptr; + void * host_ptr = nullptr; + size_t host_buf_size = 0; + + std::vector evicted_tensors; + std::vector evicted_offsets; + + struct tensor_mapping { + ggml_tensor * vision_t; + void * gpu_data; + ggml_backend_buffer_t gpu_buffer; + void * host_data; + size_t size; + }; + std::vector mappings; + + std::atomic state { llama_pool_state::DISABLED }; + std::mutex mutex; + + int64_t n_swaps = 0; + double total_swap_ms = 0.0; +}; + + +struct llama_mmproj_pool * llama_mmproj_pool_init( + struct llama_model * model, + int n_swap_layers, + std::vector & mmproj_tensors, + size_t dynamic_overhead_bytes); + +bool llama_mmproj_pool_swap_in(struct llama_mmproj_pool * pool, struct llama_context * ctx); +void llama_mmproj_pool_swap_back(struct llama_mmproj_pool * pool, struct llama_context * ctx); +void llama_mmproj_pool_free(struct llama_mmproj_pool * pool); +void llama_mmproj_pool_log_stats(const struct llama_mmproj_pool * pool); diff --git a/src/llama-impl.h b/src/llama-impl.h index 7923c3f7ed55..2b671cbe4b97 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -1,6 +1,6 @@ #pragma once -#include "ggml.h" // for ggml_log_level +#include "llama.h" // for LLAMA_API, ggml_log_level #include #include @@ -21,7 +21,7 @@ // LLAMA_ATTRIBUTE_FORMAT(2, 3) -void llama_log_internal (ggml_log_level level, const char * format, ...); +LLAMA_API void llama_log_internal (ggml_log_level level, const char * format, ...); void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data); #define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__) diff --git a/src/llama-model.h b/src/llama-model.h index 77d8d3b6258a..5dbd30872716 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -733,4 +733,4 @@ const char * llm_type_name(llm_type type); // For internal test use // TODO: remove -const std::vector> & llama_internal_get_tensor_map(const llama_model * model); +LLAMA_API const std::vector> & llama_internal_get_tensor_map(const llama_model * model); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d2226b3be1d7..dd13047c35cd 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4134,7 +4134,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32 // 144 for 768 tile views const int num_image_tokens = num_patches / 16; const int seq_len = num_image_tokens * 2; - std::vector qwen2_mask(static_cast(seq_len) * seq_len, 0.0f); + std::vector qwen2_mask(static_cast(seq_len) * seq_len, 0.0f); // attention mask layout // +--------------+---------------+ @@ -4660,6 +4660,18 @@ std::map clip_get_mem_usage(const struct clip_ctx * return result; } +std::vector clip_get_all_tensors(const struct clip_ctx * ctx) { + std::vector result; + if (!ctx || !ctx->ctx_data.get()) return result; + + ggml_tensor * t = ggml_get_first_tensor(ctx->ctx_data.get()); + while (t) { + result.push_back(t); + t = ggml_get_next_tensor(ctx->ctx_data.get(), t); + } + return result; +} + // // API for debugging // @@ -4667,3 +4679,9 @@ std::map clip_get_mem_usage(const struct clip_ctx * void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) { ctx->debug_output_embeddings = enable; } + +void clip_free_buffer(clip_ctx * ctx) { + if (ctx) { + ctx->buf.reset(); // Safely frees the underlying memory and unbinds the smart pointer + } +} diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 967093a812d6..6a9f259432e0 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -97,8 +97,13 @@ int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : std::map clip_get_mem_usage(const struct clip_ctx * ctx); +// return all tensors of the vision encoder model. +// used by the mmproj swap pool to relocate them between GPU and host. +std::vector clip_get_all_tensors(const struct clip_ctx * ctx); + struct clip_cap { bool has_vision; bool has_audio; }; +MTMD_API void clip_free_buffer(struct clip_ctx * ctx); struct clip_cap clip_get_cap(const char * fname); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 724538b5857a..14fe90c23d91 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -2128,6 +2128,13 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector & } } +// return all vision encoder tensors of the multimodal model. +// used by the mmproj swap pool to relocate them between GPU and host. +std::vector mtmd_get_vision_tensors(mtmd_context * mctx) { + if (!mctx || !mctx->ctx_v) return {}; + return clip_get_all_tensors(mctx->ctx_v); +} + static void stub_log_callback(enum ggml_log_level, const char *, void *) { // do nothing } @@ -2163,3 +2170,8 @@ std::map mtmd_get_memory_usage(const char * mmproj_f return {}; } } +void mtmd_free_vision_buffer(mtmd_context * mctx) { + if (mctx && mctx->ctx_v) { + clip_free_buffer(mctx->ctx_v); + } +} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 25d51ef58d41..c7cdaabeb054 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -341,7 +341,12 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); MTMD_API std::map mtmd_get_memory_usage( const char * mmproj_fname, struct mtmd_context_params ctx_params); + +// return all vision encoder tensors of the multimodal model. +// used by the mmproj swap pool to relocate them between GPU and host. +MTMD_API std::vector mtmd_get_vision_tensors(mtmd_context * mctx); #endif +MTMD_API void mtmd_free_vision_buffer(mtmd_context * mctx); // // C++ wrappers diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 5c33a418f549..c4d3ebd8a3db 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1,3 +1,15 @@ +// Must be defined before any include that may pull in windows.h (e.g. through +// ggml-backend.h / CUDA or clip-impl.h), otherwise winsock.h pollutes the +// namespace and cascades syntax errors later in the file. +#if defined(_WIN32) +# ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# endif +# ifndef NOMINMAX +# define NOMINMAX +# endif +#endif + #include "server-context.h" #include "server-chat.h" #include "server-common.h" @@ -16,6 +28,7 @@ #include "speculative.h" #include "mtmd.h" #include "mtmd-helper.h" +#include "llama_mmproj_pool.h" #include #include @@ -25,18 +38,18 @@ #include #include #include - +#include +#include // fix problem with std::min and std::max #if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include +#undef min +#undef max #endif using json = nlohmann::ordered_json; +using json = nlohmann::ordered_json; + constexpr int HTTP_POLLING_SECONDS = 1; static uint32_t server_n_outputs_max(const common_params & params) { @@ -169,6 +182,9 @@ struct server_slot { mtmd_context * mctx = nullptr; mtmd::batch_ptr mbatch = nullptr; + // mmproj swap pool: evicts LLM layers to host RAM during vision encode. + struct llama_mmproj_pool * mmproj_pool = nullptr; + // speculative decoding common_speculative * spec; @@ -773,7 +789,29 @@ struct server_slot { // TODO @ngxson : move this log line to debug when it become more stable SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added); + // ── mmproj swap in/out RAII guard ── + struct MmprojSwapGuard { + llama_mmproj_pool * pool; + llama_context * ctx; + bool swapped; + MmprojSwapGuard(llama_mmproj_pool * p, llama_context * c) : pool(p), ctx(c), swapped(false) {} + bool swap_in() { + if (pool) swapped = llama_mmproj_pool_swap_in(pool, ctx); + return !pool || swapped; + } + ~MmprojSwapGuard() { + if (pool && swapped) llama_mmproj_pool_swap_back(pool, ctx); + } + } guard(mmproj_pool, ctx_tgt); + + if (!guard.swap_in()) { + SLT_ERR(*this, "%s", "mmproj swap_in failed; insufficient VRAM\n"); + return -1; + } + res = mtmd_batch_encode(mbatch.get()); + // guard destruction handles swap_back unconditionally, even if mtmd_batch_encode throws exception + if (res != 0) { SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res); return -1; @@ -861,6 +899,10 @@ struct server_context_impl { llama_model * model_tgt = nullptr; mtmd_context * mctx = nullptr; + + // mmproj swap pool: evicts LLM layers to host RAM when vision encoder runs. + struct llama_mmproj_pool * mmproj_pool = nullptr; + const llama_vocab * vocab = nullptr; server_queue queue_tasks; @@ -941,13 +983,18 @@ struct server_context_impl { ctx_dft.reset(); model_dft.reset(); + // pool must be freed before its owners: it holds pointers into + // the LLM model tensors and mmproj tensors. + llama_mmproj_pool_free(mmproj_pool); + mmproj_pool = nullptr; + + mtmd_free(mctx); + mctx = nullptr; + llama_init.reset(); ctx_tgt = nullptr; model_tgt = nullptr; - - mtmd_free(mctx); - mctx = nullptr; } void handle_sleeping_state(bool new_state) { @@ -1051,10 +1098,11 @@ struct server_context_impl { mparams.progress_callback_user_data = &load_progress_mmproj; } - // optionally get the memory usage of mmproj - if (has_mmproj && params_base.fit_params) { + std::map mmproj_mem; + // get the memory usage of mmproj for fit_params OR auto swap calculation + if (has_mmproj && (params_base.fit_params || params_base.n_mmproj_swap < 0)) { int64_t t_start = ggml_time_us(); - auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); int64_t t_elapsed = ggml_time_us() - t_start; if (!mmproj_mem.empty()) { size_t total = 0; @@ -1062,15 +1110,18 @@ struct server_context_impl { total += size; } SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0); - GGML_ASSERT(!params_base.fit_params_target.empty()); - for (auto & [dev, size] : mmproj_mem) { - for (size_t i = 0; i < ggml_backend_dev_count(); i++) { - if (ggml_backend_dev_get(i) == dev) { - if (i < params_base.fit_params_target.size()) { - SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); - params_base.fit_params_target[i] += size; + + if (params_base.fit_params) { + GGML_ASSERT(!params_base.fit_params_target.empty()); + for (auto & [dev, size] : mmproj_mem) { + for (size_t i = 0; i < ggml_backend_dev_count(); i++) { + if (ggml_backend_dev_get(i) == dev) { + if (i < params_base.fit_params_target.size()) { + SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev)); + params_base.fit_params_target[i] += size; + } + break; } - break; } } } @@ -1277,6 +1328,61 @@ struct server_context_impl { params_base.n_cache_reuse = 0; SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled"); } + + // Resolve parameter conflicts: forcefully disable Swap if vision GPU acceleration is disabled + if (params_base.n_mmproj_swap != 0 && !params_base.mmproj_use_gpu) { + SRV_WRN("%s\n", "Conflict detected: --mmproj-swap-layers is ignored because --no-mmproj-offload is active."); + params_base.n_mmproj_swap = 0; // Forcefully disable Swap + } + + // ── mmproj swap pool: evict LLM layers to host RAM when vision runs ── + if (params_base.n_mmproj_swap != 0) { + if (mctx) { + auto mmproj_tensors = mtmd_get_vision_tensors(mctx); + + size_t mtmd_total_mem = 0; + if (!mmproj_mem.empty()) { + for (auto & [dev, size] : mmproj_mem) { + mtmd_total_mem += size; + } + } + + size_t mtmd_weight_mem = 0; + for (auto * t : mmproj_tensors) { + mtmd_weight_mem += ggml_nbytes(t); + } + + size_t mtmd_compute_overhead = 0; + if (mtmd_total_mem > mtmd_weight_mem) { + mtmd_compute_overhead = mtmd_total_mem - mtmd_weight_mem; + } + + // Note: llama.cpp pre-allocates the KV cache pool in the context, so image tokens' KV doesn't need to be added to the dynamic overhead + // Thus, dynamic overhead only accounts for the Compute Buffer required by the vision forward pass + size_t dynamic_overhead_bytes = mtmd_compute_overhead; + + // As a safeguard, if the calculated overhead is 0 (profiling failed), provide a fallback safety margin + if (dynamic_overhead_bytes == 0) { + dynamic_overhead_bytes = 300 * 1024 * 1024; + SRV_WRN("mmproj compute overhead detection failed or zero, using fallback %zu MB\n", dynamic_overhead_bytes / 1024 / 1024); + } else { + SRV_INF("mmproj dynamic overhead evaluated: Compute Buffer=%zu MB\n", mtmd_compute_overhead / 1024 / 1024); + } + + mmproj_pool = llama_mmproj_pool_init( + model_tgt, + params_base.n_mmproj_swap, + mmproj_tensors, + dynamic_overhead_bytes + ); + if (mmproj_pool) { + mtmd_free_vision_buffer(mctx); // Completely free the original VRAM of the vision model!` + SRV_INF("%s", "mmproj swap pool initialized, vision VRAM freed.\n"); + } else { + SRV_WRN("%s\n", "mmproj swap pool not created; vision will run without LLM layer eviction\n"); + } + } + } } if (!llama_memory_can_shift(llama_get_memory(ctx_tgt))) { @@ -1359,6 +1465,7 @@ struct server_context_impl { slot.n_ctx = n_ctx_slot; slot.mctx = mctx; + slot.mmproj_pool = mmproj_pool; slot.prompt.tokens.has_mtmd = mctx != nullptr; SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);