diff --git a/common/arg.cpp b/common/arg.cpp
index 841ca3ce2ec2..f3f6ed9a6ddb 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -29,6 +29,7 @@
 #include <cstdarg>
 #include <fstream>
 #include <list>
+#include <numeric>
 #include <regex>
 #include <set>
 #include <string>
@@ -2494,6 +2495,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             }
         }
     ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
+    add_opt(common_arg(
+        {"--mmproj-swap-layers"}, "N",
+        "number of LLM layers to evict to host RAM when mmproj is active;\n"
+        "0 = disabled (default), -1 = auto-detect based on free VRAM;\n"
+        "requires CUDA backend and a loaded --mmproj model",
+        [](common_params & params, int value) {
+            params.n_mmproj_swap = value;
+        }
+    ).set_env("LLAMA_ARG_MMPROJ_SWAP_LAYERS"));
     add_opt(common_arg(
         {"-sm", "--split-mode"}, "{none,layer,row,tensor}",
         "how to split the model across multiple GPUs, one of:\n"
diff --git a/common/common.h b/common/common.h
index 94147d5d8cf1..c1f942dbb6ba 100644
--- a/common/common.h
+++ b/common/common.h
@@ -590,6 +590,9 @@ struct common_params {
     int image_max_tokens = -1;
     int mtmd_batch_max_tokens = 1024;
 
+    // mmproj swap pool (see common/llama_mmproj_pool.h)
+    int n_mmproj_swap = 0; // --mmproj-swap-layers: LLM layers evicted per vision step
+
     // finetune
     struct lr_opt lr;
     enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
diff --git a/common/llama_mmproj_pool.cpp b/common/llama_mmproj_pool.cpp
new file mode 100644
index 000000000000..a0695b08c9f4
--- /dev/null
+++ b/common/llama_mmproj_pool.cpp
@@ -0,0 +1,315 @@
+#include "llama_mmproj_pool.h"
+#include "llama-impl.h"
+#include "../src/llama-model.h"
+#include <algorithm>
+#include <chrono>
+#include <thread>
+
+static double now_ms() {
+    using namespace std::chrono;
+    return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count();
+}
+
+static size_t calc_aligned_size(const std::vector<ggml_tensor *> & tensors, size_t align = 256) {
+    size_t total = 0;
+    for (ggml_tensor * t : tensors) {
+        total = (total + align - 1) / align * align;
+        total += ggml_nbytes(t);
+    }
+    return total;
+}
+
+static std::vector<ggml_tensor *> collect_evicted_tensors(struct llama_model * model, int n_swap_layers) {
+    if (!model || n_swap_layers <= 0) return {};
+    const int n_layer = llama_model_n_layer(model);
+    const int first = std::max(0, n_layer - n_swap_layers);
+    std::vector<ggml_tensor *> result;
+    const auto & tensor_map = llama_internal_get_tensor_map(model);
+    
+    for (int il = first; il < n_layer; ++il) {
+        const std::string prefix = "blk." + std::to_string(il) + ".";
+        for (auto & [name, t] : tensor_map) {
+            if (t && name.rfind(prefix, 0) == 0) {
+                if (!t->buffer) continue;
+                ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(t->buffer);
+                if (ggml_backend_buft_is_host(buft)) continue;
+                result.push_back(t);
+            }
+        }
+    }
+    return result;
+}
+
+struct llama_mmproj_pool * llama_mmproj_pool_init(
+    struct llama_model         * model,
+    int                          n_swap_layers,
+    std::vector<ggml_tensor *> & mmproj_tensors,
+    size_t                       dynamic_overhead_bytes) {
+
+    if (mmproj_tensors.empty()) return nullptr;
+
+    size_t align = 256;
+    size_t mmproj_host_size = calc_aligned_size(mmproj_tensors, align);
+
+    // 1. Auto-calculation (-1) logic, combining precisely probed dynamic overhead
+    if (n_swap_layers < 0) {
+        int n_layer = llama_model_n_layer(model);
+        size_t accumulated_size = 0;
+        int calculated_layers = 0;
+        
+        // Target eviction size = Vision Weights + Compute Buffer
+        // Reserve a 5% safety margin for VRAM fragmentation
+        size_t target_eviction_size = (mmproj_host_size + dynamic_overhead_bytes) * 1.05;
+
+        for (int il = n_layer - 1; il >= 0; --il) {
+            calculated_layers++;
+            auto evicted_tensors_tmp = collect_evicted_tensors(model, calculated_layers);
+            
+            accumulated_size = 0;
+            for (auto * t : evicted_tensors_tmp) {
+                accumulated_size += ggml_nbytes(t);
+            }
+            
+            if (accumulated_size >= target_eviction_size) {
+                break;
+            }
+        }
+        n_swap_layers = calculated_layers;
+        LLAMA_LOG_INFO("%s: auto mode: need %.0f MB (Weights) + %.0f MB (Overhead) for mmproj; will evict %d layers (target eviction %.0f MB)\n",
+                    __func__, mmproj_host_size / 1e6, dynamic_overhead_bytes / 1e6, n_swap_layers, target_eviction_size / 1e6);
+    }
+
+    if (n_swap_layers <= 0) return nullptr;
+
+
+    auto * pool = new llama_mmproj_pool();
+    pool->evicted_tensors = collect_evicted_tensors(model, n_swap_layers);
+
+    if (pool->evicted_tensors.empty()) {
+        delete pool;
+        return nullptr;
+    }
+
+    // Get the actual GPU Backend Dev to prepare for pinned memory
+    ggml_backend_dev_t dev = nullptr;
+    for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+        ggml_backend_dev_t d = ggml_backend_dev_get(i);
+        if (ggml_backend_dev_type(d) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+            dev = d;
+            break;
+        }
+    }
+
+    // Allocate Host buffer
+    size_t evicted_total_bytes = 0;
+    for (auto * t : pool->evicted_tensors) {
+        pool->evicted_offsets.push_back(evicted_total_bytes);
+        evicted_total_bytes += ggml_nbytes(t);
+    }
+
+    pool->host_buf_size = evicted_total_bytes + mmproj_host_size;
+    ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr;
+    if (!host_buft) host_buft = ggml_backend_cpu_buffer_type();
+
+    pool->host_buf = ggml_backend_buft_alloc_buffer(host_buft, pool->host_buf_size);
+    if (!pool->host_buf) {
+        delete pool;
+        return nullptr;
+    }
+    pool->host_ptr = ggml_backend_buffer_get_base(pool->host_buf);
+    char * host_mm = (char *)pool->host_ptr + evicted_total_bytes;
+
+    // 2.Restore the robust "Bin-Packing" method to prevent any risk of data corruption
+    struct Block {
+        ggml_tensor * t;
+        size_t used;
+        size_t cap;
+    };
+    std::vector<Block> blocks;
+    for (auto * t : pool->evicted_tensors) {
+        blocks.push_back({t, 0, ggml_nbytes(t)});
+    }
+    std::sort(blocks.begin(), blocks.end(), [](const Block & a, const Block & b) { return a.cap > b.cap; });
+
+    std::vector<ggml_tensor *> sorted_mmproj = mmproj_tensors;
+    std::sort(sorted_mmproj.begin(), sorted_mmproj.end(), [](ggml_tensor * a, ggml_tensor * b) {
+        return ggml_nbytes(a) > ggml_nbytes(b);
+    });
+
+    bool packing_failed = false;
+    size_t current_host_offset = 0;
+
+    for (ggml_tensor * vt : sorted_mmproj) {
+        size_t vsize = ggml_nbytes(vt);
+        
+        current_host_offset = (current_host_offset + align - 1) / align * align;
+        char * host_data = host_mm + current_host_offset;
+        
+        if (vt->data) {
+            ggml_backend_tensor_get(vt, host_data, 0, vsize); // Backup vision model to host
+        }
+        current_host_offset += vsize;
+
+        bool placed = false;
+        for (auto & b : blocks) {
+            size_t offset = (b.used + align - 1) / align * align;
+            if (offset + vsize <= b.cap) {
+                b.used = offset + vsize;
+                char * gpu_data = (char *)b.t->data + offset;
+                pool->mappings.push_back({vt, gpu_data, b.t->buffer, host_data, vsize});
+                placed = true;
+                break;
+            }
+        }
+        if (!placed) {
+            packing_failed = true;
+            break;
+        }
+    }
+
+    if (packing_failed) {
+        LLAMA_LOG_ERROR("%s: Fragmentation prevents packing mmproj tensors. Increase --mmproj-swap-layers.\n", __func__);
+        llama_mmproj_pool_free(pool);
+        return nullptr;
+    }
+
+    // Redirect pointers, ready for execution
+    for (const auto & m : pool->mappings) {
+        m.vision_t->data   = m.host_data;
+        m.vision_t->buffer = pool->host_buf;
+    }
+
+    pool->state = llama_pool_state::LLM_RESIDENT;
+    LLAMA_LOG_INFO("%s: pool ready | %zu evicted (%.0f MB) | packed %zu mmproj (%.0f MB) | host_buft: %s\n",
+                   __func__, pool->evicted_tensors.size(), evicted_total_bytes / 1e6, 
+                   pool->mappings.size(), mmproj_host_size / 1e6, ggml_backend_buft_name(host_buft));
+    return pool;
+}
+
+
+
+
+// Helper: Given the physical address of allocated gpu_data, deduce which evicted tensor (LLM layer) it maps to
+static int find_evicted_idx(void * gpu_data, const std::vector<ggml_tensor*> & ev_tensors) {
+    for (size_t i = 0; i < ev_tensors.size(); ++i) {
+        char * base = (char *)ev_tensors[i]->data;
+        size_t size = ggml_nbytes(ev_tensors[i]);
+        // If the vision data falls within this evicted LLM tensor's address range
+        if ((char *)gpu_data >= base && (char *)gpu_data < base + size) {
+            return (int)i;
+        }
+    }
+    return -1;
+}
+
+bool llama_mmproj_pool_swap_in(struct llama_mmproj_pool * pool, struct llama_context * ctx) {
+    if (!pool) return false;
+    std::lock_guard<std::mutex> guard(pool->mutex);
+    if (pool->state == llama_pool_state::MMPROJ_RESIDENT) return true;
+    if (pool->state == llama_pool_state::DISABLED || pool->state == llama_pool_state::CORRUPTED) return false;
+
+    if (ctx) llama_synchronize(ctx);
+    double t0 = now_ms();
+    pool->state = llama_pool_state::SWAPPING_OUT;
+
+    char * host_llm = (char *)pool->host_ptr;
+
+    // 3. Use pipelining strategy to achieve PCIe full-duplex parallelism, completely preventing VRAM read/write pollution
+    // First group vision tensors by the evicted LLM tensor they occupy
+    std::vector<std::vector<llama_mmproj_pool::tensor_mapping>> grouped_mappings(pool->evicted_tensors.size());
+    for (const auto & m : pool->mappings) {
+        int idx = find_evicted_idx(m.gpu_data, pool->evicted_tensors);
+        if (idx >= 0) {
+            grouped_mappings[idx].push_back(m);
+        }
+    }
+
+    std::thread prev_load_thread;
+
+    for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) {
+        // Step A: Read the LLM weights of the current layer back to host (Device-to-Host)
+        // This DMA copy is blocking in the main thread
+        ggml_backend_tensor_get(
+            pool->evicted_tensors[i], 
+            host_llm + pool->evicted_offsets[i], 
+            0, 
+            ggml_nbytes(pool->evicted_tensors[i])
+        );
+
+        // Wait for the previous block's asynchronous write (H2D) to complete, preventing thread backlog
+        if (prev_load_thread.joinable()) {
+            prev_load_thread.join();
+        }
+
+        // Step B: Since the current layer (i-th) has been safely moved to host, its VRAM space can now be safely overwritten
+        // Launch a background thread to write the corresponding vision tensors to that VRAM (Host-to-Device)
+        // Key advantage: when the loop next executes D2H for layer i+1, it can run in full-duplex parallel with this H2D!
+        prev_load_thread = std::thread([pool, i, &grouped_mappings]() {
+            for (const auto & m : grouped_mappings[i]) {
+                m.vision_t->data   = m.gpu_data;
+                m.vision_t->buffer = m.gpu_buffer;
+                ggml_backend_tensor_set(m.vision_t, m.host_data, 0, m.size); // Push to VRAM
+            }
+        });
+    }
+
+    // After the loop, ensure the final background write task has completed
+    if (prev_load_thread.joinable()) {
+        prev_load_thread.join();
+    }
+
+    pool->state = llama_pool_state::MMPROJ_RESIDENT;
+
+    if (ctx) llama_synchronize(ctx);
+    pool->total_swap_ms += (now_ms() - t0);
+    ++pool->n_swaps;
+    return true;
+}
+
+
+
+
+
+
+void llama_mmproj_pool_swap_back(struct llama_mmproj_pool * pool, struct llama_context * ctx) {
+    if (!pool) return;
+    std::lock_guard<std::mutex> guard(pool->mutex);
+    if (pool->state != llama_pool_state::MMPROJ_RESIDENT) return;
+
+    if (ctx) llama_synchronize(ctx);
+    pool->state = llama_pool_state::SWAPPING_IN;
+
+    // Vision -> Host (Adjust pointers only, no copy needed)
+    for (const auto & m : pool->mappings) {
+        m.vision_t->data   = m.host_data;
+        m.vision_t->buffer = pool->host_buf;
+    }
+
+    // LLM -> GPU (Restore LLM)
+    char * host_llm = (char *)pool->host_ptr;
+    for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) {
+        ggml_backend_tensor_set(pool->evicted_tensors[i], host_llm + pool->evicted_offsets[i], 0, ggml_nbytes(pool->evicted_tensors[i]));
+    }
+
+    if (ctx) llama_synchronize(ctx);
+    pool->state = llama_pool_state::LLM_RESIDENT;
+}
+
+void llama_mmproj_pool_free(struct llama_mmproj_pool * pool) {
+    if (!pool) return;
+    for (const auto & m : pool->mappings) {
+        if (m.vision_t) {
+            m.vision_t->data = nullptr;
+            m.vision_t->buffer = nullptr;
+        }
+    }
+    if (pool->host_buf) ggml_backend_buffer_free(pool->host_buf);
+    delete pool;
+}
+
+void llama_mmproj_pool_log_stats(const struct llama_mmproj_pool * pool) {
+    if (!pool) return;
+    LLAMA_LOG_INFO("mmproj pool stats: n_swaps=%lld, avg_swap_ms=%.1f\n",
+                   (long long)pool->n_swaps,
+                   pool->n_swaps > 0 ? pool->total_swap_ms / pool->n_swaps : 0.0);
+}
diff --git a/common/llama_mmproj_pool.h b/common/llama_mmproj_pool.h
new file mode 100644
index 000000000000..4022c046c56f
--- /dev/null
+++ b/common/llama_mmproj_pool.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#if defined(_WIN32)
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#endif
+
+#include "ggml-backend.h"
+#include "llama.h"
+#include <atomic>
+#include <mutex>
+#include <vector>
+
+enum class llama_pool_state : uint8_t {
+    LLM_RESIDENT    = 0,
+    SWAPPING_OUT    = 1,
+    MMPROJ_RESIDENT = 2,
+    SWAPPING_IN     = 3,
+    CORRUPTED       = 4,
+    DISABLED        = 5,
+};
+
+struct llama_mmproj_pool {
+    ggml_backend_buffer_t host_buf      = nullptr;
+    void *                host_ptr      = nullptr;
+    size_t                host_buf_size = 0;
+
+    std::vector<ggml_tensor *> evicted_tensors;
+    std::vector<size_t>        evicted_offsets;
+
+    struct tensor_mapping {
+        ggml_tensor          * vision_t;
+        void                 * gpu_data;
+        ggml_backend_buffer_t  gpu_buffer;
+        void                 * host_data;
+        size_t                 size;
+    };
+    std::vector<tensor_mapping> mappings;
+
+    std::atomic<llama_pool_state> state { llama_pool_state::DISABLED };
+    std::mutex                    mutex;
+
+    int64_t n_swaps       = 0;
+    double  total_swap_ms = 0.0;
+};
+
+
+struct llama_mmproj_pool * llama_mmproj_pool_init(
+        struct llama_model         * model,
+        int                          n_swap_layers,
+        std::vector<ggml_tensor *> & mmproj_tensors,
+        size_t                       dynamic_overhead_bytes);
+
+bool llama_mmproj_pool_swap_in(struct llama_mmproj_pool * pool, struct llama_context * ctx);
+void llama_mmproj_pool_swap_back(struct llama_mmproj_pool * pool, struct llama_context * ctx);
+void llama_mmproj_pool_free(struct llama_mmproj_pool * pool);
+void llama_mmproj_pool_log_stats(const struct llama_mmproj_pool * pool);
diff --git a/src/llama-impl.h b/src/llama-impl.h
index 7923c3f7ed55..2b671cbe4b97 100644
--- a/src/llama-impl.h
+++ b/src/llama-impl.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "ggml.h" // for ggml_log_level
+#include "llama.h" // for LLAMA_API, ggml_log_level
 
 #include <string>
 #include <type_traits>
@@ -21,7 +21,7 @@
 //
 
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
-void llama_log_internal        (ggml_log_level level, const char * format, ...);
+LLAMA_API void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 
 #define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
diff --git a/src/llama-model.h b/src/llama-model.h
index 77d8d3b6258a..5dbd30872716 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -733,4 +733,4 @@ const char * llm_type_name(llm_type type);
 
 // For internal test use
 // TODO: remove
-const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
+LLAMA_API const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index d2226b3be1d7..dd13047c35cd 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4134,7 +4134,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32
                     //   144 for 768 tile views
                     const int   num_image_tokens = num_patches / 16;
                     const int   seq_len          = num_image_tokens * 2;
-                    std::vector qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
+                    std::vector<float> qwen2_mask(static_cast<size_t>(seq_len) * seq_len, 0.0f);
 
                     // attention mask layout
                     //  +--------------+---------------+
@@ -4660,6 +4660,18 @@ std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx *
     return result;
 }
 
+std::vector<ggml_tensor *> clip_get_all_tensors(const struct clip_ctx * ctx) {
+    std::vector<ggml_tensor *> result;
+    if (!ctx || !ctx->ctx_data.get()) return result;
+
+    ggml_tensor * t = ggml_get_first_tensor(ctx->ctx_data.get());
+    while (t) {
+        result.push_back(t);
+        t = ggml_get_next_tensor(ctx->ctx_data.get(), t);
+    }
+    return result;
+}
+
 //
 // API for debugging
 //
@@ -4667,3 +4679,9 @@ std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx *
 void clip_set_debug_output_embeddings(clip_ctx * ctx, bool enable) {
     ctx->debug_output_embeddings = enable;
 }
+
+void clip_free_buffer(clip_ctx * ctx) {
+    if (ctx) {
+        ctx->buf.reset(); // Safely frees the underlying memory and unbinds the smart pointer
+    }
+}
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 967093a812d6..6a9f259432e0 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -97,8 +97,13 @@ int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson :
 
 std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
 
+// return all tensors of the vision encoder model.
+// used by the mmproj swap pool to relocate them between GPU and host.
+std::vector<ggml_tensor *> clip_get_all_tensors(const struct clip_ctx * ctx);
+
 struct clip_cap {
     bool has_vision;
     bool has_audio;
 };
+MTMD_API void clip_free_buffer(struct clip_ctx * ctx);
 struct clip_cap clip_get_cap(const char * fname);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 724538b5857a..14fe90c23d91 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -2128,6 +2128,13 @@ void mtmd_debug_preprocess_audio(mtmd_context * ctx, const std::vector<float> &
     }
 }
 
+// return all vision encoder tensors of the multimodal model.
+// used by the mmproj swap pool to relocate them between GPU and host.
+std::vector<ggml_tensor *> mtmd_get_vision_tensors(mtmd_context * mctx) {
+    if (!mctx || !mctx->ctx_v) return {};
+    return clip_get_all_tensors(mctx->ctx_v);
+}
+
 static void stub_log_callback(enum ggml_log_level, const char *, void *) {
     // do nothing
 }
@@ -2163,3 +2170,8 @@ std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(const char * mmproj_f
         return {};
     }
 }
+void mtmd_free_vision_buffer(mtmd_context * mctx) {
+    if (mctx && mctx->ctx_v) {
+        clip_free_buffer(mctx->ctx_v);
+    }
+}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 25d51ef58d41..c7cdaabeb054 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -341,7 +341,12 @@ MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
 MTMD_API std::map<ggml_backend_dev_t, size_t> mtmd_get_memory_usage(
     const char * mmproj_fname,
     struct mtmd_context_params ctx_params);
+
+// return all vision encoder tensors of the multimodal model.
+// used by the mmproj swap pool to relocate them between GPU and host.
+MTMD_API std::vector<ggml_tensor *> mtmd_get_vision_tensors(mtmd_context * mctx);
 #endif
+MTMD_API void mtmd_free_vision_buffer(mtmd_context * mctx);
 
 //
 // C++ wrappers
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 5c33a418f549..c4d3ebd8a3db 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -1,3 +1,15 @@
+// Must be defined before any include that may pull in windows.h (e.g. through
+// ggml-backend.h / CUDA or clip-impl.h), otherwise winsock.h pollutes the
+// namespace and cascades syntax errors later in the file.
+#if defined(_WIN32)
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
+#  ifndef NOMINMAX
+#    define NOMINMAX
+#  endif
+#endif
+
 #include "server-context.h"
 #include "server-chat.h"
 #include "server-common.h"
@@ -16,6 +28,7 @@
 #include "speculative.h"
 #include "mtmd.h"
 #include "mtmd-helper.h"
+#include "llama_mmproj_pool.h"
 
 #include <algorithm>
 #include <cstddef>
@@ -25,18 +38,18 @@
 #include <filesystem>
 #include <utility>
 #include <fstream>
-
+#include <iomanip>
+#include <ctime>
 // fix problem with std::min and std::max
 #if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
+#undef min
+#undef max
 #endif
 
 using json = nlohmann::ordered_json;
 
+using json = nlohmann::ordered_json;
+
 constexpr int HTTP_POLLING_SECONDS = 1;
 
 static uint32_t server_n_outputs_max(const common_params & params) {
@@ -169,6 +182,9 @@ struct server_slot {
     mtmd_context * mctx = nullptr;
     mtmd::batch_ptr mbatch = nullptr;
 
+    // mmproj swap pool: evicts LLM layers to host RAM during vision encode.
+    struct llama_mmproj_pool * mmproj_pool = nullptr;
+
     // speculative decoding
     common_speculative * spec;
 
@@ -773,7 +789,29 @@ struct server_slot {
         // TODO @ngxson : move this log line to debug when it become more stable
         SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added);
 
+        // ── mmproj swap in/out RAII guard ──
+        struct MmprojSwapGuard {
+            llama_mmproj_pool * pool;
+            llama_context * ctx;
+            bool swapped;
+            MmprojSwapGuard(llama_mmproj_pool * p, llama_context * c) : pool(p), ctx(c), swapped(false) {}
+            bool swap_in() {
+                if (pool) swapped = llama_mmproj_pool_swap_in(pool, ctx);
+                return !pool || swapped;
+            }
+            ~MmprojSwapGuard() {
+                if (pool && swapped) llama_mmproj_pool_swap_back(pool, ctx);
+            }
+        } guard(mmproj_pool, ctx_tgt);
+
+        if (!guard.swap_in()) {
+            SLT_ERR(*this, "%s", "mmproj swap_in failed; insufficient VRAM\n");
+            return -1;
+        }
+
         res = mtmd_batch_encode(mbatch.get());
+        // guard destruction handles swap_back unconditionally, even if mtmd_batch_encode throws exception
+
         if (res != 0) {
             SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res);
             return -1;
@@ -861,6 +899,10 @@ struct server_context_impl {
     llama_model * model_tgt = nullptr;
 
     mtmd_context * mctx = nullptr;
+
+    // mmproj swap pool: evicts LLM layers to host RAM when vision encoder runs.
+    struct llama_mmproj_pool * mmproj_pool = nullptr;
+
     const llama_vocab * vocab = nullptr;
 
     server_queue    queue_tasks;
@@ -941,13 +983,18 @@ struct server_context_impl {
         ctx_dft.reset();
         model_dft.reset();
 
+        // pool must be freed before its owners: it holds pointers into
+        // the LLM model tensors and mmproj tensors.
+        llama_mmproj_pool_free(mmproj_pool);
+        mmproj_pool = nullptr;
+
+        mtmd_free(mctx);
+        mctx = nullptr;
+
         llama_init.reset();
 
         ctx_tgt = nullptr;
         model_tgt = nullptr;
-
-        mtmd_free(mctx);
-        mctx = nullptr;
     }
 
     void handle_sleeping_state(bool new_state) {
@@ -1051,10 +1098,11 @@ struct server_context_impl {
             mparams.progress_callback_user_data = &load_progress_mmproj;
         }
 
-        // optionally get the memory usage of mmproj
-        if (has_mmproj && params_base.fit_params) {
+        std::map<ggml_backend_dev_t, size_t> mmproj_mem;
+        // get the memory usage of mmproj for fit_params OR auto swap calculation
+        if (has_mmproj && (params_base.fit_params || params_base.n_mmproj_swap < 0)) {
             int64_t t_start = ggml_time_us();
-            auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams);
+            mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams);
             int64_t t_elapsed = ggml_time_us() - t_start;
             if (!mmproj_mem.empty()) {
                 size_t total = 0;
@@ -1062,15 +1110,18 @@ struct server_context_impl {
                     total += size;
                 }
                 SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0);
-                GGML_ASSERT(!params_base.fit_params_target.empty());
-                for (auto & [dev, size] : mmproj_mem) {
-                    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-                        if (ggml_backend_dev_get(i) == dev) {
-                            if (i < params_base.fit_params_target.size()) {
-                                SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev));
-                                params_base.fit_params_target[i] += size;
+                
+                if (params_base.fit_params) {
+                    GGML_ASSERT(!params_base.fit_params_target.empty());
+                    for (auto & [dev, size] : mmproj_mem) {
+                        for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+                            if (ggml_backend_dev_get(i) == dev) {
+                                if (i < params_base.fit_params_target.size()) {
+                                    SRV_DBG("[mtmd] adding %.2f MiB to fit_params_target for device %s\n", size / (1024.0 * 1024.0), ggml_backend_dev_name(dev));
+                                    params_base.fit_params_target[i] += size;
+                                }
+                                break;
                             }
-                            break;
                         }
                     }
                 }
@@ -1277,6 +1328,61 @@ struct server_context_impl {
                 params_base.n_cache_reuse = 0;
                 SRV_WRN("%s\n", "cache_reuse is not supported by multimodal, it will be disabled");
             }
+
+            // Resolve parameter conflicts: forcefully disable Swap if vision GPU acceleration is disabled
+            if (params_base.n_mmproj_swap != 0 && !params_base.mmproj_use_gpu) {
+                SRV_WRN("%s\n", "Conflict detected: --mmproj-swap-layers is ignored because --no-mmproj-offload is active.");
+                params_base.n_mmproj_swap = 0; // Forcefully disable Swap
+            }
+
+            // ── mmproj swap pool: evict LLM layers to host RAM when vision runs ──
+            if (params_base.n_mmproj_swap != 0) {
+                if (mctx) {
+                    auto mmproj_tensors = mtmd_get_vision_tensors(mctx);
+                    
+                    size_t mtmd_total_mem = 0;
+                    if (!mmproj_mem.empty()) {
+                        for (auto & [dev, size] : mmproj_mem) {
+                            mtmd_total_mem += size;
+                        }
+                    }
+
+                    size_t mtmd_weight_mem = 0;
+                    for (auto * t : mmproj_tensors) {
+                        mtmd_weight_mem += ggml_nbytes(t);
+                    }
+
+                    size_t mtmd_compute_overhead = 0;
+                    if (mtmd_total_mem > mtmd_weight_mem) {
+                        mtmd_compute_overhead = mtmd_total_mem - mtmd_weight_mem;
+                    }
+
+                    // Note: llama.cpp pre-allocates the KV cache pool in the context, so image tokens' KV doesn't need to be added to the dynamic overhead
+                    //  Thus, dynamic overhead only accounts for the Compute Buffer required by the vision forward pass
+                    size_t dynamic_overhead_bytes = mtmd_compute_overhead;
+
+                    // As a safeguard, if the calculated overhead is 0 (profiling failed), provide a fallback safety margin
+                    if (dynamic_overhead_bytes == 0) {
+                        dynamic_overhead_bytes = 300 * 1024 * 1024;
+                        SRV_WRN("mmproj compute overhead detection failed or zero, using fallback %zu MB\n", dynamic_overhead_bytes / 1024 / 1024);
+                    } else {
+                        SRV_INF("mmproj dynamic overhead evaluated: Compute Buffer=%zu MB\n", mtmd_compute_overhead / 1024 / 1024);
+                    }
+
+                    mmproj_pool = llama_mmproj_pool_init(
+                        model_tgt,
+                        params_base.n_mmproj_swap,
+                        mmproj_tensors,
+                        dynamic_overhead_bytes
+                    );
+                    if (mmproj_pool) {
+                        mtmd_free_vision_buffer(mctx); // Completely free the original VRAM of the vision model!`
+                        SRV_INF("%s", "mmproj swap pool initialized, vision VRAM freed.\n");
+                    } else {
+                        SRV_WRN("%s\n", "mmproj swap pool not created; vision will run without LLM layer eviction\n");
+                    }
+                }
+            }
         }
 
         if (!llama_memory_can_shift(llama_get_memory(ctx_tgt))) {
@@ -1359,6 +1465,7 @@ struct server_context_impl {
             slot.n_ctx   = n_ctx_slot;
 
             slot.mctx                   = mctx;
+            slot.mmproj_pool           = mmproj_pool;
             slot.prompt.tokens.has_mtmd = mctx != nullptr;
 
             SLT_INF(slot, "new slot, n_ctx = %d\n", slot.n_ctx);