Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <cstdarg>
#include <fstream>
#include <list>
#include <numeric>
#include <regex>
#include <set>
#include <string>
Expand Down Expand Up @@ -2494,6 +2495,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_env("LLAMA_ARG_N_GPU_LAYERS"));
add_opt(common_arg(
{"--mmproj-swap-layers"}, "N",
"number of LLM layers to evict to host RAM when mmproj is active;\n"
"0 = disabled (default), -1 = auto-detect based on free VRAM;\n"
"requires CUDA backend and a loaded --mmproj model",
[](common_params & params, int value) {
params.n_mmproj_swap = value;
}
).set_env("LLAMA_ARG_MMPROJ_SWAP_LAYERS"));
add_opt(common_arg(
{"-sm", "--split-mode"}, "{none,layer,row,tensor}",
"how to split the model across multiple GPUs, one of:\n"
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,9 @@ struct common_params {
int image_max_tokens = -1;
int mtmd_batch_max_tokens = 1024;

// mmproj swap pool (see common/llama_mmproj_pool.h)
int n_mmproj_swap = 0; // --mmproj-swap-layers: LLM layers evicted per vision step

// finetune
struct lr_opt lr;
enum ggml_opt_optimizer_type optimizer = GGML_OPT_OPTIMIZER_TYPE_ADAMW;
Expand Down
315 changes: 315 additions & 0 deletions common/llama_mmproj_pool.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,315 @@
#include "llama_mmproj_pool.h"
#include "llama-impl.h"
#include "../src/llama-model.h"
#include <algorithm>
#include <chrono>
#include <thread>

static double now_ms() {
using namespace std::chrono;
return duration<double, std::milli>(steady_clock::now().time_since_epoch()).count();
}

static size_t calc_aligned_size(const std::vector<ggml_tensor *> & tensors, size_t align = 256) {
size_t total = 0;
for (ggml_tensor * t : tensors) {
total = (total + align - 1) / align * align;
total += ggml_nbytes(t);
}
return total;
}

static std::vector<ggml_tensor *> collect_evicted_tensors(struct llama_model * model, int n_swap_layers) {
if (!model || n_swap_layers <= 0) return {};
const int n_layer = llama_model_n_layer(model);
const int first = std::max(0, n_layer - n_swap_layers);
std::vector<ggml_tensor *> result;
const auto & tensor_map = llama_internal_get_tensor_map(model);

for (int il = first; il < n_layer; ++il) {
const std::string prefix = "blk." + std::to_string(il) + ".";
for (auto & [name, t] : tensor_map) {
if (t && name.rfind(prefix, 0) == 0) {
if (!t->buffer) continue;
ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(t->buffer);
if (ggml_backend_buft_is_host(buft)) continue;
result.push_back(t);
}
}
}
return result;
}

struct llama_mmproj_pool * llama_mmproj_pool_init(
struct llama_model * model,
int n_swap_layers,
std::vector<ggml_tensor *> & mmproj_tensors,
size_t dynamic_overhead_bytes) {

if (mmproj_tensors.empty()) return nullptr;

size_t align = 256;
size_t mmproj_host_size = calc_aligned_size(mmproj_tensors, align);

// 1. Auto-calculation (-1) logic, combining precisely probed dynamic overhead
if (n_swap_layers < 0) {
int n_layer = llama_model_n_layer(model);
size_t accumulated_size = 0;
int calculated_layers = 0;

// Target eviction size = Vision Weights + Compute Buffer
// Reserve a 5% safety margin for VRAM fragmentation
size_t target_eviction_size = (mmproj_host_size + dynamic_overhead_bytes) * 1.05;

for (int il = n_layer - 1; il >= 0; --il) {
calculated_layers++;
auto evicted_tensors_tmp = collect_evicted_tensors(model, calculated_layers);

accumulated_size = 0;
for (auto * t : evicted_tensors_tmp) {
accumulated_size += ggml_nbytes(t);
}

if (accumulated_size >= target_eviction_size) {
break;
}
}
n_swap_layers = calculated_layers;
LLAMA_LOG_INFO("%s: auto mode: need %.0f MB (Weights) + %.0f MB (Overhead) for mmproj; will evict %d layers (target eviction %.0f MB)\n",
__func__, mmproj_host_size / 1e6, dynamic_overhead_bytes / 1e6, n_swap_layers, target_eviction_size / 1e6);
}

if (n_swap_layers <= 0) return nullptr;


auto * pool = new llama_mmproj_pool();
pool->evicted_tensors = collect_evicted_tensors(model, n_swap_layers);

if (pool->evicted_tensors.empty()) {
delete pool;
return nullptr;
}

// Get the actual GPU Backend Dev to prepare for pinned memory
ggml_backend_dev_t dev = nullptr;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
ggml_backend_dev_t d = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(d) != GGML_BACKEND_DEVICE_TYPE_CPU) {
dev = d;
break;
}
}

// Allocate Host buffer
size_t evicted_total_bytes = 0;
for (auto * t : pool->evicted_tensors) {
pool->evicted_offsets.push_back(evicted_total_bytes);
evicted_total_bytes += ggml_nbytes(t);
}

pool->host_buf_size = evicted_total_bytes + mmproj_host_size;
ggml_backend_buffer_type_t host_buft = dev ? ggml_backend_dev_host_buffer_type(dev) : nullptr;
if (!host_buft) host_buft = ggml_backend_cpu_buffer_type();

pool->host_buf = ggml_backend_buft_alloc_buffer(host_buft, pool->host_buf_size);
if (!pool->host_buf) {
delete pool;
return nullptr;
}
pool->host_ptr = ggml_backend_buffer_get_base(pool->host_buf);
char * host_mm = (char *)pool->host_ptr + evicted_total_bytes;

// 2.Restore the robust "Bin-Packing" method to prevent any risk of data corruption
struct Block {
ggml_tensor * t;
size_t used;
size_t cap;
};
std::vector<Block> blocks;
for (auto * t : pool->evicted_tensors) {
blocks.push_back({t, 0, ggml_nbytes(t)});
}
std::sort(blocks.begin(), blocks.end(), [](const Block & a, const Block & b) { return a.cap > b.cap; });

std::vector<ggml_tensor *> sorted_mmproj = mmproj_tensors;
std::sort(sorted_mmproj.begin(), sorted_mmproj.end(), [](ggml_tensor * a, ggml_tensor * b) {
return ggml_nbytes(a) > ggml_nbytes(b);
});

bool packing_failed = false;
size_t current_host_offset = 0;

for (ggml_tensor * vt : sorted_mmproj) {
size_t vsize = ggml_nbytes(vt);

current_host_offset = (current_host_offset + align - 1) / align * align;
char * host_data = host_mm + current_host_offset;

if (vt->data) {
ggml_backend_tensor_get(vt, host_data, 0, vsize); // Backup vision model to host
}
current_host_offset += vsize;

bool placed = false;
for (auto & b : blocks) {
size_t offset = (b.used + align - 1) / align * align;
if (offset + vsize <= b.cap) {
b.used = offset + vsize;
char * gpu_data = (char *)b.t->data + offset;
pool->mappings.push_back({vt, gpu_data, b.t->buffer, host_data, vsize});
placed = true;
break;
}
}
if (!placed) {
packing_failed = true;
break;
}
}

if (packing_failed) {
LLAMA_LOG_ERROR("%s: Fragmentation prevents packing mmproj tensors. Increase --mmproj-swap-layers.\n", __func__);
llama_mmproj_pool_free(pool);
return nullptr;
}

// Redirect pointers, ready for execution
for (const auto & m : pool->mappings) {
m.vision_t->data = m.host_data;
m.vision_t->buffer = pool->host_buf;
}

pool->state = llama_pool_state::LLM_RESIDENT;
LLAMA_LOG_INFO("%s: pool ready | %zu evicted (%.0f MB) | packed %zu mmproj (%.0f MB) | host_buft: %s\n",
__func__, pool->evicted_tensors.size(), evicted_total_bytes / 1e6,
pool->mappings.size(), mmproj_host_size / 1e6, ggml_backend_buft_name(host_buft));
return pool;
}




// Helper: Given the physical address of allocated gpu_data, deduce which evicted tensor (LLM layer) it maps to
static int find_evicted_idx(void * gpu_data, const std::vector<ggml_tensor*> & ev_tensors) {
for (size_t i = 0; i < ev_tensors.size(); ++i) {
char * base = (char *)ev_tensors[i]->data;
size_t size = ggml_nbytes(ev_tensors[i]);
// If the vision data falls within this evicted LLM tensor's address range
if ((char *)gpu_data >= base && (char *)gpu_data < base + size) {
return (int)i;
}
}
return -1;
}

bool llama_mmproj_pool_swap_in(struct llama_mmproj_pool * pool, struct llama_context * ctx) {
if (!pool) return false;
std::lock_guard<std::mutex> guard(pool->mutex);
if (pool->state == llama_pool_state::MMPROJ_RESIDENT) return true;
if (pool->state == llama_pool_state::DISABLED || pool->state == llama_pool_state::CORRUPTED) return false;

if (ctx) llama_synchronize(ctx);
double t0 = now_ms();
pool->state = llama_pool_state::SWAPPING_OUT;

char * host_llm = (char *)pool->host_ptr;

// 3. Use pipelining strategy to achieve PCIe full-duplex parallelism, completely preventing VRAM read/write pollution
// First group vision tensors by the evicted LLM tensor they occupy
std::vector<std::vector<llama_mmproj_pool::tensor_mapping>> grouped_mappings(pool->evicted_tensors.size());
for (const auto & m : pool->mappings) {
int idx = find_evicted_idx(m.gpu_data, pool->evicted_tensors);
if (idx >= 0) {
grouped_mappings[idx].push_back(m);
}
}

std::thread prev_load_thread;

for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) {
// Step A: Read the LLM weights of the current layer back to host (Device-to-Host)
// This DMA copy is blocking in the main thread
ggml_backend_tensor_get(
pool->evicted_tensors[i],
host_llm + pool->evicted_offsets[i],
0,
ggml_nbytes(pool->evicted_tensors[i])
);

// Wait for the previous block's asynchronous write (H2D) to complete, preventing thread backlog
if (prev_load_thread.joinable()) {
prev_load_thread.join();
}

// Step B: Since the current layer (i-th) has been safely moved to host, its VRAM space can now be safely overwritten
// Launch a background thread to write the corresponding vision tensors to that VRAM (Host-to-Device)
// Key advantage: when the loop next executes D2H for layer i+1, it can run in full-duplex parallel with this H2D!
prev_load_thread = std::thread([pool, i, &grouped_mappings]() {
for (const auto & m : grouped_mappings[i]) {
m.vision_t->data = m.gpu_data;
m.vision_t->buffer = m.gpu_buffer;
ggml_backend_tensor_set(m.vision_t, m.host_data, 0, m.size); // Push to VRAM
}
});
}

// After the loop, ensure the final background write task has completed
if (prev_load_thread.joinable()) {
prev_load_thread.join();
}

pool->state = llama_pool_state::MMPROJ_RESIDENT;

if (ctx) llama_synchronize(ctx);
pool->total_swap_ms += (now_ms() - t0);
++pool->n_swaps;
return true;
}






void llama_mmproj_pool_swap_back(struct llama_mmproj_pool * pool, struct llama_context * ctx) {
if (!pool) return;
std::lock_guard<std::mutex> guard(pool->mutex);
if (pool->state != llama_pool_state::MMPROJ_RESIDENT) return;

if (ctx) llama_synchronize(ctx);
pool->state = llama_pool_state::SWAPPING_IN;

// Vision -> Host (Adjust pointers only, no copy needed)
for (const auto & m : pool->mappings) {
m.vision_t->data = m.host_data;
m.vision_t->buffer = pool->host_buf;
}

// LLM -> GPU (Restore LLM)
char * host_llm = (char *)pool->host_ptr;
for (size_t i = 0; i < pool->evicted_tensors.size(); ++i) {
ggml_backend_tensor_set(pool->evicted_tensors[i], host_llm + pool->evicted_offsets[i], 0, ggml_nbytes(pool->evicted_tensors[i]));
}

if (ctx) llama_synchronize(ctx);
pool->state = llama_pool_state::LLM_RESIDENT;
}

void llama_mmproj_pool_free(struct llama_mmproj_pool * pool) {
if (!pool) return;
for (const auto & m : pool->mappings) {
if (m.vision_t) {
m.vision_t->data = nullptr;
m.vision_t->buffer = nullptr;
}
}
if (pool->host_buf) ggml_backend_buffer_free(pool->host_buf);
delete pool;
}

void llama_mmproj_pool_log_stats(const struct llama_mmproj_pool * pool) {
if (!pool) return;
LLAMA_LOG_INFO("mmproj pool stats: n_swaps=%lld, avg_swap_ms=%.1f\n",
(long long)pool->n_swaps,
pool->n_swaps > 0 ? pool->total_swap_ms / pool->n_swaps : 0.0);
}
Loading