llama: fit ctx size for CPU only (ggml-org#21568)

JohannesGaessler · ArberSephirotheca · commit fed737b39e16 · 2026-04-20T17:39:58.000-07:00
diff --git a/src/llama.cpp b/src/llama.cpp
@@ -91,20 +91,24 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
         throw std::runtime_error("failed to create llama_context from model");
     }
 
-    std::vector<llama_device_memory_data> ret(model->devices.size());
+    const size_t nd = model->n_devices();
+    std::vector<llama_device_memory_data> ret(nd + 1);
 
     std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
 
     for (const auto & [buft, mb] : memory_breakdown) {
         if (ggml_backend_buft_is_host(buft)) {
+            ret.back().mb.model   += mb.model;
+            ret.back().mb.context += mb.context;
+            ret.back().mb.compute += mb.compute;
             continue;
         }
 
         ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
         if (!dev) {
             continue;
         }
-        for (size_t i = 0; i < ret.size(); i++) {
+        for (size_t i = 0; i < nd; i++) {
             if (model->devices[i].dev == dev) {
                 ret[i].mb.model   += mb.model;
                 ret[i].mb.context += mb.context;
@@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
             }
         }
     }
-    for (size_t i = 0; i < ret.size(); i++) {
+
+    {
+        ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (cpu_dev == nullptr) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
+        size_t free;
+        size_t total;
+        ggml_backend_dev_memory(cpu_dev, &free, &total);
+        ret.back().free  = free;
+        ret.back().total = total;
+    }
+    for (size_t i = 0; i < nd; i++) {
         size_t free;
         size_t total;
         ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
@@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
         // have any to report. in this case, we will use the host memory as a fallback
         // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
         if (free == 0 && total == 0) {
-            ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
-            if (cpu_dev == nullptr) {
-                throw std::runtime_error(format("%s: no CPU backend found", __func__));
-            }
-            ggml_backend_dev_memory(cpu_dev, &free, &total);
+            free  = ret.back().free;
+            total = ret.back().total;
         }
         ret[i].free  = free;
         ret[i].total = total;
@@ -180,15 +193,15 @@ static void llama_params_fit_impl(
     LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
     const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
     const size_t nd = devs.size(); // number of devices
-    if (nd == 0) {
-        LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
-        return;
-    }
 
     std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
     margins.reserve(nd);
-    for (size_t id = 0; id < nd; id++) {
-        margins.push_back(margins_s[id]);
+    if (nd == 0) {
+        margins.push_back(margins_s[0]);
+    } else {
+        for (size_t id = 0; id < nd; id++) {
+            margins.push_back(margins_s[id]);
+        }
     }
 
     std::vector<std::string> dev_names;
@@ -215,58 +228,75 @@ static void llama_params_fit_impl(
     std::vector<int64_t> projected_free_per_device;
     projected_free_per_device.reserve(nd);
 
-    if (nd > 1) {
-        LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
-    }
-    for (size_t id = 0; id < nd; id++) {
-        const llama_device_memory_data & dmd = dmds_full[id];
-
-        const int64_t projected_used = dmd.mb.total();
-        const int64_t projected_free = dmd.free - projected_used;
-        projected_free_per_device.push_back(projected_free);
-
-        sum_free            += dmd.free;
-        sum_projected_used  += projected_used;
-        sum_projected_free  += projected_free;
-        sum_projected_model += dmd.mb.model;
-
-        if (nd > 1) {
-            LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
-                __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
-        }
-    }
-    assert(sum_free >= 0 && sum_projected_used >= 0);
-    LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
-        __func__, sum_projected_used/MiB, sum_free/MiB);
-    if (nd == 1) {
-        if (projected_free_per_device[0] >= margins[0]) {
-            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
-                __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+    if (nd == 0) {
+        sum_projected_used = dmds_full.back().mb.total();
+        sum_free           = dmds_full.back().total;
+        sum_projected_free = sum_free - sum_projected_used;
+        LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (sum_projected_free >= margins[0]) {
+            LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
+                __func__, sum_projected_free/MiB, margins[0]/MiB);
             return;
         }
     } else {
-        bool changes_needed = false;
+        if (nd > 1) {
+            LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
+        }
         for (size_t id = 0; id < nd; id++) {
-            if (projected_free_per_device[id] < margins[id]) {
-                changes_needed = true;
-                break;
+            const llama_device_memory_data & dmd = dmds_full[id];
+
+            const int64_t projected_used = dmd.mb.total();
+            const int64_t projected_free = dmd.free - projected_used;
+            projected_free_per_device.push_back(projected_free);
+
+            sum_free            += dmd.free;
+            sum_projected_used  += projected_used;
+            sum_projected_free  += projected_free;
+            sum_projected_model += dmd.mb.model;
+
+            if (nd > 1) {
+                LLAMA_LOG_INFO("%s:   - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
+                    __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
             }
         }
-        if (!changes_needed) {
-            LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
-            return;
+        assert(sum_free >= 0 && sum_projected_used >= 0);
+        LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
+            __func__, sum_projected_used/MiB, sum_free/MiB);
+        if (nd == 1) {
+            if (projected_free_per_device[0] >= margins[0]) {
+                LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
+                    __func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
+                return;
+            }
+        } else {
+            bool changes_needed = false;
+            for (size_t id = 0; id < nd; id++) {
+                if (projected_free_per_device[id] < margins[id]) {
+                    changes_needed = true;
+                    break;
+                }
+            }
+            if (!changes_needed) {
+                LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
+                return;
+            }
         }
     }
 
     // step 2: try reducing memory use by reducing the context size
 
     {
         int64_t global_surplus = sum_projected_free;
-        for (size_t id = 0; id < nd; id++) {
-            global_surplus -= margins[id];
+        if (nd == 0) {
+            global_surplus -= margins[0];
+        } else {
+            for (size_t id = 0; id < nd; id++) {
+                global_surplus -= margins[id];
+            }
         }
         if (global_surplus < 0) {
-            if (nd == 1) {
+            if (nd <= 1) {
                 LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
                     __func__, margins[0]/MiB, -global_surplus/MiB);
             } else {
@@ -277,8 +307,12 @@ static void llama_params_fit_impl(
             if (cparams->n_ctx == 0) {
                 if (hp_nct > n_ctx_min) {
                     int64_t sum_used_target = sum_free;
-                    for (size_t id = 0; id < nd; id++) {
-                        sum_used_target -= margins[id];
+                    if (nd == 0) {
+                        sum_used_target -= margins[0];
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_used_target -= margins[id];
+                        }
                     }
                     if (nd > 1) {
                         // for multiple devices we need to be more conservative in terms of how much context we think can fit:
@@ -293,8 +327,12 @@ static void llama_params_fit_impl(
                     int64_t sum_projected_used_min_ctx = 0;
                     cparams->n_ctx = n_ctx_min;
                     const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
-                    for (const auto & dmd : dmds_min_ctx) {
-                        sum_projected_used_min_ctx += dmd.mb.total();
+                    if (nd == 0) {
+                        sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
+                    } else {
+                        for (size_t id = 0; id < nd; id++) {
+                            sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
+                        }
                     }
                     if (sum_used_target > sum_projected_used_min_ctx) {
                         // linear interpolation between minimum and maximum context size:
@@ -306,7 +344,7 @@ static void llama_params_fit_impl(
                         const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
                         LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
                             __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
-                        if (nd == 1) {
+                        if (nd <= 1) {
                             LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
                             return;
                         }
@@ -329,6 +367,9 @@ static void llama_params_fit_impl(
             }
         }
     }
+    if (nd == 0) {
+        throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
+    }
 
     if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
         throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
@@ -476,8 +517,8 @@ static void llama_params_fit_impl(
 
         std::vector<int64_t> ret;
         ret.reserve(nd);
-        for (const llama_device_memory_data & dmd : dmd_nl) {
-            ret.push_back(dmd.mb.total());
+        for (size_t id = 0; id < nd; id++) {
+            ret.push_back(dmd_nl[id].mb.total());
         }
         return ret;
     };