@@ -91,20 +91,24 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
9191 throw std::runtime_error (" failed to create llama_context from model" );
9292 }
9393
94- std::vector<llama_device_memory_data> ret (model->devices .size ());
94+ const size_t nd = model->n_devices ();
95+ std::vector<llama_device_memory_data> ret (nd + 1 );
9596
9697 std::map<ggml_backend_buffer_type_t , llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown ();
9798
9899 for (const auto & [buft, mb] : memory_breakdown) {
99100 if (ggml_backend_buft_is_host (buft)) {
101+ ret.back ().mb .model += mb.model ;
102+ ret.back ().mb .context += mb.context ;
103+ ret.back ().mb .compute += mb.compute ;
100104 continue ;
101105 }
102106
103107 ggml_backend_dev_t dev = ggml_backend_buft_get_device (buft);
104108 if (!dev) {
105109 continue ;
106110 }
107- for (size_t i = 0 ; i < ret. size () ; i++) {
111+ for (size_t i = 0 ; i < nd ; i++) {
108112 if (model->devices [i].dev == dev) {
109113 ret[i].mb .model += mb.model ;
110114 ret[i].mb .context += mb.context ;
@@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
113117 }
114118 }
115119 }
116- for (size_t i = 0 ; i < ret.size (); i++) {
120+
121+ {
122+ ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
123+ if (cpu_dev == nullptr ) {
124+ throw std::runtime_error (format (" %s: no CPU backend found" , __func__));
125+ }
126+ size_t free;
127+ size_t total;
128+ ggml_backend_dev_memory (cpu_dev, &free, &total);
129+ ret.back ().free = free;
130+ ret.back ().total = total;
131+ }
132+ for (size_t i = 0 ; i < nd; i++) {
117133 size_t free;
118134 size_t total;
119135 ggml_backend_dev_memory (model->devices [i].dev , &free, &total);
@@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
122138 // have any to report. in this case, we will use the host memory as a fallback
123139 // fixes: https://github.com/ggml-org/llama.cpp/issues/18577
124140 if (free == 0 && total == 0 ) {
125- ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type (GGML_BACKEND_DEVICE_TYPE_CPU);
126- if (cpu_dev == nullptr ) {
127- throw std::runtime_error (format (" %s: no CPU backend found" , __func__));
128- }
129- ggml_backend_dev_memory (cpu_dev, &free, &total);
141+ free = ret.back ().free ;
142+ total = ret.back ().total ;
130143 }
131144 ret[i].free = free;
132145 ret[i].total = total;
@@ -180,15 +193,15 @@ static void llama_params_fit_impl(
180193 LLAMA_LOG_DEBUG (" %s: getting device memory data for initial parameters:\n " , __func__);
181194 const dmds_t dmds_full = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
182195 const size_t nd = devs.size (); // number of devices
183- if (nd == 0 ) {
184- LLAMA_LOG_INFO (" %s: no devices with dedicated memory found\n " , __func__);
185- return ;
186- }
187196
188197 std::vector<int64_t > margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
189198 margins.reserve (nd);
190- for (size_t id = 0 ; id < nd; id++) {
191- margins.push_back (margins_s[id]);
199+ if (nd == 0 ) {
200+ margins.push_back (margins_s[0 ]);
201+ } else {
202+ for (size_t id = 0 ; id < nd; id++) {
203+ margins.push_back (margins_s[id]);
204+ }
192205 }
193206
194207 std::vector<std::string> dev_names;
@@ -215,58 +228,75 @@ static void llama_params_fit_impl(
215228 std::vector<int64_t > projected_free_per_device;
216229 projected_free_per_device.reserve (nd);
217230
218- if (nd > 1 ) {
219- LLAMA_LOG_INFO (" %s: projected memory use with initial parameters [MiB]:\n " , __func__);
220- }
221- for (size_t id = 0 ; id < nd; id++) {
222- const llama_device_memory_data & dmd = dmds_full[id];
223-
224- const int64_t projected_used = dmd.mb .total ();
225- const int64_t projected_free = dmd.free - projected_used;
226- projected_free_per_device.push_back (projected_free);
227-
228- sum_free += dmd.free ;
229- sum_projected_used += projected_used;
230- sum_projected_free += projected_free;
231- sum_projected_model += dmd.mb .model ;
232-
233- if (nd > 1 ) {
234- LLAMA_LOG_INFO (" %s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 " \n " ,
235- __func__, dev_names[id].c_str (), dmd.total /MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
236- }
237- }
238- assert (sum_free >= 0 && sum_projected_used >= 0 );
239- LLAMA_LOG_INFO (" %s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n " ,
240- __func__, sum_projected_used/MiB, sum_free/MiB);
241- if (nd == 1 ) {
242- if (projected_free_per_device[0 ] >= margins[0 ]) {
243- LLAMA_LOG_INFO (" %s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n " ,
244- __func__, projected_free_per_device[0 ]/MiB, margins[0 ]/MiB);
231+ if (nd == 0 ) {
232+ sum_projected_used = dmds_full.back ().mb .total ();
233+ sum_free = dmds_full.back ().total ;
234+ sum_projected_free = sum_free - sum_projected_used;
235+ LLAMA_LOG_INFO (" %s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n " ,
236+ __func__, sum_projected_used/MiB, sum_free/MiB);
237+ if (sum_projected_free >= margins[0 ]) {
238+ LLAMA_LOG_INFO (" %s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n " ,
239+ __func__, sum_projected_free/MiB, margins[0 ]/MiB);
245240 return ;
246241 }
247242 } else {
248- bool changes_needed = false ;
243+ if (nd > 1 ) {
244+ LLAMA_LOG_INFO (" %s: projected memory use with initial parameters [MiB]:\n " , __func__);
245+ }
249246 for (size_t id = 0 ; id < nd; id++) {
250- if (projected_free_per_device[id] < margins[id]) {
251- changes_needed = true ;
252- break ;
247+ const llama_device_memory_data & dmd = dmds_full[id];
248+
249+ const int64_t projected_used = dmd.mb .total ();
250+ const int64_t projected_free = dmd.free - projected_used;
251+ projected_free_per_device.push_back (projected_free);
252+
253+ sum_free += dmd.free ;
254+ sum_projected_used += projected_used;
255+ sum_projected_free += projected_free;
256+ sum_projected_model += dmd.mb .model ;
257+
258+ if (nd > 1 ) {
259+ LLAMA_LOG_INFO (" %s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 " \n " ,
260+ __func__, dev_names[id].c_str (), dmd.total /MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
253261 }
254262 }
255- if (!changes_needed) {
256- LLAMA_LOG_INFO (" %s: targets for free memory can be met on all devices, no changes needed\n " , __func__);
257- return ;
263+ assert (sum_free >= 0 && sum_projected_used >= 0 );
264+ LLAMA_LOG_INFO (" %s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n " ,
265+ __func__, sum_projected_used/MiB, sum_free/MiB);
266+ if (nd == 1 ) {
267+ if (projected_free_per_device[0 ] >= margins[0 ]) {
268+ LLAMA_LOG_INFO (" %s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n " ,
269+ __func__, projected_free_per_device[0 ]/MiB, margins[0 ]/MiB);
270+ return ;
271+ }
272+ } else {
273+ bool changes_needed = false ;
274+ for (size_t id = 0 ; id < nd; id++) {
275+ if (projected_free_per_device[id] < margins[id]) {
276+ changes_needed = true ;
277+ break ;
278+ }
279+ }
280+ if (!changes_needed) {
281+ LLAMA_LOG_INFO (" %s: targets for free memory can be met on all devices, no changes needed\n " , __func__);
282+ return ;
283+ }
258284 }
259285 }
260286
261287 // step 2: try reducing memory use by reducing the context size
262288
263289 {
264290 int64_t global_surplus = sum_projected_free;
265- for (size_t id = 0 ; id < nd; id++) {
266- global_surplus -= margins[id];
291+ if (nd == 0 ) {
292+ global_surplus -= margins[0 ];
293+ } else {
294+ for (size_t id = 0 ; id < nd; id++) {
295+ global_surplus -= margins[id];
296+ }
267297 }
268298 if (global_surplus < 0 ) {
269- if (nd = = 1 ) {
299+ if (nd < = 1 ) {
270300 LLAMA_LOG_INFO (" %s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n " ,
271301 __func__, margins[0 ]/MiB, -global_surplus/MiB);
272302 } else {
@@ -277,8 +307,12 @@ static void llama_params_fit_impl(
277307 if (cparams->n_ctx == 0 ) {
278308 if (hp_nct > n_ctx_min) {
279309 int64_t sum_used_target = sum_free;
280- for (size_t id = 0 ; id < nd; id++) {
281- sum_used_target -= margins[id];
310+ if (nd == 0 ) {
311+ sum_used_target -= margins[0 ];
312+ } else {
313+ for (size_t id = 0 ; id < nd; id++) {
314+ sum_used_target -= margins[id];
315+ }
282316 }
283317 if (nd > 1 ) {
284318 // for multiple devices we need to be more conservative in terms of how much context we think can fit:
@@ -293,8 +327,12 @@ static void llama_params_fit_impl(
293327 int64_t sum_projected_used_min_ctx = 0 ;
294328 cparams->n_ctx = n_ctx_min;
295329 const dmds_t dmds_min_ctx = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
296- for (const auto & dmd : dmds_min_ctx) {
297- sum_projected_used_min_ctx += dmd.mb .total ();
330+ if (nd == 0 ) {
331+ sum_projected_used_min_ctx = dmds_min_ctx.back ().mb .total ();
332+ } else {
333+ for (size_t id = 0 ; id < nd; id++) {
334+ sum_projected_used_min_ctx += dmds_min_ctx[id].mb .total ();
335+ }
298336 }
299337 if (sum_used_target > sum_projected_used_min_ctx) {
300338 // linear interpolation between minimum and maximum context size:
@@ -306,7 +344,7 @@ static void llama_params_fit_impl(
306344 const int64_t memory_reduction = (hp_nct - cparams->n_ctx ) * bytes_per_ctx;
307345 LLAMA_LOG_INFO (" %s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n " ,
308346 __func__, hp_nct, cparams->n_ctx , memory_reduction/MiB);
309- if (nd = = 1 ) {
347+ if (nd < = 1 ) {
310348 LLAMA_LOG_INFO (" %s: entire model can be fit by reducing context\n " , __func__);
311349 return ;
312350 }
@@ -329,6 +367,9 @@ static void llama_params_fit_impl(
329367 }
330368 }
331369 }
370+ if (nd == 0 ) {
371+ throw llama_params_fit_exception (" was unable to fit model into system memory by reducing context, abort" );
372+ }
332373
333374 if (mparams->n_gpu_layers != default_mparams.n_gpu_layers ) {
334375 throw llama_params_fit_exception (" n_gpu_layers already set by user to " + std::to_string (mparams->n_gpu_layers ) + " , abort" );
@@ -476,8 +517,8 @@ static void llama_params_fit_impl(
476517
477518 std::vector<int64_t > ret;
478519 ret.reserve (nd);
479- for (const llama_device_memory_data & dmd : dmd_nl ) {
480- ret.push_back (dmd .mb .total ());
520+ for (size_t id = 0 ; id < nd; id++ ) {
521+ ret.push_back (dmd_nl[id] .mb .total ());
481522 }
482523 return ret;
483524 };
0 commit comments