Skip to content

Commit fed737b

Browse files
JohannesGaesslerArberSephirotheca
authored andcommitted
llama: fit ctx size for CPU only (ggml-org#21568)
1 parent 5055e3a commit fed737b

1 file changed

Lines changed: 99 additions & 58 deletions

File tree

src/llama.cpp

Lines changed: 99 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -91,20 +91,24 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
9191
throw std::runtime_error("failed to create llama_context from model");
9292
}
9393

94-
std::vector<llama_device_memory_data> ret(model->devices.size());
94+
const size_t nd = model->n_devices();
95+
std::vector<llama_device_memory_data> ret(nd + 1);
9596

9697
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
9798

9899
for (const auto & [buft, mb] : memory_breakdown) {
99100
if (ggml_backend_buft_is_host(buft)) {
101+
ret.back().mb.model += mb.model;
102+
ret.back().mb.context += mb.context;
103+
ret.back().mb.compute += mb.compute;
100104
continue;
101105
}
102106

103107
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
104108
if (!dev) {
105109
continue;
106110
}
107-
for (size_t i = 0; i < ret.size(); i++) {
111+
for (size_t i = 0; i < nd; i++) {
108112
if (model->devices[i].dev == dev) {
109113
ret[i].mb.model += mb.model;
110114
ret[i].mb.context += mb.context;
@@ -113,7 +117,19 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
113117
}
114118
}
115119
}
116-
for (size_t i = 0; i < ret.size(); i++) {
120+
121+
{
122+
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
123+
if (cpu_dev == nullptr) {
124+
throw std::runtime_error(format("%s: no CPU backend found", __func__));
125+
}
126+
size_t free;
127+
size_t total;
128+
ggml_backend_dev_memory(cpu_dev, &free, &total);
129+
ret.back().free = free;
130+
ret.back().total = total;
131+
}
132+
for (size_t i = 0; i < nd; i++) {
117133
size_t free;
118134
size_t total;
119135
ggml_backend_dev_memory(model->devices[i].dev, &free, &total);
@@ -122,11 +138,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
122138
// have any to report. in this case, we will use the host memory as a fallback
123139
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
124140
if (free == 0 && total == 0) {
125-
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
126-
if (cpu_dev == nullptr) {
127-
throw std::runtime_error(format("%s: no CPU backend found", __func__));
128-
}
129-
ggml_backend_dev_memory(cpu_dev, &free, &total);
141+
free = ret.back().free;
142+
total = ret.back().total;
130143
}
131144
ret[i].free = free;
132145
ret[i].total = total;
@@ -180,15 +193,15 @@ static void llama_params_fit_impl(
180193
LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__);
181194
const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
182195
const size_t nd = devs.size(); // number of devices
183-
if (nd == 0) {
184-
LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__);
185-
return;
186-
}
187196

188197
std::vector<int64_t> margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits
189198
margins.reserve(nd);
190-
for (size_t id = 0; id < nd; id++) {
191-
margins.push_back(margins_s[id]);
199+
if (nd == 0) {
200+
margins.push_back(margins_s[0]);
201+
} else {
202+
for (size_t id = 0; id < nd; id++) {
203+
margins.push_back(margins_s[id]);
204+
}
192205
}
193206

194207
std::vector<std::string> dev_names;
@@ -215,58 +228,75 @@ static void llama_params_fit_impl(
215228
std::vector<int64_t> projected_free_per_device;
216229
projected_free_per_device.reserve(nd);
217230

218-
if (nd > 1) {
219-
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
220-
}
221-
for (size_t id = 0; id < nd; id++) {
222-
const llama_device_memory_data & dmd = dmds_full[id];
223-
224-
const int64_t projected_used = dmd.mb.total();
225-
const int64_t projected_free = dmd.free - projected_used;
226-
projected_free_per_device.push_back(projected_free);
227-
228-
sum_free += dmd.free;
229-
sum_projected_used += projected_used;
230-
sum_projected_free += projected_free;
231-
sum_projected_model += dmd.mb.model;
232-
233-
if (nd > 1) {
234-
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
235-
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
236-
}
237-
}
238-
assert(sum_free >= 0 && sum_projected_used >= 0);
239-
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
240-
__func__, sum_projected_used/MiB, sum_free/MiB);
241-
if (nd == 1) {
242-
if (projected_free_per_device[0] >= margins[0]) {
243-
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
244-
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
231+
if (nd == 0) {
232+
sum_projected_used = dmds_full.back().mb.total();
233+
sum_free = dmds_full.back().total;
234+
sum_projected_free = sum_free - sum_projected_used;
235+
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of host memory vs. %" PRId64 " MiB of total host memory\n",
236+
__func__, sum_projected_used/MiB, sum_free/MiB);
237+
if (sum_projected_free >= margins[0]) {
238+
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of system memory, no changes needed\n",
239+
__func__, sum_projected_free/MiB, margins[0]/MiB);
245240
return;
246241
}
247242
} else {
248-
bool changes_needed = false;
243+
if (nd > 1) {
244+
LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__);
245+
}
249246
for (size_t id = 0; id < nd; id++) {
250-
if (projected_free_per_device[id] < margins[id]) {
251-
changes_needed = true;
252-
break;
247+
const llama_device_memory_data & dmd = dmds_full[id];
248+
249+
const int64_t projected_used = dmd.mb.total();
250+
const int64_t projected_free = dmd.free - projected_used;
251+
projected_free_per_device.push_back(projected_free);
252+
253+
sum_free += dmd.free;
254+
sum_projected_used += projected_used;
255+
sum_projected_free += projected_free;
256+
sum_projected_model += dmd.mb.model;
257+
258+
if (nd > 1) {
259+
LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " free vs. target of %6" PRId64 "\n",
260+
__func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, projected_free/MiB, margins[id]/MiB);
253261
}
254262
}
255-
if (!changes_needed) {
256-
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
257-
return;
263+
assert(sum_free >= 0 && sum_projected_used >= 0);
264+
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
265+
__func__, sum_projected_used/MiB, sum_free/MiB);
266+
if (nd == 1) {
267+
if (projected_free_per_device[0] >= margins[0]) {
268+
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
269+
__func__, projected_free_per_device[0]/MiB, margins[0]/MiB);
270+
return;
271+
}
272+
} else {
273+
bool changes_needed = false;
274+
for (size_t id = 0; id < nd; id++) {
275+
if (projected_free_per_device[id] < margins[id]) {
276+
changes_needed = true;
277+
break;
278+
}
279+
}
280+
if (!changes_needed) {
281+
LLAMA_LOG_INFO("%s: targets for free memory can be met on all devices, no changes needed\n", __func__);
282+
return;
283+
}
258284
}
259285
}
260286

261287
// step 2: try reducing memory use by reducing the context size
262288

263289
{
264290
int64_t global_surplus = sum_projected_free;
265-
for (size_t id = 0; id < nd; id++) {
266-
global_surplus -= margins[id];
291+
if (nd == 0) {
292+
global_surplus -= margins[0];
293+
} else {
294+
for (size_t id = 0; id < nd; id++) {
295+
global_surplus -= margins[id];
296+
}
267297
}
268298
if (global_surplus < 0) {
269-
if (nd == 1) {
299+
if (nd <= 1) {
270300
LLAMA_LOG_INFO("%s: cannot meet free memory target of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n",
271301
__func__, margins[0]/MiB, -global_surplus/MiB);
272302
} else {
@@ -277,8 +307,12 @@ static void llama_params_fit_impl(
277307
if (cparams->n_ctx == 0) {
278308
if (hp_nct > n_ctx_min) {
279309
int64_t sum_used_target = sum_free;
280-
for (size_t id = 0; id < nd; id++) {
281-
sum_used_target -= margins[id];
310+
if (nd == 0) {
311+
sum_used_target -= margins[0];
312+
} else {
313+
for (size_t id = 0; id < nd; id++) {
314+
sum_used_target -= margins[id];
315+
}
282316
}
283317
if (nd > 1) {
284318
// for multiple devices we need to be more conservative in terms of how much context we think can fit:
@@ -293,8 +327,12 @@ static void llama_params_fit_impl(
293327
int64_t sum_projected_used_min_ctx = 0;
294328
cparams->n_ctx = n_ctx_min;
295329
const dmds_t dmds_min_ctx = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
296-
for (const auto & dmd : dmds_min_ctx) {
297-
sum_projected_used_min_ctx += dmd.mb.total();
330+
if (nd == 0) {
331+
sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total();
332+
} else {
333+
for (size_t id = 0; id < nd; id++) {
334+
sum_projected_used_min_ctx += dmds_min_ctx[id].mb.total();
335+
}
298336
}
299337
if (sum_used_target > sum_projected_used_min_ctx) {
300338
// linear interpolation between minimum and maximum context size:
@@ -306,7 +344,7 @@ static void llama_params_fit_impl(
306344
const int64_t memory_reduction = (hp_nct - cparams->n_ctx) * bytes_per_ctx;
307345
LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n",
308346
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
309-
if (nd == 1) {
347+
if (nd <= 1) {
310348
LLAMA_LOG_INFO("%s: entire model can be fit by reducing context\n", __func__);
311349
return;
312350
}
@@ -329,6 +367,9 @@ static void llama_params_fit_impl(
329367
}
330368
}
331369
}
370+
if (nd == 0) {
371+
throw llama_params_fit_exception("was unable to fit model into system memory by reducing context, abort");
372+
}
332373

333374
if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) {
334375
throw llama_params_fit_exception("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort");
@@ -476,8 +517,8 @@ static void llama_params_fit_impl(
476517

477518
std::vector<int64_t> ret;
478519
ret.reserve(nd);
479-
for (const llama_device_memory_data & dmd : dmd_nl) {
480-
ret.push_back(dmd.mb.total());
520+
for (size_t id = 0; id < nd; id++) {
521+
ret.push_back(dmd_nl[id].mb.total());
481522
}
482523
return ret;
483524
};

0 commit comments

Comments
 (0)