Skip to content

Commit e2b129e

Browse files
authored
mtmd: fit_params now take into account mmproj (ggml-org#21489)
* mtmd: fit_params now take into account mmproj * rename alloc_compute_meta to reserve_compute_meta * rm unused functions * add ggml_backend_dev_t support * add debug log
1 parent 7e50ef7 commit e2b129e

5 files changed

Lines changed: 183 additions & 80 deletions

File tree

tools/mtmd/clip.cpp

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -162,8 +162,14 @@ struct clip_ctx {
162162

163163
bool debug_output_embeddings = false;
164164

165+
// for measuring memory usage
166+
bool no_alloc = false;
167+
std::map<ggml_backend_dev_t, size_t> mem_usage;
168+
std::map<ggml_backend_dev_t, size_t> mem_compute;
169+
165170
clip_ctx(clip_context_params & ctx_params) {
166171
flash_attn_type = ctx_params.flash_attn_type;
172+
no_alloc = ctx_params.no_alloc;
167173
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
168174
if (!backend_cpu) {
169175
throw std::runtime_error("failed to initialize CPU backend");
@@ -1688,6 +1694,8 @@ struct clip_model_loader {
16881694
ggml_set_name(data_tensor, cur->name);
16891695
loaded_tensor_names.insert(name);
16901696
cur = data_tensor;
1697+
// add to weight memory counter
1698+
ctx_clip.mem_usage[ggml_backend_get_device(ctx_clip.backend)] += ggml_nbytes(cur);
16911699
}
16921700
return cur;
16931701
};
@@ -2602,7 +2610,7 @@ struct clip_model_loader {
26022610
}
26032611

26042612
// load data
2605-
{
2613+
if (!ctx_clip.no_alloc) {
26062614
std::vector<uint8_t> read_buf;
26072615

26082616
// alloc memory and offload data
@@ -2676,7 +2684,7 @@ struct clip_model_loader {
26762684
if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
26772685
// try to enable flash attention to see if it's supported
26782686
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
2679-
info = alloc_compute_meta(ctx_clip, batch);
2687+
info = reserve_compute_meta(ctx_clip, batch);
26802688
if (!info.fattn && info.fattn_op) {
26812689
auto op = info.fattn_op;
26822690
LOG_WRN("%s: *****************************************************************\n", __func__);
@@ -2695,10 +2703,10 @@ struct clip_model_loader {
26952703
LOG_WRN("%s: please report this on github as an issue\n", __func__);
26962704
LOG_WRN("%s: *****************************************************************\n", __func__);
26972705
ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
2698-
alloc_compute_meta(ctx_clip, batch);
2706+
reserve_compute_meta(ctx_clip, batch);
26992707
}
27002708
} else {
2701-
info = alloc_compute_meta(ctx_clip, batch);
2709+
info = reserve_compute_meta(ctx_clip, batch);
27022710
if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
27032711
LOG_WRN("%s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n", __func__);
27042712
}
@@ -2737,12 +2745,14 @@ struct clip_model_loader {
27372745
}
27382746
}
27392747

2740-
static support_info_graph alloc_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
2748+
// only initialize backend buffers, but do not allocate them yet
2749+
static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
27412750
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
27422751

27432752
ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch);
27442753
ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
27452754

2755+
ctx_clip.mem_compute.clear();
27462756
for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
27472757
ggml_backend_t backend = ctx_clip.backend_ptrs[i];
27482758
ggml_backend_buffer_type_t buft = ctx_clip.backend_buft[i];
@@ -2752,6 +2762,7 @@ struct clip_model_loader {
27522762
ggml_backend_buft_name(buft),
27532763
size / 1024.0 / 1024.0);
27542764
}
2765+
ctx_clip.mem_compute[ggml_backend_get_device(backend)] += size;
27552766
}
27562767

27572768
const int n_splits = ggml_backend_sched_get_n_splits(ctx_clip.sched.get());
@@ -4266,22 +4277,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
42664277
}
42674278
}
42684279

4269-
int clip_is_minicpmv(const struct clip_ctx * ctx) {
4270-
// TODO: remove this function
4271-
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
4272-
return ctx->model.hparams.minicpmv_version;
4273-
}
4274-
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV4_6) {
4275-
return 46;
4276-
}
4277-
return 0;
4278-
}
4279-
4280-
bool clip_is_glm(const struct clip_ctx * ctx) {
4281-
// TODO: remove this function
4282-
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
4283-
}
4284-
42854280
bool clip_is_llava(const struct clip_ctx * ctx) {
42864281
return ctx->model.hparams.has_llava_projector;
42874282
}
@@ -4330,6 +4325,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
43304325
return &ctx->model.hparams;
43314326
}
43324327

4328+
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx) {
4329+
std::map<ggml_backend_dev_t, size_t> result = ctx->mem_usage;
4330+
for (auto & [dev, size] : ctx->mem_compute) {
4331+
result[dev] += size;
4332+
}
4333+
return result;
4334+
}
4335+
43334336
//
43344337
// API for debugging
43354338
//

tools/mtmd/clip.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
#include <stddef.h>
77
#include <stdint.h>
88

9+
#include <map>
10+
911
// !!! Internal header, to be used by mtmd only !!!
1012

1113
#define MTMD_INTERNAL_HEADER
@@ -40,6 +42,7 @@ struct clip_context_params {
4042
bool warmup;
4143
ggml_backend_sched_eval_callback cb_eval;
4244
void * cb_eval_user_data;
45+
bool no_alloc;
4346
};
4447

4548
struct clip_init_result {
@@ -102,8 +105,6 @@ struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
102105
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
103106
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
104107

105-
int clip_is_minicpmv(const struct clip_ctx * ctx);
106-
bool clip_is_glm(const struct clip_ctx * ctx);
107108
bool clip_is_llava(const struct clip_ctx * ctx);
108109
// note for contributor: this clip_is_(model) pattern is deprecated
109110
// do NOT add new functions like this
@@ -116,6 +117,8 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
116117
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
117118
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
118119

120+
std::map<ggml_backend_dev_t, size_t> clip_get_mem_usage(const struct clip_ctx * ctx);
121+
119122
struct clip_cap {
120123
bool has_vision;
121124
bool has_audio;

0 commit comments

Comments
 (0)