@@ -162,8 +162,14 @@ struct clip_ctx {
162162
163163 bool debug_output_embeddings = false ;
164164
165+ // for measuring memory usage
166+ bool no_alloc = false ;
167+ std::map<ggml_backend_dev_t , size_t > mem_usage;
168+ std::map<ggml_backend_dev_t , size_t > mem_compute;
169+
165170 clip_ctx (clip_context_params & ctx_params) {
166171 flash_attn_type = ctx_params.flash_attn_type ;
172+ no_alloc = ctx_params.no_alloc ;
167173 backend_cpu = ggml_backend_init_by_type (GGML_BACKEND_DEVICE_TYPE_CPU, nullptr );
168174 if (!backend_cpu) {
169175 throw std::runtime_error (" failed to initialize CPU backend" );
@@ -1688,6 +1694,8 @@ struct clip_model_loader {
16881694 ggml_set_name (data_tensor, cur->name );
16891695 loaded_tensor_names.insert (name);
16901696 cur = data_tensor;
1697+ // add to weight memory counter
1698+ ctx_clip.mem_usage [ggml_backend_get_device (ctx_clip.backend )] += ggml_nbytes (cur);
16911699 }
16921700 return cur;
16931701 };
@@ -2602,7 +2610,7 @@ struct clip_model_loader {
26022610 }
26032611
26042612 // load data
2605- {
2613+ if (!ctx_clip. no_alloc ) {
26062614 std::vector<uint8_t > read_buf;
26072615
26082616 // alloc memory and offload data
@@ -2676,7 +2684,7 @@ struct clip_model_loader {
26762684 if (ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_AUTO) {
26772685 // try to enable flash attention to see if it's supported
26782686 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_ENABLED;
2679- info = alloc_compute_meta (ctx_clip, batch);
2687+ info = reserve_compute_meta (ctx_clip, batch);
26802688 if (!info.fattn && info.fattn_op ) {
26812689 auto op = info.fattn_op ;
26822690 LOG_WRN (" %s: *****************************************************************\n " , __func__);
@@ -2695,10 +2703,10 @@ struct clip_model_loader {
26952703 LOG_WRN (" %s: please report this on github as an issue\n " , __func__);
26962704 LOG_WRN (" %s: *****************************************************************\n " , __func__);
26972705 ctx_clip.flash_attn_type = CLIP_FLASH_ATTN_TYPE_DISABLED;
2698- alloc_compute_meta (ctx_clip, batch);
2706+ reserve_compute_meta (ctx_clip, batch);
26992707 }
27002708 } else {
2701- info = alloc_compute_meta (ctx_clip, batch);
2709+ info = reserve_compute_meta (ctx_clip, batch);
27022710 if (!info.fattn && ctx_clip.flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
27032711 LOG_WRN (" %s: flash attention is not supported by the current backend; falling back to CPU (performance will be degraded)\n " , __func__);
27042712 }
@@ -2737,12 +2745,14 @@ struct clip_model_loader {
27372745 }
27382746 }
27392747
2740- static support_info_graph alloc_compute_meta (clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
2748+ // only initialize backend buffers, but do not allocate them yet
2749+ static support_info_graph reserve_compute_meta (clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
27412750 ctx_clip.buf_compute_meta .resize (ctx_clip.max_nodes * ggml_tensor_overhead () + ggml_graph_overhead ());
27422751
27432752 ggml_cgraph * gf = clip_image_build_graph (&ctx_clip, batch);
27442753 ggml_backend_sched_reserve (ctx_clip.sched .get (), gf);
27452754
2755+ ctx_clip.mem_compute .clear ();
27462756 for (size_t i = 0 ; i < ctx_clip.backend_ptrs .size (); ++i) {
27472757 ggml_backend_t backend = ctx_clip.backend_ptrs [i];
27482758 ggml_backend_buffer_type_t buft = ctx_clip.backend_buft [i];
@@ -2752,6 +2762,7 @@ struct clip_model_loader {
27522762 ggml_backend_buft_name (buft),
27532763 size / 1024.0 / 1024.0 );
27542764 }
2765+ ctx_clip.mem_compute [ggml_backend_get_device (backend)] += size;
27552766 }
27562767
27572768 const int n_splits = ggml_backend_sched_get_n_splits (ctx_clip.sched .get ());
@@ -4266,22 +4277,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
42664277 }
42674278}
42684279
4269- int clip_is_minicpmv (const struct clip_ctx * ctx) {
4270- // TODO: remove this function
4271- if (ctx->proj_type () == PROJECTOR_TYPE_MINICPMV) {
4272- return ctx->model .hparams .minicpmv_version ;
4273- }
4274- if (ctx->proj_type () == PROJECTOR_TYPE_MINICPMV4_6) {
4275- return 46 ;
4276- }
4277- return 0 ;
4278- }
4279-
4280- bool clip_is_glm (const struct clip_ctx * ctx) {
4281- // TODO: remove this function
4282- return ctx->proj_type () == PROJECTOR_TYPE_GLM_EDGE;
4283- }
4284-
42854280bool clip_is_llava (const struct clip_ctx * ctx) {
42864281 return ctx->model .hparams .has_llava_projector ;
42874282}
@@ -4330,6 +4325,14 @@ const clip_hparams * clip_get_hparams(const struct clip_ctx * ctx) {
43304325 return &ctx->model .hparams ;
43314326}
43324327
4328+ std::map<ggml_backend_dev_t , size_t > clip_get_mem_usage (const struct clip_ctx * ctx) {
4329+ std::map<ggml_backend_dev_t , size_t > result = ctx->mem_usage ;
4330+ for (auto & [dev, size] : ctx->mem_compute ) {
4331+ result[dev] += size;
4332+ }
4333+ return result;
4334+ }
4335+
43334336//
43344337// API for debugging
43354338//
0 commit comments