@@ -171,6 +171,8 @@ struct clip_ctx {
171171 std::map<ggml_backend_dev_t , size_t > mem_usage;
172172 std::map<ggml_backend_dev_t , size_t > mem_compute;
173173
174+ bool support_batch = false ;
175+
174176 clip_ctx (clip_context_params & ctx_params) {
175177 flash_attn_type = ctx_params.flash_attn_type ;
176178 no_alloc = ctx_params.no_alloc ;
@@ -314,7 +316,7 @@ ggml_tensor * clip_graph::build_vit(
314316 std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
315317 const build_vit_opts & opts
316318 ) {
317- // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode)
319+ // batch dim: inp is [n_embd, n_pos, B]
318320 const int64_t B = inp->ne [2 ];
319321
320322 if (learned_pos_embd) {
@@ -862,7 +864,7 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale
862864 return cur;
863865}
864866
865- static ggml_cgraph * clip_image_build_graph (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
867+ static std::unique_ptr<clip_graph> clip_get_graph_builder (clip_ctx * ctx, const clip_image_f32_batch & imgs) {
866868 const clip_image_f32 & img = *imgs.entries [0 ];
867869 std::unique_ptr<clip_graph> builder;
868870
@@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
10251027 // TODO [QWEN_VIDEO]: improve this in the future
10261028 builder->n_batch = imgs.entries .size ();
10271029
1028- return builder-> build () ;
1030+ return builder;
10291031}
10301032
10311033//
@@ -2819,7 +2821,7 @@ struct clip_model_loader {
28192821 std::vector<support_info_op> ops;
28202822 };
28212823
2822- static void warmup (clip_ctx & ctx_clip) {
2824+ static clip_image_f32_batch get_dummy_batch (clip_ctx & ctx_clip) {
28232825 // create a fake batch
28242826 const auto & hparams = ctx_clip.model .hparams ;
28252827 clip_image_f32_batch batch;
@@ -2833,6 +2835,20 @@ struct clip_model_loader {
28332835 LOG_INF (" %s: warmup with audio size = %d\n " , __func__, hparams.warmup_audio_size );
28342836 }
28352837 batch.entries .push_back (std::move (img));
2838+ return batch;
2839+ }
2840+
2841+ static void init_ctx (clip_ctx & ctx_clip) {
2842+ ctx_clip.buf_compute_meta .resize (ctx_clip.max_nodes * ggml_tensor_overhead () + ggml_graph_overhead ());
2843+
2844+ // check batching support
2845+ auto batch = get_dummy_batch (ctx_clip);
2846+ auto builder = clip_get_graph_builder (&ctx_clip, batch);
2847+ ctx_clip.support_batch = builder->support_batch ();
2848+ }
2849+
2850+ static void warmup (clip_ctx & ctx_clip) {
2851+ auto batch = get_dummy_batch (ctx_clip);
28362852 warmup (ctx_clip, batch);
28372853 }
28382854
@@ -2905,9 +2921,7 @@ struct clip_model_loader {
29052921
29062922 // only initialize backend buffers, but do not allocate them yet
29072923 static support_info_graph reserve_compute_meta (clip_ctx & ctx_clip, const clip_image_f32_batch & batch) {
2908- ctx_clip.buf_compute_meta .resize (ctx_clip.max_nodes * ggml_tensor_overhead () + ggml_graph_overhead ());
2909-
2910- ggml_cgraph * gf = clip_image_build_graph (&ctx_clip, batch);
2924+ ggml_cgraph * gf = clip_get_graph_builder (&ctx_clip, batch)->build ();
29112925 ggml_backend_sched_reserve (ctx_clip.sched .get (), gf);
29122926
29132927 ctx_clip.mem_compute .clear ();
@@ -3070,6 +3084,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
30703084 ctx_vision = new clip_ctx (ctx_params);
30713085 loader.load_hparams (ctx_vision->model , CLIP_MODALITY_VISION );
30723086 loader.load_tensors (*ctx_vision);
3087+ loader.init_ctx (*ctx_vision);
30733088 if (ctx_params.warmup ) {
30743089 loader.warmup (*ctx_vision);
30753090 }
@@ -3083,6 +3098,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params
30833098 ctx_audio = new clip_ctx (ctx_params);
30843099 loader.load_hparams (ctx_audio->model , CLIP_MODALITY_AUDIO );
30853100 loader.load_tensors (*ctx_audio);
3101+ loader.init_ctx (*ctx_audio);
30863102 if (ctx_params.warmup ) {
30873103 loader.warmup (*ctx_audio);
30883104 }
@@ -3484,25 +3500,22 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
34843500 return n_patches;
34853501}
34863502
3487- bool clip_image_encode (struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec ) {
3503+ bool clip_image_encode (struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, std::vector< float > & out_vec ) {
34883504 clip_image_f32_batch imgs;
34893505 clip_image_f32_ptr img_copy (clip_image_f32_init ());
34903506 *img_copy = *img;
34913507 imgs.entries .push_back (std::move (img_copy));
34923508
3493- return clip_image_batch_encode (ctx, n_threads, &imgs, vec );
3509+ return clip_image_batch_encode (ctx, n_threads, &imgs, out_vec );
34943510}
34953511
3496- bool clip_image_batch_encode (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec ) {
3512+ bool clip_image_batch_encode (clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector< float > & out_batch_embd ) {
34973513 const clip_image_f32_batch & imgs = *imgs_c_ptr;
34983514 int n_batch_cur = imgs.entries .size ();
34993515
3500- // maximum supported batch size, usually == 2 for qwen-vl-based models
3501- int n_batch_max = clip_model_n_batch_max (ctx);
3502-
3503- // TODO @ngxson : implement batch size > 1 as a loop
3504- // we don't need true batching support because the cgraph will gonna be big anyway
3505- if (n_batch_cur > n_batch_max) {
3516+ // [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames
3517+ if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge (ctx)) {
3518+ LOG_ERR (" %s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n " , __func__, n_batch_cur, clip_model_n_temporal_merge (ctx));
35063519 return false ;
35073520 }
35083521
@@ -3513,7 +3526,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
35133526
35143527 // build the inference graph
35153528 ggml_backend_sched_reset (ctx->sched .get ());
3516- ggml_cgraph * gf = clip_image_build_graph (ctx, imgs);
3529+ ggml_cgraph * gf = clip_get_graph_builder (ctx, imgs)-> build ( );
35173530 ggml_backend_sched_alloc_graph (ctx->sched .get (), gf);
35183531
35193532 // set inputs
@@ -3582,6 +3595,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
35823595 const int n = nx * ny;
35833596
35843597 for (int b = 0 ; b < n_batch_cur; b++) {
3598+ LOG_DBG (" %s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n " , __func__, b+1 , n_batch_cur, nx, ny);
35853599 const auto & buf = imgs.entries [b]->get_ro_buf ();
35863600 float * batch_entry = inp_raw.data () + b * (3 *n);
35873601 for (int y = 0 ; y < ny; y++) {
@@ -4416,24 +4430,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
44164430 // the last node is the embedding tensor
44174431 ggml_tensor * embeddings = ggml_graph_node (gf, -1 );
44184432
4419- // sanity check (only support batch size of 1 for now )
4433+ // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one )
44204434 const int n_tokens_out = embeddings->ne [1 ];
44214435 const int expected_n_tokens_out = clip_n_output_tokens (ctx, imgs.entries [0 ].get ());
44224436 if (n_tokens_out != expected_n_tokens_out) {
44234437 LOG_ERR (" %s: expected output %d tokens, got %d\n " , __func__, expected_n_tokens_out, n_tokens_out);
44244438 GGML_ABORT (" Invalid number of output tokens" );
44254439 }
44264440
4427- // copy the embeddings to the location passed by the user
4428- if (vec != nullptr ) {
4429- ggml_backend_tensor_get (embeddings, vec, 0 , ggml_nbytes (embeddings));
4441+ LOG_DBG (" %s: output embedding shape [%d, %d, %d]\n " , __func__,
4442+ (int )embeddings->ne [0 ], (int )embeddings->ne [1 ], (int )embeddings->ne [2 ]);
4443+
4444+ // copy output to user buffer if provided
4445+ // if output is empty, skip the copy
4446+ if (!out_batch_embd.empty ()) {
4447+ if (out_batch_embd.size () != (size_t )ggml_nelements (embeddings)) {
4448+ LOG_ERR (" %s: output buffer has %zu elements but expected %zu\n " , __func__, out_batch_embd.size (), (size_t )ggml_nelements (embeddings));
4449+ GGML_ABORT (" Output buffer size mismatch" );
4450+ }
4451+ ggml_backend_tensor_get (embeddings, out_batch_embd.data (), 0 , ggml_nbytes (embeddings));
4452+ } else {
4453+ LOG_WRN (" %s: output buffer is empty, skipping copy\n " , __func__);
44304454 }
44314455
44324456 // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set
44334457 if (ctx->debug_output_embeddings ) {
44344458 const int64_t n_embd = embeddings->ne [0 ];
44354459 const int64_t n_tokens = embeddings->ne [1 ];
4436- std::vector<float > emb_data (n_embd * n_tokens );
4460+ std::vector<float > emb_data (ggml_nelements (embeddings) );
44374461 ggml_backend_tensor_get (embeddings, emb_data.data (), 0 , ggml_nbytes (embeddings));
44384462
44394463 LOG_INF (" \n === MTMD_DEBUG_EMBEDDINGS ===\n " );
@@ -4570,7 +4594,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
45704594 return ctx->model .modality == CLIP_MODALITY_AUDIO ;
45714595}
45724596
4573- int clip_model_n_batch_max (const struct clip_ctx * ctx) {
4597+ bool clip_support_batch (const struct clip_ctx * ctx) {
4598+ return ctx->support_batch ;
4599+ }
4600+
4601+ // TODO @ngxson : this is no longer correct with mtmd_batch API
4602+ // this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support)
4603+ // this logic should be refactored in near future to distinctly handle "merge frames" and "batching"
4604+ int clip_model_n_temporal_merge (const struct clip_ctx * ctx) {
45744605 switch (ctx->proj_type ()) {
45754606 case PROJECTOR_TYPE_QWEN2VL :
45764607 case PROJECTOR_TYPE_QWEN25VL :
0 commit comments