@@ -131,6 +131,7 @@ struct mtmd_context {
131131 int n_threads;
132132 std::string media_marker;
133133 const int n_embd_text;
134+ llama_rope_type decoder_rope;
134135
135136 // these are not token, but strings used to mark the beginning and end of image/audio embeddings
136137 std::string img_beg;
@@ -167,7 +168,8 @@ struct mtmd_context {
167168 print_timings (ctx_params.print_timings),
168169 n_threads (ctx_params.n_threads),
169170 media_marker (ctx_params.media_marker),
170- n_embd_text (llama_model_n_embd_inp(text_model))
171+ n_embd_text (llama_model_n_embd_inp(text_model)),
172+ decoder_rope (llama_model_rope_type(text_model))
171173 {
172174 if (ctx_params.image_marker != nullptr ) {
173175 throw std::runtime_error (" custom image_marker is not supported anymore, use media_marker instead" );
@@ -1029,20 +1031,8 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx, const mtmd_input_chunk * chu
10291031}
10301032
10311033bool mtmd_decode_use_mrope (mtmd_context * ctx) {
1032- if (ctx->ctx_v == nullptr && ctx->proj_type_a () == PROJECTOR_TYPE_QWEN3A) {
1033- // qwen3-asr
1034- return true ;
1035- }
1036- switch (ctx->proj_type_v ()) {
1037- case PROJECTOR_TYPE_QWEN2VL:
1038- case PROJECTOR_TYPE_QWEN25VL:
1039- case PROJECTOR_TYPE_QWEN3VL:
1040- case PROJECTOR_TYPE_GLM4V:
1041- case PROJECTOR_TYPE_PADDLEOCR:
1042- return true ;
1043- default :
1044- return false ;
1045- }
1034+ return ctx->decoder_rope == LLAMA_ROPE_TYPE_MROPE
1035+ || ctx->decoder_rope == LLAMA_ROPE_TYPE_IMROPE;
10461036}
10471037
10481038bool mtmd_support_vision (mtmd_context * ctx) {
0 commit comments