CrazyForks · pull · May 15, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/conversion/qwen3vl.py b/conversion/qwen3vl.py
@@ -183,6 +183,9 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
         if name.startswith("model.visual."):
             name = name.replace("model.visual.", "visual.", 1)
 
+        if name.startswith("thinker.audio_tower."):
+            name = name.replace("thinker.audio_tower.", "audio_tower.", 1)
+
         if "visual." not in name and "audio_tower." not in name:
             return None
 

diff --git a/scripts/webui-download.cmake b/scripts/webui-download.cmake
@@ -184,8 +184,8 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED)
             foreach(asset ${ASSETS})
                 set(download_path "${PUBLIC_DIR}/${asset}")
                 file(SHA256 "${download_path}" asset_hash)
-                string(TOUPPER "${asset_hash}" EXPECTED_HASH_UPPER)
-                string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
+                string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER)
+                string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
                 if(NOT CHECKSUM_LINE)
                     message(WARNING "WebUI: checksum verification failed for ${asset}")
                     message(WARNING "  downloaded file may not match expected checksum, but will be used")

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -8337,6 +8337,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1}));
         }
     }
+
+    // BF16 is absent from base_types: add the 3 standard non-contig permutations explicitly
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16,  8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
+
     for (ggml_type type_a : other_types) {
         for (ggml_type type_b : {GGML_TYPE_F32}) {
             if (ggml_blck_size(type_a) != 256) {

diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h
@@ -11,6 +11,10 @@
 
 #define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)
 
+struct build_vit_opts {
+    ggml_tensor * attn_mask = nullptr;
+};
+
 struct clip_graph {
     const clip_model & model;
     const clip_hparams & hparams;
@@ -63,7 +67,8 @@ struct clip_graph {
                 norm_type norm_t,
                 ffn_op_type ffn_t,
                 ggml_tensor * learned_pos_embd,
-                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
+                std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
+                const build_vit_opts & opts = {});
 
     // build the input after conv2d (inp_raw --> patches)
     // returns tensor with shape [n_embd, n_patches]

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -300,7 +300,8 @@ ggml_tensor * clip_graph::build_vit(
             norm_type norm_t,
             ffn_op_type ffn_t,
             ggml_tensor * learned_pos_embd,
-            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
+            std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
+            const build_vit_opts & opts
         ) {
     if (learned_pos_embd) {
         inp = ggml_add(ctx0, inp, learned_pos_embd);
@@ -427,7 +428,7 @@ ggml_tensor * clip_graph::build_vit(
             }
 
             cur = build_attn(layer.o_w, layer.o_b,
-                Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il);
             cb(cur, "attn_out", il);
         }
 
@@ -663,6 +664,9 @@ ggml_tensor * clip_graph::build_attn(
 
         k = ggml_cast(ctx0, k, GGML_TYPE_F16);
         v = ggml_cast(ctx0, v, GGML_TYPE_F16);
+        if (kq_mask) {
+            kq_mask = ggml_cast(ctx0, kq_mask, GGML_TYPE_F16);
+        }
 
         cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
         ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
@@ -3244,12 +3248,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             } break;
         case PROJECTOR_TYPE_QWEN3A:
             {
-                // 3x stride-2 conv2d: each step is floor((n-1)/2)+1
-                int n = img->nx;
-                n = (n - 1) / 2 + 1;
-                n = (n - 1) / 2 + 1;
-                n = (n - 1) / 2 + 1;
-                n_patches = n;
+                // chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
+                const int chunk_size       = 100;
+                const int tokens_per_chunk = 13;
+                n_patches = (img->nx / chunk_size) * tokens_per_chunk;
             } break;
         case PROJECTOR_TYPE_GLMA:
             {
@@ -4292,21 +4294,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_AUDIO;
 }
 
-bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_ULTRAVOX:
-        case PROJECTOR_TYPE_QWEN2A:
-        case PROJECTOR_TYPE_QWEN3A:
-        case PROJECTOR_TYPE_GLMA:
-        case PROJECTOR_TYPE_VOXTRAL:
-        case PROJECTOR_TYPE_MERALION:
-        case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-            return true;
-        default:
-            return false;
-    }
-}
-
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);

diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
@@ -115,7 +115,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
-bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
 
 struct clip_cap {
     bool has_vision;

diff --git a/tools/mtmd/models/qwen3a.cpp b/tools/mtmd/models/qwen3a.cpp
@@ -1,68 +1,88 @@
 #include "models.h"
 
 ggml_cgraph * clip_graph_qwen3a::build() {
+    // Ref implementation: https://github.com/QwenLM/Qwen3-ASR/blob/main/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py
+
+    // inp_raw: [n_frames, n_mel, 1]  (nx=n_frames, ny=n_mel)
     ggml_tensor * inp = build_inp_raw(1);
 
-    // conv2d block
-    // TODO: do we need to split by chunks of n_window each like on transformers impl?
-    {
-        inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
-        inp = ggml_add(ctx0, inp, model.conv2d_1_b);
-        inp = ggml_gelu_erf(ctx0, inp);
+    const int64_t n_frames   = inp->ne[0]; // total frames, padded to multiple of chunk_size
+    const int64_t n_mel      = inp->ne[1]; // 128
+    const int64_t chunk_size = 100;        // n_window * 2 (n_window=50 from model config)
+    const int64_t n_chunks   = n_frames / chunk_size;
 
-        inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
-        inp = ggml_add(ctx0, inp, model.conv2d_2_b);
-        inp = ggml_gelu_erf(ctx0, inp);
+    GGML_ASSERT(n_frames % chunk_size == 0); // preprocessor should already pad the input
+    GGML_ASSERT(inp->type == GGML_TYPE_F32);
 
-        inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
-        inp = ggml_add(ctx0, inp, model.conv2d_3_b);
-        inp = ggml_gelu_erf(ctx0, inp);
+    // View mel spectrogram as batched 100-frame chunks: [chunk_size, n_mel, 1, n_chunks]
+    inp = ggml_view_4d(ctx0, inp,
+        chunk_size, n_mel, 1, n_chunks,
+        n_frames   * (int64_t)sizeof(float), // nb[1]: stride over mel bins
+        chunk_size * (int64_t)sizeof(float), // nb[2]: stride for C=1 (unused)
+        chunk_size * (int64_t)sizeof(float), // nb[3]: stride over chunks
+        0);
+    inp = ggml_cont(ctx0, inp);
+    cb(inp, "inp_chunks", -1);
 
-        // inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
-        cb(inp, "after_conv_blocks", -1);
+    // 3 x conv2d + gelu
+    {
+        // conv output [OW, OH, C_out, n_chunks]
+        auto conv_block = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
+            x = ggml_conv_2d(ctx0, w, x, 2, 2, 1, 1, 1, 1);
+            if (b) {
+                x = ggml_add(ctx0, x, ggml_reshape_4d(ctx0, b, 1, 1, x->ne[2], 1));
+            }
+            return ggml_gelu_erf(ctx0, x);
+        };
 
-        const int64_t n_pos_after_conv = inp->ne[0];
-        const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
+        inp = conv_block(inp, model.conv2d_1_w, model.conv2d_1_b);
+        inp = conv_block(inp, model.conv2d_2_w, model.conv2d_2_b);
+        inp = conv_block(inp, model.conv2d_3_w, model.conv2d_3_b);
+        // inp: [OW=13, OH=16, OC=480, n_chunks]
+        cb(inp, "after_conv_blocks", -1);
+    }
 
-        inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
-        inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
-        inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
+    // permute [OW=25, OH=16, OC=480, n_chunks] -> [OH=16, OC=480, OW=25, n_chunks]
+    // reshape to [OH*OC=7680, OW*n_chunks]
+    // feature index h+16*c = c*16+f (matches python code)
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 2, 0, 1, 3));
+    inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2] * inp->ne[3]);
 
-        // project to n_embd
-        inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
-        if (model.conv_out_b) {
-            inp = ggml_add(ctx0, inp, model.conv_out_b);
-        }
-        cb(inp, "after_conv_out", -1);
+    // Project to d_model: [d_model, 25*n_chunks]
+    inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
+    if (model.conv_out_b) {
+        inp = ggml_add(ctx0, inp, model.conv_out_b);
     }
+    cb(inp, "after_conv_out", -1);
 
-    auto n_pos = inp->ne[1];
+    const int64_t n_pos = inp->ne[1]; // 25 * n_chunks
 
-    ggml_tensor * pos_embd_selected = ggml_view_2d(
-        ctx0, model.position_embeddings,
-        model.position_embeddings->ne[0], n_pos,
-        model.position_embeddings->nb[1], 0
-    );
-    ggml_tensor * cur = build_vit(
-                            inp, n_pos,
-                            NORM_TYPE_NORMAL,
-                            hparams.ffn_op,
-                            pos_embd_selected,
-                            nullptr);
+    // Per-chunk positional embeddings: repeat pos[0:13] for each chunk
+    // (position indices reset 0..12 per chunk, not sequential across chunks)
+    {
+        const int64_t tokens_per_chunk = n_pos / n_chunks; // 13
+        ggml_tensor * pos_tmp = ggml_view_2d(ctx0, model.position_embeddings,
+            model.position_embeddings->ne[0], tokens_per_chunk,
+            model.position_embeddings->nb[1], 0);
+        ggml_tensor * tgt = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,
+            model.position_embeddings->ne[0], n_pos);
+        inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, pos_tmp, tgt));
+    }
 
+    ggml_tensor * cur = build_vit(inp, n_pos,
+        NORM_TYPE_NORMAL, hparams.ffn_op,
+        nullptr,  // pos embd already added above
+        nullptr);
     cb(cur, "after_transformer", -1);
 
-    // projector
+    // MLP projector
     cur = build_ffn(cur,
         model.mm_1_w, model.mm_1_b,
         nullptr, nullptr,
         model.mm_2_w, model.mm_2_b,
-        FFN_GELU_ERF,
-        -1);
-
+        FFN_GELU_ERF, -1);
     cb(cur, "projected", -1);
 
     ggml_build_forward_expand(gf, cur);
-
     return gf;
 }
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
@@ -609,6 +609,110 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
     return true;
 }
 
+//
+// mtmd_audio_preprocessor_qwen3a
+//
+// Matches the Python WhisperFeatureExtractor called with truncation=False:
+//   - reflection padding of n_fft/2 samples at each end (center=True)
+//   - Whisper-style log10 + (max-8)/4 normalization applied to full audio
+//   - output split into ≤30s (3000 mel frames) windows, each padded to a
+//     multiple of 200 frames (n_window * 2) for the cgraph batch view
+//
+
+void mtmd_audio_preprocessor_qwen3a::initialize() {
+    cache.fill_sin_cos_table(hparams.audio_n_fft);
+    cache.fill_hann_window(hparams.audio_window_len, true);
+    cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
+}
+
+bool mtmd_audio_preprocessor_qwen3a::preprocess(const float *                 samples,
+                                                 size_t                        n_samples,
+                                                 std::vector<mtmd_audio_mel> & output) {
+    if (n_samples == 0) {
+        return false;
+    }
+
+    GGML_ASSERT(!cache.sin_vals.empty());
+    GGML_ASSERT(!cache.cos_vals.empty());
+    GGML_ASSERT(!cache.filters.data.empty());
+
+    // Reflection-pad n_fft/2 samples at each end, matching WhisperFeatureExtractor center=True
+    const int pad = hparams.audio_n_fft / 2; // = 200
+
+    std::vector<float> padded(n_samples + 2 * pad, 0.0f);
+    // Reflect start: padded[0..pad-1] = samples[pad..1] (reversed)
+    for (int i = 0; i < pad; i++) {
+        int src = pad - i; // samples[pad], samples[pad-1], ..., samples[1]
+        padded[i] = (src < (int)n_samples) ? samples[src] : 0.0f;
+    }
+    std::copy(samples, samples + n_samples, padded.begin() + pad);
+    // Reflect end: padded[n+pad..n+2*pad-1] = samples[n-2..n-pad-1] (reversed)
+    for (int i = 0; i < pad; i++) {
+        int src = (int)n_samples - 2 - i; // samples[n-2], samples[n-3], ...
+        padded[n_samples + pad + i] = (src >= 0) ? samples[src] : 0.0f;
+    }
+
+    filter_params params;
+    params.n_mel            = hparams.n_mel_bins;
+    params.n_fft_bins       = 1 + (hparams.audio_n_fft / 2);
+    params.hann_window_size = hparams.audio_window_len;
+    params.hop_length       = hparams.audio_hop_len;
+    params.sample_rate      = hparams.audio_sample_rate;
+    params.no_padding       = true; // reflection padding already applied above
+    params.use_natural_log  = false; // log10
+
+    mtmd_audio_mel mel_full;
+    bool ok = log_mel_spectrogram(padded.data(), (int)padded.size(), 4, params, cache, mel_full);
+    if (!ok) {
+        return false;
+    }
+
+    // Whisper-style normalization: clamp to (max - 8), scale to [-1, 1]
+    {
+        double mmax = -1e20;
+        for (float v : mel_full.data) {
+            if (v > mmax) mmax = v;
+        }
+        mmax -= 8.0;
+        for (float & v : mel_full.data) {
+            v = (std::max((double)v, mmax) + 4.0) / 4.0;
+        }
+    }
+
+    // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
+    // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
+    const int n_eff = std::min(mel_full.n_len,
+                               (int)(n_samples / hparams.audio_hop_len) + 1);
+
+    // Split into inference windows matching n_window_infer=800 from model config.
+    // Each window is padded to the next multiple of chunk_size for the cgraph.
+    // The mtmd caller loops over output entries, so long audio is handled automatically.
+    const int chunk_size  = 100; // conv sub-chunk size (n_window * 2, n_window=50)
+    const int window_size = 800; // mel frames per forward pass (n_window_infer=800)
+
+    for (int off = 0; off < n_eff; off += window_size) {
+        const int win_eff    = std::min(window_size, n_eff - off);
+        const int n_chunks   = (win_eff + chunk_size - 1) / chunk_size;
+        const int n_padded   = n_chunks * chunk_size;
+
+        mtmd_audio_mel out;
+        out.n_mel     = mel_full.n_mel;
+        out.n_len     = n_padded;
+        out.n_len_org = win_eff;
+        out.data.assign(out.n_mel * out.n_len, 0.0f);
+        for (int m = 0; m < out.n_mel; m++) {
+            const int copy_len = std::min(win_eff, mel_full.n_len - off);
+            if (copy_len > 0) {
+                std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
+                          mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
+                          out.data.begin()      + (size_t)m * out.n_len);
+            }
+        }
+        output.push_back(std::move(out));
+    }
+    return true;
+}
+
 //
 // mtmd_audio_preprocessor_conformer
 //

diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
@@ -96,6 +96,15 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
     mtmd_audio_cache cache;
 };
 
+struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor {
+    mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
+    void initialize() override;
+    bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;
+
+  private:
+    mtmd_audio_cache cache;
+};
+
 //
 // streaming ISTFT - converts spectrogram frames back to audio one frame at a time
 //