Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions conversion/qwen3vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,9 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca
if name.startswith("model.visual."):
name = name.replace("model.visual.", "visual.", 1)

if name.startswith("thinker.audio_tower."):
name = name.replace("thinker.audio_tower.", "audio_tower.", 1)

if "visual." not in name and "audio_tower." not in name:
return None

Expand Down
4 changes: 2 additions & 2 deletions scripts/webui-download.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -184,8 +184,8 @@ if(NOT PROVISION_SUCCESS AND HF_ENABLED)
foreach(asset ${ASSETS})
set(download_path "${PUBLIC_DIR}/${asset}")
file(SHA256 "${download_path}" asset_hash)
string(TOUPPER "${asset_hash}" EXPECTED_HASH_UPPER)
string(REGEX MATCH "${EXPECTED_HASH_UPPER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
string(TOLOWER "${asset_hash}" EXPECTED_HASH_LOWER)
string(REGEX MATCH "${EXPECTED_HASH_LOWER}[ \\t]+${asset}" CHECKSUM_LINE "${CHECKSUMS_CONTENT}")
if(NOT CHECKSUM_LINE)
message(WARNING "WebUI: checksum verification failed for ${asset}")
message(WARNING " downloaded file may not match expected checksum, but will be used")
Expand Down
12 changes: 12 additions & 0 deletions tests/test-backend-ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8337,6 +8337,18 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, 256, {1536, 1}, {1, 1}));
}
}

// BF16 is absent from base_types: add the 3 standard non-contig permutations explicitly
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 1, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 8, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_BF16, GGML_TYPE_F32, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));

for (ggml_type type_a : other_types) {
for (ggml_type type_b : {GGML_TYPE_F32}) {
if (ggml_blck_size(type_a) != 256) {
Expand Down
7 changes: 6 additions & 1 deletion tools/mtmd/clip-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

#define DEFAULT_INTERPOLATION_MODE (GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)

struct build_vit_opts {
ggml_tensor * attn_mask = nullptr;
};

struct clip_graph {
const clip_model & model;
const clip_hparams & hparams;
Expand Down Expand Up @@ -63,7 +67,8 @@ struct clip_graph {
norm_type norm_t,
ffn_op_type ffn_t,
ggml_tensor * learned_pos_embd,
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos);
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
const build_vit_opts & opts = {});

// build the input after conv2d (inp_raw --> patches)
// returns tensor with shape [n_embd, n_patches]
Expand Down
33 changes: 10 additions & 23 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ ggml_tensor * clip_graph::build_vit(
norm_type norm_t,
ffn_op_type ffn_t,
ggml_tensor * learned_pos_embd,
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos
std::function<ggml_tensor *(ggml_tensor *, const clip_layer &)> add_pos,
const build_vit_opts & opts
) {
if (learned_pos_embd) {
inp = ggml_add(ctx0, inp, learned_pos_embd);
Expand Down Expand Up @@ -427,7 +428,7 @@ ggml_tensor * clip_graph::build_vit(
}

cur = build_attn(layer.o_w, layer.o_b,
Qcur, Kcur, Vcur, nullptr, kq_scale, il);
Qcur, Kcur, Vcur, opts.attn_mask, kq_scale, il);
cb(cur, "attn_out", il);
}

Expand Down Expand Up @@ -663,6 +664,9 @@ ggml_tensor * clip_graph::build_attn(

k = ggml_cast(ctx0, k, GGML_TYPE_F16);
v = ggml_cast(ctx0, v, GGML_TYPE_F16);
if (kq_mask) {
kq_mask = ggml_cast(ctx0, kq_mask, GGML_TYPE_F16);
}

cur = ggml_flash_attn_ext(ctx0, q, k, v, kq_mask, kq_scale, 0.0f, 0.0f);
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
Expand Down Expand Up @@ -3244,12 +3248,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_QWEN3A:
{
// 3x stride-2 conv2d: each step is floor((n-1)/2)+1
int n = img->nx;
n = (n - 1) / 2 + 1;
n = (n - 1) / 2 + 1;
n = (n - 1) / 2 + 1;
n_patches = n;
// chunk_size=100 frames --> 3x stride-2 conv2d --> 13 tokens per chunk
const int chunk_size = 100;
const int tokens_per_chunk = 13;
n_patches = (img->nx / chunk_size) * tokens_per_chunk;
} break;
case PROJECTOR_TYPE_GLMA:
{
Expand Down Expand Up @@ -4292,21 +4294,6 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_AUDIO;
}

bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_QWEN3A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_VOXTRAL:
case PROJECTOR_TYPE_MERALION:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return true;
default:
return false;
}
}

bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);
Expand Down
1 change: 0 additions & 1 deletion tools/mtmd/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel

bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);

struct clip_cap {
bool has_vision;
Expand Down
104 changes: 62 additions & 42 deletions tools/mtmd/models/qwen3a.cpp
Original file line number Diff line number Diff line change
@@ -1,68 +1,88 @@
#include "models.h"

ggml_cgraph * clip_graph_qwen3a::build() {
// Ref implementation: https://github.com/QwenLM/Qwen3-ASR/blob/main/qwen_asr/core/transformers_backend/modeling_qwen3_asr.py

// inp_raw: [n_frames, n_mel, 1] (nx=n_frames, ny=n_mel)
ggml_tensor * inp = build_inp_raw(1);

// conv2d block
// TODO: do we need to split by chunks of n_window each like on transformers impl?
{
inp = ggml_conv_2d(ctx0, model.conv2d_1_w, inp, 2, 2, 1, 1, 1, 1);
inp = ggml_add(ctx0, inp, model.conv2d_1_b);
inp = ggml_gelu_erf(ctx0, inp);
const int64_t n_frames = inp->ne[0]; // total frames, padded to multiple of chunk_size
const int64_t n_mel = inp->ne[1]; // 128
const int64_t chunk_size = 100; // n_window * 2 (n_window=50 from model config)
const int64_t n_chunks = n_frames / chunk_size;

inp = ggml_conv_2d(ctx0, model.conv2d_2_w, inp, 2, 2, 1, 1, 1, 1);
inp = ggml_add(ctx0, inp, model.conv2d_2_b);
inp = ggml_gelu_erf(ctx0, inp);
GGML_ASSERT(n_frames % chunk_size == 0); // preprocessor should already pad the input
GGML_ASSERT(inp->type == GGML_TYPE_F32);

inp = ggml_conv_2d(ctx0, model.conv2d_3_w, inp, 2, 2, 1, 1, 1, 1);
inp = ggml_add(ctx0, inp, model.conv2d_3_b);
inp = ggml_gelu_erf(ctx0, inp);
// View mel spectrogram as batched 100-frame chunks: [chunk_size, n_mel, 1, n_chunks]
inp = ggml_view_4d(ctx0, inp,
chunk_size, n_mel, 1, n_chunks,
n_frames * (int64_t)sizeof(float), // nb[1]: stride over mel bins
chunk_size * (int64_t)sizeof(float), // nb[2]: stride for C=1 (unused)
chunk_size * (int64_t)sizeof(float), // nb[3]: stride over chunks
0);
inp = ggml_cont(ctx0, inp);
cb(inp, "inp_chunks", -1);

// inp [n_pos, n_mels/8, channels, 1] (W, H, C, N)
cb(inp, "after_conv_blocks", -1);
// 3 x conv2d + gelu
{
// conv output [OW, OH, C_out, n_chunks]
auto conv_block = [&](ggml_tensor * x, ggml_tensor * w, ggml_tensor * b) {
x = ggml_conv_2d(ctx0, w, x, 2, 2, 1, 1, 1, 1);
if (b) {
x = ggml_add(ctx0, x, ggml_reshape_4d(ctx0, b, 1, 1, x->ne[2], 1));
}
return ggml_gelu_erf(ctx0, x);
};

const int64_t n_pos_after_conv = inp->ne[0];
const int64_t n_mel_after_conv = inp->ne[1]; // 128/8 = 16
inp = conv_block(inp, model.conv2d_1_w, model.conv2d_1_b);
inp = conv_block(inp, model.conv2d_2_w, model.conv2d_2_b);
inp = conv_block(inp, model.conv2d_3_w, model.conv2d_3_b);
// inp: [OW=13, OH=16, OC=480, n_chunks]
cb(inp, "after_conv_blocks", -1);
}

inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 3, 1));
inp = ggml_reshape_2d(ctx0, inp, n_pos_after_conv, n_mel_after_conv * inp->ne[3]); // [n_pos, 7680]
inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); // [7680, n_pos]
// permute [OW=25, OH=16, OC=480, n_chunks] -> [OH=16, OC=480, OW=25, n_chunks]
// reshape to [OH*OC=7680, OW*n_chunks]
// feature index h+16*c = c*16+f (matches python code)
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 2, 0, 1, 3));
inp = ggml_reshape_2d(ctx0, inp, inp->ne[0] * inp->ne[1], inp->ne[2] * inp->ne[3]);

// project to n_embd
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
if (model.conv_out_b) {
inp = ggml_add(ctx0, inp, model.conv_out_b);
}
cb(inp, "after_conv_out", -1);
// Project to d_model: [d_model, 25*n_chunks]
inp = ggml_mul_mat(ctx0, model.conv_out_w, inp);
if (model.conv_out_b) {
inp = ggml_add(ctx0, inp, model.conv_out_b);
}
cb(inp, "after_conv_out", -1);

auto n_pos = inp->ne[1];
const int64_t n_pos = inp->ne[1]; // 25 * n_chunks

ggml_tensor * pos_embd_selected = ggml_view_2d(
ctx0, model.position_embeddings,
model.position_embeddings->ne[0], n_pos,
model.position_embeddings->nb[1], 0
);
ggml_tensor * cur = build_vit(
inp, n_pos,
NORM_TYPE_NORMAL,
hparams.ffn_op,
pos_embd_selected,
nullptr);
// Per-chunk positional embeddings: repeat pos[0:13] for each chunk
// (position indices reset 0..12 per chunk, not sequential across chunks)
{
const int64_t tokens_per_chunk = n_pos / n_chunks; // 13
ggml_tensor * pos_tmp = ggml_view_2d(ctx0, model.position_embeddings,
model.position_embeddings->ne[0], tokens_per_chunk,
model.position_embeddings->nb[1], 0);
ggml_tensor * tgt = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32,
model.position_embeddings->ne[0], n_pos);
inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, pos_tmp, tgt));
}

ggml_tensor * cur = build_vit(inp, n_pos,
NORM_TYPE_NORMAL, hparams.ffn_op,
nullptr, // pos embd already added above
nullptr);
cb(cur, "after_transformer", -1);

// projector
// MLP projector
cur = build_ffn(cur,
model.mm_1_w, model.mm_1_b,
nullptr, nullptr,
model.mm_2_w, model.mm_2_b,
FFN_GELU_ERF,
-1);

FFN_GELU_ERF, -1);
cb(cur, "projected", -1);

ggml_build_forward_expand(gf, cur);

return gf;
}
104 changes: 104 additions & 0 deletions tools/mtmd/mtmd-audio.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,110 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s
return true;
}

//
// mtmd_audio_preprocessor_qwen3a
//
// Matches the Python WhisperFeatureExtractor called with truncation=False:
// - reflection padding of n_fft/2 samples at each end (center=True)
// - Whisper-style log10 + (max-8)/4 normalization applied to full audio
// - output split into ≤30s (3000 mel frames) windows, each padded to a
// multiple of 200 frames (n_window * 2) for the cgraph batch view
//

void mtmd_audio_preprocessor_qwen3a::initialize() {
cache.fill_sin_cos_table(hparams.audio_n_fft);
cache.fill_hann_window(hparams.audio_window_len, true);
cache.fill_mel_filterbank_matrix(hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate);
}

bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * samples,
size_t n_samples,
std::vector<mtmd_audio_mel> & output) {
if (n_samples == 0) {
return false;
}

GGML_ASSERT(!cache.sin_vals.empty());
GGML_ASSERT(!cache.cos_vals.empty());
GGML_ASSERT(!cache.filters.data.empty());

// Reflection-pad n_fft/2 samples at each end, matching WhisperFeatureExtractor center=True
const int pad = hparams.audio_n_fft / 2; // = 200

std::vector<float> padded(n_samples + 2 * pad, 0.0f);
// Reflect start: padded[0..pad-1] = samples[pad..1] (reversed)
for (int i = 0; i < pad; i++) {
int src = pad - i; // samples[pad], samples[pad-1], ..., samples[1]
padded[i] = (src < (int)n_samples) ? samples[src] : 0.0f;
}
std::copy(samples, samples + n_samples, padded.begin() + pad);
// Reflect end: padded[n+pad..n+2*pad-1] = samples[n-2..n-pad-1] (reversed)
for (int i = 0; i < pad; i++) {
int src = (int)n_samples - 2 - i; // samples[n-2], samples[n-3], ...
padded[n_samples + pad + i] = (src >= 0) ? samples[src] : 0.0f;
}

filter_params params;
params.n_mel = hparams.n_mel_bins;
params.n_fft_bins = 1 + (hparams.audio_n_fft / 2);
params.hann_window_size = hparams.audio_window_len;
params.hop_length = hparams.audio_hop_len;
params.sample_rate = hparams.audio_sample_rate;
params.no_padding = true; // reflection padding already applied above
params.use_natural_log = false; // log10

mtmd_audio_mel mel_full;
bool ok = log_mel_spectrogram(padded.data(), (int)padded.size(), 4, params, cache, mel_full);
if (!ok) {
return false;
}

// Whisper-style normalization: clamp to (max - 8), scale to [-1, 1]
{
double mmax = -1e20;
for (float v : mel_full.data) {
if (v > mmax) mmax = v;
}
mmax -= 8.0;
for (float & v : mel_full.data) {
v = (std::max((double)v, mmax) + 4.0) / 4.0;
}
}

// The effective frame count: center-padded STFT gives ~n_samples/hop_length frames.
// We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames.
const int n_eff = std::min(mel_full.n_len,
(int)(n_samples / hparams.audio_hop_len) + 1);

// Split into inference windows matching n_window_infer=800 from model config.
// Each window is padded to the next multiple of chunk_size for the cgraph.
// The mtmd caller loops over output entries, so long audio is handled automatically.
const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50)
const int window_size = 800; // mel frames per forward pass (n_window_infer=800)

for (int off = 0; off < n_eff; off += window_size) {
const int win_eff = std::min(window_size, n_eff - off);
const int n_chunks = (win_eff + chunk_size - 1) / chunk_size;
const int n_padded = n_chunks * chunk_size;

mtmd_audio_mel out;
out.n_mel = mel_full.n_mel;
out.n_len = n_padded;
out.n_len_org = win_eff;
out.data.assign(out.n_mel * out.n_len, 0.0f);
for (int m = 0; m < out.n_mel; m++) {
const int copy_len = std::min(win_eff, mel_full.n_len - off);
if (copy_len > 0) {
std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off,
mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len,
out.data.begin() + (size_t)m * out.n_len);
}
}
output.push_back(std::move(out));
}
return true;
}

//
// mtmd_audio_preprocessor_conformer
//
Expand Down
9 changes: 9 additions & 0 deletions tools/mtmd/mtmd-audio.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,15 @@ struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor {
mtmd_audio_cache cache;
};

struct mtmd_audio_preprocessor_qwen3a : mtmd_audio_preprocessor {
mtmd_audio_preprocessor_qwen3a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {}
void initialize() override;
bool preprocess(const float * samples, size_t n_samples, std::vector<mtmd_audio_mel> & output) override;

private:
mtmd_audio_cache cache;
};

//
// streaming ISTFT - converts spectrogram frames back to audio one frame at a time
//
Expand Down
Loading
Loading