Skip to content

Commit 073bb2c

Browse files
authored
mtmd : add MERaLiON-2 multimodal audio support (ggml-org#21756)
* mtmd : add MERaLiON-2 multimodal audio support Adds support for A*STAR's MERaLiON-2 audio-language model (3B and 10B) to the multimodal framework. Architecture: - Whisper large-v2 encoder for audio feature extraction - Gated MLP adaptor: ln_speech -> frame stack (x15) -> Linear+SiLU -> GLU -> out_proj - Gemma2 3B / 27B decoder The mmproj GGUF is generated via convert_hf_to_gguf.py --mmproj on the full MERaLiON-2 model directory (architecture: MERaLiON2ForConditionalGeneration). The decoder is converted separately as a standard Gemma2 model after stripping the text_decoder. weight prefix. New projector type: PROJECTOR_TYPE_MERALION Supports tasks: speech transcription (EN/ZH/MS/TA), translation, spoken QA. Model: https://huggingface.co/MERaLiON/MERaLiON-2-3B https://huggingface.co/MERaLiON/MERaLiON-2-10B * simplify comments in meralion adaptor * meralion: use format_tensor_name, ascii arrows in comments
1 parent af1127d commit 073bb2c

File tree

8 files changed

+103
-2
lines changed

8 files changed

+103
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11279,6 +11279,48 @@ def set_gguf_parameters(self):
1127911279
self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])
1128011280

1128111281

11282+
@ModelBase.register("MERaLiON2ForConditionalGeneration")
11283+
class MERaLiONWhisperEncoderModel(WhisperEncoderModel):
11284+
has_vision_encoder = False
11285+
has_audio_encoder = True
11286+
11287+
def get_audio_config(self) -> dict[str, Any] | None:
11288+
return self.global_config.get("speech_config")
11289+
11290+
def set_gguf_parameters(self):
11291+
super().set_gguf_parameters()
11292+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.MERALION)
11293+
self.gguf_writer.add_audio_stack_factor(self.global_config.get("speech_mlp_scale_factor", 15))
11294+
11295+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11296+
if name.startswith("text_decoder."):
11297+
return
11298+
11299+
if name.startswith("speech_encoder."):
11300+
name = name.replace("speech_encoder.", "audio_tower.")
11301+
yield from super().modify_tensors(data_torch, name, bid)
11302+
return
11303+
11304+
suffix = "." + name.rsplit(".", 1)[-1]
11305+
11306+
if name.startswith("ln_speech."):
11307+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MM_NORM_PRE, suffix=suffix), data_torch)
11308+
return
11309+
11310+
if name.startswith("speech_audio_adapter."):
11311+
if ".mlp_adapter.0." in name:
11312+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 0, suffix=suffix), data_torch)
11313+
elif ".gate_proj." in name:
11314+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 1, suffix=suffix), data_torch)
11315+
elif ".pool_proj." in name:
11316+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 2, suffix=suffix), data_torch)
11317+
elif ".out_proj." in name:
11318+
yield (self.format_tensor_name(gguf.MODEL_TENSOR.A_MMPROJ, 3, suffix=suffix), data_torch)
11319+
return
11320+
11321+
yield from super().modify_tensors(data_torch, name, bid)
11322+
11323+
1128211324
@ModelBase.register("VoxtralForConditionalGeneration")
1128311325
class VoxtralWhisperEncoderModel(WhisperEncoderModel):
1128411326
has_vision_encoder = False # no vision encoder

gguf-py/gguf/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4115,6 +4115,7 @@ class VisionProjectorType:
41154115
GLMA = "glma" # audio
41164116
QWEN25O = "qwen2.5o" # omni
41174117
VOXTRAL = "voxtral"
4118+
MERALION = "meralion" # audio: Whisper + gated MLP adaptor
41184119
LFM2 = "lfm2"
41194120
KIMIVL = "kimivl"
41204121
PADDLEOCR = "paddleocr"

gguf-py/gguf/tensor_mapping.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2041,7 +2041,7 @@ class TensorNameMap:
20412041
# this prefix is added in the conversion code in modify_tensors()
20422042

20432043
MODEL_TENSOR.A_MMPROJ: (
2044-
"audio.multi_modal_projector.linear_{bid}", # ultravox
2044+
"audio.multi_modal_projector.linear_{bid}", # ultravox, meralion
20452045
"audio_adapter.model.{bid}" # lfm2
20462046
),
20472047

tools/mtmd/clip-impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ enum projector_type {
259259
PROJECTOR_TYPE_GLMA,
260260
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
261261
PROJECTOR_TYPE_VOXTRAL,
262+
PROJECTOR_TYPE_MERALION,
262263
PROJECTOR_TYPE_MUSIC_FLAMINGO,
263264
PROJECTOR_TYPE_LFM2,
264265
PROJECTOR_TYPE_KIMIVL,
@@ -302,6 +303,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
302303
{ PROJECTOR_TYPE_GLMA, "glma"},
303304
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
304305
{ PROJECTOR_TYPE_VOXTRAL, "voxtral"},
306+
{ PROJECTOR_TYPE_MERALION, "meralion"},
305307
{ PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"},
306308
{ PROJECTOR_TYPE_LFM2, "lfm2"},
307309
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},

tools/mtmd/clip-model.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,8 @@ struct clip_model {
467467

468468
bool audio_has_stack_frames() const {
469469
return proj_type == PROJECTOR_TYPE_ULTRAVOX
470-
|| proj_type == PROJECTOR_TYPE_VOXTRAL;
470+
|| proj_type == PROJECTOR_TYPE_VOXTRAL
471+
|| proj_type == PROJECTOR_TYPE_MERALION;
471472
}
472473
};
473474

tools/mtmd/clip.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
890890
case PROJECTOR_TYPE_VOXTRAL:
891891
case PROJECTOR_TYPE_QWEN2A:
892892
case PROJECTOR_TYPE_GLMA:
893+
case PROJECTOR_TYPE_MERALION:
893894
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
894895
{
895896
builder = std::make_unique<clip_graph_whisper_enc>(ctx, img);
@@ -1399,10 +1400,12 @@ struct clip_model_loader {
13991400
case PROJECTOR_TYPE_QWEN2A:
14001401
case PROJECTOR_TYPE_GLMA:
14011402
case PROJECTOR_TYPE_VOXTRAL:
1403+
case PROJECTOR_TYPE_MERALION:
14021404
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
14031405
{
14041406
bool require_stack = model.proj_type == PROJECTOR_TYPE_ULTRAVOX ||
14051407
model.proj_type == PROJECTOR_TYPE_VOXTRAL ||
1408+
model.proj_type == PROJECTOR_TYPE_MERALION ||
14061409
model.proj_type == PROJECTOR_TYPE_GLMA;
14071410
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor, require_stack);
14081411
hparams.ffn_op = FFN_GELU_ERF;
@@ -2017,6 +2020,30 @@ struct clip_model_loader {
20172020
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
20182021
model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
20192022
} break;
2023+
case PROJECTOR_TYPE_MERALION:
2024+
{
2025+
// Whisper encoder conv layers
2026+
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
2027+
model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
2028+
model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
2029+
model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
2030+
// MERaLiON adaptor: 4 linear layers + ln_pre
2031+
// linear_0 = frame compression (19200->6400) + SiLU
2032+
// linear_1 = gate_proj (6400->6400) for GLU
2033+
// linear_2 = pool_proj (6400->6400) for GLU
2034+
// linear_3 = out_proj (6400->3584)
2035+
model.mm_0_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "weight"));
2036+
model.mm_0_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 0, "bias"));
2037+
model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
2038+
model.mm_1_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "bias"));
2039+
model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
2040+
model.mm_2_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "bias"));
2041+
model.mm_3_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "weight"));
2042+
model.mm_3_b = get_tensor(string_format(TN_MM_AUDIO_MLP, 3, "bias"));
2043+
// ln_speech (LayerNorm before adaptor)
2044+
model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
2045+
model.mm_norm_pre_b = get_tensor(string_format(TN_MM_NORM_PRE, "bias"));
2046+
} break;
20202047
case PROJECTOR_TYPE_QWEN2A:
20212048
{
20222049
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -2809,6 +2836,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
28092836
case PROJECTOR_TYPE_VOXTRAL:
28102837
case PROJECTOR_TYPE_ULTRAVOX:
28112838
case PROJECTOR_TYPE_QWEN2A:
2839+
case PROJECTOR_TYPE_MERALION:
28122840
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
28132841
{
28142842
n_patches = img->nx;
@@ -3298,6 +3326,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
32983326
case PROJECTOR_TYPE_ULTRAVOX:
32993327
case PROJECTOR_TYPE_LFM2:
33003328
case PROJECTOR_TYPE_VOXTRAL:
3329+
case PROJECTOR_TYPE_MERALION:
33013330
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
33023331
case PROJECTOR_TYPE_JANUS_PRO:
33033332
case PROJECTOR_TYPE_PHI4:
@@ -3463,6 +3492,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
34633492
case PROJECTOR_TYPE_VOXTRAL:
34643493
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
34653494
return ctx->model.mm_2_w->ne[1];
3495+
case PROJECTOR_TYPE_MERALION:
3496+
return ctx->model.mm_3_w->ne[1]; // out_proj output dim
34663497
case PROJECTOR_TYPE_INTERNVL:
34673498
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
34683499
return ctx->model.mm_3_w->ne[1];
@@ -3523,6 +3554,7 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
35233554
case PROJECTOR_TYPE_QWEN2A:
35243555
case PROJECTOR_TYPE_GLMA:
35253556
case PROJECTOR_TYPE_VOXTRAL:
3557+
case PROJECTOR_TYPE_MERALION:
35263558
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
35273559
return true;
35283560
default:

tools/mtmd/models/whisper-enc.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,28 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
9595
FFN_GELU_ERF,
9696
-1);
9797

98+
} else if (proj_type == PROJECTOR_TYPE_MERALION) {
99+
// stack (above) -> ln -> linear0+silu -> GLU -> out
100+
cur = ggml_norm(ctx0, cur, hparams.eps);
101+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
102+
cur = ggml_add(ctx0, cur, model.mm_norm_pre_b);
103+
104+
cur = ggml_mul_mat(ctx0, model.mm_0_w, cur);
105+
cur = ggml_add(ctx0, cur, model.mm_0_b);
106+
cur = ggml_silu(ctx0, cur);
107+
108+
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_1_w, cur);
109+
gate = ggml_add(ctx0, gate, model.mm_1_b);
110+
gate = ggml_silu(ctx0, gate);
111+
112+
ggml_tensor * pool = ggml_mul_mat(ctx0, model.mm_2_w, cur);
113+
pool = ggml_add(ctx0, pool, model.mm_2_b);
114+
115+
cur = ggml_mul(ctx0, gate, pool);
116+
117+
cur = ggml_mul_mat(ctx0, model.mm_3_w, cur);
118+
cur = ggml_add(ctx0, cur, model.mm_3_b);
119+
98120
} else if (proj_type == PROJECTOR_TYPE_GLMA) {
99121
cur = ggml_norm(ctx0, cur, hparams.eps);
100122
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);

tools/mtmd/mtmd.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,7 @@ struct mtmd_context {
476476
} break;
477477
case PROJECTOR_TYPE_ULTRAVOX:
478478
case PROJECTOR_TYPE_GLMA:
479+
case PROJECTOR_TYPE_MERALION:
479480
{
480481
audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
481482
} break;

0 commit comments

Comments
 (0)