gemma4: fix audio encoder and LM precision issues

stephencox · claude · stephencox · commit 2852c7c7a993 · 2026-04-07T13:03:53.000+12:00
Audio encoder fixes: - Fix swapped conv norm weight mapping in tensor_mapping.py (A_ENC_CONV_NORM and A_ENC_NORM_CONV had their gemma4 entries inverted, causing the conv pre-norm and internal norm weights to be swapped in GGUF. This produced 0.67 encoder cosine vs PyTorch; now 0.9999) - Fix causal mask off-by-one: add (gq - gk) < max_past to match PyTorch's dist < left_window_size (was attending to 13 past tokens instead of 12) - Use -1e9 instead of -INFINITY for masked positions to match PyTorch's attention_invalid_logits_value and avoid NaN in padded attention weights LM fixes: - Disable attention logit softcapping for Gemma4 (unlike Gemma2, Gemma4's text model does not use attn softcapping; was incorrectly hardcoded) - Use BF16-rounded embedding scale constants to match PyTorch's native BF16 training precision (ref: PR ggml-org#21451). Fixes long-context coherence on CPU/Vulkan backends. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
@@ -2045,22 +2045,22 @@ class TensorNameMap:
 
         MODEL_TENSOR.A_ENC_CONV_NORM: (
             "conformer.layers.{bid}.conv.batch_norm", # lfm2
-            "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
+            "conformer.layers.{bid}.lconv1d.conv_norm", # gemma4
         ),
 
         MODEL_TENSOR.A_ENC_CONV_PW1: (
             "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
-            "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
+            "conformer.layers.{bid}.lconv1d.linear_start", # gemma4
         ),
 
         MODEL_TENSOR.A_ENC_CONV_PW2: (
             "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
-            "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
+            "conformer.layers.{bid}.lconv1d.linear_end", # gemma4
         ),
 
         MODEL_TENSOR.A_ENC_NORM_CONV: (
             "conformer.layers.{bid}.norm_conv", # lfm2
-            "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
+            "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma4
         ),
 
         MODEL_TENSOR.A_PER_DIM_K_SCALE: (
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -1186,14 +1186,16 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 uint32_t swa_period = 2;
                 ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
                 hparams.set_swa_pattern(swa_period);
-                hparams.attn_soft_cap = true;
                 hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
                 hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
 
                 ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA,          hparams.rope_freq_base_train_swa, false);
                 ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,    hparams.n_swa, false);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
-                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING,      hparams.f_attn_logit_softcapping, false);
+                // Gemma4 does NOT use attention logit softcapping (unlike Gemma2)
+                hparams.f_attn_logit_softcapping = 0.0f;
+                ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
+                hparams.attn_soft_cap = (hparams.f_attn_logit_softcapping > 0.0f);
                 ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING,     hparams.f_final_logit_softcapping, false);
 
                 switch (hparams.n_layer) {
diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp
@@ -10,7 +10,8 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
     inpL = build_inp_embd(model.tok_embd);
 
     // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
-    inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
+    // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451)
+    inpL = ggml_scale(ctx0, inpL, ubatch.token ? ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf(n_embd))) : 1.0f);
     cb(inpL, "inp_scaled", -1);
 
     // inp_pos - contains the positions
@@ -139,8 +140,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
             cb(cur_moe, "ffn_norm_2", il);
 
             // custom MoE logits calculation (router operates on attn_out, not cur)
+            // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451)
             ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps);
-            tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd));
+            tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd))));
             tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s);
             ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens]
             cb(logits, "ffn_moe_logits", il);
@@ -266,7 +268,7 @@ ggml_tensor * llm_build_gemma4_iswa::get_per_layer_inputs() {
         res->t_inp_tokens = inp->tokens;
         inp_per_layer = ggml_get_rows(ctx0, model.tok_embd_per_layer, inp->tokens);
         inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, n_tokens);
-        inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer));
+        inp_per_layer = ggml_scale(ctx0, inp_per_layer, ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd_per_layer))));
         cb(inp_per_layer, "inp_per_layer_selected", -1);
         res->add_input(std::move(inp));
     } else {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -3347,13 +3347,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
                 // Blocked causal attention mask: [context_size, chunk_size, num_blocks]
                 {
-                    std::vector<float> mask(context_size * chunk_size * num_blocks, -INFINITY);
+                    std::vector<float> mask(context_size * chunk_size * num_blocks, -1e9f);
                     for (int b = 0; b < num_blocks; b++) {
                         for (int q = 0; q < chunk_size; q++) {
                             int gq = b * chunk_size + q;
                             for (int k = 0; k < context_size; k++) {
                                 int gk = b * chunk_size - max_past + k;
-                                if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq) {
+                                if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq && (gq - gk) < max_past) {
                                     mask[k + q * context_size + b * context_size * chunk_size] = 0.0f;
                                 }
                             }

Original file line number	Diff line number	Diff line change
`@@ -3347,13 +3347,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima`
`3347`	`3347`
`3348`	`3348`	`// Blocked causal attention mask: [context_size, chunk_size, num_blocks]`
`3349`	`3349`	`{`
`3350`		`- std::vector<float> mask(context_size * chunk_size * num_blocks, -INFINITY);`
	`3350`	`+ std::vector<float> mask(context_size * chunk_size * num_blocks, -1e9f);`
`3351`	`3351`	`for (int b = 0; b < num_blocks; b++) {`
`3352`	`3352`	`for (int q = 0; q < chunk_size; q++) {`
`3353`	`3353`	`int gq = b * chunk_size + q;`
`3354`	`3354`	`for (int k = 0; k < context_size; k++) {`
`3355`	`3355`	`int gk = b * chunk_size - max_past + k;`
`3356`		`- if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq) {`
	`3356`	`+ if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq && (gq - gk) < max_past) {`
`3357`	`3357`	`mask[k + q * context_size + b * context_size * chunk_size] = 0.0f;`
`3358`	`3358`	`}`
`3359`	`3359`	`}`