[model] fix: use hf_config to check whether model is dense (#4414)

pavelgein · yaoyu-33 · web-flow · commit 394a7cfefee0 · 2026-06-24T21:27:36.000-07:00
Signed-off-by: Pavel Gein &lt;pavel.gein@gmail.com&gt;
Signed-off-by: yaoyu-33 &lt;yaoyu.094@gmail.com&gt;
Co-authored-by: yaoyu-33 &lt;yaoyu.094@gmail.com&gt;
diff --git a/src/megatron/bridge/models/gemma/gemma4_bridge.py b/src/megatron/bridge/models/gemma/gemma4_bridge.py
@@ -127,6 +127,17 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> "Gemma4ModelProv
         self._is_dense = False
         return self._build_moe_provider(hf_config)
 
+    def _text_config(self) -> Any | None:
+        """Return the text config used to dispatch dense vs MoE behavior."""
+        return getattr(self, "hf_config", None)
+
+    def _is_dense_config(self) -> bool:
+        """Return whether the current HF config describes a dense Gemma 4 model."""
+        if getattr(self, "_is_dense", False):
+            return True
+        text_config = self._text_config()
+        return text_config is not None and not getattr(text_config, "enable_moe_block", False)
+
     def _build_dense_provider(self, hf_config) -> Gemma4DenseProvider:
         """Build a Gemma4DenseProvider from HF config."""
         rope_params = getattr(hf_config, "rope_parameters", {}) or {}
@@ -269,13 +280,24 @@ def maybe_modify_loaded_hf_weight(
 
             if k_name not in hf_state_dict and v_name not in hf_state_dict:
                 q_weight = hf_state_dict[q_name]
-                num_q_heads = getattr(self, "_dense_num_attention_heads", 8)
-                kv_head_dim = q_weight.shape[0] // num_q_heads
-                num_kv_heads = getattr(
-                    self,
-                    "_dense_num_global_query_groups",
-                    getattr(self, "_dense_num_query_groups", 2),
+                text_config = self._text_config()
+                num_q_heads = getattr(
+                    text_config, "num_attention_heads", getattr(self, "_dense_num_attention_heads", 8)
                 )
+                kv_head_dim = q_weight.shape[0] // num_q_heads
+                num_kv_heads = getattr(text_config, "num_key_value_heads", getattr(self, "_dense_num_query_groups", 2))
+                layer_match = re.search(r"layers\.(\d+)\.", q_name)
+                layer_types = getattr(text_config, "layer_types", None)
+                if layer_match and layer_types:
+                    layer_idx = int(layer_match.group(1))
+                    if layer_idx < len(layer_types) and layer_types[layer_idx] == "full_attention":
+                        num_kv_heads = getattr(
+                            text_config,
+                            "num_global_key_value_heads",
+                            getattr(self, "_dense_num_global_query_groups", num_kv_heads),
+                        )
+                elif hasattr(self, "_dense_num_global_query_groups"):
+                    num_kv_heads = self._dense_num_global_query_groups
                 kv_shape = (num_kv_heads * kv_head_dim, q_weight.shape[1])
                 k_zero = torch.zeros(kv_shape, dtype=q_weight.dtype, device=q_weight.device)
                 return {"q": q_weight, "k": k_zero, "v": torch.zeros_like(k_zero)}
@@ -340,7 +362,7 @@ def _fuse_shared_expert_prenorm(
         return hf_weights
 
     def mapping_registry(self) -> MegatronMappingRegistry:
-        if getattr(self, "_is_dense", False):
+        if self._is_dense_config():
             return self._dense_mapping_registry()
         return self._moe_mapping_registry()
 
diff --git a/src/megatron/bridge/models/gemma_vl/gemma4_vl_bridge.py b/src/megatron/bridge/models/gemma_vl/gemma4_vl_bridge.py
@@ -184,12 +184,6 @@ def _text_config(self):
         hf_config = getattr(self, "hf_config", None)
         return getattr(hf_config, "text_config", None)
 
-    def _is_dense_e4b_config(self) -> bool:
-        if getattr(self, "_is_dense", False):
-            return True
-        text_config = self._text_config()
-        return text_config is not None and not getattr(text_config, "enable_moe_block", True)
-
     def _hf_layer_prefix(self) -> str:
         """VLM text weights live under ``model.language_model.*``."""
         return "model.language_model."
@@ -238,35 +232,9 @@ def _fuse_shared_expert_prenorm(
             hf_weights[role] = fused.to(weight.dtype)
         return hf_weights
 
-    def maybe_modify_loaded_hf_weight(
-        self, hf_param: str | dict[str, str], hf_state_dict: Mapping[str, torch.Tensor]
-    ) -> torch.Tensor:
-        """Handle special weight loading for Gemma 4 VLM."""
-        if self._is_dense_e4b_config() and isinstance(hf_param, dict) and "v" in hf_param:
-            k_name = hf_param["k"]
-            v_name = hf_param["v"]
-            q_name = hf_param["q"]
-            if k_name not in hf_state_dict and v_name not in hf_state_dict:
-                q_weight = hf_state_dict[q_name]
-                text_config = self._text_config()
-                num_q_heads = getattr(text_config, "num_attention_heads", 8)
-                num_kv_heads = getattr(text_config, "num_key_value_heads", 2)
-                layer_match = re.search(r"layers\.(\d+)\.", q_name)
-                layer_types = getattr(text_config, "layer_types", None)
-                if layer_match and layer_types:
-                    layer_idx = int(layer_match.group(1))
-                    if layer_idx < len(layer_types) and layer_types[layer_idx] == "full_attention":
-                        num_kv_heads = getattr(text_config, "num_global_key_value_heads", num_kv_heads)
-                kv_head_dim = q_weight.shape[0] // num_q_heads
-                kv_shape = (num_kv_heads * kv_head_dim, q_weight.shape[1])
-                k_zero = torch.zeros(kv_shape, dtype=q_weight.dtype, device=q_weight.device)
-                return {"q": q_weight, "k": k_zero, "v": torch.zeros_like(k_zero)}
-
-        return super().maybe_modify_loaded_hf_weight(hf_param, hf_state_dict)
-
     def mapping_registry(self) -> MegatronMappingRegistry:
         """Dispatch to Dense or MoE VLM mappings."""
-        if self._is_dense_e4b_config():
+        if self._is_dense_config():
             if self._conversion_mode() == "text":
                 return self._dense_mapping_registry(megatron_prefix="")
             return self._dense_vl_mapping_registry()
diff --git a/tests/unit_tests/models/gemma/test_gemma4_bridge.py b/tests/unit_tests/models/gemma/test_gemma4_bridge.py
@@ -308,6 +308,25 @@ def test_kv_synthesis_uses_dense_provider_head_metadata(self, bridge, mock_pretr
         assert result["k"].shape == (4, 8)
         assert result["v"].shape == (4, 8)
 
+    def test_kv_synthesis_uses_hf_config_without_provider_bridge(self, bridge, mock_hf_config_dense):
+        bridge.hf_config = mock_hf_config_dense
+        mock_hf_config_dense.num_attention_heads = 6
+        mock_hf_config_dense.num_key_value_heads = 5
+        mock_hf_config_dense.num_global_key_value_heads = 3
+        mock_hf_config_dense.layer_types = ["full_attention"]
+        q_weight = torch.randn(24, 8)
+        sd = {"model.layers.0.self_attn.q_proj.weight": q_weight}
+        hf_param = {
+            "q": "model.layers.0.self_attn.q_proj.weight",
+            "k": "model.layers.0.self_attn.k_proj.weight",
+            "v": "model.layers.0.self_attn.v_proj.weight",
+        }
+
+        result = bridge.maybe_modify_loaded_hf_weight(hf_param, sd)
+
+        assert result["k"].shape == (12, 8)
+        assert result["v"].shape == (12, 8)
+
     def test_kv_passthrough_when_v_present(self, bridge):
         sd = self._make_sd()
         sd["model.layers.0.self_attn.v_proj.weight"] = torch.randn(4, 8)
@@ -486,6 +505,30 @@ def test_has_post_moe_layernorm(self, bridge):
         names = self._collect_names(bridge.mapping_registry())
         assert any("post_moe_layernorm" in n for n in names)
 
+    def test_selects_dense_registry_from_hf_config_without_provider_bridge(self, bridge, mock_hf_config_dense):
+        bridge.hf_config = mock_hf_config_dense
+
+        names = self._collect_names(bridge.mapping_registry())
+
+        assert "per_layer_embedding.weight" in names
+        assert "decoder.layers.*.mlp.router.weight" not in names
+
+    def test_selects_dense_registry_when_enable_moe_block_missing(self, bridge):
+        bridge.hf_config = Mock(spec=[])
+
+        names = self._collect_names(bridge.mapping_registry())
+
+        assert "per_layer_embedding.weight" in names
+        assert "decoder.layers.*.mlp.router.weight" not in names
+
+    def test_selects_moe_registry_from_hf_config_without_provider_bridge(self, bridge, mock_hf_config_moe):
+        bridge.hf_config = mock_hf_config_moe
+
+        names = self._collect_names(bridge.mapping_registry())
+
+        assert "decoder.layers.*.mlp.router.weight" in names
+        assert "per_layer_embedding.weight" not in names
+
     def test_has_layer_scalar_mapping(self, bridge):
         names = self._collect_names(bridge.mapping_registry())
         assert any("layer_scalar" in n for n in names)