update tests

akoumpa · akoumpa · commit 897a38b098cf · 2026-04-02T11:18:16.000-07:00
Signed-off-by: Alexandros Koumparoulis &lt;akoumparouli@nvidia.com&gt;
diff --git a/tests/unit_tests/models/afmoe/test_afmoe_layers.py b/tests/unit_tests/models/afmoe/test_afmoe_layers.py
@@ -80,21 +80,58 @@ def test_has_qk_norm(self, tiny_config, backend_config):
         assert hasattr(attn, "k_norm")
 
     def test_forward_shape(self, tiny_config, backend_config, device):
-        attn = AfmoeAttention(tiny_config, layer_idx=0, backend=backend_config).to(device).to(torch.float32)
+        attn = AfmoeAttention(tiny_config, layer_idx=0, backend=backend_config).to(device)
 
         batch, seq_len = 2, 8
-        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device)
+        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device, dtype=torch.bfloat16)
         freqs_cis = torch.randn(batch, seq_len, tiny_config.head_dim, device=device)
 
         out = attn(x, freqs_cis=freqs_cis)
         assert out.shape == (batch, seq_len, tiny_config.hidden_size)
 
     def test_global_attention_forward_shape(self, tiny_config, backend_config, device):
-        attn = AfmoeAttention(tiny_config, layer_idx=1, backend=backend_config).to(device).to(torch.float32)
+        attn = AfmoeAttention(tiny_config, layer_idx=1, backend=backend_config).to(device)
 
         batch, seq_len = 2, 8
-        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device)
+        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device, dtype=torch.bfloat16)
         freqs_cis = torch.randn(batch, seq_len, tiny_config.head_dim, device=device)
 
         out = attn(x, freqs_cis=freqs_cis)
         assert out.shape == (batch, seq_len, tiny_config.hidden_size)
+
+
+class TestAfmoeAttentionParity:
+    def test_rope_conditional_local_vs_global(self, tiny_config, backend_config, device):
+        """Local attention (with RoPE) and global attention (without) must diverge given shared weights."""
+        torch.manual_seed(42)
+        local_attn = AfmoeAttention(tiny_config, layer_idx=0, backend=backend_config).to(device)
+        global_attn = AfmoeAttention(tiny_config, layer_idx=1, backend=backend_config).to(device)
+        global_attn.load_state_dict(local_attn.state_dict())
+
+        batch, seq_len = 2, 8
+        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device, dtype=torch.bfloat16)
+        freqs_cis = torch.randn(batch, seq_len, tiny_config.head_dim, device=device)
+
+        with torch.no_grad():
+            local_out = local_attn(x, freqs_cis=freqs_cis)
+            global_out = global_attn(x, freqs_cis=freqs_cis)
+
+        max_diff = (local_out - global_out).abs().max().item()
+        assert max_diff > 0.01, f"RoPE should cause divergence, but max_diff={max_diff}"
+
+    def test_qk_norm_reduces_head_variance(self, tiny_config, backend_config, device):
+        """Per-head QK RMSNorm should equalize magnitudes across heads."""
+        attn = AfmoeAttention(tiny_config, layer_idx=0, backend=backend_config).to(device)
+
+        batch, seq_len = 1, 4
+        q = torch.randn(
+            batch, seq_len, tiny_config.num_attention_heads, tiny_config.head_dim, device=device, dtype=torch.bfloat16
+        )
+        q[:, :, 0, :] *= 10.0  # Make first head 10x larger
+
+        with torch.no_grad():
+            q_normed = attn.q_norm(q)
+
+        pre_var = q.norm(dim=-1).var(dim=-1).mean().item()
+        post_var = q_normed.norm(dim=-1).var(dim=-1).mean().item()
+        assert post_var < pre_var, "QK norm should reduce variance across heads"
diff --git a/tests/unit_tests/models/afmoe/test_afmoe_model.py b/tests/unit_tests/models/afmoe/test_afmoe_model.py
@@ -20,7 +20,8 @@
 from nemo_automodel.components.models.afmoe.config import AfmoeConfig
 from nemo_automodel.components.models.afmoe.model import AfmoeForCausalLM, AfmoeModel, Block, _build_moe_config
 from nemo_automodel.components.models.common import BackendConfig
-from nemo_automodel.components.moe.layers import MLP, MoE
+from nemo_automodel.components.moe.config import MoEConfig
+from nemo_automodel.components.moe.layers import MLP, Gate, MoE
 
 pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 
@@ -195,3 +196,85 @@ def test_fields_mapped_correctly(self, tiny_config):
         assert moe_cfg.route_scale == tiny_config.route_scale
         assert moe_cfg.norm_topk_prob is True
         assert moe_cfg.force_e_score_correction_bias is True
+
+
+class TestDualNormParity:
+    def test_manual_trace_matches_forward(self, tiny_config, backend_config, device):
+        """Manual 4-norm residual trace must be bit-identical to Block.forward()."""
+        torch.manual_seed(42)
+        moe_config = _build_moe_config(tiny_config)
+        block = Block(layer_idx=0, config=tiny_config, moe_config=moe_config, backend=backend_config).to(device)
+        block.eval()
+
+        batch, seq_len = 1, 4
+        x = torch.randn(batch, seq_len, tiny_config.hidden_size, device=device, dtype=torch.bfloat16)
+        freqs_cis = torch.randn(batch, seq_len, tiny_config.head_dim, device=device)
+
+        with torch.no_grad():
+            # Manual trace: attention sublayer
+            residual = x
+            h = block.input_layernorm(x)
+            h = block.self_attn(h, freqs_cis=freqs_cis)
+            h = block.post_attention_layernorm(h)
+            after_attn = residual + h
+
+            # Manual trace: MLP sublayer
+            residual = after_attn
+            h = block.pre_mlp_layernorm(after_attn)
+            h = block._mlp(h, padding_mask=None)
+            h = block.post_mlp_layernorm(h)
+            expected = residual + h
+
+            # Block forward
+            actual = block(x, freqs_cis=freqs_cis)
+
+        torch.testing.assert_close(actual, expected, rtol=0, atol=0)
+
+
+class TestMoeRoutingParity:
+    def test_sigmoid_norm_scale(self, device):
+        """Manual sigmoid -> topk -> normalize -> scale must match Gate.forward()."""
+        torch.manual_seed(42)
+
+        moe_config = MoEConfig(
+            dim=64,
+            inter_dim=128,
+            moe_inter_dim=32,
+            n_routed_experts=4,
+            n_shared_experts=1,
+            n_activated_experts=2,
+            n_expert_groups=1,
+            n_limited_groups=1,
+            train_gate=False,
+            gate_bias_update_factor=0.0,
+            score_func="sigmoid",
+            route_scale=2.0,
+            aux_loss_coeff=0.0,
+            norm_topk_prob=True,
+            force_e_score_correction_bias=True,
+            dtype=torch.bfloat16,
+        )
+
+        gate = Gate(moe_config).to(device)
+        torch.manual_seed(123)
+        gate.weight.data = torch.randn(4, 64, device=device, dtype=torch.bfloat16)
+
+        x = torch.randn(8, 64, device=device, dtype=torch.bfloat16)  # 8 tokens
+        token_mask = torch.ones(8, dtype=torch.bool, device=device)
+
+        with torch.no_grad():
+            weights, indices, aux_loss = gate(x, token_mask, cp_mesh=None)
+
+        # Manual reference: sigmoid -> bias -> topk -> gather original -> normalize -> scale
+        with torch.no_grad():
+            scores = torch.sigmoid(x @ gate.weight.data.T)  # [8, 4]
+            original_scores = scores.clone()
+            biased = scores + gate.e_score_correction_bias  # zeros, no-op
+            manual_idx = torch.topk(biased, 2, dim=-1)[1]
+            manual_w = original_scores.gather(1, manual_idx)
+            manual_w = manual_w / (manual_w.sum(dim=-1, keepdim=True) + 1e-20)
+            manual_w = manual_w * 2.0
+
+        assert torch.equal(indices, manual_idx), "Expert indices mismatch"
+        torch.testing.assert_close(weights, manual_w, rtol=1e-3, atol=1e-3)
+        assert aux_loss is None
diff --git a/tests/unit_tests/models/afmoe/test_afmoe_state_dict_adapter.py b/tests/unit_tests/models/afmoe/test_afmoe_state_dict_adapter.py
@@ -82,30 +82,30 @@ def adapter(config, moe_config, backend):
     return AfmoeStateDictAdapter(config, moe_config, backend, dtype=torch.bfloat16)
 
 
-def _make_hf_expert_state_dict(n_layers=2, n_experts=4, hidden=64, moe_inter=32, num_dense=1):
+def _make_hf_expert_state_dict(n_layers=2, n_experts=4, hidden=64, moe_inter=32, num_dense=1, dtype=torch.bfloat16):
     """Create a minimal HF-format state dict with router, experts, and expert_bias."""
     sd = {}
     for layer_idx in range(n_layers):
         prefix = f"model.layers.{layer_idx}"
         if layer_idx >= num_dense:
             # Router gate
-            sd[f"{prefix}.mlp.router.gate.weight"] = torch.randn(n_experts, hidden)
+            sd[f"{prefix}.mlp.router.gate.weight"] = torch.randn(n_experts, hidden, dtype=dtype)
             # Expert bias
             sd[f"{prefix}.mlp.expert_bias"] = torch.zeros(n_experts)
             # Per-expert weights
             for e in range(n_experts):
-                sd[f"{prefix}.mlp.experts.{e}.gate_proj.weight"] = torch.randn(moe_inter, hidden)
-                sd[f"{prefix}.mlp.experts.{e}.up_proj.weight"] = torch.randn(moe_inter, hidden)
-                sd[f"{prefix}.mlp.experts.{e}.down_proj.weight"] = torch.randn(hidden, moe_inter)
+                sd[f"{prefix}.mlp.experts.{e}.gate_proj.weight"] = torch.randn(moe_inter, hidden, dtype=dtype)
+                sd[f"{prefix}.mlp.experts.{e}.up_proj.weight"] = torch.randn(moe_inter, hidden, dtype=dtype)
+                sd[f"{prefix}.mlp.experts.{e}.down_proj.weight"] = torch.randn(hidden, moe_inter, dtype=dtype)
             # Shared expert
-            sd[f"{prefix}.mlp.shared_experts.gate_proj.weight"] = torch.randn(moe_inter, hidden)
-            sd[f"{prefix}.mlp.shared_experts.up_proj.weight"] = torch.randn(moe_inter, hidden)
-            sd[f"{prefix}.mlp.shared_experts.down_proj.weight"] = torch.randn(hidden, moe_inter)
+            sd[f"{prefix}.mlp.shared_experts.gate_proj.weight"] = torch.randn(moe_inter, hidden, dtype=dtype)
+            sd[f"{prefix}.mlp.shared_experts.up_proj.weight"] = torch.randn(moe_inter, hidden, dtype=dtype)
+            sd[f"{prefix}.mlp.shared_experts.down_proj.weight"] = torch.randn(hidden, moe_inter, dtype=dtype)
         else:
             # Dense MLP
-            sd[f"{prefix}.mlp.gate_proj.weight"] = torch.randn(128, hidden)
-            sd[f"{prefix}.mlp.up_proj.weight"] = torch.randn(128, hidden)
-            sd[f"{prefix}.mlp.down_proj.weight"] = torch.randn(hidden, 128)
+            sd[f"{prefix}.mlp.gate_proj.weight"] = torch.randn(128, hidden, dtype=dtype)
+            sd[f"{prefix}.mlp.up_proj.weight"] = torch.randn(128, hidden, dtype=dtype)
+            sd[f"{prefix}.mlp.down_proj.weight"] = torch.randn(hidden, 128, dtype=dtype)
     return sd
 
 
@@ -187,3 +187,20 @@ def test_to_hf_splits_experts(self, adapter):
             assert f"model.layers.1.mlp.experts.{e}.gate_proj.weight" in hf_sd
             assert f"model.layers.1.mlp.experts.{e}.up_proj.weight" in hf_sd
             assert f"model.layers.1.mlp.experts.{e}.down_proj.weight" in hf_sd
+
+    def test_roundtrip_preserves_all_values(self, adapter):
+        """HF -> NeMo -> HF round-trip must preserve exact tensor values."""
+        torch.manual_seed(42)
+        hf_sd = _make_hf_expert_state_dict()
+        originals = {k: v.clone() for k, v in hf_sd.items()}
+
+        nemo_sd = adapter.from_hf(hf_sd)
+        roundtrip_sd = adapter.to_hf(nemo_sd)
+
+        assert set(roundtrip_sd.keys()) == set(originals.keys()), (
+            f"Missing: {set(originals.keys()) - set(roundtrip_sd.keys())}, "
+            f"Extra: {set(roundtrip_sd.keys()) - set(originals.keys())}"
+        )
+        for key in originals:
+            max_diff = (originals[key].float() - roundtrip_sd[key].float()).abs().max().item()
+            assert max_diff == 0.0, f"Round-trip mismatch for {key}: max_diff={max_diff}"