Add unit test for fused_mla_lora_proj output equivalence

abhinavgoel95 · abhinavgoel95 · commit 89a005616c4b · 2026-03-16T13:52:54.000-07:00
Verifies that MLA with fused_mla_lora_proj=True produces numerically
identical outputs to fused_mla_lora_proj=False when the fused weight
(wq_kv_a) is set to the concatenation of the unfused weights (wq_a, wkv_a).
diff --git a/tests/unit/attention_test.py b/tests/unit/attention_test.py
@@ -1390,6 +1390,92 @@ def test_projection_initialization(self):
     self.assertTrue(hasattr(mla_layer, "kv_norm"), "MLA should have 'kv_norm' projection.")
     self.assertTrue(hasattr(mla_layer, "out"), "MLA should have 'out' projection.")
 
+  def test_fused_mla_lora_proj_output_equivalence(self):
+    """Tests that fused_mla_lora_proj=True produces identical outputs to fused_mla_lora_proj=False."""
+    extra_args = get_decoupled_parallelism_overrides()
+
+    # Initialize the unfused model.
+    unfused_args = {**self.config_arguments, "fused_mla_lora_proj": False, **extra_args}
+    cfg_unfused = pyconfig.initialize([sys.argv[0], get_test_config_path()], **unfused_args)
+    devices_array = maxtext_utils.create_device_mesh(cfg_unfused)
+    mesh = Mesh(devices_array, cfg_unfused.mesh_axes)
+    dummy_q = jnp.ones((cfg_unfused.global_batch_size_to_train_on, cfg_unfused.max_target_length, cfg_unfused.base_emb_dim))
+    mla_unfused = MLA(
+        config=cfg_unfused,
+        num_query_heads=cfg_unfused.num_query_heads,
+        num_kv_heads=cfg_unfused.num_kv_heads,
+        head_dim=cfg_unfused.head_dim,
+        inputs_q_shape=dummy_q.shape,
+        inputs_kv_shape=dummy_q.shape,
+        max_target_length=cfg_unfused.max_target_length,
+        max_prefill_predict_length=cfg_unfused.max_prefill_predict_length,
+        mesh=mesh,
+        attention_kernel="dot_product",
+        dtype=cfg_unfused.dtype,
+        dropout_rate=cfg_unfused.dropout_rate,
+        attention_type=cfg_unfused.attention_type,
+        q_lora_rank=cfg_unfused.q_lora_rank,
+        kv_lora_rank=cfg_unfused.kv_lora_rank,
+        qk_nope_head_dim=cfg_unfused.qk_nope_head_dim,
+        qk_rope_head_dim=cfg_unfused.qk_rope_head_dim,
+        v_head_dim=cfg_unfused.v_head_dim,
+        model_mode=MODEL_MODE_TRAIN,
+        rngs=nnx.Rngs(params=0, dropout=jax.random.PRNGKey(42)),
+    )
+
+    # Initialize the fused model.
+    fused_args = {**self.config_arguments, "fused_mla_lora_proj": True, **extra_args}
+    cfg_fused = pyconfig.initialize([sys.argv[0], get_test_config_path()], **fused_args)
+    mla_fused = MLA(
+        config=cfg_fused,
+        num_query_heads=cfg_fused.num_query_heads,
+        num_kv_heads=cfg_fused.num_kv_heads,
+        head_dim=cfg_fused.head_dim,
+        inputs_q_shape=dummy_q.shape,
+        inputs_kv_shape=dummy_q.shape,
+        max_target_length=cfg_fused.max_target_length,
+        max_prefill_predict_length=cfg_fused.max_prefill_predict_length,
+        mesh=mesh,
+        attention_kernel="dot_product",
+        dtype=cfg_fused.dtype,
+        dropout_rate=cfg_fused.dropout_rate,
+        attention_type=cfg_fused.attention_type,
+        q_lora_rank=cfg_fused.q_lora_rank,
+        kv_lora_rank=cfg_fused.kv_lora_rank,
+        qk_nope_head_dim=cfg_fused.qk_nope_head_dim,
+        qk_rope_head_dim=cfg_fused.qk_rope_head_dim,
+        v_head_dim=cfg_fused.v_head_dim,
+        model_mode=MODEL_MODE_TRAIN,
+        rngs=nnx.Rngs(params=0, dropout=jax.random.PRNGKey(42)),
+    )
+
+    # Make both models mathematically equivalent:
+    # fused wq_kv_a = concat(unfused wq_a, unfused wkv_a) along the output axis.
+    mla_fused.wq_kv_a.kernel.value = jnp.concatenate(
+        [mla_unfused.wq_a.kernel.value, mla_unfused.wkv_a.kernel.value], axis=-1
+    )
+    mla_fused.wq_b.kernel.value = mla_unfused.wq_b.kernel.value
+    mla_fused.q_norm.scale.value = mla_unfused.q_norm.scale.value
+    mla_fused.wkv_b.kernel.value = mla_unfused.wkv_b.kernel.value
+    mla_fused.kv_norm.scale.value = mla_unfused.kv_norm.scale.value
+    mla_fused.out.kernel.value = mla_unfused.out.kernel.value
+
+    # Run both models on the same inputs and verify outputs are identical.
+    lnx, decoder_segment_ids, decoder_positions = self.get_data(cfg_unfused, cfg_unfused.dtype)
+    common_kwargs = dict(
+        decoder_segment_ids=decoder_segment_ids,
+        inputs_positions=decoder_positions,
+        deterministic=True,
+        model_mode=MODEL_MODE_TRAIN,
+    )
+    output_unfused, _ = mla_unfused(lnx, lnx, **common_kwargs)
+    output_fused, _ = mla_fused(lnx, lnx, **common_kwargs)
+
+    self.assertTrue(
+        jax.numpy.allclose(output_unfused, output_fused, rtol=1e-05, atol=1e-05, equal_nan=False),
+        "fused_mla_lora_proj=True and fused_mla_lora_proj=False produced different outputs.",
+    )
+
   @parameterized.named_parameters(
       {
           "testcase_name": "cp_no_load_balance",