Causal transformer (#15730)

nithinraok · web-flow · commit 2ea3e0f05b24 · 2026-05-26T10:02:09.000-04:00
* Add full causal support

Signed-off-by: nithinraok &lt;nithinrao.koluguri@gmail.com&gt;

* fix black/isort

Signed-off-by: nithinraok &lt;nithinrao.koluguri@gmail.com&gt;

---------

Signed-off-by: nithinraok &lt;nithinrao.koluguri@gmail.com&gt;
diff --git a/nemo/collections/asr/modules/transformer_encoder.py b/nemo/collections/asr/modules/transformer_encoder.py
@@ -16,7 +16,7 @@
 
 import torch
 import torch.nn as nn
-from torch.nn.attention.flex_attention import create_block_mask, flex_attention
+from torch.nn.attention.flex_attention import and_masks, create_block_mask, flex_attention
 
 flex_attention_compiled = torch.compile(flex_attention, dynamic=True)
 
@@ -33,8 +33,8 @@ class TransformerEncoderConfig:
     ff_expansion: float = 4.0
     pre_block_norm: bool = True
     subsampling_factor: int = 4
-    # Attention mode — currently only "full" is supported.
-    # Future: "causal", "lookahead", "local", "sliding_window"
+    # Attention mode: "full" (bidirectional) or "causal" (each token only attends to itself and earlier tokens).
+    # Future: "lookahead", "local", "sliding_window".
     attn_mode: str = "full"
 
 
@@ -47,6 +47,18 @@ def pad_mask(b, h, q_idx, kv_idx):
     return pad_mask
 
 
+def _make_causal_mod():
+    """Strictly causal — each query only attends to its own and earlier kv positions."""
+
+    def causal(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+
+    return causal
+
+
+_SUPPORTED_ATTN_MODES = ("full", "causal")
+
+
 class FeatureStacking(nn.Module):
     """Stacks consecutive input frames and projects to model dimension.
 
@@ -174,7 +186,8 @@ class TransformerEncoder(nn.Module):
             such as Whisper or GPT-2 — required when loading pretrained weights from those
             checkpoints.
         subsampling_factor: Frame stacking factor for the pre-encoder.
-        attn_mode: Attention pattern — currently only "full" (bidirectional) is supported.
+        attn_mode: Attention pattern — "full" (bidirectional, default) or "causal" (each token
+            only attends to itself and earlier tokens).
     """
 
     def __init__(
@@ -194,8 +207,11 @@ def __init__(
         super().__init__()
         if d_model % n_heads != 0:
             raise ValueError(f"d_model ({d_model}) must be divisible by n_heads ({n_heads}).")
-        if attn_mode != "full":
-            raise ValueError(f"attn_mode='{attn_mode}' is not yet supported. Currently only 'full' is available.")
+        if attn_mode not in _SUPPORTED_ATTN_MODES:
+            raise ValueError(
+                f"attn_mode='{attn_mode}' is not yet supported. " f"Supported modes: {_SUPPORTED_ATTN_MODES}."
+            )
+        self.attn_mode = attn_mode
 
         cfg = TransformerEncoderConfig(
             feat_in=feat_in,
@@ -231,7 +247,11 @@ def forward(self, audio_signal, length):
         x = self.embed_norm(x)
 
         B, T, _ = x.shape
-        block_mask = create_block_mask(_make_padding_mod(length), B=B, H=1, Q_LEN=T, KV_LEN=T, device=x.device)
+        if self.attn_mode == "causal":
+            mask_mod = and_masks(_make_causal_mod(), _make_padding_mod(length))
+        else:
+            mask_mod = _make_padding_mod(length)
+        block_mask = create_block_mask(mask_mod, B=B, H=1, Q_LEN=T, KV_LEN=T, device=x.device)
 
         for layer in self.layers:
             x = layer(x, block_mask=block_mask)
diff --git a/tests/collections/asr/test_transformer_encoder.py b/tests/collections/asr/test_transformer_encoder.py
@@ -124,7 +124,44 @@ def test_model_creation_without_qk_norm(self):
     @pytest.mark.unit
     def test_invalid_attn_mode(self):
         with pytest.raises(ValueError, match="not yet supported"):
-            TransformerEncoder(feat_in=80, d_model=64, n_heads=4, n_layers=2, attn_mode="causal")
+            TransformerEncoder(feat_in=80, d_model=64, n_heads=4, n_layers=2, attn_mode="sliding_window")
+
+    @pytest.mark.unit
+    def test_causal_forward_cpu(self):
+        model = TransformerEncoder(feat_in=80, d_model=64, n_heads=4, n_layers=2, drop_rate=0.0, attn_mode="causal")
+        model.eval()
+
+        x = torch.randn(2, 80, 400)
+        lengths = torch.tensor([400, 300])
+
+        with torch.no_grad():
+            out, out_lengths = model(x, lengths)
+
+        assert out.shape == (2, 64, 100)
+        assert out_lengths.tolist() == [100, 75]
+        assert not torch.isnan(out).any()
+
+    @pytest.mark.unit
+    def test_causal_future_does_not_affect_past(self):
+        """Output at position t must be invariant to changes at positions > t."""
+        model = TransformerEncoder(feat_in=80, d_model=64, n_heads=4, n_layers=2, drop_rate=0.0, attn_mode="causal")
+        model.eval()
+
+        B, C, T = 1, 80, 400
+        x_a = torch.randn(B, C, T)
+        x_b = x_a.clone()
+        # Perturb only the second half of frames.
+        x_b[:, :, T // 2 :] = torch.randn(B, C, T - T // 2)
+        lengths = torch.tensor([T])
+
+        with torch.no_grad():
+            out_a, _ = model(x_a, lengths)
+            out_b, _ = model(x_b, lengths)
+
+        # Output frames covering only past + present should be identical.
+        # First half of *output* frames corresponds to first half of input frames after subsampling.
+        safe_t = (T // 2) // model.pre_encode.subsampling_factor
+        assert torch.allclose(out_a[:, :, :safe_t], out_b[:, :, :safe_t], atol=1e-5)
 
     @pytest.mark.unit
     def test_forward_cpu(self):