Allow passing existing casual attention masks

janEbert · janEbert · commit 13becf1b3a01 · 2022-12-13T12:12:34.000+01:00
Since we create them in the T5 data loader, why not use them?
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
@@ -214,8 +214,7 @@ def forward_torch_softmax(self, input, mask):
         if self.scale is not None:
             input = input * self.scale
 
-        if self.attn_mask_type == AttnMaskType.causal:
-            assert mask is None
+        if self.attn_mask_type == AttnMaskType.causal and mask is None:
             assert input.shape[2] == input.shape[3]
             mask = self.get_causal_mask(input.shape[2])