softmax forward 2 passes

lowdy1 · lowdy1 · commit 393915f3440b · 2026-03-28T06:11:52.000Z
diff --git a/src/liger_kernel/ops/backends/_ascend/ops/multi_token_attention.py b/src/liger_kernel/ops/backends/_ascend/ops/multi_token_attention.py
@@ -64,17 +64,11 @@ def _fused_mask_softmax_fwd_kernel(
         # Second pass: normalize and store
         for block_start in range(0, valid_len, BLOCK_SIZE):
             col_idx = block_start + tl.arange(0, BLOCK_SIZE)
-            col_mask = col_idx < valid_len
-            vals = tl.load(row_ptr + col_idx, mask=col_mask, other=float("-inf"))
-            exp_vals = tl.exp(vals - max_val)
-            probs = exp_vals / d_sum
-            tl.store(out_row_ptr + col_idx, probs, mask=col_mask)
-
-        # Store zeros for masked positions
-        for block_start in range(valid_len, L, BLOCK_SIZE):
-            col_idx = block_start + tl.arange(0, BLOCK_SIZE)
-            col_mask = col_idx < L
-            tl.store(out_row_ptr + col_idx, 0.0, mask=col_mask)
+            mask = col_idx < L
+            causal = col_idx <= row_idx
+            vals = tl.load(row_ptr + col_idx, mask=mask & causal, other=float("-inf"))
+            probs = tl.exp(vals - max_val) / d_sum
+            tl.store(out_row_ptr + col_idx, probs, mask=mask)
 
 
 @triton.jit