Use row safe denominator for masked attention

keshavvinayak01 · keshavvinayak01 · commit 936478a25f63 · 2026-05-13T17:12:02.000+05:30
Signed-off-by: Keshav Vinayak Jha &lt;keshavvinayakjha@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/AggregatedOpInterfaceImpl.cpp
@@ -476,8 +476,14 @@ FailureOr<SmallVector<Value>> AttentionOp::decomposeOperation(OpBuilder &b) {
 
   Value fullyMaskedRows;
   if (mask != nullptr) {
-    fullyMaskedRows =
-        createFullyMaskedRowsFromScores(b, loc, sMap, maxMap, rowRedSize, s);
+    Type maskElementType = getElementTypeOrSelf(mask.value().getType());
+    if (isa<IntegerType>(maskElementType)) {
+      fullyMaskedRows = createFullyMaskedRowsFromMask(
+          b, loc, *getMaskMap(), maxMap, rowRedSize, mask.value());
+    } else {
+      fullyMaskedRows =
+          createFullyMaskedRowsFromScores(b, loc, sMap, maxMap, rowRedSize, s);
+    }
   }
 
   // max = rowMax(S)
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/test/decompose_aggregate_op.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/test/decompose_aggregate_op.mlir
@@ -185,9 +185,9 @@ func.func @attention_f16_masked(%query: tensor<192x1024x64xf16>,
 // CHECK: linalg.generic
 // CHECK:   arith.addf
 // CHECK:   linalg.yield
-// masked_rows = rowAll(isneginf(S))
+// masked_rows = rowAll(!mask)
 // CHECK: linalg.generic
-// CHECK:   arith.cmpf oeq
+// CHECK:   arith.xori
 // CHECK:   arith.andi
 // CHECK:   linalg.yield
 // max = rowMax(S)
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TileAttention.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/TileAttention.cpp
@@ -30,6 +30,30 @@ struct ConvertAttentionToOnlineAttentionPass final
   void runOnOperation() override;
 };
 
+static Value createSafeDenominator(OpBuilder &builder, Location loc,
+                                   AffineMap rowMap,
+                                   ArrayRef<OpFoldResult> rowSizes, Value sum) {
+  SmallVector<AffineMap> compressedMaps =
+      compressUnusedDims(SmallVector<AffineMap>{rowMap, rowMap});
+  AffineMap inputMap = compressedMaps[0];
+  AffineMap outputMap = compressedMaps[1];
+
+  Value output = tensor::EmptyOp::create(builder, loc, rowSizes,
+                                         getElementTypeOrSelf(sum.getType()));
+  SmallVector<utils::IteratorType> iteratorTypes(inputMap.getNumDims(),
+                                                 utils::IteratorType::parallel);
+  auto genericOp = linalg::GenericOp::create(
+      builder, loc, output.getType(), sum, output,
+      SmallVector<AffineMap>{inputMap, outputMap}, iteratorTypes,
+      [&](OpBuilder &b, Location loc, ValueRange args) {
+        Value one = arith::ConstantOp::create(
+            b, loc, b.getFloatAttr(args[0].getType(), 1.0));
+        Value denominator = arith::MaximumFOp::create(b, loc, args[0], one);
+        linalg::YieldOp::create(b, loc, denominator);
+      });
+  return genericOp.getResult(0);
+}
+
 } // namespace
 
 void convertToOnlineAttention(IREE::LinalgExt::AttentionOp attnOp,
@@ -124,39 +148,37 @@ void convertToOnlineAttention(IREE::LinalgExt::AttentionOp attnOp,
   Value x = onlineAttn.getResult(0);
   Value sum = onlineAttn.getResult(2);
   bool hasMask = static_cast<bool>(mask);
+  Value denominator = sum;
+  if (hasMask) {
+    denominator = createSafeDenominator(rewriter, loc, sumMap, rowRedSize, sum);
+    ops.push_back(denominator.getDefiningOp());
+  }
 
   // Finalize online attention. With a mask, fully-masked rows can have
-  // `sum == 0` and `x == 0`; guard that case in the finalization loop to
-  // produce 0 instead of NaN. Keep this fused with finalization because the
-  // online-attention lowering path expects a single finalization consumer.
+  // `sum == 0` and `x == 0`. Rows with at least one finite score have
+  // `sum >= 1`, so clamp the row denominator to 1 before the finalization loop.
+  // This preserves the normal rows and produces 0 for fully-masked rows without
+  // adding a per-output guard.
 
   // Compress the indexing maps.
-  SmallVector<AffineMap> compressedMaps =
-      compressUnusedDims(SmallVector<AffineMap>{sumMap, accMap, accMap});
+  SmallVector<Value> finalizeInputs = {denominator, x};
+  SmallVector<AffineMap> finalizeMaps = {sumMap, accMap, accMap};
+
+  SmallVector<AffineMap> compressedMaps = compressUnusedDims(finalizeMaps);
 
   SmallVector<utils::IteratorType> iteratorTypes(compressedMaps[0].getNumDims(),
                                                  utils::IteratorType::parallel);
 
   auto genericOp = linalg::GenericOp::create(
-      rewriter, loc, attnOp.getOutput().getType(), ValueRange{sum, x},
+      rewriter, loc, attnOp.getOutput().getType(), finalizeInputs,
       attnOp.getOutput(), compressedMaps, iteratorTypes,
       [&](OpBuilder &b, Location loc, ValueRange args) {
-        Value result;
-        if (hasMask) {
-          result = arith::DivFOp::create(b, loc, args[1], args[0]);
-          Value zero =
-              arith::ConstantOp::create(b, loc, b.getFloatAttr(f32Type, 0.0));
-          Value isZero = arith::CmpFOp::create(
-              b, loc, arith::CmpFPredicate::OEQ, args[0], zero);
-          result = arith::SelectOp::create(b, loc, isZero, zero, result);
-        } else {
-          Value one = arith::ConstantOp::create(
-              b, loc, b.getFloatAttr(args[0].getType(), 1.0));
-          Value reciprocal = arith::DivFOp::create(b, loc, one, args[0]);
-          // Both sum and x are in fp32, as created earlier, so we only need to
-          // cast after the mul.
-          result = arith::MulFOp::create(b, loc, reciprocal, args[1]);
-        }
+        Value one = arith::ConstantOp::create(
+            b, loc, b.getFloatAttr(args[0].getType(), 1.0));
+        Value reciprocal = arith::DivFOp::create(b, loc, one, args[0]);
+        // Both sum and x are in fp32, as created earlier, so we only need to
+        // cast after the mul.
+        Value result = arith::MulFOp::create(b, loc, reciprocal, args[1]);
         // Cast result to the required type by attention output.
         result = convertScalarToDtype(b, loc, result, args[2].getType(),
                                       /*isUnsignedCast=*/false);
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_online_attention.mlir b/compiler/src/iree/compiler/Dialect/LinalgExt/Transforms/test/convert_to_online_attention.mlir
@@ -66,12 +66,17 @@ func.func @masked_attention(%q: tensor<2x10x4096x128xf16>, %k: tensor<2x10x4096x
 
 // CHECK-LABEL: func.func @masked_attention
 // CHECK-SAME: %[[MASK:.+]]: tensor<2x10x4096x4096xi1>
-// Masked: keep the fully-masked row guard fused with finalization.
+// Masked: clamp the row denominator once and keep finalization in the same
+// reciprocal-multiply shape as the unmasked path.
 // CHECK: %[[OUT:.+]]:3 = iree_linalg_ext.online_attention
+// CHECK: %[[DENOM:.+]] = linalg.generic
+// CHECK-SAME: ins(%[[OUT]]#2
+// CHECK: arith.maximumf
+// CHECK: linalg.yield
 // CHECK: linalg.generic
-// CHECK-SAME: ins(%[[OUT]]#2, %[[OUT]]#0
+// CHECK-SAME: ins(%[[DENOM]], %[[OUT]]#0
 // CHECK: arith.divf
-// CHECK: arith.cmpf oeq
-// CHECK: arith.select
+// CHECK: arith.mulf
+// CHECK-NOT: arith.select
 // CHECK: arith.truncf
 // CHECK: linalg.yield
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.cpp
@@ -139,6 +139,22 @@ Value createFullyMaskedRowsFromScores(OpBuilder &builder, Location loc,
       });
 }
 
+Value createFullyMaskedRowsFromMask(OpBuilder &builder, Location loc,
+                                    AffineMap maskMap, AffineMap rowMap,
+                                    ArrayRef<OpFoldResult> rowSizes,
+                                    Value mask) {
+  return createFullyMaskedRows(
+      builder, loc, maskMap, rowMap, rowSizes, mask,
+      [&](OpBuilder &b, Location loc, Value maskValue) {
+        if (maskValue.getType().getIntOrFloatBitWidth() != 1) {
+          maskValue = arith::TruncIOp::create(b, loc, b.getI1Type(), maskValue);
+        }
+        Value trueValue =
+            arith::ConstantOp::create(b, loc, b.getBoolAttr(/*value=*/true));
+        return arith::XOrIOp::create(b, loc, maskValue, trueValue);
+      });
+}
+
 Value zeroFullyMaskedRows(OpBuilder &builder, Location loc, AffineMap valueMap,
                           AffineMap rowMap, Value value,
                           Value fullyMaskedRows) {
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h b/compiler/src/iree/compiler/Dialect/LinalgExt/Utils/Utils.h
@@ -50,6 +50,13 @@ Value createFullyMaskedRowsFromScores(OpBuilder &builder, Location loc,
                                       ArrayRef<OpFoldResult> rowSizes,
                                       Value scores);
 
+/// Compute a row predicate for safe masked-softmax finalization by checking
+/// whether every integer mask element in a softmax row is false.
+Value createFullyMaskedRowsFromMask(OpBuilder &builder, Location loc,
+                                    AffineMap maskMap, AffineMap rowMap,
+                                    ArrayRef<OpFoldResult> rowSizes,
+                                    Value mask);
+
 /// Zero every element in rows whose row predicate is true.
 Value zeroFullyMaskedRows(OpBuilder &builder, Location loc, AffineMap valueMap,
                           AffineMap rowMap, Value value, Value fullyMaskedRows);