[SDY][PadForDivisibility] Add DotGeneralOpPattern to enforce kZero padding

Varcho · copybara-github · commit 02dad3d4b2a7 · 2026-05-21T10:16:10.000-07:00
- Implement DotGeneralOpPattern to enforce kZero padding on operands of stablehlo.dot_general operations.
- Exclude DotGeneralOp from GenericOpPattern to avoid ambiguity.
- Add MLIR test case verifying correct padding propagation for dot_general.

Also adds a general util...
- Create ensurePadding helper to insert select operations when cached padding differs from required padding.

PiperOrigin-RevId: 919119485
diff --git a/shardy/dialect/sdy/transforms/export/pad_for_divisibility.cc b/shardy/dialect/sdy/transforms/export/pad_for_divisibility.cc
@@ -111,6 +111,12 @@ class PaddedTypeConverter : public TypeConverter {
 // Known padding value kinds for generated padding values.
 enum class PaddingValueKind { kZero, kOne };
 
+// Returns true if the operation has custom padding handling implemented in
+// this file and should be excluded from GenericOpPattern.
+bool hasCustomPadHandling(Operation* op) {
+  return isa<stablehlo::SliceOp, stablehlo::DotGeneralOp>(op);
+}
+
 class PaddingCache {
  public:
   // Registers the padding kind for a value. The conversion pattern
@@ -186,6 +192,75 @@ Value createPaddedValue(RankedTensorType paddedType, Value value,
   return padOp;
 }
 
+// Returns 'inputVal' if the value is not padded or the value already has
+// 'requiredKind' as PaddingValueKind. Otherwise, uses compare-and-select to
+// produce a new padded value from inputVal with the requiredKind padding and
+// returns the new value.
+//
+// We ensure all dimensions that require padding are padded with requireKind
+// unless dimsToEnforce is provided, in which case only the specified
+// dimensions are padded.
+Value ensurePadding(
+    Value inputVal, RankedTensorType origType, PaddingValueKind requiredKind,
+    OpBuilder& b, Location loc, PaddingCache& cache,
+    std::optional<ArrayRef<int64_t>> dimsToEnforce = std::nullopt) {
+  // Return early if no padding is applied or the cached padding already
+  // matches.
+  auto paddedType = cast<RankedTensorType>(inputVal.getType());
+  if (origType == paddedType) {
+    return inputVal;
+  }
+  std::optional<PaddingValueKind> currentKind = cache.getPadding(inputVal);
+  if (currentKind && *currentKind == requiredKind) {
+    return inputVal;
+  }
+
+  // Build a mask that is `true` for the original (unpadded) data region.
+  // An element is in the original region if its index along each padded
+  // dimension is less than the original unpadded size (index < original_size).
+  Value validDataMask;
+  for (auto [dim, origSize] : llvm::enumerate(origType.getShape())) {
+    if (origSize == paddedType.getDimSize(dim) ||
+        (dimsToEnforce && !llvm::is_contained(*dimsToEnforce, dim))) {
+      continue;
+    }
+    auto iotaType =
+        RankedTensorType::get(paddedType.getShape(), b.getI32Type());
+    Value iota = stablehlo::IotaOp::create(b, loc, iotaType, dim);
+    Value limit = stablehlo::ConstantOp::create(
+        b, loc,
+        DenseElementsAttr::get(RankedTensorType::get({}, b.getI32Type()),
+                               b.getI32IntegerAttr(origSize)));
+    Value broadcastLimit = stablehlo::BroadcastInDimOp::create(
+        b, loc, iotaType, limit, b.getDenseI64ArrayAttr({}));
+    Value mask = stablehlo::CompareOp::create(
+        b, loc, iota, broadcastLimit, stablehlo::ComparisonDirection::LT);
+    validDataMask = validDataMask
+                        ? stablehlo::AndOp::create(b, loc, validDataMask, mask)
+                        : mask;
+  }
+
+  if (!validDataMask) {
+    return inputVal;
+  }
+
+  // Create the constant with the new padding value and broadcast it to the
+  // same shape as 'inputVal'.
+  Value newPaddingScalar =
+      createConstant(b, loc, paddedType.getElementType(), requiredKind);
+  Value newPaddingValue = stablehlo::BroadcastInDimOp::create(
+      b, loc, paddedType, newPaddingScalar, b.getDenseI64ArrayAttr({}));
+
+  // Keep the original data from 'inputVal' (where mask is true), and replace
+  // the padded region with 'newPaddingValue' (where mask is false).
+  Value select = stablehlo::SelectOp::create(b, loc, validDataMask, inputVal,
+                                             newPaddingValue);
+  if (!dimsToEnforce) {
+    cache.setPadding(select, requiredKind);
+  }
+  return select;
+}
+
 // Converts op to its local version by replacing its operands with the already
 // converted operands.
 LogicalResult padGenericOp(Operation* op, ValueRange operands,
@@ -263,7 +338,7 @@ class GenericOpPattern : public ConversionPattern {
     Dialect* dialect = op->getDialect();
     if ((dialect && dialect->getNamespace() != "stablehlo" &&
          !isa<sdy::ReturnOp>(op)) ||
-        isa<stablehlo::SliceOp>(op)) {
+        hasCustomPadHandling(op)) {
       return failure();
     }
     return padGenericOp(op, operands, rewriter,
@@ -427,15 +502,17 @@ class AllSliceOpPattern : public OpConversionPattern<sdy::AllSliceOp> {
       return padGenericOp(op, adaptor.getOperands(), rewriter, converter);
     }
 
-    Value padOp = createPaddedValue(cast<RankedTensorType>(paddedInputType),
-                                    input, PaddingValueKind::kZero, symbolTable,
-                                    rewriter, cache);
+    PaddingValueKind paddingKind = PaddingValueKind::kZero;
+    Value padOp =
+        createPaddedValue(cast<RankedTensorType>(paddedInputType), input,
+                          paddingKind, symbolTable, rewriter, cache);
     OperationState state(op->getLoc(), op->getName());
     state.addOperands({padOp});
     state.addTypes(
         {getPaddedType(op.getResult().getType(), outSharding, symbolTable)});
     state.addAttributes(op->getAttrs());
     Operation* newOp = rewriter.create(state);
+    cache.setPadding(newOp->getResult(0), paddingKind);
 
     rewriter.replaceOp(op, newOp->getResults());
     return success();
@@ -489,8 +566,51 @@ class StablehloSliceOpPattern : public OpConversionPattern<stablehlo::SliceOp> {
   }
 };
 
+class StablehloDotGeneralOpPattern
+    : public OpConversionPattern<stablehlo::DotGeneralOp> {
+ public:
+  StablehloDotGeneralOpPattern(TypeConverter& converter, MLIRContext* ctx,
+                               PaddingCache& cache)
+      : OpConversionPattern(converter, ctx), cache(cache) {}
+
+  LogicalResult matchAndRewrite(
+      stablehlo::DotGeneralOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const override {
+    auto* converter =
+        static_cast<const PaddedTypeConverter*>(getTypeConverter());
+
+    Location loc = op.getLoc();
+    stablehlo::DotDimensionNumbersAttr dimNums = op.getDotDimensionNumbers();
+
+    Value lhs = adaptor.getOperands()[0];
+    auto lhsOrigType = dyn_cast<RankedTensorType>(op->getOperand(0).getType());
+    if (!lhsOrigType) {
+      return failure();
+    }
+    Value paddedLhs =
+        ensurePadding(lhs, lhsOrigType, PaddingValueKind::kZero, rewriter, loc,
+                      cache, dimNums.getLhsContractingDimensions());
+
+    Value rhs = adaptor.getOperands()[1];
+    auto rhsOrigType = dyn_cast<RankedTensorType>(op->getOperand(1).getType());
+    if (!rhsOrigType) {
+      return failure();
+    }
+    Value paddedRhs =
+        ensurePadding(rhs, rhsOrigType, PaddingValueKind::kZero, rewriter, loc,
+                      cache, dimNums.getRhsContractingDimensions());
+
+    return padGenericOp(op, {paddedLhs, paddedRhs}, rewriter, converter);
+  }
+
+ private:
+  PaddingCache& cache;
+};
+
 struct PadForDivisibilityPass
     : public impl::PadForDivisibilityPassBase<PadForDivisibilityPass> {
+  using PadForDivisibilityPassBase::PadForDivisibilityPassBase;
+
  protected:
   void runOnOperation() final {
     // FuncOpPattern enforces that function inputs and outputs are always fully
@@ -508,7 +628,8 @@ struct PadForDivisibilityPass
                  AllGatherOpPattern>(typeConverter, &getContext());
     // Sharing the padding cache reference across pattern instances is safe from
     // data races because pattern application within a function is sequential.
-    patterns.add<AllSliceOpPattern>(typeConverter, &getContext(), paddingCache);
+    patterns.add<AllSliceOpPattern, StablehloDotGeneralOpPattern>(
+        typeConverter, &getContext(), paddingCache);
     ConversionTarget target(getContext());
 
     auto isLegalType = [&](Type type, TensorShardingAttr sharding) {
diff --git a/shardy/dialect/sdy/transforms/export/test/pad_for_divisibility/dot_general.mlir b/shardy/dialect/sdy/transforms/export/test/pad_for_divisibility/dot_general.mlir
@@ -0,0 +1,100 @@
+// RUN: sdy_opt %s -sdy-pad-for-divisibility | FileCheck %s
+
+sdy.mesh @mesh_4_2 = <["x"=4, "y"=2]>
+
+// CHECK-LABEL: func @padded_contracting_dims_reuse
+func.func @padded_contracting_dims_reuse(%arg0: tensor<4x7xf32>, %arg1: tensor<7x5xf32>) -> tensor<4x5xf32> {
+  // Pad LHS with zero (contracting dimension).
+  // CHECK: %[[CST0:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[PAD0:.*]] = stablehlo.pad %arg0, %[[CST0]], low = [0, 0], high = [0, 1], interior = [0, 0] : (tensor<4x7xf32>, tensor<f32>) -> tensor<4x8xf32>
+  // CHECK: %[[SLICE0:.*]] = sdy.all_slice [{}, {"y"}] %[[PAD0]] out_sharding=<@mesh_4_2, [{}, {"y"}]> : tensor<4x8xf32>
+
+  // Pad RHS with zero (both contracting and non-contracting dimensions).
+  // CHECK: %[[CST1:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[PAD1:.*]] = stablehlo.pad %arg1, %[[CST1]], low = [0, 0], high = [1, 3], interior = [0, 0] : (tensor<7x5xf32>, tensor<f32>) -> tensor<8x8xf32>
+  // CHECK: %[[SLICE1:.*]] = sdy.all_slice [{"y"}, {"x"}] %[[PAD1]] out_sharding=<@mesh_4_2, [{"y"}, {"x"}]> : tensor<8x8xf32>
+
+  // Perform dot_general (result is padded on non-contracting dimension).
+  // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[SLICE0]], %[[SLICE1]], contracting_dims = [1] x [0] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{}, {"x"}]>]>} : (tensor<4x8xf32>, tensor<8x8xf32>) -> tensor<4x8xf32>
+
+  // Trim the padded result back to original shape.
+  // CHECK: %[[TRIM:.*]] = stablehlo.slice %[[DOT]] [0:4, 0:5] : (tensor<4x8xf32>) -> tensor<4x5xf32>
+  // CHECK: return %[[TRIM]] : tensor<4x5xf32>
+
+  %0 = sdy.all_slice [{}, {"y"}] %arg0 out_sharding=<@mesh_4_2, [{}, {"y"}]> : tensor<4x7xf32>
+  %1 = sdy.all_slice [{"y"}, {"x"}] %arg1 out_sharding=<@mesh_4_2, [{"y"}, {"x"}]> : tensor<7x5xf32>
+  %2 = stablehlo.dot_general %0, %1, contracting_dims = [1] x [0] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{}, {"x"}]>]>} : (tensor<4x7xf32>, tensor<7x5xf32>) -> tensor<4x5xf32>
+  %3 = stablehlo.slice %2 [0:4, 0:5] : (tensor<4x5xf32>) -> tensor<4x5xf32>
+  return %3 : tensor<4x5xf32>
+}
+
+// CHECK-LABEL: func @padded_contracting_dims_not_reuse
+func.func @padded_contracting_dims_not_reuse(%arg0: tensor<4x7xf32>, %arg1: tensor<7x5xf32>) -> tensor<4x5xf32> {
+  // Prepare padded LHS and RHS with unknown padding (via abs).
+  // CHECK: %[[PAD0:.*]] = stablehlo.pad %arg0, {{.*}}
+  // CHECK: %[[LHS_SLICE:.*]] = sdy.all_slice [{}, {"y"}] %[[PAD0]]
+  // CHECK: %[[LHS_ABS:.*]] = stablehlo.abs %[[LHS_SLICE]] {{.*}}
+  // CHECK: %[[PAD1:.*]] = stablehlo.pad %arg1, {{.*}}
+  // CHECK: %[[RHS_SLICE:.*]] = sdy.all_slice [{"y"}, {"x"}] %[[PAD1]]
+  // CHECK: %[[RHS_ABS:.*]] = stablehlo.abs %[[RHS_SLICE]] {{.*}}
+
+  // Enforce zero-padding on LHS contracting dim (dim 1).
+  // CHECK: %[[LHS_IOTA:.*]] = stablehlo.iota{{.*}}dim = 1
+  // CHECK: %[[LHS_LIMIT:.*]] = stablehlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[LHS_LIMIT_BCAST:.*]] = stablehlo.broadcast_in_dim %[[LHS_LIMIT]], dims = []
+  // CHECK: %[[LHS_MASK:.*]] = stablehlo.compare{{.*}}LT, %[[LHS_IOTA]], %[[LHS_LIMIT_BCAST]]
+  // CHECK: %[[LHS_CST:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[LHS_BCAST:.*]] = stablehlo.broadcast_in_dim %[[LHS_CST]], dims = []
+  // CHECK: %[[LHS_SELECT:.*]] = stablehlo.select %[[LHS_MASK]], %[[LHS_ABS]], %[[LHS_BCAST]]
+
+  // Enforce zero-padding on RHS contracting dim (dim 0).
+  // CHECK: %[[RHS_IOTA:.*]] = stablehlo.iota{{.*}}dim = 0
+  // CHECK: %[[RHS_LIMIT:.*]] = stablehlo.constant dense<7> : tensor<i32>
+  // CHECK: %[[RHS_LIMIT_BCAST:.*]] = stablehlo.broadcast_in_dim %[[RHS_LIMIT]], dims = []
+  // CHECK: %[[RHS_MASK:.*]] = stablehlo.compare{{.*}}LT, %[[RHS_IOTA]], %[[RHS_LIMIT_BCAST]]
+  // CHECK: %[[RHS_CST:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: %[[RHS_BCAST:.*]] = stablehlo.broadcast_in_dim %[[RHS_CST]], dims = []
+  // CHECK: %[[RHS_SELECT:.*]] = stablehlo.select %[[RHS_MASK]], %[[RHS_ABS]], %[[RHS_BCAST]]
+  // CHECK-NOT: stablehlo.iota {{.*}} dim = 1
+
+  // Perform dot_general and trim result.
+  // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[LHS_SELECT]], %[[RHS_SELECT]], contracting_dims = [1] x [0] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{}, {"x"}]>]>} : (tensor<4x8xf32>, tensor<8x8xf32>) -> tensor<4x8xf32>
+  // CHECK: %[[TRIM:.*]] = stablehlo.slice %[[DOT]] [0:4, 0:5]
+  // CHECK: return %[[TRIM]]
+
+  %0 = sdy.all_slice [{}, {"y"}] %arg0 out_sharding=<@mesh_4_2, [{}, {"y"}]> : tensor<4x7xf32>
+  %1 = stablehlo.abs %0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{}, {"y"}]>]>} : tensor<4x7xf32>
+  %2 = sdy.all_slice [{"y"}, {"x"}] %arg1 out_sharding=<@mesh_4_2, [{"y"}, {"x"}]> : tensor<7x5xf32>
+  %3 = stablehlo.abs %2 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{"y"}, {"x"}]>]>} : tensor<7x5xf32>
+  %4 = stablehlo.dot_general %1, %3, contracting_dims = [1] x [0] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{}, {"x"}]>]>} : (tensor<4x7xf32>, tensor<7x5xf32>) -> tensor<4x5xf32>
+  %5 = stablehlo.slice %4 [0:4, 0:5] : (tensor<4x5xf32>) -> tensor<4x5xf32>
+  return %5 : tensor<4x5xf32>
+}
+
+// CHECK-LABEL: func @padded_non_contracting_dims_any
+func.func @padded_non_contracting_dims_any(%arg0: tensor<3x8xf32>, %arg1: tensor<8x5xf32>) -> tensor<3x5xf32> {
+  // Prepare padded LHS with unknown padding.
+  // CHECK: %[[PAD0:.*]] = stablehlo.pad %arg0, {{.*}}
+  // CHECK: %[[LHS_SLICE:.*]] = sdy.all_slice [{"y"}, {}] %[[PAD0]]
+  // CHECK: %[[LHS_ABS:.*]] = stablehlo.abs %[[LHS_SLICE]] {{.*}}
+
+  // Verify no select is generated for non-contracting dim.
+  // CHECK-NOT: stablehlo.select
+  // CHECK-NOT: stablehlo.compare
+
+  // Prepare padded RHS.
+  // CHECK: %[[PAD1:.*]] = stablehlo.pad %arg1, {{.*}}
+  // CHECK: %[[RHS_SLICE:.*]] = sdy.all_slice [{}, {"x"}] %[[PAD1]]
+
+  // Perform dot_general and trim result.
+  // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[LHS_ABS]], %[[RHS_SLICE]], contracting_dims = [1] x [0] {{.*}}
+  // CHECK: %[[TRIM:.*]] = stablehlo.slice %[[DOT]] [0:3, 0:5]
+  // CHECK: return %[[TRIM]]
+
+  %0 = sdy.all_slice [{"y"}, {}] %arg0 out_sharding=<@mesh_4_2, [{"y"}, {}]> : tensor<3x8xf32>
+  %1 = stablehlo.abs %0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{"y"}, {}]>]>} : tensor<3x8xf32>
+  %2 = sdy.all_slice [{}, {"x"}] %arg1 out_sharding=<@mesh_4_2, [{}, {"x"}]> : tensor<8x5xf32>
+  %3 = stablehlo.dot_general %1, %2, contracting_dims = [1] x [0] {sdy.sharding = #sdy.sharding_per_value<[<@mesh_4_2, [{"y"}, {"x"}]>]>} : (tensor<3x8xf32>, tensor<8x5xf32>) -> tensor<3x5xf32>
+  %4 = stablehlo.slice %3 [0:3, 0:5] : (tensor<3x5xf32>) -> tensor<3x5xf32>
+  return %4 : tensor<3x5xf32>
+}