[Codegen] Vectorize linalg_ext.scan to vector.scan (#24187)

sommerlukas · web-flow · commit d7d4d0d14835 · 2026-04-23T09:02:56.000+02:00
During `GenericVectorization`, vectorize `linalg_ext.scan` to `vector.scan`. If masking is required, it is introduced directly as masked `transfer_read/write` and an `arith.select` selecting between the actual data for unmasked elements and the identity for the combining operation in the scan (e.g., zero for add) for masked elements. `linalg_ext.scan` expresses the combiner as a region, where as `vector.scan` uses a fixed set of combiners as enum attribute. Therefore, we try to match the content of the region of the `linalg_ext.scan` operation against the set of supported combiners. This is part of #24186. Assisted-by: Claude Code and Codex --------- Signed-off-by: Lukas Sommer <lukas.sommer@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization_masked_configured.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization_masked_configured.mlir
@@ -173,3 +173,32 @@ func.func @configured_zero_vector_size_falls_back_to_inference(
 }
 // CHECK-LABEL: func.func @configured_zero_vector_size_falls_back_to_inference(
 // CHECK:         arith.addf {{.*}} : vector<4x1xf32>
+
+// -----
+
+#scan_masked_config = #iree_cpu.lowering_config<vector_common_parallel = [8, 16]>
+
+func.func @vectorize_scan_masked_configured(
+    %input: tensor<?x?xf32>,
+    %output: tensor<?x?xf32>,
+    %accum: tensor<?xf32>) -> (tensor<?x?xf32>, tensor<?xf32>) {
+  %0:2 = iree_linalg_ext.scan {lowering_config = #scan_masked_config}
+      dimension(1) inclusive(true)
+      ins(%input : tensor<?x?xf32>)
+      outs(%output, %accum : tensor<?x?xf32>, tensor<?xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %sum = arith.addf %arg0, %arg1 : f32
+      iree_linalg_ext.yield %sum : f32
+  } -> tensor<?x?xf32>, tensor<?xf32>
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?xf32>
+}
+// CHECK-LABEL: func.func @vectorize_scan_masked_configured(
+// CHECK:         vector.create_mask {{.*}} : vector<8x16xi1>
+// CHECK:         vector.transfer_read {{.*}} : tensor<?x?xf32>, vector<8x16xf32>
+// CHECK:         arith.select {{.*}} : vector<8x16xi1>, vector<8x16xf32>
+// CHECK:         vector.create_mask {{.*}} : vector<8xi1>
+// CHECK:         vector.transfer_read {{.*}} : tensor<?xf32>, vector<8xf32>
+// CHECK:         arith.select {{.*}} : vector<8xi1>, vector<8xf32>
+// CHECK:         vector.scan <add>, {{.*}} {inclusive = true, reduction_dim = 1 : i64}
+// CHECK:         vector.transfer_write {{.*}} : vector<8x16xf32>, tensor<?x?xf32>
+// CHECK:         vector.transfer_write {{.*}} : vector<8xf32>, tensor<?xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization_unmasked.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization_unmasked.mlir
@@ -56,3 +56,86 @@ func.func @single_static_pack_infer_vector_size(%arg0: tensor<101x201xi8>, %arg1
 // TODO: Support non-masking path.
 // CHECK-LABEL: func.func @single_static_pack_infer_vector_size
 // CHECK:         linalg.pack
+
+// -----
+
+// CHECK-LABEL: func.func @vectorize_scan_add_inclusive
+func.func @vectorize_scan_add_inclusive(
+    %input: tensor<8xf32>,
+    %output: tensor<8xf32>,
+    %accum: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
+  %0:2 = iree_linalg_ext.scan
+      dimension(0) inclusive(true)
+      ins(%input : tensor<8xf32>)
+      outs(%output, %accum : tensor<8xf32>, tensor<f32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %sum = arith.addf %arg0, %arg1 : f32
+      iree_linalg_ext.yield %sum : f32
+  } -> tensor<8xf32>, tensor<f32>
+  return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
+}
+// CHECK: %[[READ:.+]] = vector.transfer_read
+// CHECK: %[[INIT:.+]] = vector.transfer_read
+// CHECK: %[[DEST:.+]], %{{.+}} = vector.scan <add>, %[[READ]], %[[INIT]]
+// CHECK-SAME: inclusive = true
+// CHECK: vector.transfer_write %[[DEST]]
+// CHECK: vector.transfer_write
+
+// -----
+
+// CHECK-LABEL: func.func @vectorize_scan_mul_exclusive
+func.func @vectorize_scan_mul_exclusive(
+    %input: tensor<16xi32>,
+    %output: tensor<16xi32>,
+    %accum: tensor<i32>) -> (tensor<16xi32>, tensor<i32>) {
+  %0:2 = iree_linalg_ext.scan
+      dimension(0) inclusive(false)
+      ins(%input : tensor<16xi32>)
+      outs(%output, %accum : tensor<16xi32>, tensor<i32>) {
+    ^bb0(%arg0: i32, %arg1: i32):
+      %prod = arith.muli %arg0, %arg1 : i32
+      iree_linalg_ext.yield %prod : i32
+  } -> tensor<16xi32>, tensor<i32>
+  return %0#0, %0#1 : tensor<16xi32>, tensor<i32>
+}
+// CHECK: vector.scan <mul>
+// CHECK-SAME: inclusive = false
+
+// -----
+
+// CHECK-LABEL: func.func @vectorize_scan_2d
+func.func @vectorize_scan_2d(
+    %input: tensor<4x8xf32>,
+    %output: tensor<4x8xf32>,
+    %accum: tensor<4xf32>) -> (tensor<4x8xf32>, tensor<4xf32>) {
+  %0:2 = iree_linalg_ext.scan
+      dimension(1) inclusive(true)
+      ins(%input : tensor<4x8xf32>)
+      outs(%output, %accum : tensor<4x8xf32>, tensor<4xf32>) {
+    ^bb0(%arg0: f32, %arg1: f32):
+      %sum = arith.addf %arg0, %arg1 : f32
+      iree_linalg_ext.yield %sum : f32
+  } -> tensor<4x8xf32>, tensor<4xf32>
+  return %0#0, %0#1 : tensor<4x8xf32>, tensor<4xf32>
+}
+// CHECK: vector.scan <add>
+// CHECK-SAME: reduction_dim = 1
+
+// -----
+
+// CHECK-LABEL: func.func @vectorize_scan_maxsi
+func.func @vectorize_scan_maxsi(
+    %input: tensor<8xi32>,
+    %output: tensor<8xi32>,
+    %accum: tensor<i32>) -> (tensor<8xi32>, tensor<i32>) {
+  %0:2 = iree_linalg_ext.scan
+      dimension(0) inclusive(true)
+      ins(%input : tensor<8xi32>)
+      outs(%output, %accum : tensor<8xi32>, tensor<i32>) {
+    ^bb0(%arg0: i32, %arg1: i32):
+      %max = arith.maxsi %arg0, %arg1 : i32
+      iree_linalg_ext.yield %max : i32
+  } -> tensor<8xi32>, tensor<i32>
+  return %0#0, %0#1 : tensor<8xi32>, tensor<i32>
+}
+// CHECK: vector.scan <maxsi>
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel
@@ -236,6 +236,7 @@ iree_compiler_cc_library(
         ":VectorizableOpInterfaceGen",
         "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
+        "//compiler/src/iree/compiler/Codegen/Utils",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/Utils",
         "//compiler/src/iree/compiler/Utils",
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -188,6 +188,7 @@ iree_cc_library(
     MLIRVectorUtils
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
+    iree::compiler::Codegen::Utils
     iree::compiler::Dialect::LinalgExt::IR
     iree::compiler::Dialect::LinalgExt::Utils
     iree::compiler::Utils
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp
@@ -10,6 +10,7 @@
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h"
+#include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/Im2colUtils.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
@@ -50,6 +51,63 @@ static bool getBoolOption(DictionaryAttr options, StringRef name,
   return defaultValue;
 }
 
+static std::optional<vector::CombiningKind> matchScanCombiner(Region &region) {
+  if (!region.hasOneBlock()) {
+    return std::nullopt;
+  }
+
+  Block &block = region.front();
+  if (block.getNumArguments() != 2) {
+    return std::nullopt;
+  }
+
+  auto &ops = block.getOperations();
+  if (ops.size() != 2) {
+    return std::nullopt;
+  }
+
+  Operation &firstOp = ops.front();
+  Operation &yieldOp = ops.back();
+  if (firstOp.getNumOperands() != 2 || firstOp.getNumResults() != 1) {
+    return std::nullopt;
+  }
+  if (yieldOp.getNumOperands() != 1 ||
+      yieldOp.getOperand(0) != firstOp.getResult(0)) {
+    return std::nullopt;
+  }
+
+  Value arg0 = block.getArgument(0);
+  Value arg1 = block.getArgument(1);
+  Value opArg0 = firstOp.getOperand(0);
+  Value opArg1 = firstOp.getOperand(1);
+  if (opArg0 != arg0 || opArg1 != arg1) {
+    return std::nullopt;
+  }
+
+  return llvm::TypeSwitch<Operation *, std::optional<vector::CombiningKind>>(
+             &firstOp)
+      .Case<arith::AddIOp, arith::AddFOp>(
+          [](auto) { return vector::CombiningKind::ADD; })
+      .Case<arith::MulIOp, arith::MulFOp>(
+          [](auto) { return vector::CombiningKind::MUL; })
+      .Case<arith::AndIOp>([](auto) { return vector::CombiningKind::AND; })
+      .Case<arith::OrIOp>([](auto) { return vector::CombiningKind::OR; })
+      .Case<arith::XOrIOp>([](auto) { return vector::CombiningKind::XOR; })
+      .Case<arith::MaxSIOp>([](auto) { return vector::CombiningKind::MAXSI; })
+      .Case<arith::MaxUIOp>([](auto) { return vector::CombiningKind::MAXUI; })
+      .Case<arith::MinSIOp>([](auto) { return vector::CombiningKind::MINSI; })
+      .Case<arith::MinUIOp>([](auto) { return vector::CombiningKind::MINUI; })
+      .Case<arith::MaximumFOp>(
+          [](auto) { return vector::CombiningKind::MAXIMUMF; })
+      .Case<arith::MinimumFOp>(
+          [](auto) { return vector::CombiningKind::MINIMUMF; })
+      .Case<arith::MaxNumFOp>(
+          [](auto) { return vector::CombiningKind::MAXNUMF; })
+      .Case<arith::MinNumFOp>(
+          [](auto) { return vector::CombiningKind::MINNUMF; })
+      .Default([](Operation *) { return std::nullopt; });
+}
+
 struct GatherOpVectorizationModel
     : VectorizableOpInterface::ExternalModel<GatherOpVectorizationModel,
                                              IREE::LinalgExt::GatherOp> {
@@ -1342,6 +1400,138 @@ struct Im2colOpVectorizationModel
     return SmallVector<Value>{result};
   }
 };
+
+struct ScanOpVectorizationModel
+    : VectorizableOpInterface::ExternalModel<ScanOpVectorizationModel,
+                                             IREE::LinalgExt::ScanOp> {
+
+  bool isVectorizable(Operation *op, ArrayRef<int64_t> vectorSizes,
+                      ArrayRef<bool> scalableDims,
+                      DictionaryAttr options) const {
+    auto scanOp = cast<IREE::LinalgExt::ScanOp>(op);
+
+    // Must be able to match region to CombiningKind.
+    if (!matchScanCombiner(scanOp.getRegion())) {
+      return false;
+    }
+
+    // Scalable vectors not yet supported.
+    if (llvm::any_of(scalableDims, [](bool b) { return b; })) {
+      return false;
+    }
+
+    // Without vector sizes, require static shapes.
+    if (vectorSizes.empty()) {
+      auto inputTy = cast<ShapedType>(scanOp.getInput().getType());
+      return inputTy.hasStaticShape();
+    }
+
+    return true;
+  }
+
+  FailureOr<SmallVector<Value>> vectorize(Operation *op, RewriterBase &rewriter,
+                                          ArrayRef<int64_t> vectorSizes,
+                                          ArrayRef<bool> scalableDims,
+                                          DictionaryAttr options) const {
+    auto scanOp = cast<IREE::LinalgExt::ScanOp>(op);
+    Location loc = scanOp.getLoc();
+    RewriterBase::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(scanOp);
+
+    // Match combiner to CombiningKind.
+    auto kind = matchScanCombiner(scanOp.getRegion());
+    if (!kind) {
+      return failure();
+    }
+
+    // Determine vector shapes.
+    auto inputTy = cast<ShapedType>(scanOp.getInput().getType());
+    auto accumTy = cast<ShapedType>(scanOp.getAccumulator().getType());
+    Type elemType = inputTy.getElementType();
+    int64_t inputRank = inputTy.getRank();
+    int64_t scanDim = scanOp.getDimension();
+
+    SmallVector<int64_t> inputVecShape =
+        vectorSizes.empty() ? llvm::to_vector(inputTy.getShape())
+                            : llvm::to_vector(vectorSizes);
+
+    // Accumulator shape = input shape with scan dimension dropped.
+    SmallVector<int64_t> accumVecShape = inputVecShape;
+    accumVecShape.erase(accumVecShape.begin() + scanDim);
+
+    auto inputVecTy = VectorType::get(inputVecShape, elemType);
+    auto accumVecTy = VectorType::get(accumVecShape, elemType);
+
+    // Determine if masking is needed (dynamic shapes or vector > tensor).
+    bool needsInputMasking = !inputTy.hasStaticShape() ||
+                             !llvm::equal(inputTy.getShape(), inputVecShape);
+    bool needsAccumMasking = !accumTy.hasStaticShape() ||
+                             !llvm::equal(accumTy.getShape(), accumVecShape);
+
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    SmallVector<Value> inputIndices(inputRank, zero);
+    SmallVector<Value> accumIndices(accumTy.getRank(), zero);
+
+    // Read input tensor to vector.
+    Value padding = ub::PoisonOp::create(rewriter, loc, elemType);
+    Value inputVec = vector::createReadOrMaskedRead(
+        rewriter, loc, scanOp.getInput(), inputVecShape, padding,
+        /*useInBoundsInsteadOfMasking=*/!needsInputMasking);
+    if (needsInputMasking) {
+      SmallVector<OpFoldResult> inputDims =
+          tensor::getMixedSizes(rewriter, loc, scanOp.getInput());
+      auto inputMaskTy = VectorType::get(inputVecShape, rewriter.getI1Type());
+      Value inputMask = vector::CreateMaskOp::create(
+          rewriter, loc, inputMaskTy,
+          getValueOrCreateConstantIndexOp(rewriter, loc, inputDims));
+
+      // Replace masked-off lanes with identity value.
+      Value identity =
+          getCombiningIdentityValue(loc, rewriter, *kind, inputVecTy);
+      inputVec =
+          arith::SelectOp::create(rewriter, loc, inputMask, inputVec, identity);
+    }
+
+    // Read accumulator (initial value) to vector.
+    Value accumVec = vector::createReadOrMaskedRead(
+        rewriter, loc, scanOp.getAccumulator(), accumVecShape, padding,
+        /*useInBoundsInsteadOfMasking=*/!needsAccumMasking);
+    if (needsAccumMasking) {
+      SmallVector<OpFoldResult> accumDims =
+          tensor::getMixedSizes(rewriter, loc, scanOp.getAccumulator());
+      auto accumMaskTy = VectorType::get(accumVecShape, rewriter.getI1Type());
+      Value accumMask = vector::CreateMaskOp::create(
+          rewriter, loc, accumMaskTy,
+          getValueOrCreateConstantIndexOp(rewriter, loc, accumDims));
+
+      Value identity =
+          getCombiningIdentityValue(loc, rewriter, *kind, accumVecTy);
+      accumVec =
+          arith::SelectOp::create(rewriter, loc, accumMask, accumVec, identity);
+    }
+
+    // Create vector.scan.
+    auto vectorScanOp =
+        vector::ScanOp::create(rewriter, loc, *kind, inputVec, accumVec,
+                               scanDim, scanOp.getInclusive());
+
+    // Write results back to tensors.
+    Value output = vector::createWriteOrMaskedWrite(
+                       rewriter, loc, vectorScanOp.getDest(),
+                       scanOp.getOutput(), inputIndices,
+                       /*useInBoundsInsteadOfMasking=*/!needsInputMasking)
+                       ->getResult(0);
+
+    Value accum = vector::createWriteOrMaskedWrite(
+                      rewriter, loc, vectorScanOp.getAccumulatedValue(),
+                      scanOp.getAccumulator(), accumIndices,
+                      /*useInBoundsInsteadOfMasking=*/!needsAccumMasking)
+                      ->getResult(0);
+
+    return SmallVector<Value>{output, accum};
+  }
+};
+
 } // namespace
 
 void registerVectorizableOpInterfaceExternalModels(DialectRegistry &registry) {
@@ -1355,6 +1545,7 @@ void registerVectorizableOpInterfaceExternalModels(DialectRegistry &registry) {
         *ctx);
     IREE::LinalgExt::Im2colOp::attachInterface<Im2colOpVectorizationModel>(
         *ctx);
+    IREE::LinalgExt::ScanOp::attachInterface<ScanOpVectorizationModel>(*ctx);
   });
   registry.addExtension(+[](MLIRContext *ctx,
                             IREE::VectorExt::IREEVectorExtDialect *dialect) {