[Fix] Honor stride in universal copy atom lowering

kefan203 · kefan203 · commit b4130b02a8a0 · 2026-04-23T14:07:43.000+08:00
CopyOpUniversalCopyType::emitAtomCallSSA / emitAtomCall used to emit a
single contiguous llvm.load / llvm.store / llvm.memcpy against the
memory-side pointer of fly.copy_atom_call, ignoring the memref's
layout stride. When the memref's coalesced leaf has stride != 1, this
silently produced wrong IR (adjacent lanes read/written instead of
stride-spaced elements).

Fix by consulting the coalesced leaf of the memref layout:

* count &lt;= 1 or stride == 1: keep the existing fast path (single
  vector load/store or memcpy).
* otherwise: emit element-wise gather/scatter -- per-element GEP by
  i * stride (in elements), swizzle the resulting pointer, then
  load/insertelement (load side) or extractelement/store (store side).
diff --git a/lib/Dialect/Fly/IR/FlyUniversalOps.cpp b/lib/Dialect/Fly/IR/FlyUniversalOps.cpp
@@ -9,11 +9,87 @@
 #include "mlir/IR/DialectImplementation.h"
 
 #include "flydsl/Dialect/Fly/IR/FlyDialect.h"
+#include "flydsl/Dialect/Fly/Utils/LayoutUtils.h"
 #include "flydsl/Dialect/Fly/Utils/PointerUtils.h"
 #include "flydsl/Dialect/Fly/Utils/ThrValLayoutMacro.h.inc"
 
 namespace mlir::fly {
 
+namespace {
+
+/// Returns the element-count and element-stride of a memref, assuming its
+/// layout coalesces into a single static leaf. This represents how a single
+/// atom-call moves `count` elements starting from the base pointer with the
+/// given `elementStride` (in elements).
+FailureOr<std::pair<int64_t, int64_t>>
+getCoalescedLeafCountAndStride(fly::MemRefType memTy) {
+  auto layoutAttr = dyn_cast<LayoutAttr>(memTy.getLayout());
+  if (!layoutAttr)
+    return failure();
+  LayoutBuilder<LayoutAttr> lb(memTy.getContext());
+  auto coal = layoutCoalesce(lb, layoutAttr);
+  if (!coal.isLeaf())
+    return failure();
+  auto shapeInt = coal.getShape().getLeafAsInt();
+  auto strideInt = coal.getStride().getLeafAsInt();
+  if (!shapeInt.isStatic() || !strideInt.isStatic())
+    return failure();
+  return std::make_pair<int64_t, int64_t>(shapeInt.getValue(), strideInt.getValue());
+}
+
+/// Emits a sequence of element-wise loads from a strided memory pointer and
+/// packs the values into a `vector<count x elemTy>`. The i-th element is read
+/// from `base + i * elementStride` (in elements).
+Value emitStridedLoadAsVector(OpBuilder &b, Location loc, VectorType vecTy,
+                              TypedValue<LLVM::LLVMPointerType> base,
+                              int64_t count, int64_t elementStride,
+                              fly::MemRefType memTy) {
+  Type llvmElemTy = projectToLLVMCompatibleElemTy(memTy.getElemTy());
+  Type vecElemTy = vecTy.getElementType();
+  auto ptrTy = base.getType();
+  Value vec = LLVM::UndefOp::create(b, loc, vecTy);
+  for (int64_t i = 0; i < count; ++i) {
+    Value gep = LLVM::GEPOp::create(b, loc, ptrTy, llvmElemTy, base,
+                                    ArrayRef<LLVM::GEPArg>{int32_t(i * elementStride)});
+    Value swz = applySwizzleOnPtr(b, loc,
+                                  cast<TypedValue<LLVM::LLVMPointerType>>(gep),
+                                  memTy.getSwizzle());
+    Value elem = LLVM::LoadOp::create(b, loc, llvmElemTy, swz);
+    if (llvmElemTy != vecElemTy)
+      elem = LLVM::BitcastOp::create(b, loc, vecElemTy, elem);
+    Value idx = arith::ConstantIntOp::create(b, loc, i, /*width=*/32);
+    vec = LLVM::InsertElementOp::create(b, loc, vec, elem, idx);
+  }
+  return vec;
+}
+
+/// Emits a sequence of element-wise stores that scatter a `vector<count x E>`
+/// into a strided memory pointer. The i-th element is written to
+/// `base + i * elementStride` (in elements).
+void emitStridedStoreFromVector(OpBuilder &b, Location loc, Value vec,
+                                TypedValue<LLVM::LLVMPointerType> base,
+                                int64_t count, int64_t elementStride,
+                                fly::MemRefType memTy) {
+  auto vecTy = cast<VectorType>(vec.getType());
+  Type llvmElemTy = projectToLLVMCompatibleElemTy(memTy.getElemTy());
+  Type vecElemTy = vecTy.getElementType();
+  auto ptrTy = base.getType();
+  for (int64_t i = 0; i < count; ++i) {
+    Value idx = arith::ConstantIntOp::create(b, loc, i, /*width=*/32);
+    Value elem = LLVM::ExtractElementOp::create(b, loc, vec, idx);
+    if (llvmElemTy != vecElemTy)
+      elem = LLVM::BitcastOp::create(b, loc, llvmElemTy, elem);
+    Value gep = LLVM::GEPOp::create(b, loc, ptrTy, llvmElemTy, base,
+                                    ArrayRef<LLVM::GEPArg>{int32_t(i * elementStride)});
+    Value swz = applySwizzleOnPtr(b, loc,
+                                  cast<TypedValue<LLVM::LLVMPointerType>>(gep),
+                                  memTy.getSwizzle());
+    LLVM::StoreOp::create(b, loc, elem, swz);
+  }
+}
+
+} // namespace
+
 bool CopyOpUniversalCopyType::isStatic() const { return true; }
 
 Value CopyOpUniversalCopyType::rebuildStaticValue(OpBuilder &builder, Location loc,
@@ -129,23 +205,55 @@ FailureOr<Value> CopyOpUniversalCopyType::emitAtomCallSSA(OpBuilder &builder, Lo
                                                           Value dst) const {
   Value result;
   if (isa<fly::MemRefType>(srcTyArg)) {
-    // src is memory
+    // src is memory: honor the memref's coalesced stride so a non-unit stride
+    // layout lowers to a per-element scatter/gather rather than a single
+    // contiguous vector load that silently ignores the stride.
     auto srcMemTy = cast<fly::MemRefType>(srcTyArg);
+    auto srcBase = cast<TypedValue<LLVM::LLVMPointerType>>(src);
+    auto countAndStride = getCoalescedLeafCountAndStride(srcMemTy);
+    if (failed(countAndStride))
+      return failure();
+    auto [count, stride] = *countAndStride;
+
     Type loadTy = resultTy ? resultTy : builder.getIntegerType(getBitSize());
-    Value srcPtr = applySwizzleOnPtr(builder, loc, cast<TypedValue<LLVM::LLVMPointerType>>(src),
-                                     srcMemTy.getSwizzle());
-    result = LLVM::LoadOp::create(builder, loc, loadTy, srcPtr);
+
+    if (count <= 1 || stride == 1) {
+      Value srcPtr = applySwizzleOnPtr(builder, loc, srcBase, srcMemTy.getSwizzle());
+      result = LLVM::LoadOp::create(builder, loc, loadTy, srcPtr);
+    } else {
+      Type llvmElemTy = projectToLLVMCompatibleElemTy(srcMemTy.getElemTy());
+      auto vecTy = VectorType::get({count}, llvmElemTy);
+      Value vec =
+          emitStridedLoadAsVector(builder, loc, vecTy, srcBase, count, stride, srcMemTy);
+      if (vec.getType() != loadTy)
+        vec = LLVM::BitcastOp::create(builder, loc, loadTy, vec);
+      result = vec;
+    }
   } else {
     // src is register
     result = src;
   }
 
   if (!resultTy) {
-    // dst is memory
+    // dst is memory: symmetric treatment for strided stores.
     auto dstMemTy = cast<fly::MemRefType>(dstTyArg);
-    Value dstPtr = applySwizzleOnPtr(builder, loc, cast<TypedValue<LLVM::LLVMPointerType>>(dst),
-                                     dstMemTy.getSwizzle());
-    LLVM::StoreOp::create(builder, loc, result, dstPtr);
+    auto dstBase = cast<TypedValue<LLVM::LLVMPointerType>>(dst);
+    auto countAndStride = getCoalescedLeafCountAndStride(dstMemTy);
+    if (failed(countAndStride))
+      return failure();
+    auto [count, stride] = *countAndStride;
+
+    if (count <= 1 || stride == 1) {
+      Value dstPtr = applySwizzleOnPtr(builder, loc, dstBase, dstMemTy.getSwizzle());
+      LLVM::StoreOp::create(builder, loc, result, dstPtr);
+    } else {
+      Type llvmElemTy = projectToLLVMCompatibleElemTy(dstMemTy.getElemTy());
+      auto vecTy = VectorType::get({count}, llvmElemTy);
+      Value vec = result;
+      if (vec.getType() != vecTy)
+        vec = LLVM::BitcastOp::create(builder, loc, vecTy, vec);
+      emitStridedStoreFromVector(builder, loc, vec, dstBase, count, stride, dstMemTy);
+    }
   }
   return result;
 }
@@ -188,14 +296,53 @@ LogicalResult CopyOpUniversalCopyType::emitAtomCall(OpBuilder &builder, Location
   if (!isa<LLVM::LLVMPointerType>(src.getType()) || !isa<LLVM::LLVMPointerType>(dst.getType()))
     return failure();
 
-  int32_t copyBytes = getBitSize() / 8;
-  Value srcPtr = applySwizzleOnPtr(builder, loc, cast<TypedValue<LLVM::LLVMPointerType>>(src),
-                                   srcMemTy.getSwizzle());
-  Value dstPtr = applySwizzleOnPtr(builder, loc, cast<TypedValue<LLVM::LLVMPointerType>>(dst),
-                                   dstMemTy.getSwizzle());
-  Value len = arith::ConstantIntOp::create(builder, loc, copyBytes, /*width=*/32);
-  LLVM::MemcpyOp::create(builder, loc, dstPtr, srcPtr, len, /*isVolatile=*/false);
+  auto srcBase = cast<TypedValue<LLVM::LLVMPointerType>>(src);
+  auto dstBase = cast<TypedValue<LLVM::LLVMPointerType>>(dst);
+
+  auto srcCs = getCoalescedLeafCountAndStride(srcMemTy);
+  auto dstCs = getCoalescedLeafCountAndStride(dstMemTy);
+  if (failed(srcCs) || failed(dstCs))
+    return failure();
+  auto [srcCount, srcStride] = *srcCs;
+  auto [dstCount, dstStride] = *dstCs;
+  if (srcCount != dstCount)
+    return failure();
+
+  bool srcContig = srcCount <= 1 || srcStride == 1;
+  bool dstContig = dstCount <= 1 || dstStride == 1;
+
+  if (srcContig && dstContig) {
+    // Fast path: both sides are contiguous, lower to a single memcpy.
+    int32_t copyBytes = getBitSize() / 8;
+    Value srcPtr = applySwizzleOnPtr(builder, loc, srcBase, srcMemTy.getSwizzle());
+    Value dstPtr = applySwizzleOnPtr(builder, loc, dstBase, dstMemTy.getSwizzle());
+    Value len = arith::ConstantIntOp::create(builder, loc, copyBytes, /*width=*/32);
+    LLVM::MemcpyOp::create(builder, loc, dstPtr, srcPtr, len, /*isVolatile=*/false);
+    return success();
+  }
 
+  // At least one side is strided: emit element-wise gather/scatter so each
+  // side honors its own stride.
+  Type llvmSrcElemTy = projectToLLVMCompatibleElemTy(srcMemTy.getElemTy());
+  Type llvmDstElemTy = projectToLLVMCompatibleElemTy(dstMemTy.getElemTy());
+  auto srcPtrTy = srcBase.getType();
+  auto dstPtrTy = dstBase.getType();
+  for (int64_t i = 0; i < srcCount; ++i) {
+    Value srcGep = LLVM::GEPOp::create(builder, loc, srcPtrTy, llvmSrcElemTy, srcBase,
+                                       ArrayRef<LLVM::GEPArg>{int32_t(i * srcStride)});
+    Value srcSwz = applySwizzleOnPtr(builder, loc,
+                                     cast<TypedValue<LLVM::LLVMPointerType>>(srcGep),
+                                     srcMemTy.getSwizzle());
+    Value v = LLVM::LoadOp::create(builder, loc, llvmSrcElemTy, srcSwz);
+    if (llvmSrcElemTy != llvmDstElemTy)
+      v = LLVM::BitcastOp::create(builder, loc, llvmDstElemTy, v);
+    Value dstGep = LLVM::GEPOp::create(builder, loc, dstPtrTy, llvmDstElemTy, dstBase,
+                                       ArrayRef<LLVM::GEPArg>{int32_t(i * dstStride)});
+    Value dstSwz = applySwizzleOnPtr(builder, loc,
+                                     cast<TypedValue<LLVM::LLVMPointerType>>(dstGep),
+                                     dstMemTy.getSwizzle());
+    LLVM::StoreOp::create(builder, loc, v, dstSwz);
+  }
   return success();
 }
 
diff --git a/tests/mlir/Transforms/convert_fly_to_rocdl_universal_copy_strided.mlir b/tests/mlir/Transforms/convert_fly_to_rocdl_universal_copy_strided.mlir
@@ -7,8 +7,20 @@ gpu.module @bug_strided_universal_copy {
 // CHECK-LABEL: gpu.func @load_strided_global_into_register(
 // CHECK-SAME:     %[[ARG0:.*]]: !llvm.ptr<1>
 // CHECK:      %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5>
-// CHECK:      %[[V:.*]] = llvm.load %[[ARG0]] : !llvm.ptr<1> -> vector<4xf16>
-// CHECK-NEXT: llvm.store %[[V]], %[[REG]] : vector<4xf16>, !llvm.ptr<5>
+// CHECK:      %[[U:.*]] = llvm.mlir.undef : vector<4xf16>
+// CHECK:      %[[P0:.*]] = llvm.getelementptr %[[ARG0]][0] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      %[[E0:.*]] = llvm.load %[[P0]] : !llvm.ptr<1> -> f16
+// CHECK:      %[[V0:.*]] = llvm.insertelement %[[E0]], %[[U]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P1:.*]] = llvm.getelementptr %[[ARG0]][8] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      %[[E1:.*]] = llvm.load %[[P1]] : !llvm.ptr<1> -> f16
+// CHECK:      %[[V1:.*]] = llvm.insertelement %[[E1]], %[[V0]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P2:.*]] = llvm.getelementptr %[[ARG0]][16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      %[[E2:.*]] = llvm.load %[[P2]] : !llvm.ptr<1> -> f16
+// CHECK:      %[[V2:.*]] = llvm.insertelement %[[E2]], %[[V1]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P3:.*]] = llvm.getelementptr %[[ARG0]][24] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      %[[E3:.*]] = llvm.load %[[P3]] : !llvm.ptr<1> -> f16
+// CHECK:      %[[V3:.*]] = llvm.insertelement %[[E3]], %[[V2]]{{.*}} : vector<4xf16>
+// CHECK:      llvm.store %[[V3]], %[[REG]] : vector<4xf16>, !llvm.ptr<5>
   gpu.func @load_strided_global_into_register(%src: !fly.ptr<f16,  global>) kernel {
     %shape4  = fly.make_int_tuple() : () -> !fly.int_tuple<4>
     %stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1>
@@ -41,7 +53,18 @@ gpu.module @bug_strided_universal_copy {
 // CHECK-SAME:     %[[ARG0:.*]]: !llvm.ptr<1>
 // CHECK:      %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5>
 // CHECK:      %[[V:.*]] = llvm.load %[[REG]] : !llvm.ptr<5> -> vector<4xf16>
-// CHECK-NEXT: llvm.store %[[V]], %[[ARG0]] : vector<4xf16>, !llvm.ptr<1>
+// CHECK:      %[[E0:.*]] = llvm.extractelement %[[V]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P0:.*]] = llvm.getelementptr %[[ARG0]][0] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      llvm.store %[[E0]], %[[P0]] : f16, !llvm.ptr<1>
+// CHECK:      %[[E1:.*]] = llvm.extractelement %[[V]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P1:.*]] = llvm.getelementptr %[[ARG0]][8] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      llvm.store %[[E1]], %[[P1]] : f16, !llvm.ptr<1>
+// CHECK:      %[[E2:.*]] = llvm.extractelement %[[V]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P2:.*]] = llvm.getelementptr %[[ARG0]][16] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      llvm.store %[[E2]], %[[P2]] : f16, !llvm.ptr<1>
+// CHECK:      %[[E3:.*]] = llvm.extractelement %[[V]]{{.*}} : vector<4xf16>
+// CHECK:      %[[P3:.*]] = llvm.getelementptr %[[ARG0]][24] : (!llvm.ptr<1>) -> !llvm.ptr<1>, f16
+// CHECK:      llvm.store %[[E3]], %[[P3]] : f16, !llvm.ptr<1>
   gpu.func @store_register_into_strided_global(%dst: !fly.ptr<f16,  global>) kernel {
     %shape4  = fly.make_int_tuple() : () -> !fly.int_tuple<4>
     %stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1>