Skip to content

Commit bc574f4

Browse files
committed
[Test] Add reproducer for strided universal_copy in convert-fly-to-rocdl
Freezes current (buggy) lowering: a non-unit-stride !fly.memref on one side of fly.copy_atom_call is lowered to a single contiguous llvm.load / llvm.store against the memory-side pointer, silently ignoring the stride. Next commit will fix emitAtomCallSSA and update these CHECKs.
1 parent 3f7b6b5 commit bc574f4

1 file changed

Lines changed: 72 additions & 0 deletions

File tree

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// Copyright (c) 2025 FlyDSL Project Contributors
3+
// RUN: %fly-opt %s --fly-convert-atom-call-to-ssa-form --convert-fly-to-rocdl | FileCheck %s
4+
5+
gpu.module @bug_strided_universal_copy {
6+
7+
// CHECK-LABEL: gpu.func @load_strided_global_into_register(
8+
// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1>
9+
// CHECK: %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5>
10+
// CHECK: %[[V:.*]] = llvm.load %[[ARG0]] : !llvm.ptr<1> -> vector<4xf16>
11+
// CHECK-NEXT: llvm.store %[[V]], %[[REG]] : vector<4xf16>, !llvm.ptr<5>
12+
gpu.func @load_strided_global_into_register(%src: !fly.ptr<f16, global>) kernel {
13+
%shape4 = fly.make_int_tuple() : () -> !fly.int_tuple<4>
14+
%stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1>
15+
%stride8 = fly.make_int_tuple() : () -> !fly.int_tuple<8>
16+
17+
%src_layout = fly.make_layout(%shape4, %stride8)
18+
: (!fly.int_tuple<4>, !fly.int_tuple<8>) -> !fly.layout<4:8>
19+
%reg_layout = fly.make_layout(%shape4, %stride1)
20+
: (!fly.int_tuple<4>, !fly.int_tuple<1>) -> !fly.layout<4:1>
21+
22+
%src_view = fly.make_view(%src, %src_layout)
23+
: (!fly.ptr<f16, global>, !fly.layout<4:8>) -> !fly.memref<f16, global, 4:8>
24+
25+
%copy = fly.make_copy_atom {valBits = 16 : i32}
26+
: !fly.copy_atom<!fly.universal_copy<64>, 16>
27+
28+
%reg_ptr = fly.make_ptr() {dictAttrs = {allocaSize = 4 : i64}}
29+
: () -> !fly.ptr<f16, register>
30+
%reg_view = fly.make_view(%reg_ptr, %reg_layout)
31+
: (!fly.ptr<f16, register>, !fly.layout<4:1>) -> !fly.memref<f16, register, 4:1>
32+
33+
fly.copy_atom_call(%copy, %src_view, %reg_view)
34+
: (!fly.copy_atom<!fly.universal_copy<64>, 16>,
35+
!fly.memref<f16, global, 4:8>,
36+
!fly.memref<f16, register, 4:1>) -> ()
37+
gpu.return
38+
}
39+
40+
// CHECK-LABEL: gpu.func @store_register_into_strided_global(
41+
// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1>
42+
// CHECK: %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5>
43+
// CHECK: %[[V:.*]] = llvm.load %[[REG]] : !llvm.ptr<5> -> vector<4xf16>
44+
// CHECK-NEXT: llvm.store %[[V]], %[[ARG0]] : vector<4xf16>, !llvm.ptr<1>
45+
gpu.func @store_register_into_strided_global(%dst: !fly.ptr<f16, global>) kernel {
46+
%shape4 = fly.make_int_tuple() : () -> !fly.int_tuple<4>
47+
%stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1>
48+
%stride8 = fly.make_int_tuple() : () -> !fly.int_tuple<8>
49+
50+
%dst_layout = fly.make_layout(%shape4, %stride8)
51+
: (!fly.int_tuple<4>, !fly.int_tuple<8>) -> !fly.layout<4:8>
52+
%reg_layout = fly.make_layout(%shape4, %stride1)
53+
: (!fly.int_tuple<4>, !fly.int_tuple<1>) -> !fly.layout<4:1>
54+
55+
%dst_view = fly.make_view(%dst, %dst_layout)
56+
: (!fly.ptr<f16, global>, !fly.layout<4:8>) -> !fly.memref<f16, global, 4:8>
57+
58+
%copy = fly.make_copy_atom {valBits = 16 : i32}
59+
: !fly.copy_atom<!fly.universal_copy<64>, 16>
60+
61+
%reg_ptr = fly.make_ptr() {dictAttrs = {allocaSize = 4 : i64}}
62+
: () -> !fly.ptr<f16, register>
63+
%reg_view = fly.make_view(%reg_ptr, %reg_layout)
64+
: (!fly.ptr<f16, register>, !fly.layout<4:1>) -> !fly.memref<f16, register, 4:1>
65+
66+
fly.copy_atom_call(%copy, %reg_view, %dst_view)
67+
: (!fly.copy_atom<!fly.universal_copy<64>, 16>,
68+
!fly.memref<f16, register, 4:1>,
69+
!fly.memref<f16, global, 4:8>) -> ()
70+
gpu.return
71+
}
72+
}

0 commit comments

Comments
 (0)