|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// Copyright (c) 2025 FlyDSL Project Contributors |
| 3 | +// RUN: %fly-opt %s --fly-convert-atom-call-to-ssa-form --convert-fly-to-rocdl | FileCheck %s |
| 4 | + |
| 5 | +gpu.module @bug_strided_universal_copy { |
| 6 | + |
| 7 | +// CHECK-LABEL: gpu.func @load_strided_global_into_register( |
| 8 | +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1> |
| 9 | +// CHECK: %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5> |
| 10 | +// CHECK: %[[V:.*]] = llvm.load %[[ARG0]] : !llvm.ptr<1> -> vector<4xf16> |
| 11 | +// CHECK-NEXT: llvm.store %[[V]], %[[REG]] : vector<4xf16>, !llvm.ptr<5> |
| 12 | + gpu.func @load_strided_global_into_register(%src: !fly.ptr<f16, global>) kernel { |
| 13 | + %shape4 = fly.make_int_tuple() : () -> !fly.int_tuple<4> |
| 14 | + %stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1> |
| 15 | + %stride8 = fly.make_int_tuple() : () -> !fly.int_tuple<8> |
| 16 | + |
| 17 | + %src_layout = fly.make_layout(%shape4, %stride8) |
| 18 | + : (!fly.int_tuple<4>, !fly.int_tuple<8>) -> !fly.layout<4:8> |
| 19 | + %reg_layout = fly.make_layout(%shape4, %stride1) |
| 20 | + : (!fly.int_tuple<4>, !fly.int_tuple<1>) -> !fly.layout<4:1> |
| 21 | + |
| 22 | + %src_view = fly.make_view(%src, %src_layout) |
| 23 | + : (!fly.ptr<f16, global>, !fly.layout<4:8>) -> !fly.memref<f16, global, 4:8> |
| 24 | + |
| 25 | + %copy = fly.make_copy_atom {valBits = 16 : i32} |
| 26 | + : !fly.copy_atom<!fly.universal_copy<64>, 16> |
| 27 | + |
| 28 | + %reg_ptr = fly.make_ptr() {dictAttrs = {allocaSize = 4 : i64}} |
| 29 | + : () -> !fly.ptr<f16, register> |
| 30 | + %reg_view = fly.make_view(%reg_ptr, %reg_layout) |
| 31 | + : (!fly.ptr<f16, register>, !fly.layout<4:1>) -> !fly.memref<f16, register, 4:1> |
| 32 | + |
| 33 | + fly.copy_atom_call(%copy, %src_view, %reg_view) |
| 34 | + : (!fly.copy_atom<!fly.universal_copy<64>, 16>, |
| 35 | + !fly.memref<f16, global, 4:8>, |
| 36 | + !fly.memref<f16, register, 4:1>) -> () |
| 37 | + gpu.return |
| 38 | + } |
| 39 | + |
| 40 | +// CHECK-LABEL: gpu.func @store_register_into_strided_global( |
| 41 | +// CHECK-SAME: %[[ARG0:.*]]: !llvm.ptr<1> |
| 42 | +// CHECK: %[[REG:.*]] = llvm.alloca %{{.*}} x f16 : (i64) -> !llvm.ptr<5> |
| 43 | +// CHECK: %[[V:.*]] = llvm.load %[[REG]] : !llvm.ptr<5> -> vector<4xf16> |
| 44 | +// CHECK-NEXT: llvm.store %[[V]], %[[ARG0]] : vector<4xf16>, !llvm.ptr<1> |
| 45 | + gpu.func @store_register_into_strided_global(%dst: !fly.ptr<f16, global>) kernel { |
| 46 | + %shape4 = fly.make_int_tuple() : () -> !fly.int_tuple<4> |
| 47 | + %stride1 = fly.make_int_tuple() : () -> !fly.int_tuple<1> |
| 48 | + %stride8 = fly.make_int_tuple() : () -> !fly.int_tuple<8> |
| 49 | + |
| 50 | + %dst_layout = fly.make_layout(%shape4, %stride8) |
| 51 | + : (!fly.int_tuple<4>, !fly.int_tuple<8>) -> !fly.layout<4:8> |
| 52 | + %reg_layout = fly.make_layout(%shape4, %stride1) |
| 53 | + : (!fly.int_tuple<4>, !fly.int_tuple<1>) -> !fly.layout<4:1> |
| 54 | + |
| 55 | + %dst_view = fly.make_view(%dst, %dst_layout) |
| 56 | + : (!fly.ptr<f16, global>, !fly.layout<4:8>) -> !fly.memref<f16, global, 4:8> |
| 57 | + |
| 58 | + %copy = fly.make_copy_atom {valBits = 16 : i32} |
| 59 | + : !fly.copy_atom<!fly.universal_copy<64>, 16> |
| 60 | + |
| 61 | + %reg_ptr = fly.make_ptr() {dictAttrs = {allocaSize = 4 : i64}} |
| 62 | + : () -> !fly.ptr<f16, register> |
| 63 | + %reg_view = fly.make_view(%reg_ptr, %reg_layout) |
| 64 | + : (!fly.ptr<f16, register>, !fly.layout<4:1>) -> !fly.memref<f16, register, 4:1> |
| 65 | + |
| 66 | + fly.copy_atom_call(%copy, %reg_view, %dst_view) |
| 67 | + : (!fly.copy_atom<!fly.universal_copy<64>, 16>, |
| 68 | + !fly.memref<f16, register, 4:1>, |
| 69 | + !fly.memref<f16, global, 4:8>) -> () |
| 70 | + gpu.return |
| 71 | + } |
| 72 | +} |
0 commit comments