|
| 1 | +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2 |
| 2 | +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2 |
| 3 | +// RUN: dpct --format-range=none -out-root %T/ldmatrix %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only |
| 4 | +// RUN: FileCheck %s --match-full-lines --input-file %T/ldmatrix/ldmatrix.dp.cpp |
| 5 | +// RUN: %if build_lit %{icpx -c -fsycl %T/ldmatrix/ldmatrix.dp.cpp -o %T/ldmatrix/ldmatrix.dp.o %} |
| 6 | + |
| 7 | +// clang-format off |
| 8 | +#include <cuda_runtime.h> |
| 9 | + |
| 10 | +/* |
| 11 | +ldmatrix.sync.aligned.shape.num{.trans}{.ss}.type r, [p]; |
| 12 | +
|
| 13 | +Below are the currenly supported configurations: |
| 14 | +.shape = {.m8n8}; |
| 15 | +.num = {.x1, .x2, .x4}; |
| 16 | +.ss = {.shared{::cta}}; |
| 17 | +.type = {.b16}; |
| 18 | +*/ |
| 19 | + |
| 20 | +__device__ void load_matrix_x1(void *sh_r_addr, int *r) { |
| 21 | + // CHECK: auto addr = sh_r_addr; |
| 22 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 23 | + |
| 24 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], item_ct1); |
| 25 | + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" |
| 26 | + : "=r"(r[0]) |
| 27 | + : "r"(addr)); |
| 28 | +} |
| 29 | + |
| 30 | +__device__ void load_matrix_x2(void *sh_r_addr, int *r) { |
| 31 | + // CHECK: auto addr = sh_r_addr; |
| 32 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 33 | + |
| 34 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], item_ct1); |
| 35 | + asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n" |
| 36 | + : "=r"(r[0]), "=r"(r[1]) |
| 37 | + : "r"(addr)); |
| 38 | +} |
| 39 | + |
| 40 | +__device__ void load_matrix_x4(void *sh_r_addr, int *r) { |
| 41 | + // CHECK: auto addr = sh_r_addr; |
| 42 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 43 | + |
| 44 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], &r[2], &r[3], item_ct1); |
| 45 | + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n" |
| 46 | + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]) |
| 47 | + : "r"(addr)); |
| 48 | +} |
| 49 | + |
| 50 | +__device__ void load_matrix_x1_trans(void *sh_r_addr, int *r) { |
| 51 | + // CHECK: auto addr = sh_r_addr; |
| 52 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 53 | + |
| 54 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], item_ct1, true); |
| 55 | + asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n" |
| 56 | + : "=r"(r[0]) |
| 57 | + : "r"(addr)); |
| 58 | +} |
| 59 | + |
| 60 | +__device__ void load_matrix_x2_trans(void *sh_r_addr, int *r) { |
| 61 | + // CHECK: auto addr = sh_r_addr; |
| 62 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 63 | + |
| 64 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], item_ct1, true); |
| 65 | + asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n" |
| 66 | + : "=r"(r[0]), "=r"(r[1]) |
| 67 | + : "r"(addr)); |
| 68 | +} |
| 69 | + |
| 70 | +__device__ void load_matrix_x4_trans(void *sh_r_addr, int *r) { |
| 71 | + // CHECK: auto addr = sh_r_addr; |
| 72 | + uint32_t addr = static_cast<uint32_t>(__cvta_generic_to_shared(sh_r_addr)); |
| 73 | + |
| 74 | + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], &r[2], &r[3], item_ct1, true); |
| 75 | + asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n" |
| 76 | + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]) |
| 77 | + : "r"(addr)); |
| 78 | +} |
| 79 | + |
| 80 | +// clang-format on |
0 commit comments