Skip to content

Commit e6666b4

Browse files
Added QAM support for 4 nvcuda wmma APIs
1 parent ff03b4a commit e6666b4

7 files changed

Lines changed: 116 additions & 0 deletions

File tree

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
__global__ void test() {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
7+
nvcuda::wmma::fill_fragment(acc_frag /* type fragment */,
8+
1.0f /* type value */);
9+
// End
10+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
__global__ void test(half *a, int row, int col, int lda) {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
7+
nvcuda::wmma::row_major>
8+
a_frag;
9+
nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */,
10+
a + col + row * lda, lda);
11+
// End
12+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
__global__ void test() {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
7+
nvcuda::wmma::row_major>
8+
a_frag;
9+
nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half,
10+
nvcuda::wmma::col_major>
11+
b_frag;
12+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
13+
nvcuda::wmma::mma_sync(acc_frag /* type fragment */,
14+
a_frag /* type fragment */, b_frag /* type fragment */,
15+
acc_frag /* type fragment */);
16+
// End
17+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Option: --use-experimental-features=matrix
2+
#include <mma.h>
3+
4+
__global__ void test(float *c, int row, int col, int ldc) {
5+
// Start
6+
nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
7+
nvcuda::wmma::store_matrix_sync(
8+
c + col + row * ldc, acc_frag /* type fragment */, ldc,
9+
nvcuda::wmma::mem_col_major /* type memory order */);
10+
nvcuda::wmma::store_matrix_sync(
11+
c + row + col * ldc, acc_frag /* type fragment */, ldc,
12+
nvcuda::wmma::mem_row_major /* type memory order */);
13+
// End
14+
}

clang/lib/DPCT/DPCT.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1070,6 +1070,8 @@ int runDPCT(int argc, const char **argv) {
10701070
Experimentals.addValue(ExperimentalFeatures::Exp_LevelZero);
10711071
else if (Option.ends_with("non-uniform-groups"))
10721072
Experimentals.addValue(ExperimentalFeatures::Exp_NonUniformGroups);
1073+
else if (Option.ends_with("matrix"))
1074+
Experimentals.addValue(ExperimentalFeatures::Exp_Matrix);
10731075
} else if (Option == "--no-dry-pattern") {
10741076
NoDRYPattern.setValue(true);
10751077
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0
2+
// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0
3+
4+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::fill_fragment | FileCheck %s -check-prefix=NVCUDA_WMMA_FILL_FRAGMENT
5+
// NVCUDA_WMMA_FILL_FRAGMENT: CUDA API:
6+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
7+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag /* type fragment */,
8+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: 1.0f /* type value */);
9+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
10+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
11+
// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), 1.0f);
12+
13+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::load_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_LOAD_MATRIX_SYNC
14+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC: CUDA API:
15+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
16+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::row_major>
17+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag;
18+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */,
19+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a + col + row * lda, lda);
20+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
21+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::a, 16, 16, 16, sycl::half, dpct::experimental::matrix::row_major>
22+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag;
23+
// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_load(sycl::ext::oneapi::this_work_item::get_sub_group(), a_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, const sycl::half>(a + col + row * lda), lda);
24+
25+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::store_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_STORE_MATRIX_SYNC
26+
// NVCUDA_WMMA_STORE_MATRIX_SYNC: CUDA API:
27+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
28+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync(
29+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc, acc_frag /* type fragment */, ldc,
30+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /* type memory order */);
31+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync(
32+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc, acc_frag /* type fragment */, ldc,
33+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /* type memory order */);
34+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
35+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
36+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, float>(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major);
37+
// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast<sycl::access::address_space::generic_space, sycl::access::decorated::no, float>(c + row + col * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::row_major);
38+
39+
// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::mma_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_MMA_SYNC
40+
// NVCUDA_WMMA_MMA_SYNC: CUDA API:
41+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 16, half,
42+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::row_major>
43+
// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag;
44+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 16, half,
45+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::col_major>
46+
// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag;
47+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 16, float> acc_frag;
48+
// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::mma_sync(acc_frag /* type fragment */,
49+
// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag /* type fragment */, b_frag /* type fragment */,
50+
// NVCUDA_WMMA_MMA_SYNC-NEXT: acc_frag /* type fragment */);
51+
// NVCUDA_WMMA_MMA_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix):
52+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::a, 16, 16, 16, sycl::half, dpct::experimental::matrix::row_major>
53+
// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag;
54+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::b, 16, 16, 16, sycl::half, dpct::experimental::matrix::col_major>
55+
// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag;
56+
// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix<dpct::experimental::matrix::accumulator, 16, 16, 16, float> acc_frag;
57+
// NVCUDA_WMMA_MMA_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_mad(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), a_frag.get(), b_frag.get(), acc_frag.get());

clang/test/dpct/query_api_mapping/test_all.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,6 +2329,10 @@
23292329
// CHECK-NEXT: normcdfinv
23302330
// CHECK-NEXT: normcdfinvf
23312331
// CHECK-NEXT: normf
2332+
// CHECK-NEXT: nvcuda::wmma::fill_fragment
2333+
// CHECK-NEXT: nvcuda::wmma::load_matrix_sync
2334+
// CHECK-NEXT: nvcuda::wmma::mma_sync
2335+
// CHECK-NEXT: nvcuda::wmma::store_matrix_sync
23322336
// CHECK-NEXT: nvshmem_align
23332337
// CHECK-NEXT: nvshmem_calloc
23342338
// CHECK-NEXT: nvshmem_finalize

0 commit comments

Comments
 (0)