From 722ed60633dcc37d562f04d0a2917addd83ed288 Mon Sep 17 00:00:00 2001 From: Teja Alaghari Date: Thu, 3 Jul 2025 16:43:04 +0800 Subject: [PATCH 1/3] Added QAM support for 4 nvcuda wmma APIs --- .../Runtime/nvcuda$$wmma$$fill_fragment.cu | 10 ++++ .../Runtime/nvcuda$$wmma$$load_matrix_sync.cu | 12 ++++ .../DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu | 17 ++++++ .../nvcuda$$wmma$$store_matrix_sync.cu | 14 +++++ clang/lib/DPCT/DPCT.cpp | 2 + .../query_api_mapping/Runtime/test_wmma.cu | 57 +++++++++++++++++++ clang/test/dpct/query_api_mapping/test_all.cu | 4 ++ 7 files changed, 116 insertions(+) create mode 100644 clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu create mode 100644 clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu create mode 100644 clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu create mode 100644 clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu create mode 100644 clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu new file mode 100644 index 000000000000..09855183656e --- /dev/null +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu @@ -0,0 +1,10 @@ +// Option: --use-experimental-features=matrix +#include + +__global__ void test() { + // Start + nvcuda::wmma::fragment acc_frag; + nvcuda::wmma::fill_fragment(acc_frag /* type fragment */, + 1.0f /* type value */); + // End +} diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu new file mode 100644 index 000000000000..1402890e34cc --- /dev/null +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu @@ -0,0 +1,12 @@ +// Option: --use-experimental-features=matrix +#include + +__global__ void test(half *a, int row, int col, int lda) { + // Start + nvcuda::wmma::fragment + a_frag; + nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */, + a + col + row * lda, lda); + // End +} diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu new file mode 100644 index 000000000000..cc7e004b775f --- /dev/null +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu @@ -0,0 +1,17 @@ +// Option: --use-experimental-features=matrix +#include + +__global__ void test() { + // Start + nvcuda::wmma::fragment + a_frag; + nvcuda::wmma::fragment + b_frag; + nvcuda::wmma::fragment acc_frag; + nvcuda::wmma::mma_sync(acc_frag /* type fragment */, + a_frag /* type fragment */, b_frag /* type fragment */, + acc_frag /* type fragment */); + // End +} diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu new file mode 100644 index 000000000000..0c87f04db574 --- /dev/null +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu @@ -0,0 +1,14 @@ +// Option: --use-experimental-features=matrix +#include + +__global__ void test(float *c, int row, int col, int ldc) { + // Start + nvcuda::wmma::fragment acc_frag; + nvcuda::wmma::store_matrix_sync( + c + col + row * ldc, acc_frag /* type fragment */, ldc, + nvcuda::wmma::mem_col_major /* type memory order */); + nvcuda::wmma::store_matrix_sync( + c + row + col * ldc, acc_frag /* type fragment */, ldc, + nvcuda::wmma::mem_row_major /* type memory order */); + // End +} diff --git a/clang/lib/DPCT/DPCT.cpp b/clang/lib/DPCT/DPCT.cpp index efad8a5917d6..153c0d4a31d9 100644 --- a/clang/lib/DPCT/DPCT.cpp +++ b/clang/lib/DPCT/DPCT.cpp @@ -1063,6 +1063,8 @@ int runDPCT(int argc, const char **argv) { Experimentals.addValue(ExperimentalFeatures::Exp_LevelZero); else if (Option.ends_with("non-uniform-groups")) Experimentals.addValue(ExperimentalFeatures::Exp_NonUniformGroups); + else if (Option.ends_with("matrix")) + Experimentals.addValue(ExperimentalFeatures::Exp_Matrix); } else if (Option == "--no-dry-pattern") { NoDRYPattern.setValue(true); } else if (Option == "--enable-profiling") { diff --git a/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu new file mode 100644 index 000000000000..0cb34a63e37d --- /dev/null +++ b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu @@ -0,0 +1,57 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0 + +// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::fill_fragment | FileCheck %s -check-prefix=NVCUDA_WMMA_FILL_FRAGMENT +// NVCUDA_WMMA_FILL_FRAGMENT: CUDA API: +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fragment acc_frag; +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag /* type fragment */, +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: 1.0f /* type value */); +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: Is migrated to (with the option --use-experimental-features=matrix): +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), 1.0f); + +// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::load_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_LOAD_MATRIX_SYNC +// NVCUDA_WMMA_LOAD_MATRIX_SYNC: CUDA API: +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */, +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a + col + row * lda, lda); +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_load(sycl::ext::oneapi::this_work_item::get_sub_group(), a_frag.get(), sycl::address_space_cast(a + col + row * lda), lda); + +// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::store_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_STORE_MATRIX_SYNC +// NVCUDA_WMMA_STORE_MATRIX_SYNC: CUDA API: +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment acc_frag; +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc, acc_frag /* type fragment */, ldc, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /* type memory order */); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc, acc_frag /* type fragment */, ldc, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /* type memory order */); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast(c + row + col * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::row_major); + +// RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::mma_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_MMA_SYNC +// NVCUDA_WMMA_MMA_SYNC: CUDA API: +// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment +// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment +// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment acc_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::mma_sync(acc_frag /* type fragment */, +// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag /* type fragment */, b_frag /* type fragment */, +// NVCUDA_WMMA_MMA_SYNC-NEXT: acc_frag /* type fragment */); +// NVCUDA_WMMA_MMA_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): +// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix +// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix +// NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; +// NVCUDA_WMMA_MMA_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_mad(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), a_frag.get(), b_frag.get(), acc_frag.get()); diff --git a/clang/test/dpct/query_api_mapping/test_all.cu b/clang/test/dpct/query_api_mapping/test_all.cu index d1af2a07f629..e52eff4a8f5e 100644 --- a/clang/test/dpct/query_api_mapping/test_all.cu +++ b/clang/test/dpct/query_api_mapping/test_all.cu @@ -2339,6 +2339,10 @@ // CHECK-NEXT: normcdfinv // CHECK-NEXT: normcdfinvf // CHECK-NEXT: normf +// CHECK-NEXT: nvcuda::wmma::fill_fragment +// CHECK-NEXT: nvcuda::wmma::load_matrix_sync +// CHECK-NEXT: nvcuda::wmma::mma_sync +// CHECK-NEXT: nvcuda::wmma::store_matrix_sync // CHECK-NEXT: nvshmem_align // CHECK-NEXT: nvshmem_calloc // CHECK-NEXT: nvshmem_finalize From d6834387c1b0c6f99410690165231083ef40f999 Mon Sep 17 00:00:00 2001 From: Teja Alaghari Date: Sat, 5 Jul 2025 14:22:04 +0800 Subject: [PATCH 2/3] Changed type comment --- .../Runtime/nvcuda$$wmma$$fill_fragment.cu | 5 ++--- .../Runtime/nvcuda$$wmma$$load_matrix_sync.cu | 4 ++-- .../DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu | 4 +--- .../nvcuda$$wmma$$store_matrix_sync.cu | 8 +++---- .../query_api_mapping/Runtime/test_wmma.cu | 21 ++++++++----------- 5 files changed, 18 insertions(+), 24 deletions(-) diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu index 09855183656e..4d68a7c2706b 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu @@ -1,10 +1,9 @@ // Option: --use-experimental-features=matrix #include -__global__ void test() { +__global__ void test(float val) { // Start nvcuda::wmma::fragment acc_frag; - nvcuda::wmma::fill_fragment(acc_frag /* type fragment */, - 1.0f /* type value */); + nvcuda::wmma::fill_fragment(acc_frag, val /*float*/); // End } diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu index 1402890e34cc..0198c4b205b4 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu @@ -6,7 +6,7 @@ __global__ void test(half *a, int row, int col, int lda) { nvcuda::wmma::fragment a_frag; - nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */, - a + col + row * lda, lda); + nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*void **/, + lda /*int*/); // End } diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu index cc7e004b775f..bf651e4d63b0 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$mma_sync.cu @@ -10,8 +10,6 @@ __global__ void test() { nvcuda::wmma::col_major> b_frag; nvcuda::wmma::fragment acc_frag; - nvcuda::wmma::mma_sync(acc_frag /* type fragment */, - a_frag /* type fragment */, b_frag /* type fragment */, - acc_frag /* type fragment */); + nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); // End } diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu index 0c87f04db574..294dd8b86f31 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu @@ -5,10 +5,10 @@ __global__ void test(float *c, int row, int col, int ldc) { // Start nvcuda::wmma::fragment acc_frag; nvcuda::wmma::store_matrix_sync( - c + col + row * ldc, acc_frag /* type fragment */, ldc, - nvcuda::wmma::mem_col_major /* type memory order */); + c + col + row * ldc /*void **/, acc_frag, ldc /*int*/, + nvcuda::wmma::mem_col_major /*memory order*/); nvcuda::wmma::store_matrix_sync( - c + row + col * ldc, acc_frag /* type fragment */, ldc, - nvcuda::wmma::mem_row_major /* type memory order */); + c + row + col * ldc /*void **/, acc_frag, ldc /*int*/, + nvcuda::wmma::mem_row_major /*memory order*/); // End } diff --git a/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu index 0cb34a63e37d..55464437e54f 100644 --- a/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu +++ b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu @@ -4,19 +4,18 @@ // RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::fill_fragment | FileCheck %s -check-prefix=NVCUDA_WMMA_FILL_FRAGMENT // NVCUDA_WMMA_FILL_FRAGMENT: CUDA API: // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fragment acc_frag; -// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag /* type fragment */, -// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: 1.0f /* type value */); +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag, val /*float*/); // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; -// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), 1.0f); +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), val); // RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::load_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_LOAD_MATRIX_SYNC // NVCUDA_WMMA_LOAD_MATRIX_SYNC: CUDA API: // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; -// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag /* type fragment */, -// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a + col + row * lda, lda); +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*void **/, +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: lda /*int*/); // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; @@ -26,11 +25,11 @@ // NVCUDA_WMMA_STORE_MATRIX_SYNC: CUDA API: // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment acc_frag; // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc, acc_frag /* type fragment */, ldc, -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /* type memory order */); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc /*void **/, acc_frag, ldc /*int*/, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /*memory order*/); // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc, acc_frag /* type fragment */, ldc, -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /* type memory order */); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc /*void **/, acc_frag, ldc /*int*/, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /*memory order*/); // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major); @@ -45,9 +44,7 @@ // NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::col_major> // NVCUDA_WMMA_MMA_SYNC-NEXT: b_frag; // NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::fragment acc_frag; -// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::mma_sync(acc_frag /* type fragment */, -// NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag /* type fragment */, b_frag /* type fragment */, -// NVCUDA_WMMA_MMA_SYNC-NEXT: acc_frag /* type fragment */); +// NVCUDA_WMMA_MMA_SYNC-NEXT: nvcuda::wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag); // NVCUDA_WMMA_MMA_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_MMA_SYNC-NEXT: dpct::experimental::matrix::joint_matrix // NVCUDA_WMMA_MMA_SYNC-NEXT: a_frag; From 7d794e1d61828124c90086656f5480f885d8868f Mon Sep 17 00:00:00 2001 From: Teja Alaghari Date: Sat, 5 Jul 2025 14:44:49 +0800 Subject: [PATCH 3/3] Updated type info of args --- .../Runtime/nvcuda$$wmma$$fill_fragment.cu | 4 ++-- .../Runtime/nvcuda$$wmma$$load_matrix_sync.cu | 7 ++++--- .../nvcuda$$wmma$$store_matrix_sync.cu | 11 +++++----- .../query_api_mapping/Runtime/test_wmma.cu | 20 +++++++++---------- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu index 4d68a7c2706b..f2b732db91e7 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$fill_fragment.cu @@ -1,9 +1,9 @@ // Option: --use-experimental-features=matrix #include -__global__ void test(float val) { +template __global__ void test(T val) { // Start nvcuda::wmma::fragment acc_frag; - nvcuda::wmma::fill_fragment(acc_frag, val /*float*/); + nvcuda::wmma::fill_fragment(acc_frag, val /*const T&*/); // End } diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu index 0198c4b205b4..a93d2bf1cbe7 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$load_matrix_sync.cu @@ -1,12 +1,13 @@ // Option: --use-experimental-features=matrix #include -__global__ void test(half *a, int row, int col, int lda) { +template +__global__ void test(const T *a, int row, int col, unsigned lda) { // Start nvcuda::wmma::fragment a_frag; - nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*void **/, - lda /*int*/); + nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*const T **/, + lda /*unsigned*/); // End } diff --git a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu index 294dd8b86f31..06fff11ead9c 100644 --- a/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu +++ b/clang/examples/DPCT/Runtime/nvcuda$$wmma$$store_matrix_sync.cu @@ -1,14 +1,15 @@ // Option: --use-experimental-features=matrix #include -__global__ void test(float *c, int row, int col, int ldc) { +template +__global__ void test(const T *c, int row, int col, unsigned ldc) { // Start nvcuda::wmma::fragment acc_frag; nvcuda::wmma::store_matrix_sync( - c + col + row * ldc /*void **/, acc_frag, ldc /*int*/, - nvcuda::wmma::mem_col_major /*memory order*/); + c + col + row * ldc /*const T **/, acc_frag, ldc /*unsigned*/, + nvcuda::wmma::mem_col_major /*nvcuda::wmma::layout_t*/); nvcuda::wmma::store_matrix_sync( - c + row + col * ldc /*void **/, acc_frag, ldc /*int*/, - nvcuda::wmma::mem_row_major /*memory order*/); + c + row + col * ldc /*const T **/, acc_frag, ldc /*unsigned*/, + nvcuda::wmma::mem_row_major /*nvcuda::wmma::layout_t*/); // End } diff --git a/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu index 55464437e54f..91e988ef8e29 100644 --- a/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu +++ b/clang/test/dpct/query_api_mapping/Runtime/test_wmma.cu @@ -4,7 +4,7 @@ // RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::fill_fragment | FileCheck %s -check-prefix=NVCUDA_WMMA_FILL_FRAGMENT // NVCUDA_WMMA_FILL_FRAGMENT: CUDA API: // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fragment acc_frag; -// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag, val /*float*/); +// NVCUDA_WMMA_FILL_FRAGMENT-NEXT: nvcuda::wmma::fill_fragment(acc_frag, val /*const T&*/); // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; // NVCUDA_WMMA_FILL_FRAGMENT-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_fill(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), val); @@ -14,26 +14,26 @@ // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; -// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*void **/, -// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: lda /*int*/); +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: nvcuda::wmma::load_matrix_sync(a_frag, a + col + row * lda /*const T **/, +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: lda /*unsigned*/); // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix // NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: a_frag; -// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_load(sycl::ext::oneapi::this_work_item::get_sub_group(), a_frag.get(), sycl::address_space_cast(a + col + row * lda), lda); +// NVCUDA_WMMA_LOAD_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_load(sycl::ext::oneapi::this_work_item::get_sub_group(), a_frag.get(), sycl::address_space_cast::type>(a + col + row * lda), lda); // RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::store_matrix_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_STORE_MATRIX_SYNC // NVCUDA_WMMA_STORE_MATRIX_SYNC: CUDA API: // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::fragment acc_frag; // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc /*void **/, acc_frag, ldc /*int*/, -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /*memory order*/); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + col + row * ldc /*const T **/, acc_frag, ldc /*unsigned*/, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_col_major /*nvcuda::wmma::layout_t*/); // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::store_matrix_sync( -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc /*void **/, acc_frag, ldc /*int*/, -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /*memory order*/); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: c + row + col * ldc /*const T **/, acc_frag, ldc /*unsigned*/, +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: nvcuda::wmma::mem_row_major /*nvcuda::wmma::layout_t*/); // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: Is migrated to (with the option --use-experimental-features=matrix): // NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: dpct::experimental::matrix::joint_matrix acc_frag; -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major); -// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast(c + row + col * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::row_major); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast::type>(c + col + row * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::col_major); +// NVCUDA_WMMA_STORE_MATRIX_SYNC-NEXT: sycl::ext::oneapi::experimental::matrix::joint_matrix_store(sycl::ext::oneapi::this_work_item::get_sub_group(), acc_frag.get(), sycl::address_space_cast::type>(c + row + col * ldc), ldc, sycl::ext::oneapi::experimental::matrix::layout::row_major); // RUN: dpct --cuda-include-path="%cuda-path/include" --query-api-mapping=nvcuda::wmma::mma_sync | FileCheck %s -check-prefix=NVCUDA_WMMA_MMA_SYNC // NVCUDA_WMMA_MMA_SYNC: CUDA API: