Changed the interface to accept void *

TejaX-Alaghari · TejaX-Alaghari · commit 127db931d652 · 2025-05-07T17:24:03.000+08:00
diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp
@@ -1514,13 +1514,12 @@ class SYCLGen : public SYCLGenBase {
     OS() << MapNames::getDpctNamespace() << "experimental::matrix::mma";
     OS() << "<";
     OS() << M << ", " << N << ", " << K << ", ";
-    OS() << ABType;
+    OS() << ABType << ", " << InMatrixType[0] << ", " << InMatrixType[2];
     OS() << ">(";
 
-    OS() << "DMatrix_ct1";
+    OS() << "reinterpret_cast<void **>(DMatrix_ct1)";
     for (int i = 0; i < 3; i++)
-      OS() << ", reinterpret_cast<" << InMatrixType[i] << " *>(&"
-           << InMatrixName[i] << "Matrix_ct1)";
+      OS() << ", &" << InMatrixName[i] << "Matrix_ct1";
     OS() << ")";
     endstmt();
     OS() << "}";
diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp
@@ -2227,25 +2227,33 @@ void ldmatrix(uintptr_t addr, T *m1, T *m2, T *m3, T *m4, bool trans = false) {
 /// \tparam [in] MulType The type used to multiply A and B matrix elements as
 /// \tparam [in] ABType The type of the input matrix (A & B) elements
 /// \tparam [in] CDType The type of the output matrix (C & D) elements
-/// \param [out] d The elements of the output D matrix to store the result to
-/// \param [in] a The elements of the input A matrix to be multiplied with B
+/// \param [out] d_mat The elements of the output D matrix to store the result
+/// of A* B + C
+/// \param [in] a_mat The elements of the input A matrix to be multiplied with B
 /// matrix elements
-/// \param [in] b The elements of the input B matrix to be multiplied with A
+/// \param [in] b_mat The elements of the input B matrix to be multiplied with A
 /// matrix elements
-/// \param [in] c The elements of the input C matrix to be added with the result
-/// of A * B
+/// \param [in] c_mat The elements of the input C matrix to be added with the
+/// result of A * B
 template <int M, int N, int K, typename MulType, typename ABType,
           typename CDType>
-void mma(CDType **d, ABType *a, ABType *b, CDType *c) {
+void mma(void **d_mat, void *a_mat, void *b_mat, void *c_mat) {
+  auto d = reinterpret_cast<CDType **>(d_mat);
+  auto a = reinterpret_cast<ABType *>(a_mat);
+  auto b = reinterpret_cast<ABType *>(b_mat);
+  auto c = reinterpret_cast<CDType *>(c_mat);
+
   auto sg = sycl::ext::oneapi::this_work_item::get_sub_group();
   int lane = sg.get_local_linear_id();
 
+  static_assert(M == 16 && N == 8 && K == 16,
+                "Only m16n8k16 shape is supported!");
+
   short ROW_LOAD_OFFSET = 4 * (lane >> 2);
   short COL_LOAD_OFFSET = 8 * (lane % 4);
 
   if constexpr (M == 16 && N == 8 && K == 16) {
     if constexpr (std::is_floating_point_v<CDType>) {
-      // f32.f16.f16.f32
       for (int i = 0; i < 4; i++) {
         ABType recv_a[4], recv_b[4];
 
@@ -2278,7 +2286,6 @@ void mma(CDType **d, ABType *a, ABType *b, CDType *c) {
       *d[2] = c[2];
       *d[3] = c[3];
     } else if constexpr (std::is_integral_v<MulType>) {
-      // s32.s8.s8.s32
       for (int i = 0; i < 4; i++) {
         ABType recv_a[2], recv_b[2];
 
diff --git a/clang/test/dpct/asm/mma.cu b/clang/test/dpct/asm/mma.cu
@@ -27,7 +27,7 @@ __global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, float *fc, int *d) {
   // CHECK-NEXT:   sycl::vec<int32_t, 4> AMatrix_ct1(a[0], a[1], a[2], a[3]);
   // CHECK-NEXT:   sycl::vec<int32_t, 2> BMatrix_ct1(b[0], b[1]);
   // CHECK-NEXT:   sycl::vec<float, 4> CMatrix_ct1(fc[0], fc[1], fc[2], fc[3]);
-  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, sycl::half>(DMatrix_ct1, reinterpret_cast<int32_t *>(&AMatrix_ct1), reinterpret_cast<int32_t *>(&BMatrix_ct1), reinterpret_cast<float *>(&CMatrix_ct1));
+  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, sycl::half, int32_t, float>(reinterpret_cast<void **>(DMatrix_ct1), &AMatrix_ct1, &BMatrix_ct1, &CMatrix_ct1);
   // CHECK-NEXT: }
   asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
         " { %0, %1, %2, %3 }, "
@@ -43,7 +43,7 @@ __global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, float *fc, int *d) {
   // CHECK-NEXT:   sycl::vec<int32_t, 2> AMatrix_ct1(a[0], a[1]);
   // CHECK-NEXT:   sycl::vec<int32_t, 1> BMatrix_ct1(b[0]);
   // CHECK-NEXT:   sycl::vec<int32_t, 4> CMatrix_ct1(c[0], c[1], c[2], c[3]);
-  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, int8_t>(DMatrix_ct1, reinterpret_cast<int32_t *>(&AMatrix_ct1), reinterpret_cast<int32_t *>(&BMatrix_ct1), reinterpret_cast<int32_t *>(&CMatrix_ct1));
+  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, int8_t, int32_t, int32_t>(reinterpret_cast<void **>(DMatrix_ct1), &AMatrix_ct1, &BMatrix_ct1, &CMatrix_ct1);
   // CHECK-NEXT: }
   asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 "
       " { %0, %1, %2, %3 }, "