Added new type logic for A & B matrix elements

TejaX-Alaghari · TejaX-Alaghari · commit 6c61f6d047c9 · 2025-05-07T16:29:20.000+08:00
diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp
@@ -1390,7 +1390,7 @@ class SYCLGen : public SYCLGenBase {
     if (Inst->getAttr(3) != InstAttr::row || Inst->getAttr(4) != InstAttr::col)
       return SYCLGenError();
 
-    // Only f16 type is supported for A and B matrix data
+    // Data types of D, A, B & C matrices respectively in the PTX instruction
     const auto *DType = dyn_cast<InlineAsmBuiltinType>(Inst->getType(0));
     const auto *AType = dyn_cast<InlineAsmBuiltinType>(Inst->getType(1));
     const auto *BType = dyn_cast<InlineAsmBuiltinType>(Inst->getType(2));
@@ -1418,15 +1418,18 @@ class SYCLGen : public SYCLGenBase {
     // Sizes of A & B matrices
     std::string M, N, K;
 
-    // Data type used to multiply A & B matrices
-    std::string MulType;
+    // Data types of A, B & C matrices respectively in the PTX arguments
+    std::string InMatrixType[3];
+
     if (Inst->hasAttr(InstAttr::m16n8k16)) {
       M = "16";
       N = "8";
       K = "16";
+
       // Only f16/s8 types are supported for A and B matrices of m16n8k16
       if (AType->getKind() == InlineAsmBuiltinType::f16) {
-        MulType = "sycl::half";
+        InMatrixType[0] = "int32_t"; // A type is .f16x2
+        InMatrixType[1] = "int32_t"; // B type is .f16x2
 
         // If A matrix type is f16, then C&D matrix types can only be f32
         if (CType->getKind() == InlineAsmBuiltinType::f32) {
@@ -1437,7 +1440,8 @@ class SYCLGen : public SYCLGenBase {
         } else
           return SYCLGenError();
       } else if (AType->getKind() == InlineAsmBuiltinType::s8) {
-        MulType = "int8_t";
+        InMatrixType[0] = "int32_t"; // A type is .s8x4
+        InMatrixType[1] = "int32_t"; // B type is .s8x4
 
         // If A matrix type is s8, then C&D matrix types can only be s32
         if (CType->getKind() == InlineAsmBuiltinType::s32) {
@@ -1452,6 +1456,8 @@ class SYCLGen : public SYCLGenBase {
     } else
       return SYCLGenError();
 
+    InMatrixType[2] = CDType;
+
     // Check the register sizes for vector elements of A, B, C & D matrices
     for (unsigned InputOp = 0; InputOp < Inst->getNumInputOperands();
          InputOp++) {
@@ -1465,13 +1471,9 @@ class SYCLGen : public SYCLGenBase {
     if (DMatVE->getNumElements() != NumVecElements[3])
       return SYCLGenError();
 
-    OS() << MapNames::getDpctNamespace() << "experimental::matrix::mma";
-    OS() << "<";
-    OS() << M << ", " << N << ", " << K << ", ";
-    OS() << MulType;
-    OS() << ">(";
-
-    // Add D matrix address values to store the MAD result
+    // Declare and init an array for storing the addresses of D matrix elements
+    OS() << "{\n";
+    OS() << CDType << " *DMatrix_ct1[" << DMatVE->getNumElements() << "] = { ";
     for (unsigned Inst = 0; Inst != DMatVE->getNumElements(); ++Inst) {
       if (isa<InlineAsmDiscardExpr>(DMatVE->getElement(Inst)))
         continue;
@@ -1481,25 +1483,44 @@ class SYCLGen : public SYCLGenBase {
       if ((Inst + 1) != DMatVE->getNumElements())
         OS() << ", ";
     }
+    OS() << " }";
+    endstmt();
 
-    // Add A, B & C matrix values to compute MAD
+    // Declare and init vectors for storing the values of A, B & C matrix elements
+    std::string InMatrixName[3] = {"A", "B", "C"};
     for (unsigned InputOp = 0; InputOp < Inst->getNumInputOperands();
          InputOp++) {
       if (auto VE =
               dyn_cast<InlineAsmVectorExpr>(Inst->getInputOperand(InputOp))) {
+        OS() << "sycl::vec<" << InMatrixType[InputOp] << ", " << VE->getNumElements() << "> " << InMatrixName[InputOp] << "Matrix_ct1(";
         for (unsigned Inst = 0; Inst != VE->getNumElements(); ++Inst) {
           if (isa<InlineAsmDiscardExpr>(VE->getElement(Inst)))
             continue;
-          OS() << ", ";
           if (emitStmt(VE->getElement(Inst)))
             return SYCLGenError();
+          if ((Inst + 1) != VE->getNumElements())
+            OS() << ", ";
         }
+        OS() << ")";
+        endstmt();
       } else {
         return SYCLGenError();
       }
     }
 
-    OS() << ");";
+    OS() << MapNames::getDpctNamespace() << "experimental::matrix::mma";
+    OS() << "<";
+    OS() << M << ", " << N << ", " << K << ", ";
+    OS() << ABType;
+    OS() << ">(";
+
+    OS() << "DMatrix_ct1";
+    for (int i = 0; i < 3; i++)
+      OS() << ", reinterpret_cast<" << InMatrixType[i] << " *>(&" << InMatrixName[i] << "Matrix_ct1)";
+    OS() << ")";
+    endstmt();
+    OS() << "}";
+    endstmt();
 
     const auto *KernelDecl = getImmediateOuterFuncDecl(GAS);
     if (KernelDecl) {
diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp
@@ -2218,6 +2218,22 @@ void ldmatrix(uintptr_t addr, T *m1, T *m2, T *m3, T *m4, bool trans = false) {
   ldmatrix(addr, m4, trans, 3);
 }
 
+/// Multiplies 2 matrices (A & B) and adds the result to C matrix and
+/// accumulates the result to a D matrix (MAD). Requires the sub-group size of
+/// kernel calling this function to be 32.
+/// \tparam [in] M The rows of A, C & D matrix
+/// \tparam [in] N The columns of B, C, D matrix
+/// \tparam [in] K The columns & rows of A & B matrices respectively
+/// \tparam [in] MulType The type used to multiply A and B matrix elements as
+/// \tparam [in] ABType The type of the input matrix (A & B) elements
+/// \tparam [in] CDType The type of the output matrix (C & D) elements
+/// \param [in] d The elements of the output D matrix to store the result to
+/// \param [in] a The elements of the input A matrix to be multiplied with B
+/// matrix elements
+/// \param [in] b The elements of the input B matrix to be multiplied with A
+/// matrix elements
+/// \param [in] c The elements of the input C matrix to be added with the result
+/// of A * B
 template <int M, int N, int K, typename MulType, typename ABType,
           typename CDType, typename Op = sycl::bit_and<>>
 void mma(CDType **d, ABType *a, ABType *b, CDType *c, Op op = Op{}) {
@@ -2228,12 +2244,8 @@ void mma(CDType **d, ABType *a, ABType *b, CDType *c, Op op = Op{}) {
   short COL_LOAD_OFFSET = 8 * (lane % 4);
 
   if (M == 16 && N == 8 && K == 16) {
-    if constexpr (std::is_same_v<CDType, sycl::half>) {
+    if constexpr (std::is_floating_point_v<CDType>) {
       // f32.f16.f16.f32
-      auto c_h = reinterpret_cast<MulType *>(c);
-
-      float c_f[4] = {c_h[0], c_h[1], c_h[2], c_h[3]};
-
       for (int i = 0; i < 4; i++) {
         ABType recv_a[4], recv_b[4];
 
@@ -2245,50 +2257,45 @@ void mma(CDType **d, ABType *a, ABType *b, CDType *c, Op op = Op{}) {
         recv_b[0] = dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + i);
         recv_b[1] = dpct::select_from_sub_group(sg, b[1], COL_LOAD_OFFSET + i);
         recv_b[2] =
-            dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + i + 4);
+            dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + 4 + i);
         recv_b[3] =
-            dpct::select_from_sub_group(sg, b[1], COL_LOAD_OFFSET + i + 4);
+            dpct::select_from_sub_group(sg, b[1], COL_LOAD_OFFSET + 4 + i);
 
         auto ra = reinterpret_cast<MulType *>(recv_a);
         auto rb = reinterpret_cast<MulType *>(recv_b);
 
         for (int j = 0; j < 4; j++) {
-          c_f[0] += static_cast<float>(ra[j]) * static_cast<float>(rb[j]);
-          c_f[1] += static_cast<float>(ra[j]) * static_cast<float>(rb[j + 4]);
-          c_f[2] += static_cast<float>(ra[j + 4]) * static_cast<float>(rb[j]);
-          c_f[3] +=
-              static_cast<float>(ra[j + 4]) * static_cast<float>(rb[j + 4]);
+          c[0] += static_cast<CDType>(ra[j]) * static_cast<CDType>(rb[j]);
+          c[1] += static_cast<CDType>(ra[j]) * static_cast<CDType>(rb[j + 4]);
+          c[2] += static_cast<CDType>(ra[j + 4]) * static_cast<CDType>(rb[j]);
+          c[3] +=
+              static_cast<CDType>(ra[j + 4]) * static_cast<CDType>(rb[j + 4]);
         }
       }
 
-      c_h[0] = c_f[0];
-      c_h[1] = c_f[1];
-      c_h[2] = c_f[2];
-      c_h[3] = c_f[3];
-
       *d[0] = c[0];
       *d[1] = c[1];
+      *d[2] = c[2];
+      *d[3] = c[3];
     } else if constexpr (std::is_integral_v<MulType>) {
       // s32.s8.s8.s32
-      ABType recv_a[4 * 2], recv_b[4 * 2];
-
       for (int i = 0; i < 4; i++) {
-        recv_a[i] = dpct::select_from_sub_group(sg, a[0], ROW_LOAD_OFFSET + i);
-        recv_a[i + 4] =
-            dpct::select_from_sub_group(sg, a[1], ROW_LOAD_OFFSET + i);
+        ABType recv_a[2], recv_b[2];
 
-        recv_b[i] = dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + i);
-        recv_b[i + 4] =
+        recv_a[0] = dpct::select_from_sub_group(sg, a[0], ROW_LOAD_OFFSET + i);
+        recv_a[1] = dpct::select_from_sub_group(sg, a[1], ROW_LOAD_OFFSET + i);
+        recv_b[0] = dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + i);
+        recv_b[1] =
             dpct::select_from_sub_group(sg, b[0], COL_LOAD_OFFSET + i + 4);
-      }
 
-      MulType *a = reinterpret_cast<MulType *>(recv_a);
-      MulType *b = reinterpret_cast<MulType *>(recv_b);
-      for (int i = 0; i < 16; i++) {
-        c[0] += a[i] * b[i];
-        c[1] += a[i] * b[i + 16];
-        c[2] += a[i + 16] * b[i];
-        c[3] += a[i + 16] * b[i + 16];
+        auto ra = reinterpret_cast<MulType *>(recv_a);
+        auto rb = reinterpret_cast<MulType *>(recv_b);
+        for (int i = 0; i < 4; i++) {
+          c[0] += ra[i] * rb[i];
+          c[1] += ra[i] * rb[i + 4];
+          c[2] += ra[i + 4] * rb[i];
+          c[3] += ra[i + 4] * rb[i + 4];
+        }
       }
 
       *d[0] = c[0];
diff --git a/clang/test/dpct/asm/mma.cu b/clang/test/dpct/asm/mma.cu
@@ -14,15 +14,21 @@ As per PTX ASM 8.1, below is the status of supported configurations
 ---------     ---------   ----------   -----------   -------------
 | Shape |     |   A   |   |    B   |   |  C / D  |   | Supported |
 ---------     ---------   ----------   -----------   -------------
-m16n8k16     .f16/.bf16   .f16/.bf16    .f16/.f32        Partial (.f16.f16.f16.f16 / .f32.f16.f16.f32)
-              .s8/.u8      .s8/.u8        .s32           Yes
+m16n8k16        .f16         .f16       .f16/.f32        Yes
+                .s8          .s8          .s32           Yes
 
 A Layout: row
 B Layout: col
 */
 
 __global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, float *fc, int *d) {
-  // CHECK: dpct::experimental::matrix::mma<16, 8, 16, sycl::half>(&fc[0], &fc[1], &fc[2], &fc[3], a[0], a[1], a[2], a[3], b[0], b[1], fc[0], fc[1], fc[2], fc[3]);
+  // CHECK: {
+  // CHECK-NEXT:   float *DMatrix_ct1[4] = { &fc[0], &fc[1], &fc[2], &fc[3] };
+  // CHECK-NEXT:   sycl::vec<int32_t, 4> AMatrix_ct1(a[0], a[1], a[2], a[3]);
+  // CHECK-NEXT:   sycl::vec<int32_t, 2> BMatrix_ct1(b[0], b[1]);
+  // CHECK-NEXT:   sycl::vec<float, 4> CMatrix_ct1(fc[0], fc[1], fc[2], fc[3]);
+  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, sycl::half>(DMatrix_ct1, reinterpret_cast<int32_t *>(&AMatrix_ct1), reinterpret_cast<int32_t *>(&BMatrix_ct1), reinterpret_cast<float *>(&CMatrix_ct1));
+  // CHECK-NEXT: }
   asm("mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 "
         " { %0, %1, %2, %3 }, "
         " { %4, %5, %6, %7 }, "
@@ -32,7 +38,13 @@ __global__ void mma_kernel_m16n8k16(int *a, int *b, int *c, float *fc, int *d) {
         : "r"(a[0]), "r"(a[1]), "r"(a[2]), "r"(a[3]),
           "r"(b[0]), "r"(b[1]));
 
-  // CHECK: dpct::experimental::matrix::mma<16, 8, 16, int8_t>(&d[0], &d[1], &d[2], &d[3], a[0], a[1], b[0], c[0], c[1], c[2], c[3]);
+  // CHECK: {
+  // CHECK-NEXT:   int32_t *DMatrix_ct1[4] = { &d[0], &d[1], &d[2], &d[3] };
+  // CHECK-NEXT:   sycl::vec<int32_t, 2> AMatrix_ct1(a[0], a[1]);
+  // CHECK-NEXT:   sycl::vec<int32_t, 1> BMatrix_ct1(b[0]);
+  // CHECK-NEXT:   sycl::vec<int32_t, 4> CMatrix_ct1(c[0], c[1], c[2], c[3]);
+  // CHECK-NEXT:   dpct::experimental::matrix::mma<16, 8, 16, int8_t>(DMatrix_ct1, reinterpret_cast<int32_t *>(&AMatrix_ct1), reinterpret_cast<int32_t *>(&BMatrix_ct1), reinterpret_cast<int32_t *>(&CMatrix_ct1));
+  // CHECK-NEXT: }
   asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 "
       " { %0, %1, %2, %3 }, "
       " { %4, %5 }, "