diff --git a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp index fc9cb1be635e..2f940f84d7f9 100644 --- a/clang/lib/DPCT/RulesAsm/AsmMigration.cpp +++ b/clang/lib/DPCT/RulesAsm/AsmMigration.cpp @@ -556,10 +556,15 @@ bool SYCLGenBase::emitVectorType(const InlineAsmVectorType *T) { return SYCLGenError(); OS() << ", "; switch (T->getKind()) { + case InlineAsmVectorType::x1: + OS() << 1; + break; case InlineAsmVectorType::v2: + case InlineAsmVectorType::x2: OS() << 2; break; case InlineAsmVectorType::v4: + case InlineAsmVectorType::x4: OS() << 4; break; case InlineAsmVectorType::v8: @@ -589,9 +594,9 @@ bool SYCLGenBase::emitVariableDeclaration(const InlineAsmVarDecl *D) { bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) { // Address expression only support ld/st/red & atom instructions. - if (!CurrInst || - !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom, - asmtok::op_prefetch, asmtok::op_red, asmtok::op_cp)) { + if (!CurrInst || !CurrInst->is(asmtok::op_st, asmtok::op_ld, asmtok::op_atom, + asmtok::op_prefetch, asmtok::op_red, + asmtok::op_cp, asmtok::op_ldmatrix)) { return SYCLGenError(); } std::string Type; @@ -624,6 +629,8 @@ bool SYCLGenBase::emitAddressExpr(const InlineAsmAddressExpr *Dst) { if (CurrInst->is(asmtok::op_prefetch, asmtok::op_red) || CanSuppressCast(Dst->getSymbol())) OS() << llvm::formatv("{0}", Reg); + else if (CurrInst->is(asmtok::op_ldmatrix)) + OS() << llvm::formatv("(uintptr_t){0}", Reg); else OS() << llvm::formatv("(({0} *)(uintptr_t){1})", Type, Reg); break; @@ -1305,6 +1312,64 @@ class SYCLGen : public SYCLGenBase { return SYCLGenSuccess(); } + bool handle_ldmatrix(const InlineAsmInstruction *Inst) override { + if (Inst->getNumInputOperands() != 1) + return SYCLGenError(); + + const auto *Type = dyn_cast(Inst->getType(0)); + + if (!Type || Type->getKind() != InlineAsmBuiltinType::b16) + return SYCLGenError(); + + const InlineAsmVectorExpr *VE; + if (VE = dyn_cast(Inst->getOutputOperand())) { + auto numOutputOperands = VE->getNumElements(); + if (Inst->hasAttr(InstAttr::x1)) { + if (numOutputOperands != 1) + return SYCLGenError(); + } else if (Inst->hasAttr(InstAttr::x2)) { + if (numOutputOperands != 2) + return SYCLGenError(); + } else if (Inst->hasAttr(InstAttr::x4)) { + if (numOutputOperands != 4) + return SYCLGenError(); + } + } else { + return SYCLGenError(); + } + + llvm::SaveAndRestore Store(CurrInst); + CurrInst = Inst; + const auto *Src = + dyn_cast_or_null(Inst->getInputOperand(0)); + if (!Src) + return false; + + OS() << MapNames::getDpctNamespace() << "experimental::matrix::ldmatrix("; + if (emitStmt(Src)) { + return SYCLGenError(); + } + for (unsigned Inst = 0; Inst != VE->getNumElements(); ++Inst) { + if (isa(VE->getElement(Inst))) + continue; + OS() << ", &"; + if (emitStmt(VE->getElement(Inst))) + return SYCLGenError(); + } + if (Inst->hasAttr(InstAttr::trans)) + OS() << ", true"; + OS() << ");"; + const auto *KernelDecl = getImmediateOuterFuncDecl(GAS); + if (KernelDecl) { + auto FuncInfo = DeviceFunctionDecl::LinkRedecls(KernelDecl); + if (FuncInfo) + FuncInfo->addSubGroupSizeRequest(32, GAS->getBeginLoc(), + DpctGlobalInfo::getSubGroup(GAS)); + } + + return SYCLGenSuccess(); + } + bool handle_prefetch(const InlineAsmInstruction *Inst) override { if (!DpctGlobalInfo::useExtPrefetch() || Inst->getNumInputOperands() != 1) return SYCLGenError(); @@ -2881,6 +2946,7 @@ class SYCLGen : public SYCLGenBase { bool handle_ld(const InlineAsmInstruction *Inst) override { if (Inst->getNumInputOperands() != 1) return SYCLGenError(); + llvm::SaveAndRestore Store(CurrInst); CurrInst = Inst; const auto *Src = diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h b/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h index 1922185e50df..42cce5902f97 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmNodes.h @@ -116,7 +116,7 @@ class InlineAsmBuiltinType : public InlineAsmType { // This class is used for device asm vector types. class InlineAsmVectorType : public InlineAsmType { public: - enum VecKind { v2, v4, v8 }; + enum VecKind { v2, v4, v8, x1, x2, x4 }; private: VecKind Kind; diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp index 8c8b7e9ff022..60e49a9c0c56 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.cpp @@ -327,7 +327,7 @@ InlineAsmStmtResult InlineAsmParser::ParseInstruction() { if (!Tok.getIdentifier() || !Tok.getIdentifier()->isInstruction()) return AsmStmtError(); - InlineAsmIdentifierInfo *Opcode = Tok.getIdentifier(); + Opcode = Tok.getIdentifier(); ConsumeToken(); SmallVector Attrs; @@ -736,20 +736,38 @@ InlineAsmExprResult InlineAsmParser::ActOnParenExpr(InlineAsmExpr *SubExpr) { InlineAsmExprResult InlineAsmParser::ActOnVectorExpr(ArrayRef Vec) { - // Vector size must be 2, 4, or 8. + // Vector size for ldmatrix are 1, 2, 4 + // size(x) = 2 * sizeof(v). InlineAsmVectorType::VecKind Kind; - switch (Vec.size()) { - case 2: - Kind = InlineAsmVectorType::v2; - break; - case 4: - Kind = InlineAsmVectorType::v4; - break; - case 8: - Kind = InlineAsmVectorType::v8; - break; - default: - return AsmExprError(); + if (Opcode->getTokenID() == asmtok::op_ldmatrix) { + switch (Vec.size()) { + case 1: + Kind = InlineAsmVectorType::x1; + break; + case 2: + Kind = InlineAsmVectorType::x2; + break; + case 4: + Kind = InlineAsmVectorType::x4; + break; + default: + return AsmExprError(); + } + } else { + // Vector size must be 2, 4, or 8. + switch (Vec.size()) { + case 2: + Kind = InlineAsmVectorType::v2; + break; + case 4: + Kind = InlineAsmVectorType::v4; + break; + case 8: + Kind = InlineAsmVectorType::v8; + break; + default: + return AsmExprError(); + } } InlineAsmBuiltinType *ElementType = nullptr; diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.h b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.h index ca3196110015..8b9a3f5f01ba 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmParser.h +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmParser.h @@ -247,6 +247,8 @@ class InlineAsmParser { }; public: + InlineAsmIdentifierInfo *Opcode; + InlineAsmParser(InlineAsmContext &Ctx, SourceMgr &Mgr) : Lexer(*Mgr.getMemoryBuffer(Mgr.getMainFileID())), Context(Ctx), SrcMgr(Mgr), CurScope(nullptr) { diff --git a/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def b/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def index 563d5595ec65..ea401fb0777c 100644 --- a/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def +++ b/clang/lib/DPCT/RulesAsm/Parser/AsmTokenKinds.def @@ -274,6 +274,14 @@ MODIFIER(v2, ".v2") MODIFIER(v4, ".v4") MODIFIER(v8, ".v8") +// Matrix modifiers +MODIFIER(x1, ".x1") +MODIFIER(x2, ".x2") +MODIFIER(x4, ".x4") + +// Matrix shape +MODIFIER(m8n8, ".m8n8") + STATE_SPACE(reg, ".reg") STATE_SPACE(sreg, ".sreg") STATE_SPACE(const, ".const") @@ -420,6 +428,8 @@ MODIFIER(ecr, ".ecr") MODIFIER(rc16, ".rc16") MODIFIER(cs, ".cs") MODIFIER(to, ".to") +MODIFIER(aligned, ".aligned") +MODIFIER(trans, ".trans") #undef LINKAGE #undef TARGET diff --git a/clang/lib/DPCT/SrcAPI/APINames_ASM.inc b/clang/lib/DPCT/SrcAPI/APINames_ASM.inc index dbd07f2090c9..15c512ea27a2 100644 --- a/clang/lib/DPCT/SrcAPI/APINames_ASM.inc +++ b/clang/lib/DPCT/SrcAPI/APINames_ASM.inc @@ -75,7 +75,7 @@ ENTRY("griddepcontrol", "griddepcontrol", false, NO_FLAG, P1, "Comment") ENTRY("isspacep", "isspacep", false, NO_FLAG, P1, "Comment") ENTRY("istypep", "istypep", false, NO_FLAG, P1, "Comment") ENTRY("ld", "ld", true, NO_FLAG, P1, "Partial") -ENTRY("ldmatrix", "ldmatrix", false, NO_FLAG, P1, "Comment") +ENTRY("ldmatrix", "ldmatrix", true, NO_FLAG, P1, "Successful") ENTRY("ldu", "ldu", false, NO_FLAG, P1, "Comment") ENTRY("lg2", "lg2", true, NO_FLAG, P1, "Successful") ENTRY("lop3", "lop3", true, NO_FLAG, P1, "Successful") diff --git a/clang/runtime/dpct-rt/include/dpct/math.hpp b/clang/runtime/dpct-rt/include/dpct/math.hpp index f23ee2d8e83a..69004f702181 100644 --- a/clang/runtime/dpct-rt/include/dpct/math.hpp +++ b/clang/runtime/dpct-rt/include/dpct/math.hpp @@ -9,8 +9,8 @@ #ifndef __DPCT_MATH_HPP__ #define __DPCT_MATH_HPP__ -#include #include +#include #include #include @@ -2055,6 +2055,167 @@ class joint_matrix { matrix_accessor x; const size_t num_elements; }; + +/// Collectively loads 1 8x8 b16 (128 bytes) matrix from private memory to local +/// memory per sub-group. Requires the sub-group size of kernel calling this +/// function to be 32. +/// 'mat' specifies the matrix index to be loaded. The first '(mat + 1) * 8' +/// work items of sub-group contain the starting address of their respective +/// matrix row in 'addr'. +/// After distributing addresses to other work items, each of the 32 work items +/// load 32-bits (2 packed 16-bit data) into 'm' for a total of 128 bytes. +/// 'trans' specifies to perform a transposed/non-transposed load by each work +/// item like below +/// Row Major: Each row of the matrix is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi0 wi1 wi1 ... wi3 wi3 +/// row-1: wi4 wi4 wi5 wi5 ... wi7 wi7 +/// ... +/// row-6: wi24 wi24 wi25 wi25 ... wi27 wi27 +/// row-7: wi28 wi28 wi29 wi29 ... wi31 wi31 +/// Col Major: Each col of the matrix is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi4 wi8 ... wi28 +/// row-1: wi0 wi4 wi8 ... wi28 +/// ... +/// row-6: wi3 wi7 wi11 ... wi31 +/// row-7: wi3 wi7 wi11 ... wi31 +/// \tparam [in] T Type of result variable (currently only supports 16-bit type) +/// \param [in] addr The starting address of corresponding matrix row for a work +/// item in local memory +/// \param [in] m The private memory to store the matrix. It points to 2 b16 +/// type elements. +/// \param [in] trans Indicates whether the matrix to be loaded transposed +/// \param [in] mat The matrix index to be loaded +template +void ldmatrix(uintptr_t addr, T *m, bool trans = false, unsigned mat = 0) { + auto sg = sycl::ext::oneapi::this_work_item::get_sub_group(); + int lane = sg.get_local_linear_id(); + + int lane_group8_row = lane / 8; + int lane_group8_col = lane % 8; + + if (!trans) { + // calculate the source lane + int src_lane = 2 * lane_group8_row; + if (lane_group8_col >= 4) + src_lane += 1; + + // Broadcast the address from the source lane + auto recv_addr_uintp = + dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane); + + // Cast the received address from uintptr_t to the type of 'm' + auto recv_addr = reinterpret_cast(recv_addr_uintp); + + // Non-transposed load + *m = recv_addr[lane_group8_col % 4]; + } else { + // calculate the source lane + int src_lane = (lane % 4) * 2; + + // Broadcast the address from the source lane + auto recv_addr_uintp_1 = + dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane); + auto recv_addr_uintp_2 = + dpct::select_from_sub_group(sg, addr, mat * 8 + src_lane + 1); + + // Cast the received address from uintptr_t to 'half *' + auto recv_addr_1 = reinterpret_cast(recv_addr_uintp_1); + auto recv_addr_2 = reinterpret_cast(recv_addr_uintp_2); + + // Transposed load + int index = lane / 4; + sycl::half val0 = recv_addr_1[index]; + sycl::half val1 = recv_addr_2[index]; + + // Combine the two 16-bits into one 32-bit value + sycl::half2 val = sycl::half2(val0, val1); + *m = *reinterpret_cast(&val); + } +} + +/// Collectively loads 2 8x8 b16 (256 bytes) matrix from private memory to local +/// memory per sub-group. Requires the sub-group size of kernel calling this +/// function to be 32. +/// The first 16 work items of sub-group contain the starting address of their +/// respective matrix row in 'addr'. +/// After distributing addresses to other work items, each of the 32 work items +/// load 64-bits (32-bits per matrix) into 'm1' & 'm2' for a total of 256 bytes. +/// 'trans' specifies to perform a transposed/non-transposed load by each work +/// item like below +/// Row Major: Each row of the matrices is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi0 wi1 wi1 ... wi3 wi3 +/// row-1: wi4 wi4 wi5 wi5 ... wi7 wi7 +/// ... +/// row-6: wi24 wi24 wi25 wi25 ... wi27 wi27 +/// row-7: wi28 wi28 wi29 wi29 ... wi31 wi31 +/// Col Major: Each col of the matrices is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi4 wi8 ... wi28 +/// row-1: wi0 wi4 wi8 ... wi28 +/// ... +/// row-6: wi3 wi7 wi11 ... wi31 +/// row-7: wi3 wi7 wi11 ... wi31 +/// \tparam [in] T Type of result variable (currently only supports 16-bit type) +/// \param [in] addr The starting address of corresponding matrix row for a work +/// item in local memory +/// \param [in] m1 The private memory to store the data of 1st matrix. It points +/// to 2 b16 type elements. +/// \param [in] m2 The private memory to store the data of 2nd matrix. It points +/// to 2 b16 type elements. +/// \param [in] trans Indicates whether the matrix to be loaded transposed +template +void ldmatrix(uintptr_t addr, T *m1, T *m2, bool trans = false) { + // Load 1st matrix + ldmatrix(addr, m1, trans, 0); + // Load 2nd matrix + ldmatrix(addr, m2, trans, 1); +} + +/// Collectively loads 4 8x8 b16 (512 bytes) matrix from private memory to local +/// memory per sub-group. Requires the sub-group size of kernel calling this +/// function to be 32. +/// Each work item of sub-group contains the starting address of their +/// respective matrix row in 'addr'. +/// After distributing addresses to other work items, each of the 32 work items +/// load 128-bits (32-bits per matrix) into 'm1', 'm2', 'm3' & 'm4' for a total +/// of 512 bytes. +/// 'trans' specifies to perform a transposed/non-transposed load by each work +/// item like below +/// Row Major: Each row of the matrices is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi0 wi1 wi1 ... wi3 wi3 +/// row-1: wi4 wi4 wi5 wi5 ... wi7 wi7 +/// ... +/// row-6: wi24 wi24 wi25 wi25 ... wi27 wi27 +/// row-7: wi28 wi28 wi29 wi29 ... wi31 wi31 +/// Col Major: Each col of the matrices is loaded by a group of 4 work items(wi) +/// row-0: wi0 wi4 wi8 ... wi28 +/// row-1: wi0 wi4 wi8 ... wi28 +/// ... +/// row-6: wi3 wi7 wi11 ... wi31 +/// row-7: wi3 wi7 wi11 ... wi31 +/// \tparam [in] T Type of result variable (currently only supports 16-bit type) +/// \param [in] addr The starting address of corresponding matrix row for a work +/// item in local memory +/// \param [in] m1 The private memory to store the data of 1st matrix. It points +/// to 2 b16 type elements. +/// \param [in] m2 The private memory to store the data of 2nd matrix. It points +/// to 2 b16 type elements. +/// \param [in] m3 The private memory to store the data of 3rd matrix. It points +/// to 2 b16 type elements. +/// \param [in] m4 The private memory to store the data of 4th matrix. It points +/// to 2 b16 type elements. +/// \param [in] trans Indicates whether the matrix to be loaded transposed +template +void ldmatrix(uintptr_t addr, T *m1, T *m2, T *m3, T *m4, bool trans = false) { + // Load 1st matrix + ldmatrix(addr, m1, trans, 0); + // Load 2nd matrix + ldmatrix(addr, m2, trans, 1); + // Load 3rd matrix + ldmatrix(addr, m3, trans, 2); + // Load 4th matrix + ldmatrix(addr, m4, trans, 3); +} + } // namespace matrix } // namespace experimental diff --git a/clang/test/dpct/asm/ldmatrix.cu b/clang/test/dpct/asm/ldmatrix.cu new file mode 100644 index 000000000000..00a04f3ef805 --- /dev/null +++ b/clang/test/dpct/asm/ldmatrix.cu @@ -0,0 +1,119 @@ +// UNSUPPORTED: cuda-8.0, cuda-9.0, cuda-9.1, cuda-9.2, cuda-10.0, cuda-10.1, cuda-10.2 +// UNSUPPORTED: v8.0, v9.0, v9.1, v9.2, v10.0, v10.1, v10.2 +// RUN: dpct --format-range=none -out-root %T/ldmatrix %s --cuda-include-path="%cuda-path/include" -- -std=c++14 -x cuda --cuda-host-only +// RUN: FileCheck %s --match-full-lines --input-file %T/ldmatrix/ldmatrix.dp.cpp +// RUN: %if build_lit %{icpx -c -DNO_BUILD_TEST -fsycl %T/ldmatrix/ldmatrix.dp.cpp -o %T/ldmatrix/ldmatrix.dp.o %} + +// clang-format off +#include +#include + +/* +ldmatrix.sync.aligned.shape.num{.trans}{.ss}.type r, [p]; + +Below are the currenly supported configurations: +.shape = {.m8n8}; +.num = {.x1, .x2, .x4}; +.ss = {.shared{::cta}}; +.type = {.b16}; +*/ + +__device__ void load_matrix_x1(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0]); + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0}, [%1];\n" + : "=r"(r[0]) + : "r"(addr)); +} + +__device__ void load_matrix_x2(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1]); + asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0, %1}, [%2];\n" + : "=r"(r[0]), "=r"(r[1]) + : "r"(addr)); +} + +__device__ void load_matrix_x4(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], &r[2], &r[3]); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]) + : "r"(addr)); +} + +__device__ void load_matrix_x1_trans(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], true); + asm volatile("ldmatrix.sync.aligned.m8n8.x1.trans.shared.b16 {%0}, [%1];\n" + : "=r"(r[0]) + : "r"(addr)); +} + +__device__ void load_matrix_x2_trans(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], true); + asm volatile("ldmatrix.sync.aligned.m8n8.x2.trans.shared.b16 {%0, %1}, [%2];\n" + : "=r"(r[0]), "=r"(r[1]) + : "r"(addr)); +} + +__device__ void load_matrix_x4_trans(void *sh_r_addr, int *r) { + // CHECK: auto addr = sh_r_addr; + uint32_t addr = static_cast(__cvta_generic_to_shared(sh_r_addr)); + + // CHECK: dpct::experimental::matrix::ldmatrix((uintptr_t)addr, &r[0], &r[1], &r[2], &r[3], true); + asm volatile("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 {%0, %1, %2, %3}, [%4];\n" + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]), "=r"(r[3]) + : "r"(addr)); +} + +__global__ void load_kernel() { + __shared__ half s_data[1024]; + int r[4]; + + load_matrix_x1(s_data, r); + load_matrix_x2(s_data, r); + load_matrix_x4(s_data, r); + load_matrix_x1_trans(s_data, r); + load_matrix_x2_trans(s_data, r); + load_matrix_x4_trans(s_data, r); +} + +int main () { + // CHECK: [=](sycl::nd_item<3> item_ct1) {{\[\[}}sycl::reqd_sub_group_size(32){{\]\]}} { + load_kernel<<<1, 32>>>(); + + return 0; +} + +#ifndef NO_BUILD_TEST +__device__ void test_xn(uint32_t addr, int *r) { + // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported. + asm volatile("ldmatrix.sync.aligned.m8n8.x1.shared.b16 {%0, %1}, [%2];\n" + : "=r"(r[0]), "=r"(r[1]) + : "r"(addr)); + + // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported. + asm volatile("ldmatrix.sync.aligned.m8n8.x2.shared.b16 {%0}, [%0];\n" + : + : "r"(addr)); + + // CHECK: DPCT1053:{{.*}}: Migration of device assembly code is not supported. + asm volatile("ldmatrix.sync.aligned.m8n8.x4.shared.b16 {%0, %1, %2}, [%3];\n" + : "=r"(r[0]), "=r"(r[1]), "=r"(r[2]) + : "r"(addr)); +} +#endif // NO_BUILD_TEST + +// clang-format on diff --git a/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv b/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv index 0cd876f76810..2f45259b90e7 100644 --- a/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv +++ b/docs/dev_guide/api-mapping-status/ASM_API_migration_status.csv @@ -41,7 +41,7 @@ griddepcontrol,NO, isspacep,NO, istypep,NO, ld,YES, Partial -ldmatrix,NO, +ldmatrix,YES,Partial ldu,NO, lg2,YES, lop3,YES,