Skip to content

Commit 5b5a04a

Browse files
[CUDA] Add sm_121/Blackwell to known target
Add an initial NVIDIA GB10 / sm_121 CUDA target description. The CUDA execution limits are based on local cudaDeviceProp results from an sm_121 device. Existing NVIDIA MMA ops are reused as a conservative baseline until Blackwell-specific MMA intrinsics are modeled. Signed-off-by: Charlie-Tsai1123 <charlie1123tsai@gmail.com>
1 parent 2098bbe commit 5b5a04a

1 file changed

Lines changed: 31 additions & 0 deletions

File tree

compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/KnownTargets.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -913,6 +913,32 @@ StringRef normalizeARMGPUTarget(StringRef target) {
913913
// cooperative matrix layouts are opaque. We need to create NVIDIA specific WMMA
914914
// intrinsics if we need to have explicit layout analysis and register mapping.
915915

916+
// Reports initial NVIDIA Blackwell 12.1 target capabilities for GPU target
917+
// selection. CUDA execution limits are based on sm_121 GB10 device properties.
918+
const WgpDetails *getBlackwellWgpDetails() {
919+
static const MMAIntrinsic mmaOps[] = {
920+
MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_F16,
921+
MMAIntrinsic::NV_MMA_SYNC_F16_16x8x16_F16,
922+
MMAIntrinsic::NV_MMA_SYNC_F32_16x8x16_BF16,
923+
MMAIntrinsic::NV_WMMA_F32_16x16x16_F16,
924+
MMAIntrinsic::NV_WMMA_F16_16x16x16_F16,
925+
};
926+
static const WgpDetails blackwellWgp = {allComputeBits,
927+
allStorageBits,
928+
allSubgroupOps,
929+
allDotProductOps,
930+
std::size(mmaOps),
931+
mmaOps,
932+
0,
933+
nullptr,
934+
{32, 32},
935+
{1024, 1024, 64},
936+
1024,
937+
99 * 1024,
938+
{0x7fffffff, 0xffff, 0xffff}};
939+
return &blackwellWgp;
940+
}
941+
916942
// Reports Ampere-class NVIDIA tensor core capabilities for GPU target
917943
// selection.
918944
const WgpDetails *getAmpereWgpDetails() {
@@ -1000,6 +1026,7 @@ const WgpDetails *getPascalWgpDetails() {
10001026

10011027
// Maps NVIDIA target aliases to the GPU capability model used by codegen.
10021028
std::optional<TargetDetails> getNVIDIAGPUTargetDetails(StringRef target) {
1029+
const WgpDetails *blackwellWgp = getBlackwellWgpDetails();
10031030
const WgpDetails *ampereWgp = getAmpereWgpDetails();
10041031
const WgpDetails *turingWgp = getTuringWgpDetails();
10051032
const WgpDetails *voltaWgp = getVoltaWgpDetails();
@@ -1038,6 +1065,10 @@ std::optional<TargetDetails> getNVIDIAGPUTargetDetails(StringRef target) {
10381065
.Case("rtx3070ti", TargetDetails{ampereWgp, &rtx3070tiChip})
10391066
// https://www.techpowerup.com/gpu-specs/geforce-rtx-3070.c3674
10401067
.Case("rtx3070", TargetDetails{ampereWgp, &rtx3070Chip})
1068+
// Initial support for sm_121 / GB10. Other Blackwell compute
1069+
// capabilities, including sm_120, are intentionally left for follow-up
1070+
// validation.
1071+
.Case("sm_121", TargetDetails{blackwellWgp, nullptr})
10411072
.Cases({"ada", "sm_89"}, TargetDetails{ampereWgp, nullptr})
10421073
.Cases({"ampere", "sm_80", "sm_86", "sm_87"},
10431074
TargetDetails{ampereWgp, nullptr})

0 commit comments

Comments
 (0)