NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/common/quantization.h‎
Lines changed: 10 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/quantization.h‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp‎
Lines changed: 14 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp‎
Lines changed: 14 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h‎
Lines changed: 15 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h‎
Lines changed: 15 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h‎
Lines changed: 107 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h‎
Lines changed: 107 additions & 2 deletions
@@ -134,6 +134,11 @@ class QuantMode
         return QuantMode(BaseType(1u) << 16);
     }
 
+    static constexpr QuantMode mxfp8() noexcept
+    {
+        return QuantMode(BaseType(1u) << 17);
+    }
+
     constexpr BaseType value() const noexcept
     {
         return mValue;
@@ -224,6 +229,11 @@ class QuantMode
         return isSet(w4a16Mxfp4());
     }
 
+    constexpr bool hasMxfp8() const noexcept
+    {
+        return isSet(mxfp8());
+    }
+
     constexpr bool hasKvCacheQuant() const noexcept
     {
         return hasInt8KvCache() || hasFp8KvCache() || hasFp4KvCache();
 
@@ -449,10 +449,20 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(
     CutlassGemmConfig::CandidateConfigTypeParam const config, int sm)
 {
 #ifdef FAST_BUILD
-    // Fast build disables all configs except this one for SM100
-    return {CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B, MainloopScheduleType::AUTO,
-        EpilogueScheduleType::TMA, ClusterShape::ClusterShape_1x1x1, ClusterShape::Undefined, ClusterShape::Undefined,
-        sm}};
+    // Fast build limits the candidate set to a single CTA tile shape but
+    // keeps both 1SM (cluster 1x1x1) and 2SM (cluster 2x1x1) variants so
+    // the autotuner can profile both. Block-scaled paths (MXFP8xMXFP8,
+    // NVFP4) accept both; the 2SM variant is required as a candidate so
+    // FAST_BUILD doesn't accidentally exclude all 2SM kernels (needed for
+    // MMA M=256 configurations of the Mxf8f6f4 tensor-op).
+    return {
+        CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B, MainloopScheduleType::AUTO,
+            EpilogueScheduleType::TMA, ClusterShape::ClusterShape_1x1x1, ClusterShape::Undefined,
+            ClusterShape::Undefined, sm},
+        CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x128x128B, MainloopScheduleType::AUTO,
+            EpilogueScheduleType::TMA, ClusterShape::ClusterShape_2x1x1, ClusterShape::Undefined,
+            ClusterShape::Undefined, sm},
+    };
 #else
     if (config & CutlassGemmConfig::GROUPED_GEMM)
     {
 
@@ -32,11 +32,22 @@ template <class ArchTag, class TileShape, class ClusterShape, bool DYNAMIC_CGA,
 struct should_filter_tma_warp_specialized_gemm_problem_shape
 {
 #ifdef FAST_BUILD
-    using SupportedCtaShape = cute::Shape<cute::_128, cute::_128, decltype(cute::get<2>(TileShape{}))>;
-    using SupportedCgaShape = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    // The launcher passes its MMA tile shape here, which is CTA_M *
+    // (Is2SM ? 2 : 1). So a CTA_M=128 with cluster_M=2 (2SM mode) lands as
+    // MmaTileShape M==256. We accept both M=128 (1SM) and M=256 (2SM) under
+    // FAST_BUILD so MXFP8xMXFP8 grouped MoE (which requires the Mxf8f6f4
+    // tensor-op's MMA M==256) is reachable. The 1SM variant is also kept
+    // for per-tensor FP8 / BF16 paths.
+    using SupportedCtaShape1Sm = cute::Shape<cute::_128, cute::_128, decltype(cute::get<2>(TileShape{}))>;
+    using SupportedCtaShape2Sm = cute::Shape<cute::_256, cute::_128, decltype(cute::get<2>(TileShape{}))>;
+    using SupportedCgaShape1Sm = cute::Shape<cute::_1, cute::_1, cute::_1>;
+    using SupportedCgaShape2Sm = cute::Shape<cute::_2, cute::_1, cute::_1>;
 
-    constexpr static bool value = !cute::is_same_v<SupportedCtaShape, TileShape>
-        || !cute::is_same_v<SupportedCgaShape, ClusterShape> || DYNAMIC_CGA;
+    constexpr static bool cta_ok
+        = cute::is_same_v<SupportedCtaShape1Sm, TileShape> || cute::is_same_v<SupportedCtaShape2Sm, TileShape>;
+    constexpr static bool cga_ok
+        = cute::is_same_v<SupportedCgaShape1Sm, ClusterShape> || cute::is_same_v<SupportedCgaShape2Sm, ClusterShape>;
+    constexpr static bool value = !cta_ok || !cga_ok || DYNAMIC_CGA;
 #else
     constexpr static bool value = false;
 #endif
 
@@ -80,6 +80,7 @@ INSTANTIATE_FP4_GEMM_KERNEL_LAUNCHER_SM120(__nv_bfloat16, 256, 128, 128, 1, 1, 1
 
 template class CutlassFp4GemmRunner<__nv_bfloat16, FP4GemmType::W4A4_NVFP4_NVFP4>;
 template class CutlassFp4GemmRunner<__nv_bfloat16, FP4GemmType::W4A8_MXFP4_MXFP8>;
+template class CutlassFp4GemmRunner<__nv_bfloat16, FP4GemmType::W8A8_MXFP8_MXFP8>;
 
 #endif
 
 
@@ -79,6 +79,7 @@ INSTANTIATE_FP4_GEMM_KERNEL_LAUNCHER_SM120(half, 256, 128, 128, 1, 1, 1)
 
 template class CutlassFp4GemmRunner<half, FP4GemmType::W4A4_NVFP4_NVFP4>;
 template class CutlassFp4GemmRunner<half, FP4GemmType::W4A8_MXFP4_MXFP8>;
+template class CutlassFp4GemmRunner<half, FP4GemmType::W8A8_MXFP8_MXFP8>;
 
 } // namespace cutlass_kernels
 } // namespace kernels
 
@@ -79,6 +79,7 @@ INSTANTIATE_FP4_GEMM_KERNEL_LAUNCHER_SM120(float, 256, 128, 128, 1, 1, 1)
 
 template class CutlassFp4GemmRunner<float, FP4GemmType::W4A4_NVFP4_NVFP4>;
 template class CutlassFp4GemmRunner<float, FP4GemmType::W4A8_MXFP4_MXFP8>;
+template class CutlassFp4GemmRunner<float, FP4GemmType::W8A8_MXFP8_MXFP8>;
 
 } // namespace cutlass_kernels
 } // namespace kernels
 
@@ -37,6 +37,7 @@
 
 #include "../include/fp4_gemm.h"
 #include "mxfp8_mxfp4_gemm_template_sm100.h"
+#include "mxfp8_mxfp8_gemm_template_sm100.h"
 #include "nvfp4_nvfp4_gemm_template_sm100.h"
 #include "nvfp4_nvfp4_gemm_template_sm120.h"
 
@@ -323,6 +324,94 @@ size_t dispatchMXFP8xMXFP4GemmCTAShapeSm100(T* D, void const* A, void const* B,
     }
 }
 
+template <typename T, typename CTA_M_, typename CTA_N_, typename CTA_K_>
+size_t dispatchMXFP8xMXFP8GemmClusterShapeSm100(T* D, void const* A, void const* B, void const* input_sf,
+    void const* weight_sf, float const* global_sf, int m, int n, int k, int batch_count,
+    tkc::CutlassGemmConfig gemmConfig, char* workspace, const size_t workspaceBytes, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+
+    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+
+    switch (gemmConfig.cluster_shape)
+    {
+    case tkc::ClusterShape::ClusterShape_2x1x1:
+        return genericMXFP8xMXFP8GemmKernelLauncher<T, CTA_M_, CTA_N_, CTA_K_, cute::Int<2>, cute::Int<1>, cute::Int<1>,
+            __2SM>(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes,
+            stream, occupancy);
+        break;
+    case tkc::ClusterShape::ClusterShape_2x2x1:
+        return genericMXFP8xMXFP8GemmKernelLauncher<T, CTA_M_, CTA_N_, CTA_K_, cute::Int<2>, cute::Int<2>, cute::Int<1>,
+            __2SM>(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes,
+            stream, occupancy);
+        break;
+    case tkc::ClusterShape::ClusterShape_4x2x1:
+        return genericMXFP8xMXFP8GemmKernelLauncher<T, CTA_M_, CTA_N_, CTA_K_, cute::Int<4>, cute::Int<2>, cute::Int<1>,
+            __2SM>(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes,
+            stream, occupancy);
+        break;
+    case tkc::ClusterShape::ClusterShape_2x4x1:
+        return genericMXFP8xMXFP8GemmKernelLauncher<T, CTA_M_, CTA_N_, CTA_K_, cute::Int<2>, cute::Int<4>, cute::Int<1>,
+            __2SM>(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes,
+            stream, occupancy);
+        break;
+    case tkc::ClusterShape::ClusterShape_4x4x1:
+        return genericMXFP8xMXFP8GemmKernelLauncher<T, CTA_M_, CTA_N_, CTA_K_, cute::Int<4>, cute::Int<4>, cute::Int<1>,
+            __2SM>(D, A, B, input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes,
+            stream, occupancy);
+        break;
+    default:
+        throw std::runtime_error(
+            "[TensorRT LLM Error][MXFP8][dispatch_gemm_cluster_shape] Config is invalid for MXFP8xMXFP8 GEMM.");
+        break;
+    }
+}
+
+template <typename T>
+size_t dispatchMXFP8xMXFP8GemmCTAShapeSm100(T* D, void const* A, void const* B, void const* input_sf,
+    void const* weight_sf, float const* global_sf, int m, int n, int k, int batch_count,
+    tkc::CutlassGemmConfig gemmConfig, char* workspace, const size_t workspaceBytes, cudaStream_t stream,
+    int* occupancy = nullptr)
+{
+
+    TLLM_LOG_DEBUG(__PRETTY_FUNCTION__);
+    switch (gemmConfig.tile_config_sm100)
+    {
+    case tkc::CutlassTileConfigSM100::CtaShape128x64x128B:
+        return dispatchMXFP8xMXFP8GemmClusterShapeSm100<T, cute::Int<128>, cute::Int<64>, cute::Int<128>>(D, A, B,
+            input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream,
+            occupancy);
+        break;
+    case tkc::CutlassTileConfigSM100::CtaShape128x256x128B:
+        return dispatchMXFP8xMXFP8GemmClusterShapeSm100<T, cute::Int<128>, cute::Int<256>, cute::Int<128>>(D, A, B,
+            input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream,
+            occupancy);
+        break;
+    case tkc::CutlassTileConfigSM100::CtaShape128x128x256B:
+        return dispatchMXFP8xMXFP8GemmClusterShapeSm100<T, cute::Int<128>, cute::Int<128>, cute::Int<256>>(D, A, B,
+            input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream,
+            occupancy);
+        break;
+    case tkc::CutlassTileConfigSM100::CtaShape128x256x256B:
+        return dispatchMXFP8xMXFP8GemmClusterShapeSm100<T, cute::Int<128>, cute::Int<256>, cute::Int<256>>(D, A, B,
+            input_sf, weight_sf, global_sf, m, n, k, batch_count, gemmConfig, workspace, workspaceBytes, stream,
+            occupancy);
+        break;
+    case tkc::CutlassTileConfigSM100::Undefined:
+        throw std::runtime_error("[TensorRT LLM Error][MXFP8][dispatch_gemm_cta_shape] Gemm config undefined.");
+        break;
+    case tkc::CutlassTileConfigSM100::ChooseWithHeuristic:
+        throw std::runtime_error(
+            "[TensorRT LLM Error][MXFP8][dispatch_gemm_cta_shape] Gemm config should have already been set by "
+            "heuristic.");
+        break;
+    default:
+        throw std::runtime_error(
+            "[TensorRT LLM Error][MXFP8][dispatch_gemm_cta_shape] Config is invalid for MXFP8xMXFP8 GEMM.");
+        break;
+    }
+}
+
 template <typename T, FP4GemmType fp4GemmType>
 CutlassFp4GemmRunner<T, fp4GemmType>::CutlassFp4GemmRunner()
 {
@@ -358,6 +447,19 @@ size_t CutlassFp4GemmRunner<T, fp4GemmType>::dispatchToArch(T* D, void const* A,
                 "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS FP4 GEMM");
         }
     }
+    else if constexpr (fp4GemmType == FP4GemmType::W8A8_MXFP8_MXFP8)
+    {
+        if (mSm == 100 || mSm == 103)
+        {
+            return dispatchMXFP8xMXFP8GemmCTAShapeSm100<T>(D, A, B, input_sf, weight_sf, global_sf, m, n, k,
+                batch_count, gemmConfig, workspace, workspaceBytes, stream, occupancy);
+        }
+        else
+        {
+            throw std::runtime_error(
+                "[TensorRT LLM Error][CutlassFp4GemmRunner][GEMM Dispatch] Arch unsupported for CUTLASS MXFP8 GEMM");
+        }
+    }
     else if constexpr (fp4GemmType == FP4GemmType::W4A4_NVFP4_NVFP4)
     {
         if (mSm == 103)
@@ -437,9 +539,12 @@ std::vector<tkc::CutlassGemmConfig> CutlassFp4GemmRunner<T, fp4GemmType>::getCon
         {
             for (auto const& cluster_config : clusterShapes)
             {
-                if constexpr (fp4GemmType == FP4GemmType::W4A8_MXFP4_MXFP8)
+                if constexpr (fp4GemmType == FP4GemmType::W4A8_MXFP4_MXFP8
+                    || fp4GemmType == FP4GemmType::W8A8_MXFP8_MXFP8)
                 {
-                    // Skip for high smem usage.
+                    // Skip for high smem usage (MXFP8xMXFP8 has even higher
+                    // smem pressure than MXFP8xMXFP4 because the B operand is
+                    // 2x wider, so the same skips apply).
                     if (cluster_config == tkc::ClusterShape::ClusterShape_1x1x1
                         || cluster_config == tkc::ClusterShape::ClusterShape_1x2x1
                         || cluster_config == tkc::ClusterShape::ClusterShape_1x4x1)
Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,11 @@ class QuantMode`
`134`	`134`	`return QuantMode(BaseType(1u) << 16);`
`135`	`135`	`}`
`136`	`136`
	`137`	`+ static constexpr QuantMode mxfp8() noexcept`
	`138`	`+ {`
	`139`	`+ return QuantMode(BaseType(1u) << 17);`
	`140`	`+ }`
	`141`	`+`
`137`	`142`	`constexpr BaseType value() const noexcept`
`138`	`143`	`{`
`139`	`144`	`return mValue;`
`@@ -224,6 +229,11 @@ class QuantMode`
`224`	`229`	`return isSet(w4a16Mxfp4());`
`225`	`230`	`}`
`226`	`231`
	`232`	`+ constexpr bool hasMxfp8() const noexcept`
	`233`	`+ {`
	`234`	`+ return isSet(mxfp8());`
	`235`	`+ }`
	`236`	`+`
`227`	`237`	`constexpr bool hasKvCacheQuant() const noexcept`
`228`	`238`	`{`
`229`	`239`	`return hasInt8KvCache() \|\| hasFp8KvCache() \|\| hasFp4KvCache();`