Improve XDL to WMMA porting for grouped conv fwd (#3456)

bartekxk · web-flow · commit cbc83359649b · 2025-12-19T15:58:51.000-07:00
Refactors the way the number of XDL (matrix multiply-accumulate) instructions per wave is calculated and used in the grouped convolution forward implementations, especially to better support WMMA (Wave Matrix Multiply-Accumulate) instructions and 16x16 tiles. 
The changes use MXdlPerWave instead of NXdlPerWave to increase number of waves per M dim.
diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_2d_bf16_scaleadd_relu.cpp
@@ -33,7 +33,7 @@ TEST(FwdConvInstances,
     constexpr auto FwdConvAlgorithm =
         ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle{}
             .with_thread_block(FwdThreadBlock_64_64x32x32)
-            .with_gemm_config(FwdGemmParams_Xdl_2x2_per_wave)
+            .with_gemm_config(FwdGemmParams_Xdl_2x1_per_wave)
             .with_transfer(FwdTransfer_4x16x1)
             .with_specializations(ConvFwdSpecialization::DEFAULT, GemmSpecialization::MNKPadding)
             .with_prefetch_config(1, 1, PipelineScheduler::DEFAULT);
diff --git a/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp b/experimental/builder/test/conv/ck/test_ckb_conv_fwd_3d_fp16.cpp
@@ -28,7 +28,7 @@ TEST(FwdConvInstances,
     constexpr auto FwdConvAlgorithm =
         ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3{}
             .with_thread_block(FwdThreadBlock_256_128x128x32)
-            .with_gemm_config(FwdGemmParams_Xdl_4x4_per_wave)
+            .with_gemm_config(FwdGemmParams_Xdl_2x1_per_wave)
             .with_transfer(FwdTransfer_4x64x1)
             .with_specializations(ConvFwdSpecialization::FILTER_1X1_PAD0,
                                   GemmSpecialization::MNKPadding)
diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp
@@ -111,8 +111,8 @@ struct DefaultAlgorithm
                                              .bk1            = 8,
                                              .m_per_xdl      = 16,
                                              .n_per_xdl      = 16,
-                                             .m_xdl_per_wave = 4,
-                                             .n_xdl_per_wave = 4};
+                                             .m_xdl_per_wave = 8,
+                                             .n_xdl_per_wave = 8};
 
     ckb::test::TransferABC transfer{
         .a =
@@ -188,7 +188,7 @@ TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
                     "   ├─ Pipeline scheduler: INTRAWAVE\n"
                     "   ├─ Warp Gemm parameters: \n"
                     "   │  ├─ subtile size: 16×16\n"
-                    "   │  └─ Number of warp gemm iterations: 4×4\n"
+                    "   │  └─ Number of warp gemm iterations: 8×8\n"
                     "   └─ Memory access:\n"
                     "      ├─ A Tile transfer: \n"
                     "      │  ├─ Tile dimensions: 4×256×8×\n"
diff --git a/experimental/builder/test/utils/ckb_conv_test_configs.hpp b/experimental/builder/test/utils/ckb_conv_test_configs.hpp
@@ -68,7 +68,7 @@ constexpr TransferABC FwdTransfer_4x64x1{
                 {.m_block = 1, .m_wave_per_xdl = 32, .n_block = 1, .n_wave_per_xdl = 8},
             .epilogue = {.m_xdl_per_wave_per_shuffle = 1,
                          .n_per_wave_per_shuffle     = 1,
-                         .scalar_per_vector          = 8},
+                         .scalar_per_vector          = 4},
         },
 };
 
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -60,7 +60,7 @@ template <index_t BlockSize_,
           index_t NPerXDL_,
           index_t MXdlPerWave_,
           bool IsWave64>
-static constexpr auto GetNXdlPerWave2()
+static constexpr auto GetXdlPerWave2()
 {
     constexpr index_t Waves  = IsWave64 ? BlockSize_ / 64 : BlockSize_ / 32;
     constexpr index_t MWaves = MPerBlock_ / (MXdlPerWave_ * MPerXDL_);
@@ -84,17 +84,33 @@ static constexpr auto GetNXdlPerWave2()
     }
 }
 
-#define GET_NXDL_PER_WAVE_IMPL              \
-    template <bool IsWave64>                \
-    static constexpr auto GetNXdlPerWave()  \
-    {                                       \
-        return GetNXdlPerWave2<BlockSize,   \
-                               MPerBlock,   \
-                               NPerBlock,   \
-                               MPerXDL,     \
-                               NPerXDL,     \
-                               MXdlPerWave, \
-                               IsWave64>(); \
+#define GET_NXDL_PER_WAVE_IMPL             \
+    template <bool IsWave64>               \
+    static constexpr auto GetNXdlPerWave() \
+    {                                      \
+        return GetXdlPerWave2<BlockSize,   \
+                              MPerBlock,   \
+                              NPerBlock,   \
+                              MPerXDL,     \
+                              NPerXDL,     \
+                              MXdlPerWave, \
+                              IsWave64>(); \
+    }
+
+#define GET_MXDL_PER_WAVE_IMPL                          \
+    template <bool IsWave64,                            \
+              index_t MPerXDLAligned     = MPerXDL,     \
+              index_t NPerXDLAligned     = NPerXDL,     \
+              index_t NXdlPerWaveAligned = NXdlPerWave> \
+    static constexpr auto GetMXdlPerWave()              \
+    {                                                   \
+        return GetXdlPerWave2<BlockSize,                \
+                              NPerBlock,                \
+                              MPerBlock,                \
+                              NPerXDLAligned,           \
+                              MPerXDLAligned,           \
+                              NXdlPerWaveAligned,       \
+                              IsWave64>();              \
     }
 
 template <index_t BlockSize_,
@@ -114,14 +130,14 @@ static constexpr auto GetWarpTileConfig()
 
     constexpr auto NXdlPerWave =
         IsWave64
-            ? GetNXdlPerWave2<BlockSize_,
-                              MPerBlock_,
-                              NPerBlock_,
-                              MPerXDL_,
-                              NPerXDL_,
-                              MXdlPerWave_,
-                              true>()
-            : GetNXdlPerWave2<BlockSize_, MPerBlock_, NPerBlock_, 16, 16, MXdlPerWave32, false>();
+            ? GetXdlPerWave2<BlockSize_,
+                             MPerBlock_,
+                             NPerBlock_,
+                             MPerXDL_,
+                             NPerXDL_,
+                             MXdlPerWave_,
+                             true>()
+            : GetXdlPerWave2<BlockSize_, MPerBlock_, NPerBlock_, 16, 16, MXdlPerWave32, false>();
 
     if constexpr(IsWave64 == false && NXdlPerWave != 0)
     {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_gemm_xdl_cshuffle.hpp
@@ -190,9 +190,9 @@ struct DeviceBatchedGemmGemm_Xdl_CShuffle : public DeviceBatchedGemmGemm<ALayout
     using DeviceOp = DeviceBatchedGemmGemm_Xdl_CShuffle;
 
     static constexpr auto MXdlPerWave64 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
     static constexpr auto MXdlPerWave32 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_gemm_multiple_d_xdl_cshuffle.hpp
@@ -235,20 +235,20 @@ struct DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle
 {
     using DeviceOp = DeviceBatchedGemmMultipleDGemmMultipleD_Xdl_CShuffle;
 
-    static constexpr auto Gemm0MXdlPerWave64 = GetNXdlPerWave2<BlockSize,
-                                                               Gemm0NPerBlock,
-                                                               Gemm0MPerBlock,
-                                                               Gemm0NPerXdl,
-                                                               Gemm0MPerXdl,
-                                                               Gemm0NXdlPerWave,
-                                                               true>();
-    static constexpr auto Gemm0MXdlPerWave32 = GetNXdlPerWave2<BlockSize,
-                                                               Gemm0NPerBlock,
-                                                               Gemm0MPerBlock,
-                                                               Gemm0NPerXdl,
-                                                               Gemm0MPerXdl,
-                                                               Gemm0NXdlPerWave,
-                                                               false>();
+    static constexpr auto Gemm0MXdlPerWave64 = GetXdlPerWave2<BlockSize,
+                                                              Gemm0NPerBlock,
+                                                              Gemm0MPerBlock,
+                                                              Gemm0NPerXdl,
+                                                              Gemm0MPerXdl,
+                                                              Gemm0NXdlPerWave,
+                                                              true>();
+    static constexpr auto Gemm0MXdlPerWave32 = GetXdlPerWave2<BlockSize,
+                                                              Gemm0NPerBlock,
+                                                              Gemm0MPerBlock,
+                                                              Gemm0NPerXdl,
+                                                              Gemm0MPerXdl,
+                                                              Gemm0NXdlPerWave,
+                                                              false>();
 
     static constexpr index_t NumD0Tensor = D0sDataType::Size();
     static constexpr index_t NumD1Tensor = D1sDataType::Size();
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
@@ -223,9 +223,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle
                                                  MaskingSpec>
 {
     static constexpr auto MXdlPerWave64 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
     static constexpr auto MXdlPerWave32 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
 
     static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0 && NumDimO > 0,
                   "Number of dimension must be greater than 0");
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -211,9 +211,9 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
 
     using DeviceOp = DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle;
     static constexpr auto MXdlPerWave64 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, true>();
     static constexpr auto MXdlPerWave32 =
-        GetNXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
+        GetXdlPerWave2<BlockSize, NPerBlock, MPerBlock, NPerXDL, MPerXDL, NXdlPerWave, false>();
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_softmax_gemm_permute_xdl_cshuffle.hpp