[CK_TILE] Minor splitk bugfix for gemms and conv (#3387)

jakpiase · bartekxk · web-flow · commit c0797c167143 · 2025-12-24T00:10:13.000+01:00
* fix for splitk if splitk &lt; grid

* add different splitk implementation

* minor bugfix for streamk gemm

* Add test

---------

Co-authored-by: Bartlomiej Kocot &lt;barkocot@amd.com&gt;
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -323,22 +323,38 @@ struct UniversalGemmKernel
 
     struct SplitKBatchOffset
     {
-        __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)
+        // This structure distributes work evenly among splitkk workgroups
+        // It's based on a principle that if there is enough work to fill all workgroups,
+        // then we can distribute the (K / K1) parts among k_batch workgroups in such a way
+        // that each workgroup will be doing ceil((K / K1) / splitk) or ceil((K / K1) / splitk) - 1
+        // and leave the potential tail for last(splitk - 1) indexed workgroup.
+        __device__ SplitKBatchOffset(const KernelArgs& kargs, const index_t k_id = blockIdx.z)
         {
-            constexpr auto K1   = GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{});
-            const index_t K_t   = amd_wave_read_first_lane(kargs.k_batch * K1);
-            const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);
+            constexpr auto K1     = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});
+            const index_t num_all = amd_wave_read_first_lane(
+                kargs.K / K1); // num of all loops not including potential tail
+            index_t num_full = amd_wave_read_first_lane(num_all % kargs.k_batch);
+            num_full         = num_full == 0 ? kargs.k_batch : num_full;
+
+            const index_t num_full_iters =
+                amd_wave_read_first_lane(std::max(integer_divide_ceil(num_all, kargs.k_batch), 1));
+            const index_t full_k_read    = num_full_iters * K1;
+            const index_t partial_k_read = (num_full_iters - 1) * K1;
 
             static_for<0, NumATensor, 1>{}([&](auto index) {
                 using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)
                 {
-                    as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
+                    as_k_split_offset[index] =
+                        amd_wave_read_first_lane(std::min(k_id, num_full) * full_k_read +
+                                                 std::max(k_id - num_full, 0) * partial_k_read);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)
                 {
                     as_k_split_offset[index] =
-                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);
+                        amd_wave_read_first_lane((std::min(k_id, num_full) * full_k_read +
+                                                  std::max(k_id - num_full, 0) * partial_k_read) *
+                                                 kargs.stride_As[index]);
                 }
             });
 
@@ -347,21 +363,30 @@ struct UniversalGemmKernel
                 if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)
                 {
                     bs_k_split_offset[index] =
-                        amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);
+                        amd_wave_read_first_lane((std::min(k_id, num_full) * full_k_read +
+                                                  std::max(k_id - num_full, 0) * partial_k_read) *
+                                                 kargs.stride_Bs[index]);
                 }
                 else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)
                 {
-                    bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);
+                    bs_k_split_offset[index] =
+                        amd_wave_read_first_lane(std::min(k_id, num_full) * full_k_read +
+                                                 std::max(k_id - num_full, 0) * partial_k_read);
                 }
             });
 
-            if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))
+            if(k_id == kargs.k_batch - 1)
+            {
+                splitted_k = kargs.K - std::min(k_id, num_full) * full_k_read -
+                             std::max(k_id - num_full, 0) * partial_k_read;
+            }
+            else if(k_id < num_full)
             {
-                splitted_k = amd_wave_read_first_lane(KRead);
+                splitted_k = full_k_read;
             }
             else
             {
-                splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));
+                splitted_k = partial_k_read;
             }
         }
 
@@ -385,6 +410,15 @@ struct UniversalGemmKernel
             }
         }
 
+        if(kargs.K < GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!");
+            }
+            return false;
+        }
+
         const auto vectorSizeA = is_wave32() ? GemmPipeline::template GetVectorSizeA<true>()
                                              : GemmPipeline::template GetVectorSizeA<false>();
         bool AsTesnorIsValid   = {true};
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -568,6 +568,15 @@ struct GroupedConvolutionBackwardWeightKernel
             }
         }
 
+        if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!");
+            }
+            return false;
+        }
+
         const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
         const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
 
diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp
@@ -173,6 +173,11 @@ static GroupedConvBwdWeightHostArgs create_2d_host_args(index_t k_batch)
     return create_2d_host_args(2, 2, 8, 8, 3, 3, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, k_batch);
 }
 
+static GroupedConvBwdWeightHostArgs create_large_2d_host_args(index_t k_batch)
+{
+    return create_2d_host_args(2, 2, 8, 8, 3, 3, 70, 70, 1, 1, 1, 1, 1, 1, 1, 1, k_batch);
+}
+
 class GroupedConvBwdWeightIsSupportedArgumentTest : public ::testing::Test
 {
 };
@@ -227,6 +232,25 @@ TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, AtomicAddRequiresKBatchGreat
     EXPECT_TRUE(Kernel::IsSupportedArgument(kargs_2));
 }
 
+TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, K0KBatchLimitation)
+{
+    using Kernel = typename BuildKernel<half_t,
+                                        TestConvConfig,
+                                        tensor_layout::convolution::NHWGC,
+                                        tensor_layout::convolution::GKYXC,
+                                        tensor_layout::convolution::NHWGK>::type;
+
+    // k_batch = 128 should pass
+    auto host_args_kbatch_6 = create_2d_host_args(6);
+    auto kargs_6 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_6);
+    EXPECT_TRUE(Kernel::IsSupportedArgument(kargs_6));
+
+    // k_batch = 129 should fail for half_t output
+    auto host_args_kbatch_7 = create_2d_host_args(7);
+    auto kargs_7 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_7);
+    EXPECT_FALSE(Kernel::IsSupportedArgument(kargs_7));
+}
+
 TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, NonFloatDoubleOutputLimitsKBatch)
 {
     using Kernel = typename BuildKernel<half_t,
@@ -236,13 +260,13 @@ TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, NonFloatDoubleOutputLimitsKB
                                         tensor_layout::convolution::NHWGK>::type;
 
     // k_batch = 128 should pass
-    auto host_args_kbatch_128 = create_2d_host_args(128);
+    auto host_args_kbatch_128 = create_large_2d_host_args(128);
     auto kargs_128 =
         typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_128);
     EXPECT_TRUE(Kernel::IsSupportedArgument(kargs_128));
 
     // k_batch = 129 should fail for half_t output
-    auto host_args_kbatch_129 = create_2d_host_args(129);
+    auto host_args_kbatch_129 = create_large_2d_host_args(129);
     auto kargs_129 =
         typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_129);
     EXPECT_FALSE(Kernel::IsSupportedArgument(kargs_129));

Original file line number	Diff line number	Diff line change
`@@ -323,22 +323,38 @@ struct UniversalGemmKernel`
`323`	`323`
`324`	`324`	`struct SplitKBatchOffset`
`325`	`325`	`{`
`326`		`- __device__ SplitKBatchOffset(const KernelArgs& kargs, const std::size_t k_id = blockIdx.z)`
	`326`	`+ // This structure distributes work evenly among splitkk workgroups`
	`327`	`+ // It's based on a principle that if there is enough work to fill all workgroups,`
	`328`	`+ // then we can distribute the (K / K1) parts among k_batch workgroups in such a way`
	`329`	`+ // that each workgroup will be doing ceil((K / K1) / splitk) or ceil((K / K1) / splitk) - 1`
	`330`	`+ // and leave the potential tail for last(splitk - 1) indexed workgroup.`
	`331`	`+ __device__ SplitKBatchOffset(const KernelArgs& kargs, const index_t k_id = blockIdx.z)`
`327`	`332`	`{`
`328`		`- constexpr auto K1 = GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{});`
`329`		`- const index_t K_t = amd_wave_read_first_lane(kargs.k_batch * K1);`
`330`		`- const index_t KRead = amd_wave_read_first_lane((kargs.K + K_t - 1) / K_t * K1);`
	`333`	`+ constexpr auto K1 = TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{});`
	`334`	`+ const index_t num_all = amd_wave_read_first_lane(`
	`335`	`+ kargs.K / K1); // num of all loops not including potential tail`
	`336`	`+ index_t num_full = amd_wave_read_first_lane(num_all % kargs.k_batch);`
	`337`	`+ num_full = num_full == 0 ? kargs.k_batch : num_full;`
	`338`	`+`
	`339`	`+ const index_t num_full_iters =`
	`340`	`+ amd_wave_read_first_lane(std::max(integer_divide_ceil(num_all, kargs.k_batch), 1));`
	`341`	`+ const index_t full_k_read = num_full_iters * K1;`
	`342`	`+ const index_t partial_k_read = (num_full_iters - 1) * K1;`
`331`	`343`
`332`	`344`	`static_for<0, NumATensor, 1>{}([&](auto index) {`
`333`	`345`	`using AiLayout = remove_cvref_t<std::tuple_element_t<index.value, AsLayout>>;`
`334`	`346`	`if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, AiLayout>)`
`335`	`347`	`{`
`336`		`- as_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);`
	`348`	`+ as_k_split_offset[index] =`
	`349`	`+ amd_wave_read_first_lane(std::min(k_id, num_full) * full_k_read +`
	`350`	`+ std::max(k_id - num_full, 0) * partial_k_read);`
`337`	`351`	`}`
`338`	`352`	`else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, AiLayout>)`
`339`	`353`	`{`
`340`	`354`	`as_k_split_offset[index] =`
`341`		`- amd_wave_read_first_lane(k_id * KRead * kargs.stride_As[index]);`
	`355`	`+ amd_wave_read_first_lane((std::min(k_id, num_full) * full_k_read +`
	`356`	`+ std::max(k_id - num_full, 0) * partial_k_read) *`
	`357`	`+ kargs.stride_As[index]);`
`342`	`358`	`}`
`343`	`359`	`});`
`344`	`360`
`@@ -347,21 +363,30 @@ struct UniversalGemmKernel`
`347`	`363`	`if constexpr(std::is_same_v<tensor_layout::gemm::RowMajor, BiLayout>)`
`348`	`364`	`{`
`349`	`365`	`bs_k_split_offset[index] =`
`350`		`- amd_wave_read_first_lane(k_id * KRead * kargs.stride_Bs[index]);`
	`366`	`+ amd_wave_read_first_lane((std::min(k_id, num_full) * full_k_read +`
	`367`	`+ std::max(k_id - num_full, 0) * partial_k_read) *`
	`368`	`+ kargs.stride_Bs[index]);`
`351`	`369`	`}`
`352`	`370`	`else if constexpr(std::is_same_v<tensor_layout::gemm::ColumnMajor, BiLayout>)`
`353`	`371`	`{`
`354`		`- bs_k_split_offset[index] = amd_wave_read_first_lane(k_id * KRead);`
	`372`	`+ bs_k_split_offset[index] =`
	`373`	`+ amd_wave_read_first_lane(std::min(k_id, num_full) * full_k_read +`
	`374`	`+ std::max(k_id - num_full, 0) * partial_k_read);`
`355`	`375`	`}`
`356`	`376`	`});`
`357`	`377`
`358`		`- if(k_id < static_cast<uint32_t>(kargs.k_batch - 1))`
	`378`	`+ if(k_id == kargs.k_batch - 1)`
	`379`	`+ {`
	`380`	`+ splitted_k = kargs.K - std::min(k_id, num_full) * full_k_read -`
	`381`	`+ std::max(k_id - num_full, 0) * partial_k_read;`
	`382`	`+ }`
	`383`	`+ else if(k_id < num_full)`
`359`	`384`	`{`
`360`		`- splitted_k = amd_wave_read_first_lane(KRead);`
	`385`	`+ splitted_k = full_k_read;`
`361`	`386`	`}`
`362`	`387`	`else`
`363`	`388`	`{`
`364`		`- splitted_k = amd_wave_read_first_lane(kargs.K - KRead * (kargs.k_batch - 1));`
	`389`	`+ splitted_k = partial_k_read;`
`365`	`390`	`}`
`366`	`391`	`}`
`367`	`392`
`@@ -385,6 +410,15 @@ struct UniversalGemmKernel`
`385`	`410`	`}`
`386`	`411`	`}`
`387`	`412`
	`413`	`+ if(kargs.K < GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)`
	`414`	`+ {`
	`415`	`+ if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))`
	`416`	`+ {`
	`417`	`+ CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!");`
	`418`	`+ }`
	`419`	`+ return false;`
	`420`	`+ }`
	`421`	`+`
`388`	`422`	`const auto vectorSizeA = is_wave32() ? GemmPipeline::template GetVectorSizeA<true>()`
`389`	`423`	`: GemmPipeline::template GetVectorSizeA<false>();`
`390`	`424`	`bool AsTesnorIsValid = {true};`
Original file line number	Diff line number	Diff line change
`@@ -568,6 +568,15 @@ struct GroupedConvolutionBackwardWeightKernel`
`568`	`568`	`}`
`569`	`569`	`}`
`570`	`570`
	`571`	`+ if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)`
	`572`	`+ {`
	`573`	`+ if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))`
	`574`	`+ {`
	`575`	`+ CK_TILE_ERROR("KBatch is too large, part of GPU wouldn't be utilized!");`
	`576`	`+ }`
	`577`	`+ return false;`
	`578`	`+ }`
	`579`	`+`
`571`	`580`	`const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];`
`572`	`581`	`const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];`
`573`	`582`