[rocm-libraries] ROCm/rocm-libraries#5555 (commit 1d2c4c8)

bartekxk · assistant-librarian[bot] · commit f79926009b6b · 2026-03-21T22:56:19.000Z
[CK][CK Tile] Fix kbatch check in grouped conv and gemm kernels (#5555) ## Motivation Fix kbatch check in grouped conv and gemm kernels, allow tails for kbatch. ## Technical Details Round up K / Kperxdl and divide it by Kbatch to allow tail for K. ## Test Plan test_grouped_convnd_bwd_weight_tile ## Test Result passed locally ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -418,7 +418,8 @@ struct UniversalGemmKernel
             }
         }
 
-        if(kargs.K < GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        if(integer_divide_ceil(kargs.K, GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{})) <
+           kargs.k_batch)
         {
             if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
             {
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp
@@ -574,7 +574,9 @@ struct GroupedConvolutionBackwardWeightKernel
             }
         }
 
-        if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)
+        if(integer_divide_ceil(kargs.GemmK,
+                               TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{})) <
+           kargs.k_batch)
         {
             LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ",
                     kargs.GemmK,
diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp
@@ -178,11 +178,11 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,
                     });
 
                 const bool valid = report.get_errors().empty();
+                best_avg_time    = std::min(best_avg_time, avg_time);
+                best_op_name     = best_avg_time < avg_time ? best_op_name : op_name;
+                best_split_k     = best_avg_time < avg_time ? best_split_k : k_batch;
                 if(valid)
                 {
-                    best_avg_time = std::min(best_avg_time, avg_time);
-                    best_op_name  = best_avg_time < avg_time ? best_op_name : op_name;
-                    best_split_k  = best_avg_time < avg_time ? best_split_k : k_batch;
                     std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " "
                               << op_name << ", SplitK " << k_batch << std::endl;
                 }
diff --git a/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp b/test/ck_tile/grouped_conv/test_ck_tile_grouped_conv_bwd_weight.cpp
@@ -219,12 +219,12 @@ TEST_F(GroupedConvBwdWeightIsSupportedArgumentTest, K0KBatchLimitation)
                                         tensor_layout::convolution::NHWGK>::type;
 
     // k_batch = 128 should pass
-    auto host_args_kbatch_6 = create_2d_host_args(6);
+    auto host_args_kbatch_6 = create_2d_host_args(7);
     auto kargs_6 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_6);
     EXPECT_TRUE(Kernel::IsSupportedArgument(kargs_6));
 
     // k_batch = 129 should fail for half_t output
-    auto host_args_kbatch_7 = create_2d_host_args(7);
+    auto host_args_kbatch_7 = create_2d_host_args(8);
     auto kargs_7 = typename Kernel::GroupedConvBwdWeightKernelArgsSpecialized(host_args_kbatch_7);
     EXPECT_FALSE(Kernel::IsSupportedArgument(kargs_7));
 }

Original file line number	Diff line number	Diff line change
`@@ -418,7 +418,8 @@ struct UniversalGemmKernel`
`418`	`418`	`}`
`419`	`419`	`}`
`420`	`420`
`421`		`- if(kargs.K < GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)`
	`421`	`+ if(integer_divide_ceil(kargs.K, GemmPipeline::BlockGemmShape::WarpTile::at(number<2>{})) <`
	`422`	`+ kargs.k_batch)`
`422`	`423`	`{`
`423`	`424`	`if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))`
`424`	`425`	`{`
Original file line number	Diff line number	Diff line change
`@@ -574,7 +574,9 @@ struct GroupedConvolutionBackwardWeightKernel`
`574`	`574`	`}`
`575`	`575`	`}`
`576`	`576`
`577`		`- if(kargs.GemmK < TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}) * kargs.k_batch)`
	`577`	`+ if(integer_divide_ceil(kargs.GemmK,`
	`578`	`+ TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{})) <`
	`579`	`+ kargs.k_batch)`
`578`	`580`	`{`
`579`	`581`	`LogInfo("KBatch is too large, part of GPU wouldn't be utilized! GemmK: ",`
`580`	`582`	`kargs.GemmK,`
Original file line number	Diff line number	Diff line change
`@@ -178,11 +178,11 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args<SIGNATURE>& args,`
`178`	`178`	`});`
`179`	`179`
`180`	`180`	`const bool valid = report.get_errors().empty();`
	`181`	`+ best_avg_time = std::min(best_avg_time, avg_time);`
	`182`	`+ best_op_name = best_avg_time < avg_time ? best_op_name : op_name;`
	`183`	`+ best_split_k = best_avg_time < avg_time ? best_split_k : k_batch;`
`181`	`184`	`if(valid)`
`182`	`185`	`{`
`183`		`- best_avg_time = std::min(best_avg_time, avg_time);`
`184`		`- best_op_name = best_avg_time < avg_time ? best_op_name : op_name;`
`185`		`- best_split_k = best_avg_time < avg_time ? best_split_k : k_batch;`
`186`	`186`	`std::cout << "[Valid] Perf: " << std::setw(10) << avg_time << " ms," << " "`
`187`	`187`	`<< op_name << ", SplitK " << k_batch << std::endl;`
`188`	`188`	`}`