ROCm
diff --git a/‎Jenkinsfile‎
Lines changed: 6 additions & 2 deletions b/‎Jenkinsfile‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎example/ck_tile/40_streamk_gemm/run_gemm_example.inc‎
Lines changed: 1 addition & 1 deletion b/‎example/ck_tile/40_streamk_gemm/run_gemm_example.inc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp‎
Lines changed: 2 additions & 2 deletions b/‎example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp‎
Lines changed: 2 additions & 11 deletions b/‎include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp‎
Lines changed: 1 addition & 2 deletions b/‎test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tile_engine/include/utility/validation.hpp‎
Lines changed: 50 additions & 0 deletions b/‎tile_engine/include/utility/validation.hpp‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎tile_engine/ops/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion b/‎tile_engine/ops/CMakeLists.txt‎
Lines changed: 2 additions & 1 deletion
@@ -1615,11 +1615,13 @@ pipeline {
                                             -D GPU_TARGETS="gfx90a" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_STREAMK_DATATYPE="fp8;fp16" \
+                                            -D GEMM_STREAMK_LAYOUT="rcr" \
                                             -D GEMM_MULTI_D_DATATYPE="fp16" \
                                             -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
-                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
                                            python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
@@ -1644,11 +1646,13 @@ pipeline {
                                             -D GPU_TARGETS="gfx942" \
                                             -D GEMM_DATATYPE="fp8;fp16" \
                                             -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \
+                                            -D GEMM_STREAMK_DATATYPE="fp8;fp16" \
+                                            -D GEMM_STREAMK_LAYOUT="rcr" \
                                             -D GEMM_MULTI_D_DATATYPE="fp16" \
                                             -D GEMM_MULTI_D_LAYOUT="rcrr;rrrr;crrr;ccrr" \
                                             -D GEMM_PRESHUFFLE_DATATYPE="fp16;fp8;bf16;bf8" \
                                             -D GEMM_PRESHUFFLE_LAYOUT="rcr" .. && \
-                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all && \
+                                           ninja -j64 benchmark_gemm_all benchmark_gemm_preshuffle_all benchmark_gemm_multi_d_all benchmark_gemm_streamk_all && \
                                            python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json && \
                                            python3 ../tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py . --problem-sizes "1024,1024,1024" --warmup 5 --repeat 5 --verbose --json results.json """
 
@@ -86,7 +86,7 @@ invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
 
     std::tuple<float, ck_tile::index_t> ave_time_and_batch;
 
-    if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
+    if(reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)
     {
         ave_time_and_batch = gemm<GemmConfig,
                                   ADataType,
 
@@ -105,13 +105,13 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
         }
 
         auto reset_data_buffers = [&]() {
-            if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
+            if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
             {
                 // Clear the output C tensor results after each repetition of the kernel
                 hipGetErrorString(hipMemsetAsync(
                     args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
             }
-            else if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
+            else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
             {
                 // Reset sk flags to zero before each repetition of the kernel
                 workspace_data.SetZero();
 
@@ -28,8 +28,7 @@ struct StreamKHostArgs : public ck_tile::UniversalGemmHostArgs<>
                                           index_t K_,
                                           index_t stride_A_,
                                           index_t stride_B_,
-                                          index_t stride_C_,
-                                          StreamKReductionStrategy reduction_strategy_)
+                                          index_t stride_C_)
         : UniversalGemmHostArgs<>({a_ptr_},
                                   {b_ptr_},
                                   {/*ds_ptr*/},
@@ -41,12 +40,9 @@ struct StreamKHostArgs : public ck_tile::UniversalGemmHostArgs<>
                                   {stride_A_},
                                   {stride_B_},
                                   {/*stride_Ds_*/},
-                                  stride_C_),
-          reduction_strategy{reduction_strategy_}
+                                  stride_C_)
     {
     }
-
-    ck_tile::StreamKReductionStrategy reduction_strategy;
 };
 
 /**
@@ -133,18 +129,13 @@ struct StreamKKernel
                                       host_args.stride_Ds,
                                       host_args.stride_E,
                                       host_args.k_batch},
-              reduction_strategy{host_args.reduction_strategy},
               // The workspace pointer is set to nullptr because we must first
               // instantiate the TilePartitioner to get the necessary size
               workspace_ptr{nullptr},
               tile_partitioner{TilePartitioner{host_args.M, host_args.N, host_args.K, grid}}
 
         {
         }
-        /**
-         * @brief The strategy used by work groups to compute final results in C tensor.
-         */
-        StreamKReductionStrategy reduction_strategy;
         /**
          * @brief  A pointer to a buffer in device memory for accumulating partial via reduction
          * strategy.
 
@@ -250,8 +250,7 @@ class TestCkTileStreamK : public ::testing::Test
                                       K,
                                       stride_A,
                                       stride_B,
-                                      stride_C,
-                                      reduction_strategy};
+                                      stride_C};
 
         ck_tile::index_t num_accumulations_per_tile =
             invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
 
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c), Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+    // Calculate error due to split_k accumulation
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+/// @brief Function to compare the results of the device and host computations
+bool compare(std::string instanceName,
+             ck_tile::index_t K,
+             ck_tile::index_t kbatch,
+             ck_tile::HostTensor<CDataType>& c_m_n_dev_result,
+             ck_tile::HostTensor<CDataType>& c_m_n_host_result)
+{
+    const float max_accumulated_value =
+        *std::max_element(c_m_n_host_result.mData.begin(), c_m_n_host_result.mData.end());
+    const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+        K, kbatch, max_accumulated_value);
+    bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                   c_m_n_host_result,
+                                   "Error: Incorrect results!",
+                                   rtol_atol.at(ck_tile::number<0>{}),
+                                   rtol_atol.at(ck_tile::number<1>{}));
+
+    std::cout << "For " << instanceName << " Relative error threshold is "
+              << rtol_atol.at(ck_tile::number<0>{}) << " Absolute error threshold is "
+              << rtol_atol.at(ck_tile::number<1>{}) << std::endl;
+    std::cout << "The verification result is:" << (pass ? "correct" : "fail") << std::endl;
+
+    return pass;
+}
@@ -1,3 +1,4 @@
 add_subdirectory(gemm)
 add_subdirectory(gemm_multi_d)
-add_subdirectory(gemm_preshuffle)
+add_subdirectory(gemm_preshuffle)
+add_subdirectory(gemm_streamk)
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,`
`86`	`86`
`87`	`87`	`std::tuple<float, ck_tile::index_t> ave_time_and_batch;`
`88`	`88`
`89`		`- if(args.reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)`
	`89`	`+ if(reduction_strategy == ck_tile::StreamKReductionStrategy::Atomic)`
`90`	`90`	`{`
`91`	`91`	`ave_time_and_batch = gemm<GemmConfig,`
`92`	`92`	`ADataType,`
Original file line number	Diff line number	Diff line change
`@@ -105,13 +105,13 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,`
`105`	`105`	`}`
`106`	`106`
`107`	`107`	`auto reset_data_buffers = [&]() {`
`108`		`- if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)`
	`108`	`+ if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)`
`109`	`109`	`{`
`110`	`110`	`// Clear the output C tensor results after each repetition of the kernel`
`111`	`111`	`hipGetErrorString(hipMemsetAsync(`
`112`	`112`	`args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));`
`113`	`113`	`}`
`114`		`- else if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)`
	`114`	`+ else if constexpr(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)`
`115`	`115`	`{`
`116`	`116`	`// Reset sk flags to zero before each repetition of the kernel`
`117`	`117`	`workspace_data.SetZero();`