[CK_TILE] Stream-K bridge: derive strides from layout (no rcr hardcoding)

ozturkosu · ozturkosu · commit 3985cfdc00eb · 2026-06-27T20:36:57.000-04:00
Make the Stream-K bridge layout-generic instead of rcr-hardcoded, so all 4 A/B/C
layouts (rcr/rrr/ccr/crr) work end to end:
- streamk_gemm_ctypes_lib.cpp: derive stride_A/B/C at compile time from the
  kernel's ALayout/BLayout/CLayout (RowMajor RxC -&gt; ld=C, ColumnMajor -&gt; ld=R)
  instead of the hardcoded K/K/N.
- generated_tile_backend_streamk.hpp (registry path): same layout-derived strides.
- GpuGemmRunner: read dtype AND layout off the kernel name; arrange each operand
  per layout (RowMajor=C-contiguous, ColumnMajor=F-contiguous); bf16 encode is
  now memory-order-preserving so column-major operands stay column-major.
- run_one_streamk_gemm_kernel.py: dtype/layout-aware A/B + reference (was fp16-only).
- streamk_gemm_full_benchmark.py: SUPPORTED_LAYOUTS now rcr/rrr/ccr/crr,
  SUPPORTED_DTYPES fp16+bf16 (fp8/bf8/int8 still need runner codecs).
diff --git a/projects/composablekernel/dispatcher/bindings/ctypes/streamk_gemm_ctypes_lib.cpp b/projects/composablekernel/dispatcher/bindings/ctypes/streamk_gemm_ctypes_lib.cpp
@@ -36,6 +36,7 @@
 #include <cstring>
 #include <exception>
 #include <string>
+#include <type_traits>
 
 // Kernel header included via -include compiler flag (with CK_TILE_SINGLE_KERNEL_INCLUDE).
 // Defines: ADataType, BDataType, CDataType, AccDataType, SelectedKernel, KERNEL_NAME
@@ -100,11 +101,13 @@ int dispatcher_init() { return dispatcher_initialize(); }
  *
  * hipMalloc A/B/C, copy A and B host->device, memset C (the Atomic reduction
  * strategy accumulates into C, so it must start zeroed), build a
- * ck_tile::StreamKHostArgs with rcr default strides (stride_A=K, stride_B=K,
- * stride_C=N) and launch. The launch allocates the reduction workspace
- * internally and resets C between timed iterations. C is then copied back.
+ * ck_tile::StreamKHostArgs whose strides are derived from the kernel's actual
+ * ALayout/BLayout/CLayout (no layout hardcoding) and launch. The launch
+ * allocates the reduction workspace internally and resets C between timed
+ * iterations. C is then copied back.
  *
- * Layout contract (rcr): A row-major MxK, B col-major KxN, C row-major MxN.
+ * The host buffers must be laid out to match each operand's layout (the Python
+ * runner arranges A/B/C as RowMajor=C-contiguous, ColumnMajor=F-contiguous).
  *
  * Returns: 0 on success, -1 on HIP error / generic throw, -2 if the kernel
  * reports the arguments are unsupported.
@@ -166,17 +169,28 @@ int dispatcher_run_gemm(
         return -1;
     }
 
-    // rcr default strides: A row-major (stride=K), B col-major (stride=K),
-    // C row-major (stride=N). k_batch is fixed to 1 inside StreamKHostArgs.
+    // Strides are DERIVED from the kernel's actual layouts (ALayout/BLayout/CLayout
+    // come from the force-included generated header) -- nothing layout-specific is
+    // hardcoded, so every layout (rcr/rrr/ccr/crr/...) works. A RowMajor R x C
+    // matrix has leading dim C; a ColumnMajor one has leading dim R.
+    //   A is M x K, B is K x N, C is M x N.
+    using RowMajor             = ck_tile::tensor_layout::gemm::RowMajor;
+    const ck_tile::index_t lda = static_cast<ck_tile::index_t>(
+        std::is_same_v<ALayout, RowMajor> ? K : M);
+    const ck_tile::index_t ldb = static_cast<ck_tile::index_t>(
+        std::is_same_v<BLayout, RowMajor> ? N : K);
+    const ck_tile::index_t ldc = static_cast<ck_tile::index_t>(
+        std::is_same_v<CLayout, RowMajor> ? N : M);
+    // k_batch is fixed to 1 inside StreamKHostArgs.
     ck_tile::StreamKHostArgs args(static_cast<const void*>(A_dev),
                                   static_cast<const void*>(B_dev),
                                   static_cast<void*>(C_dev),
                                   static_cast<ck_tile::index_t>(M),
                                   static_cast<ck_tile::index_t>(N),
                                   static_cast<ck_tile::index_t>(K),
-                                  /*stride_A=*/static_cast<ck_tile::index_t>(K),
-                                  /*stride_B=*/static_cast<ck_tile::index_t>(K),
-                                  /*stride_C=*/static_cast<ck_tile::index_t>(N));
+                                  /*stride_A=*/lda,
+                                  /*stride_B=*/ldb,
+                                  /*stride_C=*/ldc);
 
     // Benchmark parameters. warmup/repeat default to old Tile Engine's values
     // (warmup=50, repeat=100); a generous warmup keeps the GPU clock ramped, and
diff --git a/projects/composablekernel/dispatcher/include/ck_tile/dispatcher/backends/generated_tile_backend_streamk.hpp b/projects/composablekernel/dispatcher/include/ck_tile/dispatcher/backends/generated_tile_backend_streamk.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/ops/gemm/kernel/streamk_gemm/streamk_gemm_kernel.hpp"
 #include <hip/hip_runtime.h>
 #include <string>
+#include <type_traits>
 
 namespace ck_tile {
 namespace dispatcher {
@@ -155,26 +156,35 @@ class GeneratedStreamKKernelInstance : public KernelInstance
     }
 
     private:
-    /// Build StreamKHostArgs for `problem`. rcr strides: row-major A (K),
-    /// column-major B (K), row-major C (N). k_batch is owned by the Stream-K tile
-    /// partitioner, not passed here. Pointers default to null for sizing-only use
-    /// (GetWorkSpaceSize). StreamKHostArgs uses ck_tile::index_t (int32); cast
-    /// from Problem's int64.
+    /// Build StreamKHostArgs for `problem`. Strides are DERIVED from the kernel's
+    /// actual layouts (the force-included single kernel exposes the global
+    /// ALayout/BLayout/CLayout), not hardcoded to rcr. k_batch is owned by the
+    /// Stream-K tile partitioner, not passed here. Pointers default to null for
+    /// sizing-only use (GetWorkSpaceSize). StreamKHostArgs uses ck_tile::index_t
+    /// (int32); cast from Problem's int64.
     ck_tile::StreamKHostArgs make_args(const Problem& problem,
                                        const void* a_ptr = nullptr,
                                        const void* b_ptr = nullptr,
                                        void* c_ptr       = nullptr) const
     {
-        using idx = ck_tile::index_t;
+        using idx      = ck_tile::index_t;
+        using RowMajor = ck_tile::tensor_layout::gemm::RowMajor;
+        // A is MxK, B is KxN, C is MxN; RowMajor RxC has leading dim C, else R.
+        const idx lda = static_cast<idx>(
+            std::is_same_v<::ALayout, RowMajor> ? problem.K : problem.M);
+        const idx ldb = static_cast<idx>(
+            std::is_same_v<::BLayout, RowMajor> ? problem.N : problem.K);
+        const idx ldc = static_cast<idx>(
+            std::is_same_v<::CLayout, RowMajor> ? problem.N : problem.M);
         return ck_tile::StreamKHostArgs{a_ptr,
                                         b_ptr,
                                         c_ptr,
                                         static_cast<idx>(problem.M),
                                         static_cast<idx>(problem.N),
                                         static_cast<idx>(problem.K),
-                                        static_cast<idx>(problem.K),
-                                        static_cast<idx>(problem.K),
-                                        static_cast<idx>(problem.N)};
+                                        lda,
+                                        ldb,
+                                        ldc};
     }
 
     KernelKey key_;
diff --git a/projects/composablekernel/dispatcher/python/gemm_utils.py b/projects/composablekernel/dispatcher/python/gemm_utils.py
@@ -405,44 +405,57 @@ def __init__(self, lib_path: Path):
             raise RuntimeError(f"Failed to initialize dispatcher .so: {lib_path}")
         names = self.lib.kernel_names
         self._kernel_name = names[0] if names else "unknown"
-        # Input dtype is encoded in the kernel name: gemm_<dtype>_<layout>_...
+        # dtype and layout are encoded in the kernel name: gemm_<dtype>_<layout>_...
+        # layout is the 3-char A/B/C major code (e.g. 'rcr'). Nothing layout- or
+        # dtype-specific is hardcoded -- both are read off the compiled kernel.
         parts = self._kernel_name.split("_")
         self._dtype = parts[1] if len(parts) > 1 else "fp16"
+        lay = parts[2] if len(parts) > 2 and len(parts[2]) == 3 else "rcr"
+        self._layout = lay if set(lay) <= {"r", "c"} else "rcr"
 
     @property
     def kernel_name(self) -> str:
         return self._kernel_name
 
     @staticmethod
     def _bf16_encode(x: np.ndarray) -> np.ndarray:
-        """float -> bfloat16 bits (uint16), round-to-nearest-even. ENCODE need only
-        be nearest-representable; DECODE must be bit-exact to device bf16_t so the
-        numpy reference multiplies the same values the GPU does."""
-        u = np.ascontiguousarray(x, dtype=np.float32).view(np.uint32)
+        """float -> bfloat16 bits (uint16), round-to-nearest-even, PRESERVING the
+        input's memory order (C or F) so column-major operands stay column-major.
+        ENCODE need only be nearest-representable; DECODE must be bit-exact to
+        device bf16_t so the numpy reference multiplies what the GPU does."""
+        f = np.asarray(x, dtype=np.float32)
+        if not (f.flags["C_CONTIGUOUS"] or f.flags["F_CONTIGUOUS"]):
+            f = np.ascontiguousarray(f)
+        u = f.view(np.uint32)
         rounded = (u + 0x7FFF + ((u >> 16) & 1)) >> 16
         return rounded.astype(np.uint16)
 
     @staticmethod
     def _bf16_decode(u16: np.ndarray) -> np.ndarray:
         return (u16.astype(np.uint32) << 16).view(np.float32)
 
+    def _to_buf(self, X: np.ndarray, major: str) -> np.ndarray:
+        """Lay out an operand in the order its layout implies: RowMajor ->
+        C-contiguous, ColumnMajor -> F-contiguous. The .so reads a flat buffer
+        with the matching stride, so the raw byte order is what matters."""
+        arr = np.ascontiguousarray(X) if major == "r" else np.asfortranarray(X)
+        if self._dtype == "bf16":
+            return self._bf16_encode(arr)
+        return arr.astype(np.float16, order="K")
+
     def run(
         self, A: np.ndarray, B: np.ndarray, problem: GemmProblem
     ) -> GemmResult:
         M, N, K = problem.M, problem.N, problem.K
 
-        # A is row-major MxK; B is supplied KxN and stored column-major (the
-        # 'c' in rcr), matching how the kernel expects its operands. bf16 is passed
-        # as raw uint16 bits (the ctypes ABI is void* + sizeof, so 2-byte bf16 and
-        # fp16 share the path; only the bit pattern differs).
-        if self._dtype == "bf16":
-            A_h = self._bf16_encode(A)
-            B_h = self._bf16_encode(np.ascontiguousarray(B.T))
-            C_h = np.zeros((M, N), dtype=np.uint16)
-        else:
-            A_h = np.ascontiguousarray(A, dtype=np.float16)
-            B_h = np.ascontiguousarray(B.T, dtype=np.float16)
-            C_h = np.zeros((M, N), dtype=np.float16)
+        # Arrange A (MxK), B (KxN), C (MxN) per the kernel's actual layout. bf16 is
+        # passed as raw uint16 bits (the ctypes ABI is void*+sizeof, so 2-byte bf16
+        # and fp16 share the path; only the bit pattern differs).
+        la, lb, lc = self._layout[0], self._layout[1], self._layout[2]
+        A_h = self._to_buf(A, la)
+        B_h = self._to_buf(B, lb)
+        cdt = np.uint16 if self._dtype == "bf16" else np.float16
+        C_h = np.zeros((M, N), dtype=cdt, order=("C" if lc == "r" else "F"))
 
         status, time_ms = self.lib.run(A_h, B_h, C_h, M, N, K)
 
diff --git a/projects/composablekernel/tile_engine/ops/gemm/run_one_streamk_gemm_kernel.py b/projects/composablekernel/tile_engine/ops/gemm/run_one_streamk_gemm_kernel.py
@@ -55,10 +55,11 @@ def _run_one(idx, so_path, prob_dict, kernel_name, verify=False, verify_tol=2e-2
         problem = GemmProblem.from_dict(prob_dict)
 
         np.random.seed(42)
-        A = (np.random.randn(problem.M, problem.K) * 0.1).astype(np.float16)
-        B = (np.random.randn(problem.K, problem.N) * 0.1).astype(np.float16)
+        A = (np.random.randn(problem.M, problem.K) * 0.1).astype(np.float32)
+        B = (np.random.randn(problem.K, problem.N) * 0.1).astype(np.float32)
 
-        # CRITICAL: load the library ONLY inside this subprocess.
+        # CRITICAL: load the library ONLY inside this subprocess. The runner reads
+        # dtype + layout off the kernel name and arranges/encodes A/B accordingly.
         runner = GpuGemmRunner(lib_path=so_path)
         result = runner.run(A, B, problem)
 
@@ -77,7 +78,16 @@ def _run_one(idx, so_path, prob_dict, kernel_name, verify=False, verify_tol=2e-2
                 "kernel": kernel_name,
             }
             if verify:
-                ref = A.astype(np.float32) @ B.astype(np.float32)
+                # Reference uses the SAME quantized inputs the device sees, per the
+                # kernel's dtype (bf16 bit-truncation vs fp16), so the metric isolates
+                # compute error from input quantization.
+                if getattr(runner, "_dtype", "fp16") == "bf16":
+                    Aq = GpuGemmRunner._bf16_decode(GpuGemmRunner._bf16_encode(A))
+                    Bq = GpuGemmRunner._bf16_decode(GpuGemmRunner._bf16_encode(B))
+                else:
+                    Aq = A.astype(np.float16).astype(np.float32)
+                    Bq = B.astype(np.float16).astype(np.float32)
+                ref = Aq @ Bq
                 got = result.output.astype(np.float32)
                 denom = float(np.max(np.abs(ref))) or 1.0
                 max_rel = float(np.max(np.abs(got - ref)) / denom)
diff --git a/projects/composablekernel/tile_engine/ops/gemm/streamk_gemm_full_benchmark.py b/projects/composablekernel/tile_engine/ops/gemm/streamk_gemm_full_benchmark.py
@@ -60,11 +60,13 @@
     {"M": 512, "N": 512, "K": 8192},
 ]
 
-# Bridge surface for Stream-K: fp16/rcr only, matching the dispatcher host path
-# in streamk_gemm_ctypes_lib.cpp and the fp16 worker in
-# run_one_streamk_gemm_kernel.py.
-SUPPORTED_DTYPES = ("fp16",)
-SUPPORTED_LAYOUTS = ("rcr",)
+# Bridge surface for Stream-K. The dispatcher host path
+# (streamk_gemm_ctypes_lib.cpp) derives strides from the kernel's layouts and the
+# worker (run_one_streamk_gemm_kernel.py) reads dtype/layout off the kernel name,
+# so all 4 A/B/C layouts are supported; dtypes cover fp16 + bf16 (the codecs the
+# bridge runner implements). fp8/bf8/int8 await runner codecs.
+SUPPORTED_DTYPES = ("fp16", "bf16")
+SUPPORTED_LAYOUTS = ("rcr", "rrr", "ccr", "crr")
 
 
 def detect_devices():