From e6aeff4dd042c65afe476ac6479265ebe92828a7 Mon Sep 17 00:00:00 2001 From: vensen Date: Sat, 2 May 2026 15:21:33 +0000 Subject: [PATCH] [CuTe][SM70] Add comment explaining why int() cast is required for blockIdx coords --- include/cutlass/gemm/kernel/sm70_gemm.hpp | 2 +- include/cutlass/gemm/kernel/sm70_gemm_array.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp index faa9b1cd7d..d752ba4139 100644 --- a/include/cutlass/gemm/kernel/sm70_gemm.hpp +++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp @@ -210,7 +210,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin int thread_idx = int(threadIdx.x); auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) auto [m_coord, n_coord, l_coord] = static_cast(blockIdx); - auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord)); // (m,n,k,l) + auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord)); // (m,n,k,l) NOTE: int() cast needed; unsigned coords cause underflow in residue computation for small problem shapes. // Represent the full tensors Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l) diff --git a/include/cutlass/gemm/kernel/sm70_gemm_array.hpp b/include/cutlass/gemm/kernel/sm70_gemm_array.hpp index 409ecda15b..e296e09baf 100644 --- a/include/cutlass/gemm/kernel/sm70_gemm_array.hpp +++ b/include/cutlass/gemm/kernel/sm70_gemm_array.hpp @@ -217,7 +217,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin int thread_idx = int(threadIdx.x); auto blk_shape = TileShape{}; // (BLK_M,BLK_N,BLK_K) auto [m_coord, n_coord, l_coord] = static_cast(blockIdx); - auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord)); // (m,n,k,l) + auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord)); // (m,n,k,l) NOTE: int() cast needed; unsigned coords cause underflow in residue computation for small problem shapes. // Represent the full tensors Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A[l_coord]), make_shape(M,K,1), params.mainloop.dA); //(m,k,l)