From e6aeff4dd042c65afe476ac6479265ebe92828a7 Mon Sep 17 00:00:00 2001
From: vensen <vensenmu@gmail.com>
Date: Sat, 2 May 2026 15:21:33 +0000
Subject: [PATCH] [CuTe][SM70] Add comment explaining why int() cast is
 required for blockIdx coords

---
 include/cutlass/gemm/kernel/sm70_gemm.hpp       | 2 +-
 include/cutlass/gemm/kernel/sm70_gemm_array.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
index faa9b1cd7d..d752ba4139 100644
--- a/include/cutlass/gemm/kernel/sm70_gemm.hpp
+++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -210,7 +210,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin
     int thread_idx = int(threadIdx.x);
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
     auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));  // (m,n,k,l) NOTE: int() cast needed; unsigned coords cause underflow in residue computation for small problem shapes.
 
     // Represent the full tensors
     Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
diff --git a/include/cutlass/gemm/kernel/sm70_gemm_array.hpp b/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
index 409ecda15b..e296e09baf 100644
--- a/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
+++ b/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
@@ -217,7 +217,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin
     int thread_idx = int(threadIdx.x);
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
     auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));  // (m,n,k,l) NOTE: int() cast needed; unsigned coords cause underflow in residue computation for small problem shapes.
 
     // Represent the full tensors
     Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A[l_coord]), make_shape(M,K,1), params.mainloop.dA); //(m,k,l)