[CuTe][SM70] Add comment explaining why int() cast is required for blockIdx coords

Flink-ddd · Flink-ddd · commit f3e2f058a738 · 2026-05-02T15:21:33.000Z
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -210,7 +210,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin
     int thread_idx = int(threadIdx.x);
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
     auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));  // (m,n,k,l) NOTE: int() cast is required. blockIdx returns uint3, and passing unsigned coords to make_coord can cause arithmetic underflow when computing tile residues for predication on small problem shapes (e.g. shape < TileShape).
 
     // Represent the full tensors
     Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A), make_shape(M,K,L), params.mainloop.dA); //(m,k,l)
diff --git a/include/cutlass/gemm/kernel/sm70_gemm_array.hpp b/include/cutlass/gemm/kernel/sm70_gemm_array.hpp
@@ -217,7 +217,7 @@ static_assert(is_valid_tile_scheduler, "SM70 kernel does not support specializin
     int thread_idx = int(threadIdx.x);
     auto blk_shape = TileShape{};                                                                // (BLK_M,BLK_N,BLK_K)
     auto [m_coord, n_coord, l_coord] = static_cast<uint3>(blockIdx);
-    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));                         // (m,n,k,l)
+    auto blk_coord_mnkl = make_coord(int(m_coord), int(n_coord), _, int(l_coord));  // (m,n,k,l) NOTE: int() cast is required. blockIdx returns uint3, and passing unsigned coords to make_coord can cause arithmetic underflow when computing tile residues for predication on small problem shapes (e.g. shape < TileShape).
 
     // Represent the full tensors
     Tensor mA_mkl = make_tensor(make_gmem_ptr(params.mainloop.ptr_A[l_coord]), make_shape(M,K,1), params.mainloop.dA); //(m,k,l)