Fix the race in the dbias computation in MXFP8 quantization and grouped quantization kernel (NVIDIA#2921)

ptrendx · web-flow · commit 5d947a037757 · 2026-04-23T20:44:35.000-07:00
Fix the race in the dbias computation

Signed-off-by: Przemek Tredak &lt;ptredak@nvidia.com&gt;
diff --git a/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/group_quantize_mxfp8.cuh
@@ -713,6 +713,8 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK) group_quantize_mxfp8_kernel
         if constexpr (COLWISE_SCALING) {
           thread_partial_dbias = partial_dbias_colwise;
         } else {
+          ptx::cp_async_bulk_wait_group_read<0>();
+          __syncthreads();
           float *partial_dbias_rowwise = reinterpret_cast<float *>(dshmem);
 
           constexpr size_t DBIAS_BUFF_WIDTH = THREADS_X * (SCALE_DIM_X + 1);
diff --git a/transformer_engine/common/cast/mxfp8/quantize_mxfp8.cuh b/transformer_engine/common/cast/mxfp8/quantize_mxfp8.cuh
@@ -498,6 +498,8 @@ __global__ void __launch_bounds__(THREADS_PER_CHUNK)
     if constexpr (COLWISE_SCALING) {
       thread_partial_dbias = partial_dbias_colwise;
     } else {
+      ptx::cp_async_bulk_wait_group_read<0>();
+      __syncthreads();
       // Reusing dshmem (in_sh) as dbias buffer [HEIGHT x WIDTH]
       // HEIGHT = THREADS_Y
       // WIDTH = THREADS_X * (SCALE_DIM_X + 1)