[Common] PDL for Blockwise Quantization (NVIDIA#2066)

yaox12 · web-flow · commit ebca61532000 · 2025-08-13T09:54:11.000+08:00
* enable PDL for blockwise qunatization kernels Signed-off-by: Xin Yao <xiny@nvidia.com> * add comment Signed-off-by: Xin Yao <xiny@nvidia.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Xin Yao <xiny@nvidia.com>
diff --git a/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu b/transformer_engine/common/transpose/quantize_transpose_square_blockwise.cu
@@ -14,6 +14,7 @@
 
 #include "common/common.h"
 #include "common/recipe/recipe_common.cuh"
+#include "common/util/cuda_runtime.h"
 #include "common/util/ptx.cuh"
 #include "common/utils.cuh"
 
@@ -167,6 +168,12 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK)
     }
   }
 
+// Trigger the next kernel here so that it's load from global memory can overlap with this kernel's
+// store to global memory.
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+
   // Step 3: Store cast output, Step 4: do transpose within thread tile
   OVecCast tmp_output_c;
 
@@ -390,6 +397,12 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK) block_scaled_cast_transpose
     }
   }
 
+// Trigger the next kernel here so that it's load from global memory can overlap with this kernel's
+// store to global memory.
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  cudaTriggerProgrammaticLaunchCompletion();
+#endif
+
   // Step 3: Store cast output, Step 4: do transpose within thread tile
   // Edge case: in the non-full tile case, there are three subcases
   // for full thread tile, it's the same thing here
@@ -511,6 +524,15 @@ void quantize_transpose_square_blockwise(const SimpleTensor& input, SimpleTensor
 
   const size_t num_blocks_x = DIVUP(row_length, BLOCK_TILE_DIM);
   const size_t num_blocks_y = DIVUP(num_rows, BLOCK_TILE_DIM);
+  dim3 grid(num_blocks_x, num_blocks_y, 1);
+  cudaLaunchAttribute attribute[1];
+  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attribute[0].val.programmaticStreamSerializationAllowed = 1;
+  cudaLaunchConfig_t cfg = {grid, THREADS_PER_BLOCK, 0, stream, NULL, 0};
+  if (transformer_engine::cuda::sm_arch(transformer_engine::cuda::current_device()) >= 90) {
+    cfg.attrs = attribute;
+    cfg.numAttrs = 1;
+  }
 
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.dtype, InputType,
@@ -521,7 +543,6 @@ void quantize_transpose_square_blockwise(const SimpleTensor& input, SimpleTensor
           TRANSFORMER_ENGINE_SWITCH_CONDITION(
               return_transpose, kReturnTranspose,
 
-              dim3 grid(num_blocks_x, num_blocks_y, 1);
               const bool full_tile =
                   row_length % BLOCK_TILE_DIM == 0 && num_rows % BLOCK_TILE_DIM == 0;
 
@@ -531,26 +552,28 @@ void quantize_transpose_square_blockwise(const SimpleTensor& input, SimpleTensor
                   tensor_map_output_trans =
                       get_tensor_map<OutputType>(output_t, num_rows, row_length);
                 }
-                block_scaled_cast_transpose_kernel<kReturnTranspose, float, InputType, OutputType>
-                    <<<grid, THREADS_PER_BLOCK, 0, stream>>>(
-                        reinterpret_cast<const InputType*>(input.dptr),
-                        reinterpret_cast<OutputType*>(output.dptr),
-                        reinterpret_cast<OutputType*>(output_t.dptr),
-                        reinterpret_cast<float*>(scale_inv.dptr),
-                        reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
-                        scale_stride_x, scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon,
-                        tensor_map_output_trans, pow_2_scale);
+                cudaLaunchKernelEx(&cfg,
+                                   block_scaled_cast_transpose_kernel<kReturnTranspose, float,
+                                                                      InputType, OutputType>,
+                                   reinterpret_cast<const InputType*>(input.dptr),
+                                   reinterpret_cast<OutputType*>(output.dptr),
+                                   reinterpret_cast<OutputType*>(output_t.dptr),
+                                   reinterpret_cast<float*>(scale_inv.dptr),
+                                   reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
+                                   scale_stride_x, scale_stride_y, scale_t_stride_x,
+                                   scale_t_stride_y, epsilon, tensor_map_output_trans, pow_2_scale);
               } else {
-                block_scaled_cast_transpose_kernel_notaligned<kReturnTranspose, float, InputType,
-                                                              OutputType>
-                    <<<grid, THREADS_PER_BLOCK, 0, stream>>>(
-                        reinterpret_cast<const InputType*>(input.dptr),
-                        reinterpret_cast<OutputType*>(output.dptr),
-                        reinterpret_cast<OutputType*>(output_t.dptr),
-                        reinterpret_cast<float*>(scale_inv.dptr),
-                        reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
-                        scale_stride_x, scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon,
-                        pow_2_scale);
+                cudaLaunchKernelEx(
+                    &cfg,
+                    block_scaled_cast_transpose_kernel_notaligned<kReturnTranspose, float,
+                                                                  InputType, OutputType>,
+                    reinterpret_cast<const InputType*>(input.dptr),
+                    reinterpret_cast<OutputType*>(output.dptr),
+                    reinterpret_cast<OutputType*>(output_t.dptr),
+                    reinterpret_cast<float*>(scale_inv.dptr),
+                    reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
+                    scale_stride_x, scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon,
+                    pow_2_scale);
               }  // full-tile
               )  // return_transpose
           )      // OutputType
diff --git a/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu b/transformer_engine/common/transpose/quantize_transpose_vector_blockwise.cu
@@ -17,6 +17,7 @@
 #include "common/common.h"
 #include "common/recipe/recipe_common.cuh"
 #include "common/transpose/cast_transpose.h"
+#include "common/util/cuda_runtime.h"
 #include "common/utils.cuh"
 
 namespace transformer_engine {
@@ -234,6 +235,14 @@ __global__ void __launch_bounds__(kThreadsPerBlock) block_scaled_1d_cast_transpo
 
   __syncthreads();
 
+// If not return columnwise, we trigger the next kernel here so that it's load from global memory
+// can overlap with this kernel's return rowwise.
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  if (!return_columnwise_gemm_ready && !return_columnwise_compact) {
+    cudaTriggerProgrammaticLaunchCompletion();
+  }
+#endif
+
   // Step 2: Cast and store to output_c
   if (return_rowwise) {
     constexpr int r_stride =
@@ -325,6 +334,14 @@ __global__ void __launch_bounds__(kThreadsPerBlock) block_scaled_1d_cast_transpo
     }
   }
 
+// If return columnwise, we trigger the next kernel here so that it's load from global memory
+// can overlap with this kernel's return columnwise.
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+  if (return_columnwise_gemm_ready || return_columnwise_compact) {
+    cudaTriggerProgrammaticLaunchCompletion();
+  }
+#endif
+
   // Step 3 (return_columnwise_gemm_ready): Transpose, cast and store to output_t
   if (return_columnwise_gemm_ready) {
     constexpr int c_stride =
@@ -584,38 +601,49 @@ void quantize_transpose_vector_blockwise(const SimpleTensor& input, SimpleTensor
 
   const size_t num_blocks_x = DIVUP(row_length, (size_t)kTileDim);
   const size_t num_blocks_y = DIVUP(num_rows, (size_t)kTileDim);
+  dim3 grid(num_blocks_x, num_blocks_y, 1);
+  cudaLaunchAttribute attribute[1];
+  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attribute[0].val.programmaticStreamSerializationAllowed = 1;
 
   TRANSFORMER_ENGINE_TYPE_SWITCH_INPUT(
       input.dtype, InputType,
 
       TRANSFORMER_ENGINE_TYPE_SWITCH_FP8ONLY(
           output.dtype, OutputType,
 
-          dim3 grid(num_blocks_x, num_blocks_y, 1);
-
           const bool full_tile = row_length % kTileDim == 0 && num_rows % kTileDim == 0;
 
           TRANSFORMER_ENGINE_SWITCH_CONDITION(
               full_tile, kAligned,
 
               size_t smem_bytes = kSMemSize * sizeof(InputType);
+
+              cudaLaunchConfig_t cfg = {grid, kThreadsPerBlock, smem_bytes, stream, NULL, 0};
+              if (transformer_engine::cuda::sm_arch(transformer_engine::cuda::current_device()) >=
+                  90) {
+                cfg.attrs = attribute;
+                cfg.numAttrs = 1;
+              }
               // shared memory must be requested up
               if (smem_bytes >= 48 * 1024) {
                 cudaError_t err = cudaFuncSetAttribute(
                     &block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType, OutputType>,
                     cudaFuncAttributeMaxDynamicSharedMemorySize, smem_bytes);
                 NVTE_CHECK(err == cudaSuccess, "Failed to set dynamic shared memory size.");
-              } block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType, OutputType>
-              <<<grid, kThreadsPerBlock, smem_bytes, stream>>>(
-                  reinterpret_cast<const InputType*>(input.dptr),
-                  reinterpret_cast<OutputType*>(output.dptr),
-                  reinterpret_cast<OutputType*>(output_t.dptr),
-                  reinterpret_cast<float*>(scale_inv.dptr),
-                  reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows, scale_stride_x,
-                  scale_stride_y, scale_t_stride_x, scale_t_stride_y, epsilon, rowwise_option,
-                  columnwise_option, pow2_scale);)  // kAligned
-          )                                         // OutputType
-      )                                             // InputType
+              } cudaLaunchKernelEx(&cfg,
+                                   block_scaled_1d_cast_transpose_kernel<kAligned, float, InputType,
+                                                                         OutputType>,
+                                   reinterpret_cast<const InputType*>(input.dptr),
+                                   reinterpret_cast<OutputType*>(output.dptr),
+                                   reinterpret_cast<OutputType*>(output_t.dptr),
+                                   reinterpret_cast<float*>(scale_inv.dptr),
+                                   reinterpret_cast<float*>(scale_inv_t.dptr), row_length, num_rows,
+                                   scale_stride_x, scale_stride_y, scale_t_stride_x,
+                                   scale_t_stride_y, epsilon, rowwise_option, columnwise_option,
+                                   pow2_scale);)  // kAligned
+          )                                       // OutputType
+      )                                           // InputType
   NVTE_CHECK_CUDA(cudaGetLastError());
 }