diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 89b96b288b4..af3279b77ad 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -897,6 +897,14 @@ quantize_with_block_size( } } } + // PDL completion is reported when every CTA has either exited or called + // this function at least once (per CUDA Programming Guide). Without a + // CTA-wide barrier, an early-finishing warp can trigger completion while + // other warps in the same CTA are still writing sf_out / out, allowing the + // downstream NVF4 GEMM consumer to read partial data once + // wait_on_dependent_grids returns. Drain the CTA's stores before trigger. + __syncthreads(); + __threadfence(); cudaTriggerProgrammaticLaunchCompletion(); #endif }