NVIDIA · tianyuxbear · May 26, 2026 · Jun 11, 2026
diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -897,6 +897,14 @@ quantize_with_block_size(
             }
         }
     }
+    // PDL completion is reported when every CTA has either exited or called
+    // this function at least once (per CUDA Programming Guide). Without a
+    // CTA-wide barrier, an early-finishing warp can trigger completion while
+    // other warps in the same CTA are still writing sf_out / out, allowing the
+    // downstream NVF4 GEMM consumer to read partial data once
+    // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.
+    __syncthreads();
+    __threadfence();
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }