NVIDIA
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 12 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/kernel_traits.h‎
Lines changed: 29 additions & 15 deletions b/‎cpp/kernels/fmha_v2/src/fmha/kernel_traits.h‎
Lines changed: 29 additions & 15 deletions
diff --git a/‎cpp/kernels/fmha_v2/src/fmha/warpspec_sm120/README.md‎
Lines changed: 124 additions & 0 deletions b/‎cpp/kernels/fmha_v2/src/fmha/warpspec_sm120/README.md‎
Lines changed: 124 additions & 0 deletions
@@ -10,7 +10,7 @@ TensorRT LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.1.1-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.10.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.3.0rc18-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.3.0rc19-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -24,6 +24,11 @@ namespace kvc = tensorrt_llm::executor::kv_cache;
 
 #pragma once
 
+namespace tensorrt_llm::testing
+{
+class KVCacheTransferManagerTestAccess;
+} // namespace tensorrt_llm::testing
+
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
@@ -76,10 +81,15 @@ class KVCacheTransferManager
     [[nodiscard]] KvCacheTransferStats getAndResetTransferStats();
 
 private:
+    friend class ::tensorrt_llm::testing::KVCacheTransferManagerTestAccess;
+
     //! \brief Get pointer to pool specified by cache block.
     static tr::ITensor::SharedPtr computeBlockPointer(
         BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx);
 
+    //! \brief Get pool-qualified index for pending transfer tracking.
+    [[nodiscard]] static kernels::KVCacheIndex::UnderlyingType getPendingTransferIndex(BlockPtr const& block);
+
     /*!
      * \brief The key method that copies the src block to the dst block.
      *
@@ -107,8 +117,8 @@ class KVCacheTransferManager
     runtime::BufferManager mOnboardManager;
     runtime::BufferManager mOffloadManager;
 
-    // Track reads and writes for blocks. Note that it is the memory pool index that
-    // identifies the raw memory blocks involved in I/O, not the block Id.
+    // Track reads and writes for blocks. Note that it is the pool-qualified memory pool index
+    // that identifies the raw memory blocks involved in I/O, not the block Id.
     std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingReads;
     std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingWrites;
     // Reference to parent loopback agent
 
@@ -145,7 +145,9 @@ template <
     // Do we use half epilogue for the 2nd GEMM (hmma_fp32)
     bool BMM2_FP16_EPILOGUE = true,
     // non-positive means disabled
-    int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0>
+    int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0,
+    // Enable skip softmax attention feature.
+    bool ENABLE_SKIP_SOFTMAX_ = false>
 struct Kernel_traits_
 {
 
@@ -197,6 +199,9 @@ struct Kernel_traits_
         SAGE_BLOCK_SIZE_V = SAGE_BLOCK_SIZE_V_
     };
 
+    // Are we enabling skip softmax attention feature?
+    static constexpr bool ENABLE_SKIP_SOFTMAX = ENABLE_SKIP_SOFTMAX_;
+
     // TODO: expose these tiling params to the interface
     enum
     {
@@ -1005,10 +1010,13 @@ template <
     // The output type.
     typename OutputType = typename Traits::A_type,
     // The sage attention block size for Q, K and V
-    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0,
+    // Enable skip softmax attention feature.
+    bool ENABLE_SKIP_SOFTMAX = false>
 using Kernel_traits_v2 = Kernel_traits_<Traits, fmha::v2::Gmem_tile_qkv, fmha::v2::Gmem_tile_qkv,
     fmha::v2::Gmem_tile_qkv, Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M, WARPS_N,
-    CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+    CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V,
+    ENABLE_SKIP_SOFTMAX>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1038,11 +1046,13 @@ template <
     // The output type.
     typename OutputType = typename Traits::A_type,
     // The sage attention block size for Q, K and V
-    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
-using Kernel_traits_v2_q_k_v
-    = Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_q_k_v,
-        Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS,
-        2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0,
+    // Enable skip softmax attention feature.
+    bool ENABLE_SKIP_SOFTMAX = false>
+using Kernel_traits_v2_q_k_v = Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_q_k_v,
+    fmha::v2::Gmem_tile_q_k_v, Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M,
+    WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K,
+    SAGE_BLOCK_SIZE_V, ENABLE_SKIP_SOFTMAX>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1072,11 +1082,13 @@ template <
     // The output type.
     typename OutputType = typename Traits::A_type,
     // The sage attention block size for Q, K and V
-    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
-using Kernel_traits_v2_paged_kv_cache
-    = Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_paged_kv, fmha::v2::Gmem_tile_paged_kv,
-        Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS,
-        2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0,
+    // Enable skip softmax attention feature.
+    bool ENABLE_SKIP_SOFTMAX = false>
+using Kernel_traits_v2_paged_kv_cache = Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v, fmha::v2::Gmem_tile_paged_kv,
+    fmha::v2::Gmem_tile_paged_kv, Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, DV, STEP, WARPS_M,
+    WARPS_N, CTAS_PER_HEAD, FLAGS, 2, MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K,
+    SAGE_BLOCK_SIZE_V, ENABLE_SKIP_SOFTMAX>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -1106,11 +1118,13 @@ template <
     // The output type.
     typename OutputType = typename Traits::A_type,
     // The sage attention block size for Q, K and V
-    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0>
+    int SAGE_BLOCK_SIZE_Q = 0, int SAGE_BLOCK_SIZE_K = 0, int SAGE_BLOCK_SIZE_V = 0,
+    // Enable skip softmax attention feature.
+    bool ENABLE_SKIP_SOFTMAX = false>
 using Kernel_traits_v2_contiguous_kv_cache = Kernel_traits_<Traits, fmha::v2::Gmem_tile_q_k_v,
     fmha::v2::Gmem_tile_contiguous_kv, fmha::v2::Gmem_tile_contiguous_kv,
     Gmem_tile_o_dispatcher<Traits, OutputType>::Gmem_tile_o, S, D, 0, STEP, WARPS_M, WARPS_N, CTAS_PER_HEAD, FLAGS, 2,
-    MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V>;
+    MASK_VERSION, BMM2_FP16_EPILOGUE, SAGE_BLOCK_SIZE_Q, SAGE_BLOCK_SIZE_K, SAGE_BLOCK_SIZE_V, ENABLE_SKIP_SOFTMAX>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 
@@ -0,0 +1,124 @@
+# skip_softmax — TMA-load + sync-MMA warp-specialized FMHA for sm_120 / sm_121
+
+> This is the sm_120 / sm_121 warp-specialized context FMHA that carries the
+> per-warp **skip-softmax** optimization (hence the name). Only half of the
+> Hopper warp-specialization recipe ports to consumer Blackwell: TMA-driven
+> async loads survive, but async MMA does not (sm_120 / sm_121 have no
+> `wgmma.async` equivalent), so the compute warps stay on `mma.sync` while a
+> dedicated producer warp drives the loads with TMA.
+
+This directory implements a warp-specialized context FMHA for the sm_120
+family (sm_120 / sm_121). It targets BF16, causal mask, `head_dim ==
+head_dim_v` in `{128, 256}`, and the PACKED_QKV layout. The kernel carries the
+per-warp skip-softmax optimization into the warp-specialized design.
+
+## Files
+
+| File | Role |
+|------|------|
+| `kernel_traits.h` | `Kernel_traits_skip_softmax_sm120`: wraps `fmha::Kernel_traits_v2` for the LDGSTS-friendly `Smem_tile_*` types, then layers on the producer/consumer warp roles, the granular smem buffers, the circular-buffer barriers, and the V re-tile (see below). |
+| `dma_sync_mma.h` | Producer (`DMA::run`). Issues `cp.async.bulk.tensor.3d.shared::cta.global.tile` for Q / K / V into the granular buffers. `DMA::Host::init_params` builds the three `CUtensorMap` descriptors with the driver-API `cuTensorMapEncodeTiled`. |
+| `compute_sync_mma.h` | Consumer (`Compute::run`). The kv-loop body — BMM1 (`fmha::gemm`) + softmax + causal mask + per-warp skip-softmax vote + BMM2 + epilogue — reading the granular `Smem_tile_q/k/v` per ring slot. |
+
+The translation unit and the in-engine dispatch bridges live in
+`cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/skip_softmax_sm120/fused_multihead_flash_attention_ws_sm120.cu`,
+and the entry kernel in
+`cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_ws_sm120.h`.
+
+## How the runner reaches this kernel
+
+This is the **default** sm_120 / sm_121 context FMHA — there is no opt-in flag.
+`FusedMultiHeadAttentionXMMAKernelV2::run` dispatches every prefill whose config
+matches (sm_120 / sm_121, BF16 in/out, causal, `head_dim == head_dim_v` in
+`{128, 256}`, PACKED_QKV) **and** that carries no feature the kernel does not
+implement (alibi, logit softcapping, sage attention, sliding-window / custom mask,
+returning softmax stats, interleaved) to the `run_skip_softmax_*` bridges; every
+other config falls through to the cubin/launcher path.
+
+The per-tile skip-softmax optimization is selected by
+`Launch_params::enableSkipSoftmax` (set when a skip-softmax threshold `> 0` is
+configured): the bridges instantiate the `ENABLE_SKIP_SOFTMAX = true` kernel
+variant when skipping is requested, and the `false` variant — a plain
+full-softmax prefill with no skip-check overhead — otherwise.
+
+The translation unit is compiled only into the `_context_attention_kernels_120`
+CMake target (sm_120 family). The all-architecture dispatch TU references the
+bridge symbols under `TLLM_ENABLE_SKIP_SOFTMAX_SM120`, which CMake defines only when
+sm_120 is built, so builds that exclude sm_120 neither reference nor link the
+(then-absent) symbols.
+
+## Design rationale
+
+### Why TMA loads, not "just split the warps"
+
+In the non-warp-specialized tiled kernel, the Q / K / V loads are *multi-thread*
+LDGSTS operations: each of the 128 threads issues several `LDGSTS` instructions
+to cover `(tile rows × D bytes)`. There is no way to "have warp 0 do the load"
+without rewriting the gmem/smem tile load helpers — the partition is baked into
+them. TMA fixes exactly this: a single descriptor + a single
+`cp.async.bulk.tensor` from one thread issues an entire tile load, and the
+consumers wait on an `mbarrier`. So the producer warp uses TMA, not LDGSTS.
+
+### TMA descriptor format
+
+Blackwell's TMA engine requires the driver-API `cuTensorMapEncodeTiled`
+(128-byte `CUtensorMap`) descriptor — the same form the shipping
+trtllmGenKernels FMHA uses. The fmha_v2 hand-rolled 64-byte `fmha::cudaTmaDesc`
+(Hopper-era bit layout) is rejected and faults at `UTMALDG`. The descriptors
+are built host-side in `DMA::Host::init_params` and passed to the kernel as
+`__grid_constant__` params.
+
+### Why the LDGSTS smem tiles can be filled by TMA
+
+The make-or-break question for reusing the existing consumer `Smem_tile_*` is
+whether their LDGSTS XOR swizzle equals a TMA hardware swizzle mode. It does:
+the Q and K granular tiles use `BYTES_PER_ROW = 128`, `BYTES_PER_STS = 16`,
+`ROWS_PER_XOR_PATTERN = 8`, i.e. a physical 16-byte chunk index of
+`(col / 8) ^ (row % 8)` — byte-identical to the TMA 128B hardware swizzle. So a
+chunked 128B-swizzle TMA load fills `Smem_tile_q/k` directly and the consumer's
+`ldmatrix` reads correct data.
+
+### V is re-tiled to 64-wide DV chunks
+
+The natural `Smem_tile_v` packs the full `DV` (256) into the lead dim, giving
+512-byte smem rows that no TMA swizzle mode can reproduce (`cuTensorMapEncodeTiled`
+caps the leading box dim at the 128-byte swizzle width; a 512-byte leading dim
+only encodes with `SWIZZLE_NONE`, which is plain row-major and does not match
+the consumer's XOR-swizzled read). Instead, V is tiled into `BMM2_DV_CHUNK = 64`
+wide groups so the V smem tile has `LEAD_DIM = 64` → 128-byte rows — the same
+proven layout as K — and the existing `N == 64` `ldsmt` read path applies
+unchanged. The producer streams `DV / 64` dv-chunks per kv-tile; the consumer
+BMM2 contracts per dv-chunk into the corresponding `acc_o` sub-range.
+
+### `setmaxnreg` is unavailable here
+
+`setmaxnreg.{dec,inc}` is a Hopper / datacenter-Blackwell instruction
+(sm_90 / 100 / 103); ptxas hard-errors on sm_120 / sm_121. The producer/consumer
+register-budget split therefore does not exist on this hardware and is guarded
+off (no-op on sm_120 / sm_121).
+
+## What the port wins, and what it does not
+
+Wins on sm_120 / sm_121:
+
+- **Fewer load instructions** — one `cp.async.bulk.tensor` per tile replaces
+  the many per-thread `LDGSTS` of the tiled kernel.
+- **Per-buffer-slot waits** (`mbarrier`) instead of CTA-wide `__syncthreads()`
+  between load and compute: a consumer warp unblocks as soon as its tile lands.
+
+Does not win:
+
+- **MMA / softmax overlap** — there is no `wgmma.async` on sm_120, so a consumer
+  warp's `mma.sync` blocks its issuing thread until result registers commit. The
+  Hopper warpspec hides BMM1/BMM2 MMA latency behind softmax/`frag_p` work; that
+  is not achievable with sync MMA only.
+- **Register-budget split** — `setmaxnreg` is unavailable (see above).
+
+## Relationship to a CuTe-DSL kernel
+
+CUTLASS 4.x has Blackwell sm_120 FMHA examples implementing the TMA-load +
+sync-MMA pattern in CuTe DSL. A longer-term direction is to route the sm_120 /
+sm_121 dispatch into a CuTe-DSL kernel. This fmha_v2 implementation maps the
+relationship between the existing fmha_v2 infrastructure and that design and is
+self-contained: the dispatch is gated, and the directory plus the entry-kernel
+header are isolated (no other code includes them).