NVIDIA
diff --git a/‎.claude/skills/trtllm-moe-develop/SKILL.md‎
Lines changed: 20 additions & 0 deletions b/‎.claude/skills/trtllm-moe-develop/SKILL.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 38 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h‎
Lines changed: 18 additions & 5 deletions b/‎cpp/include/tensorrt_llm/batch_manager/capacityScheduler.h‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 12 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheTransferManager.h‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 33 additions & 7 deletions b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 33 additions & 7 deletions
diff --git a/‎cpp/include/tensorrt_llm/common/optionalRef.h‎
Lines changed: 7 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/optionalRef.h‎
Lines changed: 7 additions & 0 deletions
@@ -268,6 +268,26 @@ Checklist:
 - Existing legacy `forward` methods can be read for compatibility context, but
   they are not the default pattern for new backend work.
 
+### Imported Kernel ABI Checklist
+
+When importing or wrapping an upstream kernel, derive the TRT-LLM adapter
+contract from the lowest-level kernel consumer. Comments, docs, design notes,
+and parameter names are useful hints, but they are not proof of the runtime ABI.
+
+- Derive weight shape and layout from the kernel entrypoint, `make_layout`, TMA,
+  MMA/GEMM transforms, and stride usage. Record required tensor shape, stride,
+  physical storage layout, and boundary view layout.
+- Derive alpha and scale semantics from kernel consumption points. Trace where
+  alpha, norm constants, block scales, activation scales, and weight scales are
+  loaded and multiplied before deciding how upper layers compute or pack them.
+  Treat weight bytes, block scales/SF, and global alpha/norm constants as
+  separate contracts.
+- Design the upper-layer adapter from the kernel ABI upward. Map each kernel
+  input/output to an adapter responsibility: storage tensor, view/transposition,
+  dtype reinterpretation, padding, scale packing, workspace ownership,
+  synchronization, and output reduction. Validate parity with upstream
+  invocation dumps, not just final output.
+
 ### Quantization And Weights
 
 Role:
 
@@ -302,6 +302,25 @@ common-files: &common_files |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/utils.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/__init__.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/blocked_scale.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/contract.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/custom_ext.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/dynamic_mainloop.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/epilogue_refactor.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/fc1_fc2_fuse_sched.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/grid_sync.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/iket_compat.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/kernel_fc12.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/megamoe_constants.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/megamoe_kernel.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/moe_persistent_scheduler.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/moe_utils.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/ptx_helpers.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/sf_swizzle.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/sym_buffer.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/token_comm.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/topk_reduce.py |
         tensorrt_llm/_torch/cute_dsl_utils.py |
         tensorrt_llm/_torch/debug/__init__.py |
         tensorrt_llm/_torch/debug/debug_hook.py |
@@ -1658,6 +1677,25 @@ legacy-files: &legacy_files |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/custom_pipeline.py |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_blockscaled_gemm_persistent.py |
         tensorrt_llm/_torch/cute_dsl_kernels/blackwell/utils.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/__init__.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/blocked_scale.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/contract.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/custom_ext.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/dynamic_mainloop.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/epilogue_refactor.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/fc1_fc2_fuse_sched.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/grid_sync.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/iket_compat.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/kernel_fc12.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/megamoe_constants.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/megamoe_kernel.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/moe_persistent_scheduler.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/moe_utils.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/ptx_helpers.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/sf_swizzle.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/sym_buffer.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/token_comm.py |
+        tensorrt_llm/_torch/cute_dsl_kernels/mega_moe_nvfp4/topk_reduce.py |
         tensorrt_llm/_torch/cute_dsl_utils.py |
         tensorrt_llm/_torch/debug/__init__.py |
         tensorrt_llm/_torch/debug/debug_hook.py |
 
@@ -87,7 +87,11 @@ class MaxRequestsScheduler : public BaseCapacityScheduler
 
 /// @brief   Schedule requests using the MAX_UTILIZATION policy
 /// @details Try reserving resources to advance requests by one step,
-///          may pause previously started requests.
+///          may pause previously started requests.  When a
+///          ``crossKvCacheManager`` is supplied, requests in the
+///          ``ENCODER_INIT`` state may be admitted for encoder compute
+///          without consuming self- or cross-KV blocks; the later
+///          ``CONTEXT_INIT`` decoder admission owns cross-pool budgeting.
 class MaxUtilizationScheduler : public BaseCapacityScheduler
 {
 public:
@@ -96,8 +100,9 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
         LlmRequestState noScheduleAfterState = LlmRequestState::kGENERATION_COMPLETE);
 
     [[nodiscard]] std::tuple<RequestVector, RequestVector> operator()(
-        kv_cache_manager::BaseKVCacheManager& kvCacheManager, OptionalRef<BasePeftCacheManager const> peftCacheManager,
-        RequestList const& activeRequests) const;
+        kv_cache_manager::BaseKVCacheManager& kvCacheManager,
+        OptionalRef<kv_cache_manager::BaseKVCacheManager> crossKvCacheManager,
+        OptionalRef<BasePeftCacheManager const> peftCacheManager, RequestList const& activeRequests) const;
 
 private:
     SizeType32 mMaxNumRequests;
@@ -106,6 +111,10 @@ class MaxUtilizationScheduler : public BaseCapacityScheduler
 };
 
 /// @brief Schedule requests using the GUARANTEED_NO_EVICT policy
+/// @details When a ``crossKvCacheManager`` is supplied, requests in the
+///          ``ENCODER_INIT`` state may be admitted for encoder compute
+///          without consuming self- or cross-KV blocks.  The later
+///          ``CONTEXT_INIT`` decoder admission owns cross-pool budgeting.
 class GuaranteedNoEvictScheduler : public BaseCapacityScheduler
 {
 public:
@@ -158,7 +167,11 @@ class CapacityScheduler : public Algorithm
      *
      * @param kvCacheManager Required in MaxUtilizationScheduler (as a ref) and in GuaranteedNoEvictScheduler and
      * StaticBatchScheduler (as a const ref).
-     * @param crossKvCacheManager Optional used in GuaranteedNoEvictScheduler and StaticBatchScheduler.
+     * @param crossKvCacheManager Optional cross-attention KV cache manager.  Used by
+     * MaxUtilizationScheduler (mutates: ``startScheduling`` / ``schedulingRemoveSequence``)
+     * and GuaranteedNoEvictScheduler / StaticBatchScheduler (read-only).  Required for
+     * encoder-decoder admission. Encoder-init requests only require this pool
+     * to be configured; decoder context admission budgets blocks from it.
      * @param peftCacheManager Optional used in MaxUtilizationScheduler, GuaranteedNoEvictScheduler and
      * StaticBatchScheduler.
      * @param activeRequests
@@ -168,7 +181,7 @@ class CapacityScheduler : public Algorithm
     [[nodiscard]] std::tuple<RequestVector, RequestVector, RequestVector> operator()(RequestList const& activeRequests,
         OptionalRef<kv_cache_manager::BaseKVCacheManager> kvCacheManager = std::nullopt,
         OptionalRef<BasePeftCacheManager const> peftCacheManager = std::nullopt,
-        OptionalRef<kv_cache_manager::BaseKVCacheManager const> crossKvCacheManager = std::nullopt) const;
+        OptionalRef<kv_cache_manager::BaseKVCacheManager> crossKvCacheManager = std::nullopt) const;
 
     /// @brief Sets the reorder policy to use AgentTreePolicy with the given configuration.
     /// @param agentPercentage The ratio of agent requests to schedule (0.0-1.0, -1.0 for random).
 
@@ -24,6 +24,11 @@ namespace kvc = tensorrt_llm::executor::kv_cache;
 
 #pragma once
 
+namespace tensorrt_llm::testing
+{
+class KVCacheTransferManagerTestAccess;
+} // namespace tensorrt_llm::testing
+
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
@@ -76,10 +81,15 @@ class KVCacheTransferManager
     [[nodiscard]] KvCacheTransferStats getAndResetTransferStats();
 
 private:
+    friend class ::tensorrt_llm::testing::KVCacheTransferManagerTestAccess;
+
     //! \brief Get pointer to pool specified by cache block.
     static tr::ITensor::SharedPtr computeBlockPointer(
         BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools, size_t poolIdx);
 
+    //! \brief Get pool-qualified index for pending transfer tracking.
+    [[nodiscard]] static kernels::KVCacheIndex::UnderlyingType getPendingTransferIndex(BlockPtr const& block);
+
     /*!
      * \brief The key method that copies the src block to the dst block.
      *
@@ -107,8 +117,8 @@ class KVCacheTransferManager
     runtime::BufferManager mOnboardManager;
     runtime::BufferManager mOffloadManager;
 
-    // Track reads and writes for blocks. Note that it is the memory pool index that
-    // identifies the raw memory blocks involved in I/O, not the block Id.
+    // Track reads and writes for blocks. Note that it is the pool-qualified memory pool index
+    // that identifies the raw memory blocks involved in I/O, not the block Id.
     std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingReads;
     std::unordered_map<kernels::KVCacheIndex::UnderlyingType, tr::CudaEvent> mPendingWrites;
     // Reference to parent loopback agent
 
@@ -665,9 +665,9 @@ class GenericLlmRequest
         return mEncoderUniqueTokens;
     }
 
-    /// @brief Get length of encoder input (could be tokens or features length)
-    /// @return An integer.
-    [[nodiscard]] SizeType32 getEncoderInputLen() const
+    /// @brief Get length of encoder input when present, without throwing for decoder-only requests.
+    /// @return Encoder input length, or nullopt when this request has no encoder side.
+    [[nodiscard]] std::optional<SizeType32> tryGetEncoderInputLen() const
     {
         if (mEncoderInputFeatures.has_value())
         {
@@ -678,19 +678,45 @@ class GenericLlmRequest
             return getEncoderTokens().value()->size();
         }
 
-        TLLM_THROW("GenericLlmRequest::getEncoderInputLen - Do not have encoder length!");
+        return std::nullopt;
     }
 
-    /// @brief Get length of encoder output. Fall back to encoder input length if not present
+    /// @brief Get length of encoder input (could be tokens or features length)
     /// @return An integer.
-    [[nodiscard]] SizeType32 getEncoderOutputLen() const
+    [[nodiscard]] SizeType32 getEncoderInputLen() const
+    {
+        auto const encoderInputLen = tryGetEncoderInputLen();
+        if (encoderInputLen.has_value())
+        {
+            return encoderInputLen.value();
+        }
+
+        TLLM_THROW("GenericLlmRequest::getEncoderInputLen - Do not have encoder length!");
+    }
+
+    /// @brief Get length of encoder output when present, without throwing for decoder-only requests.
+    /// @return Encoder output length, or nullopt when this request has no encoder side.
+    [[nodiscard]] std::optional<SizeType32> tryGetEncoderOutputLen() const
     {
         if (mEncoderOutputLength.has_value())
         {
             return mEncoderOutputLength.value();
         }
 
-        return getEncoderInputLen();
+        return tryGetEncoderInputLen();
+    }
+
+    /// @brief Get length of encoder output, or throw if the request has no encoder side.
+    /// @return Explicit encoder output length, or encoder input length when the output length is not present.
+    [[nodiscard]] SizeType32 getEncoderOutputLen() const
+    {
+        auto const encoderOutputLen = tryGetEncoderOutputLen();
+        if (encoderOutputLen.has_value())
+        {
+            return encoderOutputLen.value();
+        }
+
+        TLLM_THROW("GenericLlmRequest::getEncoderInputLen - Do not have encoder length!");
     }
 
     [[nodiscard]] std::optional<std::shared_ptr<std::vector<SizeType32>>> getPositionIds() const
 
@@ -78,6 +78,13 @@ class OptionalRef
     {
     }
 
+    // Implicit conversion from OptionalRef<non-const T> to OptionalRef<const T>
+    template <typename U = T, typename = std::enable_if_t<std::is_const_v<U>>>
+    OptionalRef(OptionalRef<std::remove_const_t<T>> const& other)
+        : opt(other ? std::optional<std::reference_wrapper<T>>(std::ref(*other)) : std::nullopt)
+    {
+    }
+
     T* operator->() const
     {
         return opt ? &(opt->get()) : nullptr;
Original file line number	Diff line number	Diff line change
`@@ -78,6 +78,13 @@ class OptionalRef`
`78`	`78`	`{`
`79`	`79`	`}`
`80`	`80`
	`81`	`+ // Implicit conversion from OptionalRef<non-const T> to OptionalRef<const T>`
	`82`	`+ template <typename U = T, typename = std::enable_if_t<std::is_const_v<U>>>`
	`83`	`+ OptionalRef(OptionalRef<std::remove_const_t<T>> const& other)`
	`84`	`+ : opt(other ? std::optional<std::reference_wrapper<T>>(std::ref(*other)) : std::nullopt)`
	`85`	`+ {`
	`86`	`+ }`
	`87`	`+`
`81`	`88`	`T* operator->() const`
`82`	`89`	`{`
`83`	`90`	`return opt ? &(opt->get()) : nullptr;`