NVIDIA
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.cpp‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/common/attentionOp.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 10 additions & 7 deletions b/‎cpp/tensorrt_llm/common/attentionOp.h‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h‎
Lines changed: 30 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h‎
Lines changed: 30 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h‎
Lines changed: 1 addition & 0 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h‎
Lines changed: 1 addition & 0 deletions
@@ -3157,6 +3157,7 @@ int AttentionOp::initialize() noexcept
         fixedParams.isSpecDecoding = mIsSpecDecodingEnabled;
         fixedParams.hasAlibi = isALiBi();
         fixedParams.useTllmGenSparseAttention = useTllmGenSparseAttention();
+        fixedParams.specDecodingTargetMaxGenLen = mSpecDecodingTargetMaxGenLen;
 
         mXqaDispatcher.reset(new XqaDispatcher(fixedParams));
 
 
@@ -491,6 +491,8 @@ class AttentionOp
     bool mIsSpecDecTree = true;
     bool mSpecDecodingIsGenerationLengthVariable = false;
     int32_t mSpecDecodingMaxGenerationLength = 1;
+    // Static spec-dec tree length used by FMHA autotuning.
+    int32_t mSpecDecodingTargetMaxGenLen = 0;
     bool mIsMLAEnabled = false;
     bool mIsGenerationMLA = false;
     bool mUseGenFlashMLA = false;
@@ -559,13 +561,14 @@ class AttentionOp
             mCrossAttention, mMaxDistance, mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mFP8AttenOutput,
             mFP8ContextMLA, mFP8GenerationMLA, mChunkPrefillBufferBatchSize, mDenseContextFMHA, mHasFullAttentionMask,
             mIsSpecDecodingEnabled, mUseSpecDecoding, mIsSpecDecTree, mSpecDecodingIsGenerationLengthVariable,
-            mSpecDecodingMaxGenerationLength, mIsMLAEnabled, mIsGenerationMLA, mUseGenFlashMLA, mUseSparseAttention,
-            mUseTllmGenSparseAttentionPaged, mUseTllmGenSparseAttention, mMLAParams.data(), mCpSize, mCpRank, mCpGroup,
-            mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank,
-            mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache,
-            mSkipAttn, mFuseFp4Quant, mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1),
-            mSkipSoftmaxThresholdScaleFactorPrefill, mSkipSoftmaxThresholdScaleFactorDecode, mSageAttnNumEltsPerBlkQ,
-            mSageAttnNumEltsPerBlkK, mSageAttnNumEltsPerBlkV, mSageAttnQkInt8);
+            mSpecDecodingMaxGenerationLength, mSpecDecodingTargetMaxGenLen, mIsMLAEnabled, mIsGenerationMLA,
+            mUseGenFlashMLA, mUseSparseAttention, mUseTllmGenSparseAttentionPaged, mUseTllmGenSparseAttention,
+            mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin,
+            mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA,
+            mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant,
+            mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1), mSkipSoftmaxThresholdScaleFactorPrefill,
+            mSkipSoftmaxThresholdScaleFactorDecode, mSageAttnNumEltsPerBlkQ, mSageAttnNumEltsPerBlkK,
+            mSageAttnNumEltsPerBlkV, mSageAttnQkInt8);
     };
 
 private:
 
@@ -997,6 +997,8 @@ class TllmGenFmhaKernel
         options.mIsCustomSpecDecodingGen = !isContext && params.mMaxSeqLenQ > 1 && params.mIsSpecDecTree;
         options.mIsCausalSpecDecodingGen = !isContext && params.mMaxSeqLenQ > 1 && !params.mIsSpecDecTree;
         options.mNumSpecDecodingTokens = !isContext && params.mMaxSeqLenQ > 1 ? params.mMaxSeqLenQ : 0;
+        // Carry static tree length into FMHA kernel selection.
+        options.mSpecDecodingTargetMaxGenLen = params.mSpecDecodingTargetMaxGenLen;
 
         options.mIsTrtllmLayout = true;
     }
@@ -1020,17 +1022,43 @@ class TllmGenFmhaKernel
         // loop. And the number of loops are not the same in different tasks.
         sstream << "\"checksTaskSchedules\": false,\n";
 
+        bool hasCompileDefs = false;
+        auto writeCompileDef = [&](char const* compileDef)
+        {
+            if (!hasCompileDefs)
+            {
+                sstream << "\"compileDefs\": [";
+                hasCompileDefs = true;
+            }
+            else
+            {
+                sstream << ", ";
+            }
+            sstream << "\"" << compileDef << "\"";
+        };
+
         if (options.mIsExportingCubin)
         {
-            sstream << "\"compileDefs\": [\"-DTLLM_EXPORT_CUBIN\"],\n";
+            writeCompileDef("-DTLLM_EXPORT_CUBIN");
         }
 
         // Set compile flags for E2M1 KV kernel benchmark.
         // NOTE(tizheng): This is to be removed after compiler fixes PTX exposure of QMUL4. See Fp4Utils.h for details.
         if (options.mChecksResults == 0 && options.mDtypeKv == tg::Dtype::E2m1)
         {
             TLLM_LOG_INFO("Forcing -DTLLM_BENCHMARK_E2M1_KV_CACHE for E2m1 Kv. The results are not correct.");
-            sstream << "\"compileDefs\": [\"-DTLLM_BENCHMARK_E2M1_KV_CACHE\"],\n";
+            writeCompileDef("-DTLLM_BENCHMARK_E2M1_KV_CACHE");
+        }
+
+        // SwapsMmaAb NVRTC kernels already emit __launch_bounds__; avoid a CUDA 13 .reqntid/.maxntid conflict.
+        if (shouldUseNvrtc(options) && options.mFmhaKernelType == FmhaKernelType::SwapsMmaAbForGeneration)
+        {
+            writeCompileDef("-DTLLM_DISABLE_BLOCK_SIZE");
+        }
+
+        if (hasCompileDefs)
+        {
+            sstream << "],\n";
         }
 
         // Enable programmatic dependent launch.
 
@@ -342,6 +342,7 @@ struct TllmGenFmhaRunnerParams
     // When seqlensQPtr[i] < mPackedMaskMaxSeqLenQ, the packed mask tensor has
     // row stride ceilDiv(mPackedMaskMaxSeqLenQ, 32) rather than ceilDiv(seqLenQ, 32).
     int32_t mPackedMaskMaxSeqLenQ = 0;
+    int32_t mSpecDecodingTargetMaxGenLen = 0;
 
     // set the attention mask type
     TllmGenFmhaRunnerParams& setAttentionMaskType(std::int8_t maskType)