Merge branch 'main' into initial-stats-sweep

BenjaminBraunDev · web-flow · commit 48f80b8df4e1 · 2026-05-26T15:51:01.000-07:00
diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh
@@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
         if (!weight_warp)
         {
             cudaGridDependencySynchronize();
-            cudaTriggerProgrammaticLaunchCompletion();
         }
 
         for (int ki = 0; ki < K_LOOPS_DMA; ki++)
@@ -422,6 +421,11 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
 
         __syncthreads();
 
+        if (threadIdx.x == 0) // one thread per block suffices according to official code examples
+        {
+            cudaTriggerProgrammaticLaunchCompletion();
+        }
+
         if (warp_id == 0)
         {
 
diff --git a/tensorrt_llm/_torch/models/modeling_laguna.py b/tensorrt_llm/_torch/models/modeling_laguna.py
@@ -21,6 +21,7 @@
 from torch import nn
 from transformers import PretrainedConfig
 
+from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 
 from ..attention_backend import AttentionMetadata
@@ -247,6 +248,12 @@ def __init__(
         self._use_gating = bool(gating)
         self._gate_per_head = gating == "per-head" or gating is True
 
+        # Temporary workaround: Hopper fails without unfused RoPE for Laguna
+        # While Blackwell has issues when RoPE is unfused.
+        # This check is to unblock Blackwell on main while we find proper fixes
+        # https://nvbugs/6211185
+        rope_fusion = get_sm_version() in (100, 103)
+
         # fuse_qk_norm_rope=False is required: the fused kernel reads
         # partial_rotary_factor and yarn params from pretrained_config
         # globally, ignoring per-layer RopeParams. Laguna has different
@@ -259,7 +266,7 @@ def __init__(
             bias=getattr(config, "qkv_bias", False) or getattr(config, "attention_bias", False),
             pos_embd_params=pos_embd_params,
             fuse_qk_norm_rope=False,
-            rope_fusion=False,
+            rope_fusion=rope_fusion,
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             dense_bias=False,
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -21,6 +21,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=True-attn_backend=TRTLLM-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -97,9 +97,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype SKIP (https://nvbugs/6209806)
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_async_cancel SKIP (https://nvbugs/6160085)
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress SKIP (https://nvbugs/6160085)
-accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_bf16 SKIP (https://nvbugs/6211185)
-accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_fp8 SKIP (https://nvbugs/6211185)
-accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4 SKIP (https://nvbugs/6211185)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
@@ -346,7 +343,6 @@ test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu] SKIP (https:
 test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu] SKIP (https://nvbugs/6114608)
 test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] SKIP (https://nvbugs/6114608)
 test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444)
-test_e2e.py::test_openai_chat_guided_decoding[openai/gpt-oss-120b] SKIP (https://nvbugs/6168859)
 test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
 test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] SKIP (https://nvbugs/6190759)
 test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[http] SKIP (https://nvbugs/6115562)
@@ -398,7 +394,6 @@ unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_t
 unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
 unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
 unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
-unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_cp_tp_broadcast_object[tp_cp_broadcast-list] SKIP (https://nvbugs/6132301)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)

Original file line number	Diff line number	Diff line change
`@@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,`
`236`	`236`	`if (!weight_warp)`
`237`	`237`	`{`
`238`	`238`	`cudaGridDependencySynchronize();`
`239`		`- cudaTriggerProgrammaticLaunchCompletion();`
`240`	`239`	`}`
`241`	`240`
`242`	`241`	`for (int ki = 0; ki < K_LOOPS_DMA; ki++)`
`@@ -422,6 +421,11 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,`
`422`	`421`
`423`	`422`	`__syncthreads();`
`424`	`423`
	`424`	`+ if (threadIdx.x == 0) // one thread per block suffices according to official code examples`
	`425`	`+ {`
	`426`	`+ cudaTriggerProgrammaticLaunchCompletion();`
	`427`	`+ }`
	`428`	`+`
`425`	`429`	`if (warp_id == 0)`
`426`	`430`	`{`
`427`	`431`