Skip to content

Commit 48f80b8

Browse files
Merge branch 'main' into initial-stats-sweep
2 parents fb51f4d + 92e601c commit 48f80b8

4 files changed

Lines changed: 14 additions & 7 deletions

File tree

cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,6 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
236236
if (!weight_warp)
237237
{
238238
cudaGridDependencySynchronize();
239-
cudaTriggerProgrammaticLaunchCompletion();
240239
}
241240

242241
for (int ki = 0; ki < K_LOOPS_DMA; ki++)
@@ -422,6 +421,11 @@ __global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output,
422421

423422
__syncthreads();
424423

424+
if (threadIdx.x == 0) // one thread per block suffices according to official code examples
425+
{
426+
cudaTriggerProgrammaticLaunchCompletion();
427+
}
428+
425429
if (warp_id == 0)
426430
{
427431

tensorrt_llm/_torch/models/modeling_laguna.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from torch import nn
2222
from transformers import PretrainedConfig
2323

24+
from tensorrt_llm._utils import get_sm_version
2425
from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
2526

2627
from ..attention_backend import AttentionMetadata
@@ -247,6 +248,12 @@ def __init__(
247248
self._use_gating = bool(gating)
248249
self._gate_per_head = gating == "per-head" or gating is True
249250

251+
# Temporary workaround: Hopper fails without unfused RoPE for Laguna
252+
# While Blackwell has issues when RoPE is unfused.
253+
# This check is to unblock Blackwell on main while we find proper fixes
254+
# https://nvbugs/6211185
255+
rope_fusion = get_sm_version() in (100, 103)
256+
250257
# fuse_qk_norm_rope=False is required: the fused kernel reads
251258
# partial_rotary_factor and yarn params from pretrained_config
252259
# globally, ignoring per-layer RopeParams. Laguna has different
@@ -259,7 +266,7 @@ def __init__(
259266
bias=getattr(config, "qkv_bias", False) or getattr(config, "attention_bias", False),
260267
pos_embd_params=pos_embd_params,
261268
fuse_qk_norm_rope=False,
262-
rope_fusion=False,
269+
rope_fusion=rope_fusion,
263270
layer_idx=layer_idx,
264271
dtype=config.torch_dtype,
265272
dense_bias=False,

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ l0_b200:
2121
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=False]
2222
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=False-attn_backend=TRTLLM-torch_compile=True]
2323
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[v2_kv_cache=True-attn_backend=TRTLLM-torch_compile=True]
24+
- accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4
2425
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False]
2526
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True]
2627
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=False]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,6 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
9797
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype SKIP (https://nvbugs/6209806)
9898
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_async_cancel SKIP (https://nvbugs/6160085)
9999
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_nvfp4_longseq_trtllm_moe_stress SKIP (https://nvbugs/6160085)
100-
accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_bf16 SKIP (https://nvbugs/6211185)
101-
accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_fp8 SKIP (https://nvbugs/6211185)
102-
accuracy/test_llm_api_pytorch.py::TestLagunaXS::test_nvfp4 SKIP (https://nvbugs/6211185)
103100
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
104101
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=False-enable_padding=False-disable_overlap_scheduler=True-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
105102
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_auto_dtype_beam_search[enable_cuda_graph=True-enable_padding=False-disable_overlap_scheduler=False-sampler_async_worker=False] SKIP (https://nvbugs/6141653)
@@ -346,7 +343,6 @@ test_e2e.py::test_multi_nodes_eval[Qwen3/Qwen3-235B-A22B-tp16-mmlu] SKIP (https:
346343
test_e2e.py::test_multi_nodes_eval[Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf-tp16-mmlu] SKIP (https://nvbugs/6114608)
347344
test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] SKIP (https://nvbugs/6114608)
348345
test_e2e.py::test_openai_chat_example[trt] SKIP (https://nvbugs/5477444)
349-
test_e2e.py::test_openai_chat_guided_decoding[openai/gpt-oss-120b] SKIP (https://nvbugs/6168859)
350346
test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
351347
test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] SKIP (https://nvbugs/6190759)
352348
test_e2e.py::test_openai_disagg_multi_nodes_completion_service_discovery[http] SKIP (https://nvbugs/6115562)
@@ -398,7 +394,6 @@ unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_t
398394
unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens16-_hidden512] SKIP (https://nvbugs/5940460)
399395
unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden32] SKIP (https://nvbugs/5940460)
400396
unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_tokens256-_hidden512] SKIP (https://nvbugs/5940460)
401-
unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_cp_tp_broadcast_object[tp_cp_broadcast-list] SKIP (https://nvbugs/6132301)
402397
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070)
403398
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
404399
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_topk_4-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)

0 commit comments

Comments
 (0)