diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 28d5cff5c3d8..3dba1084a8cf 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -529,7 +529,8 @@ def __init__( force_dynamic_quantization=config.force_dynamic_quantization, disable_deep_gemm=disable_deep_gemm, use_custom_cublas_mm=use_custom_cublas_mm, - use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm) + use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm) self.quant_config = config.get_quant_config() self.attn_backend = config.attn_backend @@ -1462,7 +1463,8 @@ def __init__( reduce_output=reduce_output, allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, - use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm) + use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm) def yarn_get_mscale(scale=1, mscale=1): if scale <= 1: diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py index 7e2fdcaeca20..4870db50b44c 100644 --- a/tensorrt_llm/_torch/modules/gated_mlp.py +++ b/tensorrt_llm/_torch/modules/gated_mlp.py @@ -44,6 +44,8 @@ def __init__( self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm config = config or ModelConfig() + use_cute_dsl_bf16_gemm = getattr(config, "use_cute_dsl_bf16_gemm", + False) self.mapping = config.mapping if overridden_tp_size is not None: assert config.mapping.tp_size % overridden_tp_size == 0 @@ -84,6 +86,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, fused_weight_shard_indices_mapping=gateup_shard_indices_mapping, use_custom_cublas_mm=use_custom_cublas_mm, @@ -114,6 +117,7 @@ def __init__( allreduce_strategy=config.allreduce_strategy, force_dynamic_quantization=config.force_dynamic_quantization, use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm, + use_cute_dsl_bf16_gemm=use_cute_dsl_bf16_gemm, disable_deep_gemm=disable_deep_gemm, use_custom_cublas_mm=use_custom_cublas_mm, ) diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 609e11c5a27a..4f0a624d13f1 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -46,7 +46,8 @@ from tensorrt_llm.lora_helper import (LoraConfig, get_default_trtllm_modules_to_hf_modules) -from .._utils import _str_to_torch_dtype_dict, mpi_rank, prefer_pinned +from .._utils import (_str_to_torch_dtype_dict, is_sm_100f, mpi_rank, + prefer_pinned) # yapf: disable # isort: off @@ -5077,6 +5078,13 @@ def validate_ray_placement_config(self) -> 'TorchLlmArgs': @model_validator(mode='after') def validate_cute_dsl_bf16(self) -> 'TorchLlmArgs': + if (not (self.use_cute_dsl_bf16_bmm and self.use_cute_dsl_bf16_gemm) + and self.pipeline_parallel_size > 1 and is_sm_100f()): + logger.info("Automatically enabling CuTe DSL BF16 BMM and GEMM for " + "SM100/SM103 PP.") + self.use_cute_dsl_bf16_bmm = True + self.use_cute_dsl_bf16_gemm = True + if self.use_cute_dsl_bf16_bmm or self.use_cute_dsl_bf16_gemm: major, minor = torch.cuda.get_device_capability() sm = major * 10 + minor diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 68102bdf0380..f81f8ebe6652 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -15,25 +15,15 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_piecewi accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6084720) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False-v2_kv_cache=True] SKIP (https://nvbugs/6095851) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6278337) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/6278337) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/6198785) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2-moe_backend=WIDEEP] SKIP (https://nvbugs/6313993) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_cute_dsl_bf16_gemm_4gpus[tp4-cuda_graph=False] SKIP (https://nvbugs/6224636) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[pp4-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-sampler_async_worker=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] SKIP (https://nvbugs/5955773) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/5945081) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6224637) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6278403) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=True] SKIP (https://nvbugs/6272673) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6224637) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-low_precision_combine=False-torch_compile=False] SKIP (https://nvbugs/6245394) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_dflash SKIP (https://nvbugs/6156233) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[v2_kv_cache-trtllm-one_model-overlap_scheduler] SKIP (https://nvbugs/6341371)