Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ tensorrt~=10.15.1
torch>=2.10.0,<=2.11.0a0
torchvision
nvidia-modelopt[torch]~=0.37.0
# NcclEP uses nccl4py's nccl.ep package without changing the NCCL wheel constraint.
nccl4py>=0.3

@bobboli bobboli Jun 26, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's confirm the expected way of pinning NCCL version with @EmmaQiaoCh. This adds nccl4py, but the same requirements file still pins nvidia-nccl-cu13<=2.29.2, while is_nccl_ep_installed() requires the loaded libnccl.so>=2.30.4. @EmmaQiaoCh not sure if we are pinning NCCL version in the container image, or via pip nvidia-nccl-cu13?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that is a known limitation today.
That is why this PR is marked as draft.
PR #15087 will update to NCCL 2.30.4 (via pip requirements.txt)

I will schedule a meeting soon about the inclusion of nccl4py

# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-26-02.html#rel-26-02 uses 2.29.2
# torch 2.10.0+cu130 depends on nvidia-nccl-cu13==2.28.9
nvidia-nccl-cu13>=2.28.9,<=2.29.2
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from .communication_factory import CommunicationFactory
from .deep_ep import DeepEP
from .deep_ep_low_latency import DeepEPLowLatency
from .nccl_ep import NcclEP
from .nvlink_one_sided import NVLinkOneSided
from .nvlink_two_sided import NVLinkTwoSided

Expand All @@ -46,6 +47,7 @@
"NVLinkOneSided",
"DeepEP",
"DeepEPLowLatency",
"NcclEP",
# Factory
"CommunicationFactory",
]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -32,6 +32,7 @@
from .base import Communication
from .deep_ep import DeepEP
from .deep_ep_low_latency import DeepEPLowLatency
from .nccl_ep import NcclEP
from .nvlink_one_sided import NVLinkOneSided
from .nvlink_two_sided import NVLinkTwoSided
from .nvlink_two_sided_flashinfer import NVLinkTwoSidedFlashinfer
Expand Down Expand Up @@ -67,6 +68,7 @@ def create_strategy(
2. Auto-selection (tries in order):
- NVLinkOneSided (highest priority for throughput)
- NVLinkTwoSided (high priority for latency)
- NcclEP (if nccl-ep is available)
- DeepEP (if enabled via TRTLLM_CAN_USE_DEEP_EP)
- DeepEPLowLatency (if enabled via TRTLLM_CAN_USE_DEEP_EP)
- AllGather + ReduceScatter (fallback, always works)
Expand Down Expand Up @@ -129,7 +131,7 @@ def create_strategy(
)

# Auto-selection: Try strategies in priority order using try-catch
# Priority: NVLinkOneSided > NVLinkTwoSided > DeepEP > DeepEPLowLatency > AllGather
# Priority: NVLinkOneSided > NVLinkTwoSided > NcclEP > DeepEP > DeepEPLowLatency > AllGather

try:
enable_eplb = model_config.moe_load_balancer is not None
Expand Down Expand Up @@ -181,6 +183,26 @@ def create_strategy(
except Exception as e:
logger.info(f"NVLinkTwoSided not available: {e}")

# Try NCCL EP (rank-major LL). Falls through to DeepEP/AllGather if
# prerequisites are not met or libnccl_ep.so is not available.
nccl_ep_unavailable_reason = CommunicationFactory._get_nccl_ep_unavailable_reason(act_dtype)
if nccl_ep_unavailable_reason is None:
try:
strategy = NcclEP(
mapping,
num_slots,
hidden_size,
max_num_tokens,
moe_max_num_tokens,
top_k=top_k,
)
logger.info("Selected communication strategy: NcclEP")
return strategy
except RuntimeError as e:
logger.debug(f"NcclEP not available: {e}")
else:
logger.debug(f"NcclEP not available: {nccl_ep_unavailable_reason}")

Comment thread
nv-lschneider marked this conversation as resolved.
# Try DeepEP (if enabled and weight dtype is bfloat16)
if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "1") == "1" and act_dtype == torch.bfloat16:
try:
Expand Down Expand Up @@ -318,7 +340,29 @@ def _create_forced_method(
use_low_precision_combine,
moe_max_num_tokens,
)
elif method == "NCCL_EP":
nccl_ep_unavailable_reason = CommunicationFactory._get_nccl_ep_unavailable_reason(
act_dtype
)
if nccl_ep_unavailable_reason is not None:
raise ValueError(nccl_ep_unavailable_reason)
return NcclEP(
mapping,
num_slots,
hidden_size,
max_num_tokens,
moe_max_num_tokens,
top_k=top_k,
)
elif method == "ALLGATHER":
return AllGatherReduceScatter(mapping)
else:
raise ValueError(f"Unknown communication method: {method}")

@staticmethod
def _get_nccl_ep_unavailable_reason(
act_dtype: torch.dtype,
) -> Optional[str]:
if act_dtype != torch.bfloat16:
return f"NcclEP requires act_dtype=torch.bfloat16, got {act_dtype}."
return None
Loading
Loading