NVIDIA
diff --git a/‎bionemo-recipes/recipes/mixtral_native_te/fused_a2a.py‎
Lines changed: 306 additions & 0 deletions b/‎bionemo-recipes/recipes/mixtral_native_te/fused_a2a.py‎
Lines changed: 306 additions & 0 deletions
@@ -0,0 +1,306 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-Apache2
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# --- BEGIN COPIED FILE NOTICE ---
+# This file is copied from: bionemo-recipes/models/mixtral/fused_a2a.py
+# Do not modify this file directly. Instead, modify the source and run:
+#     python ci/scripts/check_copied_files.py --fix
+# --- END COPIED FILE NOTICE ---
+
+# Portions of this code are from DeepSeek DeepEP project
+# Copyright (c) 2025 DeepSeek
+# Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
+
+import os
+
+
+try:
+    from deep_ep import Buffer
+    from deep_ep.utils import EventHandle, EventOverlap
+
+    HAVE_DEEP_EP = True
+    Buffer.set_num_sms(int(os.environ.get("DEEP_EP_SM_NUMS", "20")))
+except ImportError:
+    HAVE_DEEP_EP = False
+
+import torch
+
+
+_buffer = None
+_nvshmem_available = None
+
+
+def _is_nvshmem_available() -> bool:
+    """Check if DeepEP was compiled with NVSHMEM support.
+
+    Probes NVSHMEM by calling get_rdma_buffer_size_hint, since
+    is_sm90_compiled() alone is not a reliable proxy — SM90 can
+    be compiled while NVSHMEM is still disabled.
+    """
+    global _nvshmem_available  # noqa: PLW0603
+    if _nvshmem_available is None:
+        try:
+            config = Buffer.get_dispatch_config(2)
+            config.get_rdma_buffer_size_hint(256, 2)
+            _nvshmem_available = True
+        except RuntimeError:
+            _nvshmem_available = False
+    return _nvshmem_available
+
+
+def get_hidden_bytes(x: torch.Tensor) -> int:
+    """Calculate the number of hidden bytes for a tensor.
+
+    Args:
+        x (torch.Tensor): Input tensor
+
+    Returns:
+        int: Number of hidden bytes
+    """
+    return x.size(1) * max(x.element_size(), 2)
+
+
+def get_buffer(group: torch.distributed.ProcessGroup, hidden_bytes: int):
+    """Get or create a buffer for all-to-all communication.
+
+    Args:
+        group (torch.distributed.ProcessGroup): Process group for communication
+        hidden_bytes (int): Number of hidden bytes needed
+
+    Returns:
+        Buffer: Communication buffer
+    """
+    global _buffer  # noqa: PLW0603
+    num_nvl_bytes, num_rdma_bytes = 0, 0
+    nvshmem = _is_nvshmem_available()
+    for config in (
+        Buffer.get_dispatch_config(group.size()),
+        Buffer.get_combine_config(group.size()),
+    ):
+        num_nvl_bytes = max(config.get_nvl_buffer_size_hint(hidden_bytes, group.size()), num_nvl_bytes)
+        if nvshmem:
+            num_rdma_bytes = max(config.get_rdma_buffer_size_hint(hidden_bytes, group.size()), num_rdma_bytes)
+
+    # Allocate buffer if not existed or not enough buffer
+    # NOTES: the adaptive routing configuration of the network **must be off**
+    if (
+        _buffer is None
+        or _buffer.group != group
+        or _buffer.num_nvl_bytes < num_nvl_bytes
+        or _buffer.num_rdma_bytes < num_rdma_bytes
+    ):
+        _buffer = Buffer(group, num_nvl_bytes, num_rdma_bytes)
+    return _buffer
+
+
+class FusedDispatch(torch.autograd.Function):
+    """Fused dispatch operation for MoE routing combining computation and communication."""
+
+    @staticmethod
+    def forward(
+        ctx,
+        x,
+        token_indices,
+        token_probs,
+        num_experts,
+        group,
+        async_finish=False,
+        allocate_on_comm_stream=False,
+    ):
+        """Forward pass of fused dispatch."""
+        previous_event = None
+        if async_finish:
+            previous_event = EventOverlap(EventHandle())
+        # Calculate layout before actual dispatch
+        buffer = get_buffer(group, get_hidden_bytes(x))
+        (
+            num_tokens_per_rank,
+            num_tokens_per_rdma_rank,
+            num_tokens_per_expert,
+            is_token_in_rank,
+            event,
+        ) = buffer.get_dispatch_layout(
+            token_indices,
+            num_experts,
+            previous_event=previous_event,
+            async_finish=async_finish,
+            allocate_on_comm_stream=allocate_on_comm_stream,
+        )
+
+        # Do MoE dispatch
+        # NOTES: the CPU will wait for GPU's signal to arrive,
+        # so this is not compatible with CUDA graph
+        (
+            recv_x,
+            recv_token_indices,
+            recv_token_probs,
+            num_recv_tokens_per_expert_list,
+            handle,
+            after_event_overlap,
+        ) = buffer.dispatch(
+            x,
+            topk_idx=token_indices,
+            topk_weights=token_probs,  # DeepEP only supports float32 probs
+            num_tokens_per_rank=num_tokens_per_rank,
+            num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
+            is_token_in_rank=is_token_in_rank,
+            num_tokens_per_expert=num_tokens_per_expert,
+            previous_event=event,  # wait in deepep::intra/inter_dispatch
+            async_finish=async_finish,
+            allocate_on_comm_stream=allocate_on_comm_stream,
+        )
+
+        # Make sure current stream is synchronized
+        if async_finish:
+            after_event_overlap.current_stream_wait()
+
+        # Save for backward
+        ctx.group = group
+        ctx.handle = handle
+        ctx.async_finish = async_finish
+        ctx.allocate_on_comm_stream = allocate_on_comm_stream
+        tokens_per_expert = torch.tensor(num_recv_tokens_per_expert_list)
+
+        return (recv_x, recv_token_indices, recv_token_probs, tokens_per_expert, handle)
+
+    @staticmethod
+    def backward(
+        ctx,
+        grad_output,
+        grad_token_indices,
+        grad_token_probs,
+        grad_tokens_per_expert,
+        grad_handle,
+    ):
+        """Backward pass of fused dispatch."""
+        buffer = get_buffer(ctx.group, get_hidden_bytes(grad_output))
+        handle = ctx.handle
+        previous_event = None
+        if ctx.async_finish:
+            previous_event = EventOverlap(EventHandle())
+        grad_x, grad_token_probs, after_event = buffer.combine(
+            grad_output.contiguous(),
+            handle,
+            topk_weights=grad_token_probs.float(),
+            previous_event=previous_event,
+            async_finish=ctx.async_finish,
+            allocate_on_comm_stream=ctx.allocate_on_comm_stream,
+        )
+        # Make sure current stream is synchronized
+        if ctx.async_finish:
+            after_event.current_stream_wait()
+        return grad_x, None, grad_token_probs, None, None, None, None
+
+
+class FusedCombine(torch.autograd.Function):
+    """Fused combine operation for MoE output combining computation and communication."""
+
+    @staticmethod
+    def forward(ctx, x, group, handle, async_finish=False, allocate_on_comm_stream=False):
+        """Forward pass of fused combine."""
+        previous_event = None
+        if async_finish:
+            previous_event = EventOverlap(EventHandle())
+        buffer = get_buffer(group, get_hidden_bytes(x))
+        combined_x, _, after_event = buffer.combine(
+            x,
+            handle=handle,
+            async_finish=async_finish,
+            previous_event=previous_event,
+            allocate_on_comm_stream=allocate_on_comm_stream,
+        )
+        # Make sure current stream is synchronized
+        if async_finish:
+            after_event.current_stream_wait()
+
+        ctx.handle = handle
+        ctx.group = group
+        ctx.async_finish = async_finish
+        ctx.allocate_on_comm_stream = allocate_on_comm_stream
+        return combined_x, None
+
+    @staticmethod
+    def backward(ctx, grad_output, previous_event=None):
+        """Backward pass of fused combine."""
+        previous_event = None
+        if ctx.async_finish:
+            previous_event = EventOverlap(EventHandle())
+        buffer = get_buffer(ctx.group, get_hidden_bytes(grad_output))
+        grad_x, _, _, _, _, after_event = buffer.dispatch(
+            grad_output.contiguous(),
+            handle=ctx.handle,
+            previous_event=previous_event,
+            async_finish=ctx.async_finish,
+            allocate_on_comm_stream=ctx.allocate_on_comm_stream,
+        )
+        # Make sure current stream is synchronized
+        if ctx.async_finish:
+            after_event.current_stream_wait()
+        return grad_x, None, None, None, None
+
+
+if HAVE_DEEP_EP:
+
+    def fused_dispatch(
+        x,
+        token_indices,
+        token_probs,
+        num_experts,
+        group,
+        async_finish=False,
+        allocate_on_comm_stream=False,
+    ):
+        """Perform fused dispatch operation if deep_ep is available.
+
+        Args:
+            x: Input tensor [num_tokens, hidden_size]
+            token_indices: Token routing indices [num_tokens, topk]
+            token_probs: Token routing probabilities [num_tokens, topk]
+            num_experts: Number of experts
+            group: Process group
+            async_finish: Whether to finish asynchronously
+            allocate_on_comm_stream: Whether to allocate on communication stream
+
+        Returns:
+            Result of FusedDispatch
+        """
+        return FusedDispatch.apply(
+            x.contiguous(),
+            token_indices,
+            token_probs,
+            num_experts,
+            group,
+            async_finish,
+            allocate_on_comm_stream,
+        )
+
+    def fused_combine(x, group, handle, async_finish=False, allocate_on_comm_stream=False):
+        """Perform fused combine operation if deep_ep is available.
+
+        Args:
+            x: Input tensor
+            group: Process group
+            handle: Communication handle
+            async_finish: Whether to finish asynchronously
+            allocate_on_comm_stream: Whether to allocate on communication stream
+
+        Returns:
+            Result of FusedCombine
+        """
+        return FusedCombine.apply(x, group, handle, async_finish, allocate_on_comm_stream)
+
+else:
+    fused_dispatch = None
+    fused_combine = None