Fix Evoformer's multi-arch dispatch root cause (#7881)

tohtana · sfc-gh-truwase · web-flow · commit 784cc26e77e2 · 2026-03-13T00:25:45.000Z
Fixes #7863 Replaces #7872 @Flamefire Issue #7863 reports order-dependent failures in Evoformer when building for mixed CUDA architectures. The guard-only approach prevents some bad outputs but does not solve multi-generation packaging requirements. This PR takes the root-cause direction: produce a correct multi-arch binary that can run on pre-Ampere and Ampere+ and select the right kernel family at runtime. With TORCH_CUDA_ARCH_LIST='7.0;8.0': 1. Build is no longer pinned by -DGPU_ARCH; it uses runtime arch dispatch (evoformer_attn.py:33, gemm_kernel_utils.h:53). 1. Runtime chooses implementation by device CC: - CC >= 80 -> Sm80 (Ampere+ path) - CC >= 75 -> Sm75 - CC >= 70 -> Sm70 1. So pre-Ampere uses pre-Ampere kernels, and Ampere+ uses the Ampere-family kernel path. --------- Signed-off-by: Masahiro Tanaka <mtanaka@anyscale.com> Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
diff --git a/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h b/csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h
@@ -42,45 +42,28 @@
 template <typename arch, typename scalar_t>
 struct CheckArch {
     static constexpr bool isPreVolta = arch::kMinComputeCapability < 70;
-    static constexpr bool isPreAmpere =
-        arch::kMinComputeCapability < 80 && arch::kMinComputeCapability >= 70;
+    // DISPATCH_ARCHTAG only binds Sm70/Sm75/Sm80+, so overlap with isPreVolta is unreachable.
+    static constexpr bool isPreAmpere = arch::kMinComputeCapability < 80;
     static constexpr bool isAmpere = arch::kMinComputeCapability >= 80;
-#if defined(__CUDA_ARCH__)
-    static constexpr bool compiler_cc = arch::kMinComputeCapability * 10 <= __CUDA_ARCH__;
-#else
-    static constexpr bool compiler_cc = true;
-#endif
     static constexpr bool value = (isPreVolta && std::is_same_v<scalar_t, float>) ||
                                   (isPreAmpere && !std::is_same_v<scalar_t, cutlass::bfloat16_t>) ||
-                                  isAmpere && compiler_cc;
+                                  isAmpere;
 };
 
-#define DISPATCH_ARCHTAG(CC, func)                                                      \
-    {                                                                                   \
-        if constexpr (GPU_ARCH >= 80) {                                                 \
-            if (CC >= 80) {                                                             \
-                using ArchTag = cutlass::arch::Sm80;                                    \
-                func;                                                                   \
-            } else {                                                                    \
-                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
-            }                                                                           \
-        } else if constexpr (GPU_ARCH >= 75) {                                          \
-            if (CC >= 75) {                                                             \
-                using ArchTag = cutlass::arch::Sm75;                                    \
-                func;                                                                   \
-            } else {                                                                    \
-                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
-            }                                                                           \
-        } else if constexpr (GPU_ARCH >= 70) {                                          \
-            if (CC >= 70) {                                                             \
-                using ArchTag = cutlass::arch::Sm70;                                    \
-                func;                                                                   \
-            } else {                                                                    \
-                EVOFORMER_CHECK(false, "Compile flag error. Unexpected GPU");           \
-            }                                                                           \
-        } else {                                                                        \
-            EVOFORMER_CHECK(false, "Only GPUs with Tensor Core are supported for now"); \
-        }                                                                               \
+#define DISPATCH_ARCHTAG(CC, func)                                                         \
+    {                                                                                      \
+        if ((CC) >= 80) {                                                                  \
+            using ArchTag = cutlass::arch::Sm80;                                           \
+            func;                                                                          \
+        } else if ((CC) >= 75) {                                                           \
+            using ArchTag = cutlass::arch::Sm75;                                           \
+            func;                                                                          \
+        } else if ((CC) >= 70) {                                                           \
+            using ArchTag = cutlass::arch::Sm70;                                           \
+            func;                                                                          \
+        } else {                                                                           \
+            EVOFORMER_CHECK(false, "Only GPUs with Tensor Core (SM >= 70) are supported"); \
+        }                                                                                  \
     }
 
 #define DISPATCH_TYPES(tensor, func)                                              \
diff --git a/docs/_tutorials/ds4sci_evoformerattention.md b/docs/_tutorials/ds4sci_evoformerattention.md
@@ -26,9 +26,42 @@ export CUTLASS_PATH=/path/to/cutlass
 ```
 The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time.
 
-`DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher (NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is recommended to use CUDA 11.7 or later for better performance. Besides, the performance of backward kernel on V100 kernel is not as good as that on A100 for now.
-The extension checks both requirements and fails if any is not met. To disable the check, for example for cross-compiling in a system without GPUs, you can set the environment variable ```DS_IGNORE_CUDA_DETECTION=TRUE```
-and the environment value ```DS_EVOFORMER_GPU_ARCH={70|75|80}```, which controls the target GPU (80 being the last supported and meaning NVIDIA Ampere and later).
+`DS4Sci_EvoformerAttention` requires GPUs with compute capability 7.0 or higher
+(NVIDIA V100 or later GPUs) and the minimal CUDA version is 11.3. It is
+recommended to use CUDA 11.7 or later for better performance. Besides, the
+performance of backward kernel on V100 is not as good as on A100 for now.
+
+The extension checks both requirements and fails if any is not met. To disable
+the check (for example cross-compiling in a system without GPUs), set
+`DS_IGNORE_CUDA_DETECTION=TRUE`.
+
+### Multi-Arch Build Behavior
+
+Evoformer now supports mixed-architecture packaging directly via
+`TORCH_CUDA_ARCH_LIST`.
+
+Example:
+
+```shell
+CUTLASS_PATH=/path/to/cutlass \
+TORCH_CUDA_ARCH_LIST='7.0;8.0' \
+DS_BUILD_OPS=0 DS_BUILD_EVOFORMER_ATTN=1 \
+pip install -e .
+```
+
+- `TORCH_CUDA_ARCH_LIST` controls generated CUDA slices (order-independent).
+- Targets below `sm_70` are pruned for Evoformer because Tensor Cores are
+  required.
+- `DS_EVOFORMER_GPU_ARCH` is **deprecated** and ignored for Evoformer builds.
+  Use `TORCH_CUDA_ARCH_LIST` instead.
+
+Supported dtype matrix by architecture family:
+
+| Arch family | fp16 | bf16 |
+|-------------|------|------|
+| Sm70 (Volta) | Yes | No |
+| Sm75 (Turing) | Yes | No |
+| Sm80+ (Ampere/Ada/Hopper) | Yes | Yes |
 
 ### 3.2 Unit test and benchmark
 
diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py
@@ -16,12 +16,6 @@ def __init__(self, name=None):
         name = self.NAME if name is None else name
         super().__init__(name=name)
         self.cutlass_path = os.environ.get("CUTLASS_PATH")
-        # Target GPU architecture.
-        # Current useful values are: 70, 75, 80.
-        # For modern GPUs, 80 is the right value.
-        # No specializations of the kernel beyond Ampere are implemented
-        # See gemm_kernel_utils.h (also in cutlass example for fused attention) and cutlass/arch/arch.h
-        self.gpu_arch = os.environ.get("DS_EVOFORMER_GPU_ARCH")
 
     def absolute_name(self):
         return f"deepspeed.ops.{self.NAME}_op"
@@ -37,19 +31,23 @@ def sources(self):
         return [f"{src_dir}/attention.cpp", f"{src_dir}/attention_back.cu", f"{src_dir}/attention_cu.cu"]
 
     def nvcc_args(self):
-        args = super().nvcc_args()
-        if not self.gpu_arch:
-            try:
-                import torch
-            except ImportError:
-                self.warning("Please install torch if trying to pre-compile kernels")
-                return args
-            major = torch.cuda.get_device_properties(0).major  #ignore-cuda
-            minor = torch.cuda.get_device_properties(0).minor  #ignore-cuda
-            args.append(f"-DGPU_ARCH={major}{minor}")
-        else:
-            args.append(f"-DGPU_ARCH={self.gpu_arch}")
-        return args
+        if os.environ.get("DS_EVOFORMER_GPU_ARCH"):
+            self.warning("DS_EVOFORMER_GPU_ARCH is deprecated and ignored for Evoformer builds. "
+                         "Use TORCH_CUDA_ARCH_LIST to control build targets.")
+        return super().nvcc_args()
+
+    def filter_ccs(self, ccs):
+        """Keep only Tensor Core capable targets (>= 7.0)."""
+        retained = []
+        pruned = []
+        for cc in [cc.split('.') for cc in ccs]:
+            if int(cc[0]) >= 7:
+                retained.append(cc)
+            else:
+                pruned.append(cc)
+        if pruned:
+            self.warning(f"Evoformer: excluding targets below SM 7.0: {pruned}. Tensor Core required.")
+        return retained
 
     def is_compatible(self, verbose=False):
         try:
diff --git a/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py b/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from pathlib import Path
+from unittest.mock import patch
+
+from deepspeed.ops.op_builder.builder import CUDAOpBuilder
+# Import the concrete builder class instead of the accelerator-dispatched alias.
+from deepspeed.ops.op_builder.evoformer_attn import EvoformerAttnBuilder
+
+
+def test_filter_ccs_removes_below_70_and_keeps_ptx_suffix():
+    builder = EvoformerAttnBuilder()
+    result = builder.filter_ccs(["6.0", "6.1", "7.0", "8.0+PTX"])
+
+    majors = [int(cc[0]) for cc in result]
+    assert 6 not in majors
+    assert 7 in majors
+    assert 8 in majors
+
+    ptx_entries = [cc for cc in result if cc[1].endswith("+PTX")]
+    assert len(ptx_entries) == 1
+    assert ptx_entries[0] == ["8", "0+PTX"]
+
+
+def test_nvcc_args_deprecates_env_and_omits_gpu_arch_define():
+    builder = EvoformerAttnBuilder()
+    with patch.dict("os.environ", {"DS_EVOFORMER_GPU_ARCH": "80"}, clear=False):
+        with patch.object(builder, "warning") as warn:
+            with patch.object(CUDAOpBuilder, "nvcc_args", return_value=["-O3", "-lineinfo"]):
+                args = builder.nvcc_args()
+
+    warning_messages = [call.args[0] for call in warn.call_args_list if call.args]
+    assert any("DS_EVOFORMER_GPU_ARCH is deprecated and ignored" in msg for msg in warning_messages)
+    assert all("-DGPU_ARCH=" not in arg for arg in args)
+
+
+def test_no_cuda_arch_in_checkarch():
+    header = Path(__file__).resolve().parents[4] / "csrc/deepspeed4science/evoformer_attn/gemm_kernel_utils.h"
+    text = header.read_text()
+    start = text.index("struct CheckArch")
+    end = text.index("};", start) + 2
+    block = text[start:end]
+    assert "__CUDA_ARCH__" not in block