diff --git a/docs/_tutorials/ds4sci_evoformerattention.md b/docs/_tutorials/ds4sci_evoformerattention.md index bf6956840eb0..9c3f3e1c6051 100644 --- a/docs/_tutorials/ds4sci_evoformerattention.md +++ b/docs/_tutorials/ds4sci_evoformerattention.md @@ -15,14 +15,13 @@ tags: training inference `DS4Sci_EvoformerAttention` is released as part of DeepSpeed >= 0.10.3. -`DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). You need to clone the CUTLASS repository and specify the path to it in the environment variable `CUTLASS_PATH`. +`DS4Sci_EvoformerAttention` is implemented based on [CUTLASS](https://github.com/NVIDIA/cutlass). DeepSpeed automatically looks for CUTLASS in the [nvidia-cutlass](https://pypi.org/project/nvidia-cutlass/) Python package, Python environment and CMake prefixes, compiler include path environment variables, a `cutlass` checkout next to DeepSpeed or in the current working directory, and common system install prefixes such as `/usr/local`. CUTLASS setup detection can be ignored by setting ```CUTLASS_PATH="DS_IGNORE_CUTLASS_DETECTION"```, which is useful if you have a well setup compiler (e.g., compiling in a conda package with cutlass and the cuda compilers installed). -CUTLASS location can be automatically inferred using pypi's [nvidia-cutlass](https://pypi.org/project/nvidia-cutlass/) package by setting ```CUTLASS_PATH="DS_USE_CUTLASS_PYTHON_BINDINGS"```. Note that this is discouraged as ```nvidia-cutlass``` is not maintained anymore and outdated. +If automatic detection does not find the intended installation, set `CUTLASS_PATH` to either the CUTLASS checkout root or its `include` directory. -You can always simply clone cutlass and setup ```CUTLASS_PATH```: +You can always simply clone cutlass next to DeepSpeed: ```shell git clone https://github.com/NVIDIA/cutlass -export CUTLASS_PATH=/path/to/cutlass ``` The kernels will be compiled when `DS4Sci_EvoformerAttention` is called for the first time. @@ -43,7 +42,6 @@ Evoformer now supports mixed-architecture packaging directly via Example: ```shell -CUTLASS_PATH=/path/to/cutlass \ TORCH_CUDA_ARCH_LIST='7.0;8.0' \ DS_BUILD_OPS=0 DS_BUILD_EVOFORMER_ATTN=1 \ pip install -e . diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py index 3686ed373530..90e902f4e191 100644 --- a/op_builder/evoformer_attn.py +++ b/op_builder/evoformer_attn.py @@ -4,18 +4,23 @@ # DeepSpeed Team from .builder import CUDAOpBuilder, installed_cuda_version +import importlib import os from pathlib import Path +import sys class EvoformerAttnBuilder(CUDAOpBuilder): BUILD_VAR = "DS_BUILD_EVOFORMER_ATTN" NAME = "evoformer_attn" + CUTLASS_IGNORE = "DS_IGNORE_CUTLASS_DETECTION" + CUTLASS_PYTHON_BINDINGS = "DS_USE_CUTLASS_PYTHON_BINDINGS" def __init__(self, name=None): name = self.NAME if name is None else name super().__init__(name=name) self.cutlass_path = os.environ.get("CUTLASS_PATH") + self._resolved_cutlass_path = None def absolute_name(self): return f"deepspeed.ops.{self.NAME}_op" @@ -57,21 +62,20 @@ def is_compatible(self, verbose=False): self.warning("Please install torch if trying to pre-compile kernels") return False - if self.cutlass_path is None: - if verbose: - self.warning("Please specify CUTLASS location directory as environment variable CUTLASS_PATH") - self.warning( - "Possible values are: a path, DS_IGNORE_CUTLASS_DETECTION and DS_USE_CUTLASS_PYTHON_BINDINGS") - return False - - if self.cutlass_path != "DS_IGNORE_CUTLASS_DETECTION": + if self.cutlass_path != self.CUTLASS_IGNORE: try: self.include_paths() - except (RuntimeError, ImportError): + except (RuntimeError, ImportError) as exc: + if verbose: + self.warning(str(exc)) return False # Check version in case it is a CUTLASS_PATH points to a CUTLASS checkout - if os.path.exists(f"{self.cutlass_path}/CHANGELOG.md"): - with open(f"{self.cutlass_path}/CHANGELOG.md", "r") as f: + if self._resolved_cutlass_path is not None: + changelog_path = self._resolved_cutlass_path / "CHANGELOG.md" + else: + changelog_path = None + if changelog_path is not None and changelog_path.exists(): + with open(changelog_path, "r") as f: if "3.1.0" not in f.read(): if verbose: self.warning("Please use CUTLASS version >= 3.1.0") @@ -94,26 +98,114 @@ def is_compatible(self, verbose=False): cuda_okay = False return super().is_compatible(verbose) and cuda_okay + @staticmethod + def _repo_root(): + return Path(__file__).resolve().parents[1] + + @staticmethod + def _dedupe_paths(paths): + deduped = [] + seen = set() + for path in paths: + path = Path(path).expanduser() + key = str(path) + if key not in seen: + seen.add(key) + deduped.append(path) + return deduped + + @staticmethod + def _env_paths(*names): + paths = [] + for name in names: + value = os.environ.get(name) + if not value: + continue + paths.extend(Path(path) for path in value.split(os.pathsep) if path) + return paths + + @staticmethod + def _python_package_cutlass_paths(): + try: + cutlass_library = importlib.import_module("cutlass_library") + except ImportError: + return [] + + candidates = [] + source_path = getattr(cutlass_library, "source_path", None) + if source_path is not None: + candidates.append(Path(source_path)) + + package_file = getattr(cutlass_library, "__file__", None) + if package_file is not None: + package_dir = Path(package_file).resolve().parent + candidates.extend([package_dir / "source", package_dir.parent, package_dir]) + return candidates + + def _candidate_cutlass_paths(self): + if self.cutlass_path == self.CUTLASS_PYTHON_BINDINGS: + candidates = self._python_package_cutlass_paths() + if candidates: + return candidates + self.warning("Please pip install nvidia-cutlass") + raise ImportError("Unable to locate CUTLASS from the nvidia-cutlass Python package") + + if self.cutlass_path: + return [Path(self.cutlass_path)] + + repo_root = self._repo_root() + python_prefixes = self._dedupe_paths([Path(sys.prefix), Path(sys.exec_prefix), Path(sys.base_prefix)]) + prefix_paths = self._env_paths("CUTLASS_ROOT", "CUTLASS_HOME", "CONDA_PREFIX", "VIRTUAL_ENV", + "CMAKE_PREFIX_PATH", "CUDA_HOME", "CUDA_PATH") + include_paths = self._env_paths("CPATH", "CPLUS_INCLUDE_PATH", "C_INCLUDE_PATH") + + return self._dedupe_paths([ + *self._python_package_cutlass_paths(), + *prefix_paths, + *python_prefixes, + *include_paths, + Path.cwd() / "cutlass", + repo_root / "cutlass", + repo_root.parent / "cutlass", + Path("/usr/local/cutlass"), + Path("/opt/cutlass"), + Path("/usr/local"), + Path("/usr"), + ]) + + @staticmethod + def _cutlass_include_dirs(cutlass_path): + cutlass_path = cutlass_path.expanduser().resolve() + if not cutlass_path.is_dir(): + return [] + + if (cutlass_path / "include" / "cutlass" / "cutlass.h").is_file(): + include_root = cutlass_path / "include" + util_include = cutlass_path / "tools" / "util" / "include" + elif (cutlass_path / "cutlass" / "cutlass.h").is_file(): + include_root = cutlass_path + util_include = cutlass_path.parent / "tools" / "util" / "include" + else: + return [] + + include_dirs = [include_root] + if util_include.is_dir(): + include_dirs.append(util_include) + return [str(include_dir) for include_dir in include_dirs] + def include_paths(self): # Assume the user knows best and CUTLASS location is already setup externally - if self.cutlass_path == "DS_IGNORE_CUTLASS_DETECTION": + if self.cutlass_path == self.CUTLASS_IGNORE: return [] - # Use header files vendored with deprecated python packages - if self.cutlass_path == "DS_USE_CUTLASS_PYTHON_BINDINGS": - try: - import cutlass_library - cutlass_path = Path(cutlass_library.__file__).parent / "source" - except ImportError: - self.warning("Please pip install nvidia-cutlass (note that this is deprecated and likely outdated)") - raise - # Use hardcoded path in CUTLASS_PATH - else: - cutlass_path = Path(self.cutlass_path) - cutlass_path = cutlass_path.resolve() - if not cutlass_path.is_dir(): - raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not exist") - include_dirs = cutlass_path / "include", cutlass_path / "tools" / "util" / "include" - include_dirs = [str(include_dir) for include_dir in include_dirs if include_dir.is_dir()] - if not include_dirs: - raise RuntimeError(f"CUTLASS_PATH {cutlass_path} does not contain any include directories") - return include_dirs + + for cutlass_path in self._candidate_cutlass_paths(): + include_dirs = self._cutlass_include_dirs(cutlass_path) + if include_dirs: + self._resolved_cutlass_path = cutlass_path.expanduser().resolve() + return include_dirs + + if self.cutlass_path: + raise RuntimeError(f"CUTLASS_PATH {self.cutlass_path} does not contain CUTLASS headers") + + raise RuntimeError("Unable to locate CUTLASS. Install nvidia-cutlass, clone CUTLASS next to DeepSpeed, " + "or set CUTLASS_PATH to the CUTLASS checkout.") diff --git a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py index f85a7e4b6a18..e242e0a3cd05 100644 --- a/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py +++ b/tests/benchmarks/DS4Sci_EvoformerAttention_bench.py @@ -6,7 +6,7 @@ This script is to test the performance of the DS4Sci_EvoformerAttention op. To run the script, 1. Clone the CUTLASS repo. E.g. git clone https://github.com/NVIDIA/cutlass.git -2. Specify the CUTLASS_PATH environment variable. E.g. export CUTLASS_PATH=$(pwd)/cutlass +2. DeepSpeed will detect a local or installed CUTLASS. If needed, set CUTLASS_PATH explicitly. 3. Run the script. E.g. python DS4Sci_EvoformerAttention_bench.py """ diff --git a/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py b/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py index 0808995cd0cc..ff78839c8900 100644 --- a/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py +++ b/tests/unit/ops/deepspeed4science/test_evoformer_attn_builder.py @@ -6,11 +6,22 @@ from pathlib import Path from unittest.mock import patch +import pytest + from deepspeed.ops.op_builder.builder import CUDAOpBuilder # Import the concrete builder class instead of the accelerator-dispatched alias. from deepspeed.ops.op_builder.evoformer_attn import EvoformerAttnBuilder +def make_cutlass_checkout(path): + include_dir = path / "include" / "cutlass" + include_dir.mkdir(parents=True) + (include_dir / "cutlass.h").write_text("// cutlass marker\n") + util_dir = path / "tools" / "util" / "include" + util_dir.mkdir(parents=True) + return path + + def test_filter_ccs_removes_below_70_and_keeps_ptx_suffix(): builder = EvoformerAttnBuilder() result = builder.filter_ccs(["6.0", "6.1", "7.0", "8.0+PTX"]) @@ -44,3 +55,64 @@ def test_no_cuda_arch_in_checkarch(): end = text.index("};", start) + 2 block = text[start:end] assert "__CUDA_ARCH__" not in block + + +def test_include_paths_uses_cutlass_path_env(tmp_path): + cutlass_path = make_cutlass_checkout(tmp_path / "cutlass") + + with patch.dict("os.environ", {"CUTLASS_PATH": str(cutlass_path)}, clear=False): + builder = EvoformerAttnBuilder() + + assert builder.include_paths() == [ + str(cutlass_path / "include"), + str(cutlass_path / "tools" / "util" / "include"), + ] + + +def test_include_paths_finds_python_package_candidate_without_env(tmp_path): + cutlass_path = make_cutlass_checkout(tmp_path / "python_package_cutlass") + + with patch.dict("os.environ", {}, clear=True): + builder = EvoformerAttnBuilder() + + with patch.object(EvoformerAttnBuilder, "_python_package_cutlass_paths", return_value=[cutlass_path]): + assert builder.include_paths()[0] == str(cutlass_path / "include") + + +def test_include_paths_finds_cutlass_from_cmake_prefix_path(tmp_path): + cutlass_path = make_cutlass_checkout(tmp_path / "prefix") + + with patch.dict("os.environ", {"CMAKE_PREFIX_PATH": str(cutlass_path)}, clear=True): + builder = EvoformerAttnBuilder() + with patch.object(EvoformerAttnBuilder, "_python_package_cutlass_paths", return_value=[]): + assert builder.include_paths()[0] == str(cutlass_path / "include") + + +def test_include_paths_finds_cutlass_from_compiler_include_path(tmp_path): + cutlass_path = make_cutlass_checkout(tmp_path / "prefix") + + with patch.dict("os.environ", {"CPATH": str(cutlass_path / "include")}, clear=True): + builder = EvoformerAttnBuilder() + with patch.object(EvoformerAttnBuilder, "_python_package_cutlass_paths", return_value=[]): + assert builder.include_paths()[0] == str(cutlass_path / "include") + + +def test_include_paths_accepts_cutlass_include_dir_directly(tmp_path): + cutlass_path = make_cutlass_checkout(tmp_path / "cutlass") + + with patch.dict("os.environ", {"CUTLASS_PATH": str(cutlass_path / "include")}, clear=False): + builder = EvoformerAttnBuilder() + + assert builder.include_paths() == [ + str(cutlass_path / "include"), + str(cutlass_path / "tools" / "util" / "include"), + ] + + +def test_include_paths_reports_missing_cutlass(tmp_path): + with patch.dict("os.environ", {}, clear=True): + builder = EvoformerAttnBuilder() + + with patch.object(builder, "_candidate_cutlass_paths", return_value=[tmp_path / "missing"]): + with pytest.raises(RuntimeError, match="Unable to locate CUTLASS"): + builder.include_paths()