Skip to content

Commit 547d284

Browse files
authored
Revert "Expert Parallelism: common C API + NCCL EP backend" (#3126)
Revert "Expert Parallelism: common C API + NCCL EP backend (#3034)" This reverts commit c3396ee. Signed-off-by: Tim Moon <tmoon@nvidia.com>
1 parent c3396ee commit 547d284

16 files changed

Lines changed: 7 additions & 2450 deletions

File tree

.gitmodules

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,3 @@
77
[submodule "3rdparty/cutlass"]
88
path = 3rdparty/cutlass
99
url = https://github.com/NVIDIA/cutlass.git
10-
[submodule "3rdparty/nccl"]
11-
path = 3rdparty/nccl
12-
url = https://github.com/NVIDIA/nccl.git

3rdparty/nccl

Lines changed: 0 additions & 1 deletion
This file was deleted.

qa/L1_cpp_distributed/test.sh

Lines changed: 5 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,52 +2,16 @@
22
#
33
# See LICENSE for license information.
44

5-
function test_fail() {
6-
RET=1
7-
FAILED_CASES="$FAILED_CASES $1"
8-
echo "Error: sub-test failed: $1"
9-
}
10-
11-
RET=0
12-
FAILED_CASES=""
5+
set -e
136

147
# Find TE
158
: ${TE_PATH:=/opt/transformerengine}
16-
: ${XML_LOG_DIR:=/logs}
17-
mkdir -p "$XML_LOG_DIR"
18-
199
TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
2010
export LD_LIBRARY_PATH=$TE_LIB_PATH:$LD_LIBRARY_PATH
2111

2212
if [[ $(nvidia-smi --list-gpus | wc -l) -ge 4 ]]; then
23-
cd $TE_PATH/tests/cpp_distributed
24-
configure_ok=1
25-
cmake -GNinja -S. -Bbuild || { test_fail "configure"; configure_ok=0; }
26-
27-
# Build each suite independently so one suite's build failure does not
28-
# mask the other's results. Skip mpirun when the binary is missing.
29-
if [[ $configure_ok -eq 1 ]]; then
30-
comm_gemm_ok=1
31-
ep_ok=1
32-
cmake --build build --target test_comm_gemm || { test_fail "test_comm_gemm_build"; comm_gemm_ok=0; }
33-
cmake --build build --target test_ep || { test_fail "test_ep_build"; ep_ok=0; }
34-
35-
if [[ $comm_gemm_ok -eq 1 ]]; then
36-
# Per-rank XML to avoid a write race on a shared path.
37-
mpirun --allow-run-as-root --np 4 --oversubscribe bash -c \
38-
"exec ./build/test_comm_gemm --gtest_output=xml:$XML_LOG_DIR/cpp_distributed_test_comm_gemm.rank\${OMPI_COMM_WORLD_RANK}.xml" \
39-
|| test_fail "test_comm_gemm"
40-
fi
41-
42-
if [[ $ep_ok -eq 1 ]]; then
43-
# EP suites; runner self-skips on pre-Hopper GPUs.
44-
GTEST_XML_PREFIX="$XML_LOG_DIR/cpp_distributed_test_ep" \
45-
bash ./run_test_ep.sh 4 ./build || test_fail "test_ep"
46-
fi
47-
fi
48-
fi
49-
50-
if [ "$RET" -ne 0 ]; then
51-
echo "FAILED sub-tests:$FAILED_CASES"
13+
cd $TE_PATH/tests/cpp_distributed
14+
cmake -GNinja -S. -Bbuild
15+
cmake --build build
16+
mpirun --allow-run-as-root --np 4 --oversubscribe ./build/test_comm_gemm
5217
fi
53-
exit $RET

setup.py

Lines changed: 0 additions & 157 deletions
Original file line numberDiff line numberDiff line change
@@ -84,31 +84,6 @@ def setup_common_extension() -> CMakeExtension:
8484
cusolvermp_dir = os.getenv("CUSOLVERMP_HOME", "/usr")
8585
cmake_flags.append(f"-DCUSOLVERMP_DIR={cusolvermp_dir}")
8686

87-
# NCCL EP (Hopper+): on by default; auto-skipped when no arch >= 90 is
88-
# targeted. Set NVTE_WITH_NCCL_EP=0 to force off.
89-
nccl_ep_env = os.getenv("NVTE_WITH_NCCL_EP")
90-
nccl_ep_explicit = nccl_ep_env is not None
91-
build_with_nccl_ep = bool(int(nccl_ep_env if nccl_ep_explicit else "1"))
92-
if build_with_nccl_ep:
93-
arch_tokens = [a.strip() for a in str(archs or "").split(";") if a.strip()]
94-
has_hopper_or_newer = any(
95-
t.lower() == "native" or (t.rstrip("af").isdigit() and int(t.rstrip("af")) >= 90)
96-
for t in arch_tokens
97-
)
98-
if not has_hopper_or_newer:
99-
if nccl_ep_explicit:
100-
raise RuntimeError(
101-
f"NVTE_WITH_NCCL_EP=1 was set but NVTE_CUDA_ARCHS ('{archs}') "
102-
"contains no arch >= 90. NCCL EP requires Hopper or newer."
103-
)
104-
print(f"[NCCL EP] No arch >= 90 in NVTE_CUDA_ARCHS ('{archs}'); skipping build.")
105-
build_with_nccl_ep = False
106-
if build_with_nccl_ep:
107-
nccl_home = build_nccl_ep_submodule()
108-
cmake_flags.append(f"-DNCCL_INCLUDE_DIR={nccl_home}/include")
109-
else:
110-
cmake_flags.append("-DNVTE_WITH_NCCL_EP=OFF")
111-
11287
# Add custom CMake arguments from environment variable
11388
nvte_cmake_extra_args = os.getenv("NVTE_CMAKE_EXTRA_ARGS")
11489
if nvte_cmake_extra_args:
@@ -155,138 +130,6 @@ def setup_requirements() -> Tuple[List[str], List[str]]:
155130
return [remove_dups(reqs) for reqs in [install_reqs, test_reqs]]
156131

157132

158-
def _discover_nccl_home() -> str:
159-
"""Resolve NCCL_HOME: honor env var, else probe well-known prefixes, else ldconfig."""
160-
env_home = os.environ.get("NCCL_HOME")
161-
if env_home:
162-
if (Path(env_home) / "include" / "nccl.h").exists():
163-
return env_home
164-
print(
165-
f"[NCCL EP] WARNING: NCCL_HOME='{env_home}' is set but "
166-
f"'{env_home}/include/nccl.h' was not found; falling back to system probes."
167-
)
168-
169-
lib_names = ("libnccl.so", "libnccl.so.2")
170-
# Include Debian/Ubuntu multiarch subdirs (e.g. lib/aarch64-linux-gnu).
171-
lib_subdirs = ("lib", "lib64", "lib/aarch64-linux-gnu", "lib/x86_64-linux-gnu")
172-
173-
# pip-installed NCCL (nvidia-nccl-cu* wheel) lives under nvidia/nccl in
174-
# site-packages and has no top-level include/lib layout.
175-
try:
176-
import importlib.util
177-
178-
spec = importlib.util.find_spec("nvidia.nccl")
179-
if spec is not None and spec.submodule_search_locations:
180-
pip_root = Path(next(iter(spec.submodule_search_locations)))
181-
if (pip_root / "include" / "nccl.h").exists() and any(
182-
(pip_root / sub / name).exists() for sub in lib_subdirs for name in lib_names
183-
):
184-
return str(pip_root)
185-
except (ImportError, ValueError):
186-
pass
187-
188-
for cand in ("/opt/nvidia/nccl", "/usr/local/nccl", "/usr"):
189-
p = Path(cand)
190-
if (p / "include" / "nccl.h").exists() and any(
191-
(p / sub / name).exists() for sub in lib_subdirs for name in lib_names
192-
):
193-
return str(p)
194-
195-
try:
196-
out = subprocess.check_output(["ldconfig", "-p"], stderr=subprocess.DEVNULL).decode()
197-
for line in out.splitlines():
198-
if "libnccl.so" in line and "=>" in line:
199-
lib_path = Path(line.split("=>")[-1].strip())
200-
# Walk upward so multiarch layouts (.../lib/<triplet>/libnccl.so)
201-
# resolve to the prefix that contains include/nccl.h.
202-
for root in (lib_path.parent.parent, lib_path.parent.parent.parent):
203-
if (root / "include" / "nccl.h").exists():
204-
return str(root)
205-
except (subprocess.CalledProcessError, FileNotFoundError):
206-
pass
207-
208-
raise RuntimeError(
209-
"Could not locate NCCL core (nccl.h + libnccl.so). Set NCCL_HOME to the install prefix."
210-
)
211-
212-
213-
def build_nccl_ep_submodule() -> str:
214-
"""Build libnccl_ep.a from the 3rdparty/nccl submodule and return NCCL_HOME."""
215-
nccl_root = current_file_path / "3rdparty" / "nccl"
216-
if not (nccl_root / "Makefile").exists():
217-
raise RuntimeError(
218-
f"NCCL submodule not found at {nccl_root}. "
219-
"Run `git submodule update --init --recursive`."
220-
)
221-
222-
build_dir = nccl_root / "build"
223-
nccl_ep_lib = build_dir / "lib" / "libnccl_ep.a"
224-
gencode_stamp = build_dir / "lib" / "libnccl_ep.gencode"
225-
226-
# Caller gates on arch >= 90 or "native"; expand "native" to the host's
227-
# actual sm_XX so the build stamp distinguishes machines.
228-
arch_tokens = [a.strip() for a in str(cuda_archs() or "").split(";") if a.strip()]
229-
arch_list: list[str] = []
230-
for t in arch_tokens:
231-
if t.lower() == "native":
232-
try:
233-
out = subprocess.check_output(
234-
["nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader"],
235-
stderr=subprocess.DEVNULL,
236-
).decode()
237-
except (subprocess.CalledProcessError, FileNotFoundError) as e:
238-
raise RuntimeError(
239-
"NVTE_CUDA_ARCHS=native requires nvidia-smi to resolve the host arch."
240-
) from e
241-
for line in out.splitlines():
242-
cap = line.strip().replace(".", "")
243-
if cap.isdigit() and int(cap) >= 90 and cap not in arch_list:
244-
arch_list.append(cap)
245-
else:
246-
bare = t.rstrip("af")
247-
if bare.isdigit() and int(bare) >= 90 and bare not in arch_list:
248-
arch_list.append(bare)
249-
if not arch_list:
250-
raise RuntimeError(
251-
"NCCL EP requires Hopper or newer (SM >= 90); none found in"
252-
f" NVTE_CUDA_ARCHS={cuda_archs()!r}. Re-run with NVTE_WITH_NCCL_EP=0 to skip the NCCL"
253-
" EP build (the rest of TE still builds)."
254-
)
255-
gencode = " ".join(f"-gencode=arch=compute_{a},code=sm_{a}" for a in arch_list)
256-
257-
nproc = os.cpu_count() or 8
258-
env = os.environ.copy()
259-
env["NVCC_GENCODE"] = gencode
260-
# NCCL EP needs the core NCCL headers + libnccl.so; write NCCL EP build
261-
# outputs to the submodule's local build/ tree.
262-
nccl_home = _discover_nccl_home()
263-
env["NCCL_HOME"] = nccl_home
264-
env["NCCL_EP_BUILDDIR"] = str(build_dir)
265-
266-
prev_gencode = gencode_stamp.read_text().strip() if gencode_stamp.exists() else None
267-
if not nccl_ep_lib.exists() or prev_gencode != gencode:
268-
if nccl_ep_lib.exists() and prev_gencode != gencode:
269-
print(
270-
f"[NCCL EP] gencode changed ('{prev_gencode}' -> '{gencode}'); "
271-
"rebuilding libnccl_ep.a"
272-
)
273-
subprocess.check_call(
274-
["make", "-C", "contrib/nccl_ep", "clean"],
275-
cwd=str(nccl_root),
276-
env=env,
277-
)
278-
print(f"[NCCL EP] Building libnccl_ep.a (gencode='{gencode}')")
279-
subprocess.check_call(
280-
["make", "-j", str(nproc), "-C", "contrib/nccl_ep", "lib"],
281-
cwd=str(nccl_root),
282-
env=env,
283-
)
284-
gencode_stamp.parent.mkdir(parents=True, exist_ok=True)
285-
gencode_stamp.write_text(gencode)
286-
287-
return nccl_home
288-
289-
290133
def git_check_submodules() -> None:
291134
"""
292135
Attempt to checkout git submodules automatically during setup.

tests/cpp_distributed/CMakeLists.txt

Lines changed: 1 addition & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -55,32 +55,10 @@ target_include_directories(test_comm_gemm PRIVATE ${test_comm_gemm_INCLUDES})
5555
find_package(CUDAToolkit REQUIRED)
5656
find_package(OpenMP REQUIRED)
5757
find_package(MPI REQUIRED)
58-
59-
# -- NCCL core ----------------------------------------------------------------
60-
# Anchor on libnccl and derive nccl.h from the same install prefix so the
61-
# header and library can't drift across installs.
6258
find_library(NCCL_LIB
6359
NAMES nccl libnccl
64-
HINTS /opt/nvidia/nccl/lib /opt/nvidia/nccl/lib64
65-
/usr/local/nccl/lib /usr/local/nccl/lib64
66-
PATH_SUFFIXES lib lib64
60+
PATH_SUFFIXES lib
6761
REQUIRED)
68-
get_filename_component(_nccl_lib_dir "${NCCL_LIB}" DIRECTORY)
69-
set(NCCL_PREFIX "${_nccl_lib_dir}")
70-
while(NCCL_PREFIX AND NOT EXISTS "${NCCL_PREFIX}/include/nccl.h")
71-
get_filename_component(_nccl_parent "${NCCL_PREFIX}" DIRECTORY)
72-
if(_nccl_parent STREQUAL NCCL_PREFIX)
73-
break()
74-
endif()
75-
set(NCCL_PREFIX "${_nccl_parent}")
76-
endwhile()
77-
find_path(NCCL_INCLUDE_DIR nccl.h
78-
HINTS "${NCCL_PREFIX}/include"
79-
NO_DEFAULT_PATH)
80-
if(NOT NCCL_INCLUDE_DIR)
81-
message(FATAL_ERROR
82-
"nccl.h not found under the prefix of ${NCCL_LIB}.")
83-
endif()
8462
list(APPEND test_comm_gemm_LINKER_LIBS
8563
CUDA::cuda_driver
8664
CUDA::cudart
@@ -96,37 +74,3 @@ target_compile_options(test_comm_gemm PRIVATE -O2 -fopenmp)
9674

9775
include(GoogleTest)
9876
gtest_discover_tests(test_comm_gemm DISCOVERY_TIMEOUT 600)
99-
100-
# -- EP distributed tests ------------------------------------------------------
101-
# Launched via mpirun; ncclUniqueId exchange uses MPI_Bcast (see test_ep_common.h).
102-
# The test binary only uses NCCL core symbols (ncclMemAlloc, ncclCommWindow*);
103-
# all ncclEp* calls live behind TE's public <transformer_engine/ep.h>, which is
104-
# statically linked into libtransformer_engine.so.
105-
message(STATUS "EP test: NCCL headers: ${NCCL_INCLUDE_DIR}")
106-
set(EP_TEST_COMMON_INCLUDES
107-
${NCCL_INCLUDE_DIR}
108-
${MPI_CXX_INCLUDE_PATH}
109-
../../transformer_engine/common/include
110-
../../transformer_engine/common
111-
${CMAKE_CURRENT_SOURCE_DIR})
112-
113-
# nvrtc must follow TE_LIB so symbols referenced from libtransformer_engine.so
114-
# (loaded via dlopen in Python; not in its DT_NEEDED) resolve through nvrtc.
115-
set(EP_TEST_COMMON_LIBS
116-
CUDA::cuda_driver
117-
CUDA::cudart
118-
GTest::gtest
119-
${TE_LIB}
120-
CUDA::nvrtc
121-
${NCCL_LIB}
122-
MPI::MPI_CXX
123-
OpenMP::OpenMP_CXX)
124-
125-
# -- EP distributed tests (per-op + full pipeline + zero-copy symm) -----------
126-
add_executable(test_ep test_ep.cu ../cpp/test_common.cu)
127-
target_include_directories(test_ep PRIVATE ${EP_TEST_COMMON_INCLUDES})
128-
target_link_libraries(test_ep PUBLIC ${EP_TEST_COMMON_LIBS})
129-
130-
# Do NOT use gtest_discover_tests - these binaries require multi-process
131-
# launch via run_test_ep.sh, not direct single-process execution.
132-
message(STATUS "EP distributed tests enabled (NCCL EP statically linked into libtransformer_engine.so)")

tests/cpp_distributed/run_test_ep.sh

Lines changed: 0 additions & 63 deletions
This file was deleted.

0 commit comments

Comments
 (0)