diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000000..d1eb6cc8e92 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,170 @@ +cmake_minimum_required(VERSION 3.25) # ipp6 is using 3.28 + +# Version information +# Read makefiles/version.mk file +file(READ ${CMAKE_SOURCE_DIR}/makefiles/version.mk VERSION_CONTENT) +string(REGEX REPLACE ".*NCCL_MAJOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MAJOR "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_MINOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MINOR "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_PATCH[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_PATCH "${VERSION_CONTENT}") +string(REGEX REPLACE ".*NCCL_SUFFIX[ ]*:=[ ]*([a-zA-Z0-9]*).*" "\\1" NCCL_SUFFIX "${VERSION_CONTENT}") +string(REGEX REPLACE ".*PKG_REVISION[ ]*:=[ ]*([0-9]+).*" "\\1" PKG_REVISION "${VERSION_CONTENT}") +math(EXPR NCCL_VERSION_CODE "(${NCCL_MAJOR} * 10000) + (${NCCL_MINOR} * 100) + ${NCCL_PATCH}") + +# Make version information available to C++ source files +add_compile_definitions( + NCCL_USE_CMAKE + NCCL_MAJOR=${NCCL_MAJOR} + NCCL_MINOR=${NCCL_MINOR} + NCCL_PATCH=${NCCL_PATCH} + NCCL_VERSION_CODE=${NCCL_VERSION_CODE} +) + +set(ENV{NCCL_USE_CMAKE} "1") + +project(NCCL VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH} + LANGUAGES CUDA CXX C) + +# Make CMAKE_BUILD_TYPE to release by default if not set +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release") +endif() + +option(VERBOSE "Enable verbose output" OFF) +option(KEEP "Keep intermediate files" OFF) +option(DEBUG "Enable debug build" OFF) +option(ASAN "Enable Address Sanitizer" OFF) +option(UBSAN "Enable Undefined Behavior Sanitizer" OFF) +option(TRACE "Enable tracing" OFF) +option(WERROR "Treat warnings as errors" OFF) +option(PROFAPI "Enable profiling API" ON) +option(NVTX "Enable NVTX" ON) +option(RDMA_CORE "Enable RDMA core" OFF) +option(NET_PROFILER "Enable network profiler" OFF) +option(MLX5DV "Enable MLX5DV" OFF) +option(MAX_EXT_NET_PLUGINS "Maximum external network plugins" 0) + +find_package(CUDAToolkit REQUIRED) + +# CUDA version detection +string(REGEX MATCH "([0-9]+\\.[0-9]+)" CUDA_VERSION "${CUDAToolkit_VERSION}") + +# Extract major and minor version numbers +string(REGEX MATCH "([0-9]+)" CUDA_MAJOR "${CUDA_VERSION}") +string(REGEX MATCH "([0-9]+)$" CUDA_MINOR "${CUDA_VERSION}") +string(REGEX REPLACE ".*\\.([0-9]+)$" "\\1" CUDA_MINOR "${CUDA_VERSION}") + +# Add CUDA version definitions after find_package +add_compile_definitions( + CUDA_MAJOR=${CUDA_MAJOR} + CUDA_MINOR=${CUDA_MINOR} +) + +# CUDA architecture flags +if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "") + message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined or empty, setting default values based on CUDA version") + + if(${CUDA_MAJOR} LESS 9) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61") + elseif(${CUDA_MAJOR} EQUAL 9) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70") + elseif(${CUDA_MAJOR} EQUAL 10) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70") + elseif(${CUDA_MAJOR} EQUAL 11) + if(${CUDA_MINOR} LESS 8) + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80") + else() + set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80;90") + endif() + elseif(${CUDA_MAJOR} EQUAL 12) + if(${CUDA_MINOR} LESS 8) + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90") + else() + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;120") + endif() + elseif(${CUDA_MAJOR} EQUAL 13) + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120") + else() + # For future CUDA versions, include all architectures up to the latest known + set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120") + endif() +endif() +message(STATUS "Using CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}") + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -Wvla -g") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -fPIC") + +# Sanitizer options +if(ASAN) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -static-libasan") +endif() + +if(UBSAN) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined -static-libubsan") +endif() + +# Additional options +if(TRACE) + add_definitions(-DENABLE_TRACE) +endif() + +if(NOT NVTX) + add_definitions(-DNVTX_DISABLE) +endif() + +if(WERROR) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror") +endif() + +if(PROFAPI) + add_definitions(-DPROFAPI) +endif() + +set(EXTRA_LIBS) + +# RDMA and MLX5DV are Linux-specific features +if(RDMA_CORE) + add_definitions(-DNCCL_BUILD_RDMA_CORE=1) + find_library(VERBS_LIBRARY NAMES verbs) + if(VERBS_LIBRARY) + list(APPEND EXTRA_LIBS ${VERBS_LIBRARY}) + endif() +endif() + +if(MLX5DV) + add_definitions(-DNCCL_BUILD_MLX5DV=1) + find_library(MLX5_LIBRARY NAMES mlx5) + if(MLX5_LIBRARY) + list(APPEND EXTRA_LIBS ${MLX5_LIBRARY}) + endif() +endif() + +if(NET_PROFILER) + add_definitions(-DNCCL_ENABLE_NET_PROFILING=1) +endif() + +if(MAX_EXT_NET_PLUGINS GREATER 0) + add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS}) +endif() + +add_definitions(-DDOCA_VERBS_USE_CUDA_WRAPPER) +add_definitions(-DDOCA_VERBS_USE_NET_WRAPPER) +add_definitions(-DNCCL_GIN_PROXY_ENABLE=1) + +# Library dependencies +find_library(RT_LIBRARY NAMES rt) +if(RT_LIBRARY) + list(APPEND EXTRA_LIBS ${RT_LIBRARY}) +endif() + +# Debug/Release specific flags +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS} -O0 -G -g") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3") +set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS} -O3") + +add_subdirectory(ext-net) +add_subdirectory(ext-profiler/example) +add_subdirectory(ext-tuner/example) +add_subdirectory(src) diff --git a/examples/06_device_api/02_gin_alltoall_pure/Makefile b/examples/06_device_api/02_gin_alltoall_pure/Makefile new file mode 100644 index 00000000000..43d65cec1ec --- /dev/null +++ b/examples/06_device_api/02_gin_alltoall_pure/Makefile @@ -0,0 +1,84 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = gin_alltoall_pure_device_api + +# Common utilities +COMMON_INC = ../../common/include +COMMON_SRC = ../../common/src + +# Build configuration +INCLUDES += -I$(COMMON_INC) + +# Source files +SOURCES = main.cu $(COMMON_SRC)/utils.cc +OBJECTS = $(SOURCES:.cu=.o) +OBJECTS := $(OBJECTS:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ +else + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@ +endif + @echo "Built target $@" + +# Compile source files +%.o: %.cu + $(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@ + +%.o: %.cc +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +else + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +endif + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." +ifeq ($(MPI),1) + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) +else + @echo "Running with all available GPUs" + ./$(TARGET) +endif + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Pure GIN AlltoAll Device API" + @echo "==============================================" + @echo "" + @echo "This example demonstrates pure GPU-Initiated Networking (GIN)" + @echo "for AlltoAll operations without LSA optimizations." + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/06_device_api/02_gin_alltoall_pure/README.md b/examples/06_device_api/02_gin_alltoall_pure/README.md new file mode 100644 index 00000000000..72a0b33b30b --- /dev/null +++ b/examples/06_device_api/02_gin_alltoall_pure/README.md @@ -0,0 +1,178 @@ + + +# NCCL Device API Pure GIN AlltoAll Example + +This example demonstrates NCCL's GPU-Initiated Networking (GIN) capabilities for performing AlltoAll collective operations directly from GPU kernels using only network-based communication. + +## Overview + +This example showcases **pure GIN communication** where all data exchange happens through the network, without any Load Store Access (LSA) optimizations. This is particularly useful for: + +- Multi-node environments where ranks cannot use LSA +- Testing network performance without local optimizations +- Understanding the baseline GIN communication patterns +- Scenarios where all communication must go through the network + +## What This Example Does + +1. **Creates device communicators** using `ncclDevCommCreate` for GPU kernel access to NCCL operations +2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for direct peer-to-peer access +3. **Launches GPU kernel** that performs AlltoAll operations using pure GIN for all peer communication + +## Building and Running + +The advanced examples can be built using either pthread or MPI for parallelization. pthread is the default choice. To use MPI the user needs to set `MPI=1` at build time and can optionally provide a valid MPI installation under `MPI_HOME`. + +### Build +```bash +make [MPI=1] [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run when compiled for pthreads (default) +```bash +[NTHREADS=N] ./gin_alltoall_pure_device_api +``` + +### Run when compiled for MPI +```bash +mpirun -np ./gin_alltoall_pure_device_api +``` + +## Code Walk-through + +### Device Communicator Creation (Host-side) +The `ncclDevComm` is the core component enabling GPU kernels to perform network communication directly. For pure GIN communication, we configure the device communicator with GIN-specific resources. The `ncclDevCommRequirements` specifies GIN barriers for network synchronization and signals for completion detection. Unlike LSA-based examples, we don't need LSA barriers since all communication goes through the network. + +```cpp +ncclDevComm devComm; +ncclDevCommRequirements reqs; +memset(&reqs, 0, sizeof(reqs)); +// GIN barriers enable cross-node synchronization over the network +reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT; +// GIN signals provide completion notifications for asynchronous operations +reqs.ginSignalCount = 1; + +// Create device communicator with pure GIN support +NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); +``` + +### Memory Window Registration (Host-side) +The device API requires symmetric memory windows registered using `NCCL_WIN_COLL_SYMMETRIC`. These windows enable GPU kernels to access remote memory through GIN operations. Unlike LSA which provides direct memory access, GIN windows are accessed through network put/get operations. + +```cpp +ncclWindow_t send_win; +ncclWindow_t recv_win; + +// Register symmetric windows for GIN network access +NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); +NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); +``` + +### GIN Barriers (Device-side) +GIN barriers enable cross-node synchronization from device code over the network. Each thread block uses `blockIdx.x` to select its dedicated barrier, allowing blocks to progress independently while coordinating with corresponding blocks on other nodes. This is crucial for ensuring all ranks are ready before starting the AlltoAll exchange. + +```cpp +// GIN barriers coordinate GPU threads across different nodes over network +ncclGinBarrierSession bar { + ncclCoopCta(), // Barrier scope: entire CTA (thread block) + gin, // GIN context for network operations + ncclTeamWorld(devComm), // Team spanning all ranks + devComm.railGinBarrier, // GIN barrier handle + blockIdx.x // Barrier index: matches our CTA index +}; +bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed); +``` + +### GIN Put Operations (Device-side) +GIN provides one-sided put operations for direct remote memory writes over the network. Each thread handles a subset of destination ranks, writing its rank's data to the appropriate location in each peer's receive buffer. The `ncclGin_SignalInc` parameter increments a signal counter, enabling asynchronous completion detection. + +```cpp +// Send data to all peers via GIN network operations +const size_t size = count * sizeof(T); +for (int r = tid; r < devComm.nRanks; r += nthreads) { + gin.put(ncclTeamWorld(devComm), r, + recvwin, recvoffset + devComm.rank * size, // Destination: peer r's buffer + sendwin, sendoffset + r * size, // Source: data for peer r + size, ncclGin_SignalInc{signalIndex}); // Signal increment for completion +} +``` + +### Signal-based Completion (Device-side) +GIN uses signals for asynchronous completion detection of network operations. The kernel waits for the signal value to reach the expected count (initial value + number of ranks), indicating all put operations have completed. The `gin.flush()` ensures all pending operations are committed before proceeding. + +```cpp +// Wait for all remote puts to complete +gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + devComm.nRanks); +gin.flush(ncclCoopCta()); // Ensure all operations are committed +``` + +## Expected Output + +``` +Starting Pure GIN AlltoAll initialization + Rank 0 using GPU device 0 + Rank 1 using GPU device 1 + Rank 0 initialized NCCL communicator for 2 total ranks + Rank 1 initialized NCCL communicator for 2 total ranks + Rank 0 initialized send data + Rank 1 initialized send data + Rank 0 created device communicator with GIN support + Rank 1 created device communicator with GIN support +Starting Pure GIN AlltoAll with 1024 elements per rank (2048 total elements, 0 MB) + +=== Executing Pure GIN AlltoAll === + Rank 0 completed pure GIN AlltoAll kernel + Rank 1 completed pure GIN AlltoAll kernel +Pure GIN AlltoAll result: PASSED +``` + +## When to Use + +- **Multi-node environments**: When ranks cannot use LSA +- **Testing network performance**: Without local optimizations +- **Understanding the baseline GIN communication patterns** +- **Scenarios where all communication must go through the network** + +## Performance Considerations + +- **Network overhead**: All communication goes through the network stack +- **Signal-based completion**: Enables asynchronous operation patterns +- **Barrier synchronization**: Ensures proper ordering of network operations +- **Multiple GIN contexts**: Can improve parallel communication performance + +## Common Issues and Solutions + +### Issue: Deadlock at util_broadcast +**Solution:** Ensure you're running with multiple GPUs/processes +```bash +NTHREADS=2 ./gin_alltoall_pure_device_api # For 2 GPUs +``` + +### Issue: CUDA out of memory +**Solution:** Reduce the data size in the example + +### Issue: Network errors +**Solution:** Ensure proper network configuration for multi-node setups + +## Performance Notes + +- These are educational examples, not optimized for performance +- Real implementations should consider: + - Optimal GIN context usage for parallel operations + - Signal pool management for high-throughput scenarios + - Memory coalescing patterns for network operations + - Network topology-aware communication strategies + +## Error Handling + +The example uses comprehensive error checking for CUDA, NCCL, and GIN operations. Device kernels should implement proper error handling for network operations and signal management. + +## Next Steps + +After understanding this example, explore: +- **Custom network protocols**: Implement specialized communication patterns using GIN +- **Performance optimization**: Fine-tune GIN context usage and signal management +- **Hybrid approaches**: Combine GIN with LSA for topology-aware optimizations +- **Integration with compute**: Fuse network communication with computation kernels diff --git a/examples/06_device_api/02_gin_alltoall_pure/main.cu b/examples/06_device_api/02_gin_alltoall_pure/main.cu new file mode 100644 index 00000000000..08fdddf5e6e --- /dev/null +++ b/examples/06_device_api/02_gin_alltoall_pure/main.cu @@ -0,0 +1,251 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + + #include "cuda_runtime.h" + #include "nccl.h" + #include "nccl_device.h" + #include "utils.h" + #include + #include + #include + #include + #include + +/* + * NCCL Device API Pure GIN AlltoAll Example + * + * This example demonstrates NCCL's GPU-Initiated Networking (GIN) capabilities + * for performing AlltoAll collective operations directly from GPU kernels using + * only network-based communication. + * GIN enables GPU kernels to initiate network communication without CPU + * intervention, providing low-latency communication for distributed applications. + * + * Learning Objectives: + * - Understand pure GIN (GPU-Initiated Networking) communication + * - Learn how to use ncclGin for device-initiated network communication + * - See pure GIN AlltoAll implementation for network-based communication + * - Practice GIN barriers and signal-based synchronization + * + * Key GIN Concepts: + * - ncclGin: Device-side networking object for kernel-initiated communication + * - GIN contexts: Network communication channels for parallel operations + * - GIN signals: Completion notifications for asynchronous operations + * - GIN barriers: Network-based synchronization across ranks + * - One-sided put operations: Direct remote memory writes over network + * + * When to Use Pure GIN: + * - Communication between ranks that cannot use LSA (different nodes) + * - Network-based collective operations in multi-node environments + * - Scenarios where all communication must go through the network + * - Testing network performance without local optimizations + * + * Performance Considerations: + * - GIN provides network communication from GPU kernels + * - All communication goes through the network (no local optimizations) + * - Signal-based completion detection enables asynchronous operation + * - Multiple GIN contexts can improve parallel communication performance + */ + +// Device API kernel launch configuration +// CTA count must match railGinBarrierCount for proper barrier synchronization + #define NCCL_DEVICE_CTA_COUNT 1 + #define NCCL_DEVICE_THREADS_PER_CTA 512 + + // ========================================================================== + // Device Kernel Implementations + // ========================================================================== + +// Pure GIN AlltoAll kernel - uses GIN for all peer communication +// This kernel demonstrates network-based AlltoAll using GPU-initiated networking +template +__global__ void PureGinAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, + ncclWindow_t recvwin, size_t recvoffset, + size_t count, int root, struct ncclDevComm devComm) { + int ginContext = 0; + unsigned int signalIndex = 0; + ncclGin gin { devComm, ginContext }; + uint64_t signalValue = gin.readSignal(signalIndex); + + // GIN barriers enable coordination between GPU threads across different ranks over network + ncclGinBarrierSession bar { ncclCoopCta(), gin, ncclTeamWorld(devComm), + devComm.railGinBarrier, blockIdx.x }; + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed); + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int nthreads = blockDim.x * gridDim.x; + + // Send to all peers via GIN (GPU-initiated networking) + const size_t size = count * sizeof(T); + for (int r = tid; r < devComm.nRanks; r += nthreads) { + gin.put(ncclTeamWorld(devComm), r, + recvwin, recvoffset + devComm.rank * size, + sendwin, sendoffset + r * size, + size, ncclGin_SignalInc{signalIndex}); + } + + // Wait for all remote puts to complete using signal-based synchronization + gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + devComm.nRanks); + gin.flush(ncclCoopCta()); +} + + // ========================================================================== + // Host-Side Setup and Device API Initialization + // ========================================================================== + +void* pureGinAlltoAll(int my_rank, int total_ranks, int local_device, int devices_per_rank) { + ncclComm_t comm; + ncclUniqueId nccl_unique_id; + + if (my_rank == 0) { + printf("Starting Pure GIN AlltoAll initialization\n"); + } + + // Standard NCCL communicator initialization + if (my_rank == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + + // Distribute unique ID + util_broadcast(0, my_rank, &nccl_unique_id); + + // Set device context for this rank + CUDACHECK(cudaSetDevice(local_device)); + printf(" Rank %d using GPU device %d\n", my_rank, local_device); + + // ========================================================================== + // STEP 2: Initialize NCCL Communicator and Allocate Memory + // ========================================================================== + + // Initialize NCCL communicator + NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank)); + printf(" Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks); + + // Allocate memory for AlltoAll operation + size_t count = 1024; // Elements per rank + size_t total_elements = count * total_ranks; + size_t size_bytes = total_elements * sizeof(float); + + float *h_sendbuff = (float*)malloc(size_bytes); + float *h_recvbuff = (float*)malloc(size_bytes); + void* d_sendbuff; + void* d_recvbuff; + ncclWindow_t send_win; + ncclWindow_t recv_win; + + // Device API requires symmetric memory allocation + NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); + NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + + // ========================================================================== + // STEP 3: Register Memory Windows for Device-Side Access + // ========================================================================== + + // Register symmetric windows for GIN access + NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); + NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); + + // Initialize data: each rank sends unique values to each destination + for (size_t i = 0; i < total_elements; i++) { + int dest_rank = i / count; + int element_idx = i % count; + h_sendbuff[i] = (float)(my_rank * 1000 + dest_rank * 100 + element_idx); + } + CUDACHECK(cudaMemcpy(d_sendbuff, h_sendbuff, size_bytes, cudaMemcpyHostToDevice)); + printf(" Rank %d initialized send data\n", my_rank); + + // ========================================================================== + // STEP 4: Create Device Communicator with GIN Support + // ========================================================================== + + // Create stream for kernel execution + cudaStream_t stream; + CUDACHECK(cudaStreamCreate(&stream)); + + // Create device communicator with GIN support + ncclDevComm devComm; + ncclDevCommRequirements reqs; + memset(&reqs, 0, sizeof(reqs)); + reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT; // GIN barriers for network synchronization + reqs.ginSignalCount = 1; // GIN signals for completion detection + NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); + printf(" Rank %d created device communicator with GIN support\n", my_rank); + + if (my_rank == 0) { + printf("Starting Pure GIN AlltoAll with %zu elements per rank (%zu total elements, %zu MB)\n", + count, total_elements, size_bytes / (1024 * 1024)); + } + + // ========================================================================== + // STEP 5: Execute Pure GIN AlltoAll Kernel + // ========================================================================== + + if (my_rank == 0) { + printf("\n=== Executing Pure GIN AlltoAll ===\n"); + } + + // Clear receive buffer + CUDACHECK(cudaMemset(d_recvbuff, 0, size_bytes)); + + // Launch pure GIN AlltoAll kernel + PureGinAlltoAllKernel<<>>( + send_win, 0, recv_win, 0, count, 0, devComm); + + // Wait for completion + CUDACHECK(cudaStreamSynchronize(stream)); + printf(" Rank %d completed pure GIN AlltoAll kernel\n", my_rank); + + // ========================================================================== + // STEP 6: Verify Results + // ========================================================================== + + // Verify pure GIN results + CUDACHECK(cudaMemcpy(h_recvbuff, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost)); + bool gin_success = true; + for (int src_rank = 0; src_rank < total_ranks; src_rank++) { + for (size_t i = 0; i < count; i++) { + size_t recv_idx = src_rank * count + i; + float expected = (float)(src_rank * 1000 + my_rank * 100 + i); + if (h_recvbuff[recv_idx] != expected) { + gin_success = false; + printf(" Rank %d: Pure GIN mismatch at [%d][%zu]: got %.0f, expected %.0f\n", + my_rank, src_rank, i, h_recvbuff[recv_idx], expected); + break; + } + } + if (!gin_success) break; + } + + if (my_rank == 0) { + printf("Pure GIN AlltoAll result: %s\n", gin_success ? "PASSED" : "FAILED"); + } + + // ========================================================================== + // STEP 7: Cleanup Resources + // ========================================================================== + + // Cleanup host memory + free(h_sendbuff); + free(h_recvbuff); + + // Device API specific cleanup + NCCLCHECK(ncclDevCommDestroy(comm, &devComm)); + NCCLCHECK(ncclCommWindowDeregister(comm, send_win)); + NCCLCHECK(ncclCommWindowDeregister(comm, recv_win)); + NCCLCHECK(ncclMemFree(d_sendbuff)); + NCCLCHECK(ncclMemFree(d_recvbuff)); + + // Standard NCCL cleanup + CUDACHECK(cudaStreamDestroy(stream)); + NCCLCHECK(ncclCommFinalize(comm)); + NCCLCHECK(ncclCommDestroy(comm)); + + return NULL; +} + +int main(int argc, char* argv[]) { + // Run example using the provided utility framework + return run_example(argc, argv, pureGinAlltoAll); +} diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/Makefile b/examples/06_device_api/03_gin_alltoall_hybrid/Makefile new file mode 100644 index 00000000000..30733120df2 --- /dev/null +++ b/examples/06_device_api/03_gin_alltoall_hybrid/Makefile @@ -0,0 +1,85 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Include common build rules +include ../../../makefiles/common.mk +include ../../../makefiles/examples.mk + +# Target executable +TARGET = gin_alltoall_hybrid_device_api + +# Common utilities +COMMON_INC = ../../common/include +COMMON_SRC = ../../common/src + +# Build configuration +INCLUDES += -I$(COMMON_INC) + +# Source files +SOURCES = main.cu $(COMMON_SRC)/utils.cc +OBJECTS = $(SOURCES:.cu=.o) +OBJECTS := $(OBJECTS:.cc=.o) + +# Default target +all: $(TARGET) + +# Build executable +$(TARGET): $(OBJECTS) +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@ +else + $(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@ +endif + @echo "Built target $@" + +# Compile source files +%.o: %.cu + $(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@ + +%.o: %.cc +ifeq ($(MPI),1) + $(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +else + $(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@ +endif + +# Test target +test: $(TARGET) + @echo "Testing $(TARGET)..." +ifeq ($(MPI),1) + @echo "Running with 2 processes" + $(MPIRUN) -np 2 ./$(TARGET) +else + @echo "Running with all available GPUs" + ./$(TARGET) +endif + +# Clean build artifacts +clean: + rm -f $(OBJECTS) $(TARGET) + +# Install target +install: $(TARGET) + @mkdir -p $(PREFIX)/bin + cp $(TARGET) $(PREFIX)/bin/ + +# Help +help: + @echo "NCCL Example: Hybrid AlltoAll Device API" + @echo "=========================================" + @echo "" + @echo "This example demonstrates hybrid communication combining" + @echo "GPU-Initiated Networking (GIN) for remote peers with" + @echo "Load Store Access (LSA) for local peers." + @echo "" + @echo "Targets:" + @echo " all - Build the example (default)" + @echo " test - Build and run test with all GPUs" + @echo " clean - Remove build artifacts" + @echo " install - Install to PREFIX/bin (default: /usr/local/bin)" + @echo " help - Show this help" + +.PHONY: all test clean install help diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/README.md b/examples/06_device_api/03_gin_alltoall_hybrid/README.md new file mode 100644 index 00000000000..4354fcd918d --- /dev/null +++ b/examples/06_device_api/03_gin_alltoall_hybrid/README.md @@ -0,0 +1,228 @@ + + +# NCCL Device API Hybrid AlltoAll Example + +This example shows how to implement AlltoAll operations using a hybrid approach that combines Load Store Access (LSA) for local peers with GPU-Initiated Networking (GIN) for remote peers. We create a device communicator with `ncclDevCommCreate` supporting both LSA and GIN capabilities, enabling optimal communication performance across different peer types. + +## Overview + +This example showcases **hybrid communication** that intelligently selects the optimal communication method for each peer: + +- **LSA (Load Store Access)** for local peers (same node/memory space) +- **GIN (GPU-Initiated Networking)** for remote peers (different nodes) + +## What This Example Does + +1. **Creates hybrid device communicators** using `ncclDevCommCreate` with both LSA and GIN support for optimal peer communication +2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for both LSA direct access and GIN network operations +3. **Launches GPU kernel** that performs AlltoAll operations using LSA for local peers and GIN for remote peers +4. **Demonstrates hybrid synchronization** coordinating both LSA barriers and GIN signals for correctness + +## Building and Running + +The advanced examples can be built using either pthread or MPI for parallelization. pthread is the default choice. To use MPI the user needs to set `MPI=1` at build time and can optionally provide a valid MPI installation under `MPI_HOME`. + +### Build +```bash +make [MPI=1] [MPI_HOME=] [NCCL_HOME=] [CUDA_HOME=] +``` + +### Run when compiled for pthreads (default) +```bash +[NTHREADS=N] ./gin_alltoall_hybrid_device_api +``` + +### Run when compiled for MPI +```bash +mpirun -np ./gin_alltoall_hybrid_device_api +``` + +## Code Walk-through + +### Device Communicator Creation (Host-side) +The `ncclDevComm` is the core component enabling GPU kernels to perform both local and remote communication. For hybrid communication, we configure the device communicator with both LSA and GIN resources. The `ncclDevCommRequirements` specifies LSA barriers for local synchronization, GIN barriers for network synchronization, and GIN signals for completion detection. This dual setup enables optimal communication for each peer type. + +```cpp +ncclDevComm devComm; +ncclDevCommRequirements reqs; +memset(&reqs, 0, sizeof(reqs)); +// LSA barriers enable direct memory access coordination for local peers +reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT; +// GIN barriers enable cross-node synchronization over the network +reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT; +// GIN signals provide completion notifications for asynchronous network operations +reqs.ginSignalCount = 1; + +// Create device communicator with hybrid LSA+GIN support +NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); +``` + +### Memory Window Registration (Host-side) +The device API requires symmetric memory windows registered using `NCCL_WIN_COLL_SYMMETRIC`. These windows enable both LSA direct access for local peers and GIN network operations for remote peers. The same memory windows support both communication methods, with the kernel automatically selecting the appropriate access pattern based on peer locality. + +```cpp +ncclWindow_t send_win; +ncclWindow_t recv_win; + +// Register symmetric windows for both LSA and GIN access +NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); +NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); +``` + +### Hybrid Barriers (Device-side) +Hybrid barriers coordinate both local LSA operations and remote GIN operations. The barrier session uses the world team and GIN context to ensure synchronization across all ranks, regardless of their communication method. This unified barrier approach ensures all peers reach the same synchronization point before proceeding with data exchange. + +```cpp +// Hybrid barriers coordinate both LSA and GIN operations across all ranks +ncclBarrierSession bar { + ncclCoopCta(), // Barrier scope: entire CTA (thread block) + ncclTeamTagWorld(), // Team spanning all ranks (local + remote) + gin, // GIN context for network coordination + blockIdx.x // Barrier index: matches our CTA index +}; +bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed); +``` + +### Peer Classification (Device-side) +The hybrid kernel intelligently classifies peers into local (LSA-accessible) and remote (GIN-only) categories. This classification determines the optimal communication method for each peer. Local peers benefit from direct memory access, while remote peers use network communication. + +```cpp +// Classify peers into local (LSA) and remote (GIN) categories +ncclTeam world = ncclTeamWorld(devComm); // All ranks +ncclTeam lsa = ncclTeamLsa(devComm); // Local ranks only +const int startLsa = world.rank - lsa.rank; // First local rank in world +const int lsaSize = lsa.nRanks; // Number of local peers +``` + +### Memory Access (Device-side) +`ncclGetLsaPointer` allows CUDA kernels to directly access other GPUs' memory within the LSA team, while `gin.put` handles remote communication over the network. The hybrid approach uses the most efficient method for each peer type. + +```cpp +// Handle local peers using direct memory access (LSA) +T* sendLocal = (T*)ncclGetLocalPointer(sendwin, sendoffset); +T* recvPtr = (T*)ncclGetLsaPointer(recvwin, recvoffset, lp); + +// Handle remote peers using network operations (GIN) +gin.put(world, r, recvwin, recvoffset + world.rank * size, + sendwin, sendoffset + r * size, size, ncclGin_SignalInc{signalIndex}); +``` + +## Building and Running + +### Build +```bash +make +``` + +### Run with pthread mode (default) +```bash +# Run with all available GPUs +./gin_alltoall_hybrid_device_api + +# Run with specific number of GPUs +NTHREADS=4 ./gin_alltoall_hybrid_device_api +``` + +### Run with MPI mode +```bash +# Build with MPI support +make MPI=1 + +# Run with MPI across multiple nodes +mpirun -np 4 --hostfile hosts ./gin_alltoall_hybrid_device_api +``` + +### Test +```bash +make test +``` + +## Expected Output + +``` +Starting Hybrid AlltoAll initialization + Rank 0 using GPU device 0 + Rank 1 using GPU device 1 + Rank 2 using GPU device 2 + Rank 3 using GPU device 3 + Rank 0 initialized NCCL communicator for 4 total ranks + Rank 1 initialized NCCL communicator for 4 total ranks + Rank 2 initialized NCCL communicator for 4 total ranks + Rank 3 initialized NCCL communicator for 4 total ranks + Rank 0 initialized send data + Rank 1 initialized send data + Rank 2 initialized send data + Rank 3 initialized send data + Rank 0 created device communicator with hybrid support + Rank 1 created device communicator with hybrid support + Rank 2 created device communicator with hybrid support + Rank 3 created device communicator with hybrid support +Starting Hybrid AlltoAll with 1024 elements per rank (4096 total elements, 0 MB) +Using LSA for local peers and GIN for remote peers + +=== Executing Hybrid AlltoAll === + Rank 0 completed hybrid AlltoAll kernel + Rank 1 completed hybrid AlltoAll kernel + Rank 2 completed hybrid AlltoAll kernel + Rank 3 completed hybrid AlltoAll kernel +Hybrid AlltoAll result: PASSED +✓ All 4096 elements correctly exchanged using hybrid communication +``` + +## When to Use + +- **Multi-node training**: Mixed local/remote communication patterns +- **Large-scale inference**: Optimized for various topologies +- **Production workloads**: Where performance is critical +- **Heterogeneous clusters**: Different node configurations + +## Performance Considerations + +**Advantages:** +- **Reduced Latency**: LSA provides ultra-low latency for local communication +- **Optimal Bandwidth**: GIN efficiently handles remote communication +- **Reduced Network Load**: Local traffic stays off the network +- **Scalable Design**: Efficient across different node configurations + +**Disadvantages:** +- More complex programming model requiring coordination of both LSA and GIN +- Requires careful synchronization between different communication methods +- Higher development complexity compared to pure approaches + +## Common Issues and Solutions + +### Issue: LSA barriers not supported +**Cause:** GPUs not connected through NVLink or PCIe for direct memory access +**Solution:** Verify GPU topology with `nvidia-smi topo -m` and ensure proper LSA-capable connections + +### Issue: Hybrid synchronization failures +**Solution:** Ensure both `lsaBarrierCount` and `railGinBarrierCount` match the number of thread blocks in kernel launch configuration + +### Issue: Peer classification errors +**Solution:** Verify LSA team setup and ensure symmetric memory allocation is properly configured for all ranks + +### Issue: Mixed communication performance issues +**Solution:** Profile LSA vs GIN usage patterns and optimize barrier configurations for your specific topology + +## Performance Notes + +- These are educational examples, not optimized for performance +- Real implementations should consider: + - Optimal balance between LSA and GIN operations based on topology + - Memory coalescing patterns for both LSA and GIN operations + - Barrier synchronization overhead minimization + - Signal pool management for high-throughput GIN scenarios + +## Error Handling + +The example uses comprehensive error checking for CUDA, NCCL, LSA, and GIN operations. Device kernels should implement proper error handling for both direct memory access patterns and network operations. + +## Next Steps + +After understanding this example, explore: +- **Topology-aware optimization**: Fine-tune LSA/GIN balance based on hardware topology +- **Custom hybrid patterns**: Implement specialized communication strategies +- **Performance profiling**: Analyze LSA vs GIN performance characteristics +- **Advanced synchronization**: Optimize barrier usage for complex communication patterns diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/main.cu b/examples/06_device_api/03_gin_alltoall_hybrid/main.cu new file mode 100644 index 00000000000..d1201654957 --- /dev/null +++ b/examples/06_device_api/03_gin_alltoall_hybrid/main.cu @@ -0,0 +1,278 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "cuda_runtime.h" +#include "nccl.h" +#include "nccl_device.h" +#include "utils.h" +#include +#include +#include +#include +#include + +/* + * NCCL Device API Hybrid AlltoAll Example + * + * This example demonstrates NCCL's hybrid communication approach that combines + * GPU-Initiated Networking (GIN) for remote peers with Load Store Access (LSA) + * for local peers, optimizing AlltoAll collective operations. + * + * Learning Objectives: + * - Understand hybrid communication optimization + * - Learn when to use GIN vs LSA for different peer types + * - Practice combining network and memory-based communication + * - See performance optimization through intelligent peer selection + * + * Key Hybrid Concepts: + * - **LSA (Load Store Access)**: Direct memory access for local peers + * - **GIN (GPU-Initiated Networking)**: Network communication for remote peers + * - **Peer classification**: Distinguishing between local and remote peers + * - **Hybrid synchronization**: Combining LSA and GIN completion mechanisms + * - **Performance optimization**: Using the fastest method for each peer type + * + * When to Use Hybrid: + * - Multi-node environments with both local and remote peers + * - Performance-critical applications requiring optimal communication + * - Mixed communication patterns (intra-node + inter-node) + * - Production workloads where efficiency matters + * + * Performance Benefits: + * - LSA provides low-latency local communication + * - GIN handles remote communication efficiently + * - Reduced network traffic for local operations + * - Optimal bandwidth utilization across communication types + */ + +// Device API kernel launch configuration +// CTA count must match railGinBarrierCount for proper barrier synchronization +#define NCCL_DEVICE_CTA_COUNT 16 +#define NCCL_DEVICE_THREADS_PER_CTA 512 + +// ========================================================================== +// Device Kernel Implementation +// ========================================================================== + +// Hybrid AlltoAll kernel - optimizes by using LSA for local peers, GIN for remote +// This kernel demonstrates performance optimization using both communication methods +template +__global__ void HybridAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, + ncclWindow_t recvwin, size_t recvoffset, + size_t count, int root, struct ncclDevComm devComm) { + int ginContext = 0; + unsigned int signalIndex = 0; + ncclGin gin { devComm, ginContext }; + uint64_t signalValue = gin.readSignal(signalIndex); + + // GIN barriers for cross-node synchronization + ncclBarrierSession bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x }; + bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed); + + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int nthreads = blockDim.x * gridDim.x; + + ncclTeam world = ncclTeamWorld(devComm); + ncclTeam lsa = ncclTeamLsa(devComm); + const int startLsa = world.rank - lsa.rank; + const int lsaSize = lsa.nRanks; + + // Handle remote peers (i.e., non-LSA) using GIN for network communication + const size_t size = count * sizeof(T); + for (int r = tid; r < startLsa; r += nthreads) { + gin.put(world, r, + recvwin, recvoffset + world.rank * size, + sendwin, sendoffset + r * size, + size, ncclGin_SignalInc{signalIndex}); + } + for (int r = startLsa + lsaSize + tid; r < world.nRanks; r += nthreads) { + gin.put(world, r, + recvwin, recvoffset + world.rank * size, + sendwin, sendoffset + r * size, + size, ncclGin_SignalInc{signalIndex}); + } + + // Handle local peers with LSA (Load Store Access) for optimal performance + T* sendLocal = (T*)ncclGetLocalPointer(sendwin, sendoffset); + for (size_t offset = tid; offset < count; offset += nthreads) { + for (int lp = 0; lp < lsa.nRanks; lp++) { + int wr = startLsa + lp; + T* recvPtr = (T*)ncclGetLsaPointer(recvwin, recvoffset, lp); + recvPtr[world.rank * count + offset] = sendLocal[wr * count + offset]; + } + } + + // Wait for remote GIN operations to complete + int numRemotePeers = world.nRanks - lsa.nRanks; + gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + numRemotePeers); + gin.flush(ncclCoopCta()); + + // Final synchronization barrier + bar.sync(ncclCoopCta(), cuda::memory_order_release, ncclGinFenceLevel::Relaxed); +} + + // ========================================================================== + // Host-Side Setup and Device API Initialization + // ========================================================================== + +void* hybridAlltoAll(int my_rank, int total_ranks, int local_device, int devices_per_rank) { + ncclComm_t comm; + ncclUniqueId nccl_unique_id; + + if (my_rank == 0) { + printf("Starting Hybrid AlltoAll initialization\n"); + } + + // Standard NCCL communicator initialization + if (my_rank == 0) { + NCCLCHECK(ncclGetUniqueId(&nccl_unique_id)); + } + + // Distribute unique ID + util_broadcast(0, my_rank, &nccl_unique_id); + + // Set device context for this rank + CUDACHECK(cudaSetDevice(local_device)); + printf(" Rank %d using GPU device %d\n", my_rank, local_device); + + // ========================================================================== + // STEP 2: Initialize NCCL Communicator and Allocate Memory + // ========================================================================== + + // Initialize NCCL communicator + NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank)); + printf(" Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks); + + // Allocate memory for AlltoAll operation + size_t count = 1024; // Elements per rank + size_t total_elements = count * total_ranks; + size_t size_bytes = total_elements * sizeof(float); + + float *h_sendbuff = (float*)malloc(size_bytes); + float *h_recvbuff = (float*)malloc(size_bytes); + void* d_sendbuff; + void* d_recvbuff; + ncclWindow_t send_win; + ncclWindow_t recv_win; + + // Device API requires symmetric memory allocation + NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes)); + NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes)); + + // ========================================================================== + // STEP 3: Register Memory Windows for Device-Side Access + // ========================================================================== + + // Register symmetric windows for both LSA and GIN access + NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC)); + NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC)); + + // Initialize data: each rank sends unique values to each destination + for (size_t i = 0; i < total_elements; i++) { + int dest_rank = i / count; + int element_idx = i % count; + h_sendbuff[i] = (float)(my_rank * 1000 + dest_rank * 100 + element_idx); + } + CUDACHECK(cudaMemcpy(d_sendbuff, h_sendbuff, size_bytes, cudaMemcpyHostToDevice)); + printf(" Rank %d initialized send data\n", my_rank); + + // ========================================================================== + // STEP 4: Create Device Communicator with Hybrid Support + // ========================================================================== + + // Create stream for kernel execution + cudaStream_t stream; + CUDACHECK(cudaStreamCreate(&stream)); + + // Create device communicator with both LSA and GIN support + ncclDevComm devComm; + ncclDevCommRequirements reqs; + memset(&reqs, 0, sizeof(reqs)); + reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT; // LSA barriers for local synchronization + reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT; // GIN barriers for network synchronization + reqs.ginSignalCount = 1; // GIN signals for completion detection + NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm)); + printf(" Rank %d created device communicator with hybrid support\n", my_rank); + + if (my_rank == 0) { + printf("Starting Hybrid AlltoAll with %zu elements per rank (%zu total elements, %zu MB)\n", + count, total_elements, size_bytes / (1024 * 1024)); + printf("Using LSA for local peers and GIN for remote peers\n"); + } + + // ========================================================================== + // STEP 5: Execute Hybrid AlltoAll Kernel + // ========================================================================== + + if (my_rank == 0) { + printf("\n=== Executing Hybrid AlltoAll ===\n"); + } + + // Clear receive buffer + CUDACHECK(cudaMemset(d_recvbuff, 0, size_bytes)); + + // Launch hybrid AlltoAll kernel + HybridAlltoAllKernel<<>>( + send_win, 0, recv_win, 0, count, 0, devComm); + + // Wait for completion + CUDACHECK(cudaStreamSynchronize(stream)); + printf(" Rank %d completed hybrid AlltoAll kernel\n", my_rank); + + // ========================================================================== + // STEP 6: Verify Results + // ========================================================================== + + // Verify hybrid results + CUDACHECK(cudaMemcpy(h_recvbuff, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost)); + bool hybrid_success = true; + for (int src_rank = 0; src_rank < total_ranks; src_rank++) { + for (size_t i = 0; i < count; i++) { + size_t recv_idx = src_rank * count + i; + float expected = (float)(src_rank * 1000 + my_rank * 100 + i); + if (h_recvbuff[recv_idx] != expected) { + hybrid_success = false; + printf(" Rank %d: Hybrid mismatch at [%d][%zu]: got %.0f, expected %.0f\n", + my_rank, src_rank, i, h_recvbuff[recv_idx], expected); + break; + } + } + if (!hybrid_success) break; + } + + if (my_rank == 0) { + printf("Hybrid AlltoAll result: %s\n", hybrid_success ? "PASSED" : "FAILED"); + if (hybrid_success) { + printf("✓ All %zu elements correctly exchanged using hybrid communication\n", total_elements); + } + } + + // ========================================================================== + // STEP 7: Cleanup Resources + // ========================================================================== + + // Cleanup host memory + free(h_sendbuff); + free(h_recvbuff); + + // Device API specific cleanup + NCCLCHECK(ncclDevCommDestroy(comm, &devComm)); + NCCLCHECK(ncclCommWindowDeregister(comm, send_win)); + NCCLCHECK(ncclCommWindowDeregister(comm, recv_win)); + NCCLCHECK(ncclMemFree(d_sendbuff)); + NCCLCHECK(ncclMemFree(d_recvbuff)); + + // Standard NCCL cleanup + CUDACHECK(cudaStreamDestroy(stream)); + NCCLCHECK(ncclCommFinalize(comm)); + NCCLCHECK(ncclCommDestroy(comm)); + + return NULL; +} + +int main(int argc, char* argv[]) { + // Run example using the provided utility framework + return run_example(argc, argv, hybridAlltoAll); +} \ No newline at end of file diff --git a/projects/rccl/Makefile b/projects/rccl/Makefile index 458a507415b..2b1a57c5a53 100644 --- a/projects/rccl/Makefile +++ b/projects/rccl/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -11,6 +11,7 @@ BUILDDIR ?= $(abspath ./build) ABSBUILDDIR := $(abspath $(BUILDDIR)) TARGETS := src pkg clean: ${TARGETS:%=%.clean} +examples.build: src.build LICENSE_FILES := LICENSE.txt LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%) lic: $(LICENSE_TARGETS) @@ -23,6 +24,9 @@ ${BUILDDIR}/%.txt: %.txt src.%: ${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR} +examples: src.build + ${MAKE} -C examples NCCL_HOME=${ABSBUILDDIR} + pkg.%: ${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR} diff --git a/projects/rccl/cmake/rocmIb.cmake b/projects/rccl/cmake/rocmIb.cmake index f6566778fc5..4842b6fe932 100644 --- a/projects/rccl/cmake/rocmIb.cmake +++ b/projects/rccl/cmake/rocmIb.cmake @@ -264,6 +264,13 @@ execute_process( COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} ) +# Rename GIN functions to avoid duplicate symbols with net_ib.cc +# Note: We rename ncclGinIb* to rocmGinIb*, then restore the struct name +# since ncclGinIbCollComm is defined in net_ib_gin.h (not renamed) +execute_process( + COMMAND bash -c "sed -i -e 's/ncclGinIb/rocmGinIb/g' -e 's/rocmGinIbCollComm/ncclGinIbCollComm/g' ${ROCM_NETIB_FILE}" + WORKING_DIRECTORY ${RCCL_SRC_DIR} +) execute_process( COMMAND bash -c "sed -i 's/cuMemGetHandleForAddressRange/hipMemGetHandleForAddressRange/g' ${ROCM_NETIB_FILE}" WORKING_DIRECTORY ${RCCL_SRC_DIR} diff --git a/projects/rccl/ext-src/rocm_netib.patch b/projects/rccl/ext-src/rocm_netib.patch index 882c8ecf369..ef61c811de2 100644 --- a/projects/rccl/ext-src/rocm_netib.patch +++ b/projects/rccl/ext-src/rocm_netib.patch @@ -172,10 +172,11 @@ struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS]; struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1]; // Each dev correlates to a mergedIbDev -@@ -1370,6 +1457,7 @@ +@@ -1370,7 +1457,8 @@ struct ncclIbRemSizesFifo remSizesFifo; uint64_t fifoHead; int ar; // Use adaptive routing when all merged devices have it enabled + uint64_t putSignalScratchpad; + bool useCtsOffload; }; // The SendFifo needs to be 32-byte aligned and each element needs @@ -393,8 +394,8 @@ // Local ibDevN ibDevN = rComm->devs[devIndex].base.ibDevN; ibDev = ncclIbDevs + ibDevN; -- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail); -+ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp, channel_id, false, q, remMeta.isP2p), ret, fail); +- NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp), ret, fail); ++ NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp, channel_id, false, q, remMeta.isP2p), ret, fail); qp->devIndex = devIndex; devIndex = (devIndex + 1) % rComm->base.vProps.ndevs; diff --git a/projects/rccl/ext-tuner/example/plugin.c b/projects/rccl/ext-tuner/example/plugin.c index 5e4ca9e4bae..9eba0f55df2 100644 --- a/projects/rccl/ext-tuner/example/plugin.c +++ b/projects/rccl/ext-tuner/example/plugin.c @@ -307,7 +307,7 @@ __hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks, // Set Ring/Simple base network latency to 280 constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 280.0; } - + TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext)); if (!ctx) return ncclSystemError; diff --git a/projects/rccl/ext-tuner/example/test/test_plugin.c b/projects/rccl/ext-tuner/example/test/test_plugin.c index a74386731e5..746cb8ff782 100644 --- a/projects/rccl/ext-tuner/example/test/test_plugin.c +++ b/projects/rccl/ext-tuner/example/test/test_plugin.c @@ -744,16 +744,16 @@ int test_nvl_domain_info() { .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck) .maxRanksPerNvlDomain = 5 // maximum ranks across all domains (capacity) }; - + void* context = NULL; ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL); TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed"); - + // Validate NVLD info structure TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)"); TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain"); TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain"); - + // Clean up pluginFinalize(context); printf("NVLink domain info test passed!\n"); diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk index f8f455dec66..2b1d1c4b383 100644 --- a/projects/rccl/makefiles/common.mk +++ b/projects/rccl/makefiles/common.mk @@ -20,7 +20,7 @@ NET_PROFILER ?= 0 MLX5DV ?= 0 MAX_EXT_NET_PLUGINS ?= 0 -NVCC = $(CUDA_HOME)/bin/nvcc +NVCC ?= $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include @@ -85,6 +85,8 @@ NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xp # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt +NVCUFLAGS_SYM := + ########## GCOV ########## GCOV ?= 0 # disable by default. GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1 @@ -158,3 +160,8 @@ endif ifneq ($(MAX_EXT_NET_PLUGINS), 0) CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS) endif + +CXXFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER +NVCUFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER + +CXXFLAGS += -DNCCL_GIN_PROXY_ENABLE=1 diff --git a/projects/rccl/makefiles/examples.mk b/projects/rccl/makefiles/examples.mk new file mode 100644 index 00000000000..6f3a520f3d0 --- /dev/null +++ b/projects/rccl/makefiles/examples.mk @@ -0,0 +1,31 @@ +# +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +# Make sure NCCL headers are found and libraries are linked +ifneq ($(NCCL_HOME), "") +NVCUFLAGS += -I$(NCCL_HOME)/include/ +NVLDFLAGS += -L$(NCCL_HOME)/lib +endif + +# Build configuration +INCLUDES = -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include +LIBRARIES = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib +LDFLAGS = -lcudart -lnccl -Wl,-rpath,$(NCCL_HOME)/lib + + +# MPI configuration +ifeq ($(MPI), 1) + +ifdef MPI_HOME +MPICXX ?= $(MPI_HOME)/bin/mpicxx +MPIRUN ?= $(MPI_HOME)/bin/mpirun +else +MPICXX ?= mpicxx +MPIRUN ?= mpirun +endif + +CXXFLAGS += -DMPI_SUPPORT +endif diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index d0e97c06576..08c9dc78071 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 28 -NCCL_PATCH := 3 +NCCL_PATCH := 9 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/CMakeLists.txt b/projects/rccl/src/CMakeLists.txt index bf423a5c80c..9f0d41d4123 100644 --- a/projects/rccl/src/CMakeLists.txt +++ b/projects/rccl/src/CMakeLists.txt @@ -94,11 +94,14 @@ set(SRC_FILES include/debug.h include/dev_runtime.h include/device.h + include/env.h include/enqueue.h include/gdrwrap.h include/git_version.h include/graph.h include/group.h + include/gin/gin_host.h + include/gin/gin_host_proxy.h include/hip_rocm_version_info.h include/ibvcore.h include/ibvsymbols.h @@ -108,7 +111,7 @@ set(SRC_FILES include/mnnvl.h include/nccl_common.h include/nccl_device.h - include/net_device.h + include/nccl_device/net_device.h include/net.h include/net_ib_cast_inspect.h include/nvmlwrap.h @@ -149,8 +152,18 @@ set(SRC_FILES include/nccl_device/comm.h include/nccl_device/coop.h include/nccl_device/core.h + include/nccl_device/barrier.h + include/nccl_device/gin.h + include/nccl_device/gin_barrier.h + include/nccl_device/gin/gdaki/gin_gdaki.h + include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h + include/nccl_device/gin/gin_device_api.h + include/nccl_device/gin/gin_device_common.h + include/nccl_device/gin/gin_device_host_common.h + include/nccl_device/gin/proxy/gin_proxy.h + include/nccl_device/gin/proxy/gin_proxy_device_host_common.h include/nccl_device/ll_a2a.h - include/nccl_device/mem_barrier.h + include/nccl_device/lsa_barrier.h include/nccl_device/ptr.h include/nccl_device/rccl_ptr.h include/nccl_device/utility.h @@ -158,10 +171,16 @@ set(SRC_FILES include/nccl_device/impl/comm__types.h include/nccl_device/impl/core__funcs.h include/nccl_device/impl/core__types.h + include/nccl_device/impl/barrier__funcs.h + include/nccl_device/impl/barrier__types.h + include/nccl_device/impl/gin__funcs.h + include/nccl_device/impl/gin__types.h + include/nccl_device/impl/gin_barrier__funcs.h + include/nccl_device/impl/gin_barrier__types.h include/nccl_device/impl/ll_a2a__funcs.h include/nccl_device/impl/ll_a2a__types.h - include/nccl_device/impl/mem_barrier__funcs.h - include/nccl_device/impl/mem_barrier__types.h + include/nccl_device/impl/lsa_barrier__funcs.h + include/nccl_device/impl/lsa_barrier__types.h include/nccl_device/impl/ptr__funcs.h include/nccl_device/impl/ptr__types.h include/npkit/npkit.h @@ -205,6 +224,8 @@ set(SRC_FILES include/plugin/nccl_net.h include/plugin/nccl_profiler.h include/plugin/nccl_tuner.h + include/plugin/nccl_env.h + include/plugin/env/env_v1.h include/plugin/plugin.h include/plugin/net/net_v6.h include/plugin/net/net_v7.h @@ -253,9 +274,12 @@ set(SRC_FILES misc/utils.cc misc/proxy_trace/proxy_trace.cc nccl_device/core.cc + nccl_device/gin_barrier.cc nccl_device/ll_a2a.cc - nccl_device/mem_barrier.cc + nccl_device/lsa_barrier.cc plugin/net.cc + plugin/env.cc + plugin/env/env_v1.cc plugin/plugin_open.cc plugin/profiler.cc plugin/tuner.cc @@ -291,13 +315,15 @@ set(SRC_FILES transport/net.cc transport/net_ib.cc transport/net_ib_cast.cc - # net_ib_rocm.cc is generated by rocmIb.cmake directly into the hipify staging area - # so it is not listed here - it's added to HIP_SOURCES separately below + transport/net_ib_gin.h transport/net_socket.cc transport/nvls.cc transport/p2p.cc transport/profiler.cc transport/shm.cc + gin/gin_host.cc + gin/gin_host_proxy.cc + transport/gdaki/gin_host_gdaki.h include/latency_profiler/CollTrace.h include/latency_profiler/CollTraceEvent.h include/latency_profiler/CollTraceFunc.h @@ -321,6 +347,8 @@ if(USE_AMDSMI) ) else() set(SMI_SOURCES + src/include/amdsmi_wrap.h + src/misc/amdsmi_wrap.cc src/include/rocm_smi_wrap.h src/misc/rocm_smi_wrap.cc ) @@ -384,6 +412,18 @@ foreach(SRC_FILE ${SRC_FILES}) endforeach() set(NCCL_DEVICE_HEADER "${PROJECT_BINARY_DIR}/include/rccl/nccl_device.h") +# Copy hip_compat.h as-is (no hipification — contains both CUDA and HIP paths) +set(HIP_COMPAT_SRC "${RCCL_SOURCE_DIR}/src/include/nccl_device/hip_compat.h") +set(HIP_COMPAT_DST "${HIPIFY_DIR}/src/include/nccl_device/hip_compat.h") +add_custom_command( + OUTPUT ${HIP_COMPAT_DST} + COMMAND ${CMAKE_COMMAND} -E copy ${HIP_COMPAT_SRC} ${HIP_COMPAT_DST} + MAIN_DEPENDENCY ${HIP_COMPAT_SRC} + COMMENT "Copying hip_compat.h (no hipification)" +) +list(APPEND HIP_SOURCES ${HIP_COMPAT_DST}) +list(APPEND NCCL_DEVICE_HIP_FILES ${HIP_COMPAT_DST}) + add_custom_command( OUTPUT ${NCCL_DEVICE_HEADER} COMMAND ${CMAKE_COMMAND} -E make_directory "${PROJECT_BINARY_DIR}/include" @@ -1011,6 +1051,10 @@ if (HAVE_KERNARG_PRELOAD AND NOT ENABLE_DEVICE_LINKER) target_link_options(rccl PRIVATE "SHELL:-Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16") endif() +if(ENABLE_MSCCLPP) + include(${RCCL_SOURCE_DIR}/cmake/MSCCLPP.cmake) +endif() + ## Track linking time set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time") diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index be026cc2670..471a0335ef7 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -8,7 +8,7 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_device.h \ - $(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h)) + $(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/*/*.h include/nccl_device/*/*/*.h)) LIBSRCFILES := \ bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \ @@ -16,13 +16,16 @@ LIBSRCFILES := \ $(wildcard graph/*.cc) \ $(wildcard misc/*.cc) \ $(wildcard transport/*.cc) \ + $(wildcard transport/gdaki/*.cc) \ $(wildcard register/*.cc) \ $(wildcard plugin/*.cc) \ $(wildcard plugin/net/*.cc) \ $(wildcard plugin/tuner/*.cc) \ $(wildcard plugin/profiler/*.cc) \ + $(wildcard plugin/env/*.cc) \ $(wildcard nccl_device/*.cc) \ $(wildcard scheduler/*.cc) \ + $(wildcard gin/*.cc) \ $(filter-out ras/client.cc,$(wildcard ras/*.cc)) BINSRCFILES := ras/client.cc @@ -40,6 +43,7 @@ LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig BINDIR := $(BUILDDIR)/bin + ##### target files CUDARTLIB ?= cudart_static @@ -61,6 +65,17 @@ INCPLUGIN := include/plugin DEVMANIFEST := $(BUILDDIR)/obj/device/manifest +# DOCA GPUNetIO definitions +DOCA_HOME ?= transport/gdaki/doca-gpunetio +DOCA_INC_INSTALL := $(INCDIR)/nccl_device/gin/gdaki/doca_gpunetio +DOCA_OBJDIR := $(OBJDIR)/transport/gdaki/doca-gpunetio +DOCA_INCLUDES := $(DOCA_HOME)/include/doca_gpunetio_device.h $(wildcard $(DOCA_HOME)/include/common/*.h) $(wildcard $(DOCA_HOME)/include/device/*.cuh) +DOCA_INCTARGETS := $(DOCA_INCLUDES:$(DOCA_HOME)/include/%=$(DOCA_INC_INSTALL)/%) +INCTARGETS += $(DOCA_INCTARGETS) +DOCA_LIBSRC := doca_verbs_qp.cpp doca_verbs_cq.cpp doca_verbs_device_attr.cpp doca_verbs_umem.cpp doca_verbs_srq.cpp doca_verbs_uar.cpp doca_gpunetio.cpp doca_gpunetio_log.cpp doca_gpunetio_high_level.cpp doca_verbs_cuda_wrapper.cpp doca_verbs_mlx5dv_wrapper.cpp doca_verbs_ibv_wrapper.cpp doca_gpunetio_gdrcopy.cpp +DOCA_LIBOBJ := $(DOCA_LIBSRC:%.cpp=$(DOCA_OBJDIR)/%.o) +LIBOBJ += $(DOCA_LIBOBJ) + ##### rules build : lib staticlib binary @@ -94,7 +109,7 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) - $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) + $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) -Wl,--version-script=libnccl.map ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) @@ -137,6 +152,36 @@ $(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h mkdir -p $(INCDIR)/nccl_device/impl install -m 644 $< $@ +$(INCDIR)/nccl_device/gin/%.h: include/nccl_device/gin/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin + install -m 644 $< $@ + +$(INCDIR)/nccl_device/gin/gdaki/%.h: include/nccl_device/gin/gdaki/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin/gdaki + install -m 644 $< $@ + +$(INCDIR)/nccl_device/gin/proxy/%.h: include/nccl_device/gin/proxy/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR)/nccl_device/gin/proxy + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/%.h: $(DOCA_HOME)/include/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL) + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/common/%.h: $(DOCA_HOME)/include/common/%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL)/common + install -m 644 $< $@ + +$(DOCA_INC_INSTALL)/device/%.cuh: $(DOCA_HOME)/include/device/%.cuh + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(DOCA_INC_INSTALL)/device + install -m 644 $< $@ + $(PKGDIR)/%.pc : %.pc @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(PKGDIR) @@ -145,8 +190,18 @@ $(PKGDIR)/%.pc : %.pc $(OBJDIR)/%.o : %.cc $(INCTARGETS) @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@ - @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -M $< > $(@:%.o=%.d.tmp) + @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) + @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ + sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) + @rm -f $(@:%.o=%.d.tmp) + +$(DOCA_OBJDIR)/%.o : $(DOCA_HOME)/src/%.cpp + @printf "Compiling %-35s > %s\n" $< $@ + mkdir -p `dirname $@` + $(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -c $< -o $@ + @$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc index 7ed3000d5b3..1e0bab43f37 100644 --- a/projects/rccl/src/bootstrap.cc +++ b/projects/rccl/src/bootstrap.cc @@ -227,6 +227,21 @@ static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData, return ncclSuccess; } +static ncclResult_t socketDoubleSendRecv(struct ncclSocketOp ops[4]) { + // ops synchronously exchange size then asynchronously exchange data in send->recv->send->recv order + int senderRecvSize1, senderRecvSize2; + NCCLCHECK(ncclSocketSendRecv(ops[0].sock, &ops[0].size, sizeof(int), ops[1].sock, &senderRecvSize1, sizeof(int))); + NCCLCHECK(ncclSocketSendRecv(ops[2].sock, &ops[2].size, sizeof(int), ops[3].sock, &senderRecvSize2, sizeof(int))); + if (senderRecvSize1 > ops[1].size || senderRecvSize2 > ops[3].size) { + WARN("Message truncated : received %d,%d bytes instead of %d,%d", senderRecvSize1, senderRecvSize2, ops[1].size, ops[3].size); + return ncclInternalError; + } + ops[1].size = std::min(ops[1].size, senderRecvSize1); + ops[3].size = std::min(ops[3].size, senderRecvSize2); + NCCLCHECK(ncclSocketMultiOp(ops, 4)); + return ncclSuccess; +} + union ringConnectInfo { union ncclSocketAddress addr; char handle[NCCL_NET_HANDLE_MAXSIZE]; @@ -1012,22 +1027,40 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC if (recvDataHandle) netDereg(net, recvComm, &recvDataHandle); return res; } -static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct ncclSocket* recvSock, int rank, int nranks, char* data, int size) { +static ncclResult_t socketRingAllGather(struct ncclSocket* nextSock, struct ncclSocket* prevSock, int rank, int nranks, char* data, int size) { ncclResult_t res = ncclSuccess; uint64_t tFirst = 0, tRest = 0; /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from prev * and send previous step's data from (rank-i) to next */ - TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started"); + TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started: rank=%d nranks=%d", rank, nranks); + int totalSteps = nranks / 2; + TRACE(NCCL_BOOTSTRAP, "bidirectional bootstrap: totalSteps=%d", totalSteps); BOOTSTRAP_PROF_OPEN(tFirst); - for (int i = 0; i < nranks - 1; i++) { - size_t rslice = (rank - i - 1 + nranks) % nranks; - size_t sslice = (rank - i + nranks) % nranks; - void* recv_data = data + rslice * size; - void* send_data = data + sslice * size; - NCCLCHECKGOTO(socketSendRecv(sendSock, send_data, size, recvSock, recv_data, size), res, exit); - if (i == 0) { + for (int step = 0; step < totalSteps; step++) { + // N ranks requires (N-1)/2 steps for the double ring algorithm. If N is even, the last step is requires a single send/recv + bool isFinalUnidirectional = (step == totalSteps - 1) && (nranks % 2 == 0); + // Ring0: ring from previous to next + int sendSliceRing0 = (rank - step + nranks) % nranks; // Send this slice to next neighbor + int recvSliceRing0 = (rank - step - 1 + nranks) % nranks; // Receive this slice from prev neighbor + // Ring1: ring from next to previous + int sendSliceRing1 = (rank + step) % nranks; // Send this slice to prev neighbor + int recvSliceRing1 = (rank + step + 1) % nranks; // Receive this slice from next neighbor + if (isFinalUnidirectional) { + // Final unidirectional step, only Ring0 is used + NCCLCHECKGOTO(socketSendRecv(nextSock, data + sendSliceRing0 * size, size, prevSock, data + recvSliceRing0 * size, size), res, exit); + } else { + // Bidirectional step: Ring0 and Ring1 are used simultaneously + struct ncclSocketOp ops[4] = { + {NCCL_SOCKET_SEND, nextSock, data + sendSliceRing0 * size, size, 0}, // Ring0: send to next + {NCCL_SOCKET_RECV, prevSock, data + recvSliceRing0 * size, size, 0}, // Ring0: recv from prev + {NCCL_SOCKET_SEND, prevSock, data + sendSliceRing1 * size, size, 0}, // Ring1: send to prev + {NCCL_SOCKET_RECV, nextSock, data + recvSliceRing1 * size, size, 0} // Ring1: recv from next + }; + NCCLCHECKGOTO(socketDoubleSendRecv(ops), res, exit); + } + if (step == 0) { BOOTSTRAP_PROF_CLOSE(tFirst); BOOTSTRAP_PROF_OPEN(tRest); } diff --git a/projects/rccl/src/ce_coll.cc b/projects/rccl/src/ce_coll.cc index 1caf65fcbc9..53e33ca8162 100644 --- a/projects/rccl/src/ce_coll.cc +++ b/projects/rccl/src/ce_coll.cc @@ -87,13 +87,13 @@ ncclResult_t ncclCeInit(struct ncclComm* comm) { ncclResult_t ncclCeFinalize(struct ncclComm* comm) { ncclResult_t ret = ncclSuccess; - + // Clean up ceInitTaskQueue while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) { struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue); free(task); } - + // Clean up CE resources if (comm->ceColl.baseUCSymReadyPtr != NULL) { if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) { @@ -153,7 +153,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBat void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank]; size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr; NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail); - + // Write our own ready/complete flag to the multi-cast address CUDACHECKGOTO(cudaMemcpyAsync( mcDstPtr, @@ -233,7 +233,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { // Get pointers to the ready and complete synchronization arrays uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr; uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr; - + // Allocate enough slots for all possible ops size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks; size_t opIdx = 0; @@ -262,7 +262,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { opIdx++; } } - + // Execute all memory operations in a single batch CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail); @@ -278,7 +278,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) { ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) { ncclResult_t ret = ncclSuccess; - + params->srcs = nullptr; params->dsts = nullptr; params->sizes = nullptr; @@ -289,7 +289,7 @@ ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int n params->attrIdxs = nullptr; params->numAttrs = 0; #endif - + NCCLCHECKGOTO(ncclCalloc(¶ms->srcs, nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(¶ms->dsts, nRanks), ret, fail); NCCLCHECKGOTO(ncclCalloc(¶ms->sizes, nRanks), ret, fail); @@ -326,6 +326,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa int driverVersion; NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail); + //--------------Graph capture-------------- // cudaMemcpyBatchAsync is not supported during CUDA graph capture if (capturing) { @@ -430,7 +431,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of each rank's data chunk const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -481,7 +482,7 @@ ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data each rank sends to every other rank const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -500,7 +501,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, int dstRank = (comm->rank + r) % comm->nRanks; uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes; uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes; - + if (dstRank == comm->rank) { // Local copy for own data batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr; @@ -536,7 +537,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data root sends to each rank const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; @@ -596,7 +597,7 @@ ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, c ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) { ncclResult_t ret = ncclSuccess; - + // Calculate the size of data each rank sends to root const size_t chunkBytes = args->nElts * args->eltSize; uint8_t* mySendBuff = (uint8_t*)args->sendBuff; diff --git a/projects/rccl/src/debug.cc b/projects/rccl/src/debug.cc index 11129f010d5..2efefe4a2df 100644 --- a/projects/rccl/src/debug.cc +++ b/projects/rccl/src/debug.cc @@ -15,10 +15,12 @@ #include #include #include "param.h" +#include +#include "env.h" #define NCCL_DEBUG_RESET_TRIGGERED (-2) -int ncclDebugLevel = -1; +__attribute__((visibility("default"))) int ncclDebugLevel = -1; static uint32_t ncclDebugTimestampLevels = 0; // bitmaps of levels that have timestamps turned on static char ncclDebugTimestampFormat[256]; // with space for subseconds static int ncclDebugTimestampSubsecondsStart; // index where the subseconds starts @@ -28,7 +30,7 @@ static int pid = -1; static char hostname[1024]; thread_local int ncclDebugNoWarn = 0; char ncclLastError[1024] = ""; // Global string for the last error in human readable form -uint64_t ncclDebugMask = 0; +__attribute__((visibility("default"))) uint64_t ncclDebugMask = 0; FILE *ncclDebugFile = stdout; static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER; static std::chrono::steady_clock::time_point ncclEpoch; @@ -36,9 +38,12 @@ static bool ncclWarnSetDebugInfo = false; static __thread int tid = -1; +typedef const char* (*ncclGetEnvFunc_t)(const char*); + // This function must be called with ncclDebugLock locked! static void ncclDebugInit() { - const char* nccl_debug = ncclGetEnv("NCCL_DEBUG"); + ncclGetEnvFunc_t getEnvFunc = ncclEnvPluginInitialized() ? ncclGetEnv : (ncclGetEnvFunc_t)getenv; + const char* nccl_debug = getEnvFunc("NCCL_DEBUG"); int tempNcclDebugLevel = -1; uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) { @@ -46,6 +51,7 @@ static void ncclDebugInit() { fclose(ncclDebugFile); ncclDebugFile = stdout; } + if (nccl_debug == NULL) { tempNcclDebugLevel = NCCL_LOG_ERROR; } else if (strcasecmp(nccl_debug, "NONE") == 0) { @@ -66,7 +72,7 @@ static void ncclDebugInit() { * This can be a comma separated list such as INIT,COLL * or ^INIT,COLL etc */ - const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS"); + const char* ncclDebugSubsysEnv = getEnvFunc("NCCL_DEBUG_SUBSYS"); if (ncclDebugSubsysEnv != NULL) { int invert = 0; if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; } @@ -120,7 +126,7 @@ static void ncclDebugInit() { free(ncclDebugSubsys); } - const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO"); + const char* ncclWarnSetDebugInfoEnv = getEnvFunc("NCCL_WARN_ENABLE_DEBUG_INFO"); if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) { int64_t value; errno = 0; @@ -130,7 +136,7 @@ static void ncclDebugInit() { } // Determine which debug levels will have timestamps. - const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS"); + const char* timestamps = getEnvFunc("NCCL_DEBUG_TIMESTAMP_LEVELS"); if (timestamps == nullptr) { ncclDebugTimestampLevels = (1< VERSION */ - const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE"); + const char* ncclDebugFileEnv = getEnvFunc("NCCL_DEBUG_FILE"); if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) { int c = 0; char debugFn[PATH_MAX+1] = ""; @@ -419,4 +425,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) { va_end(vargs); pthread_setname_np(thread, threadName); #endif -} \ No newline at end of file +} diff --git a/projects/rccl/src/dev_runtime.cc b/projects/rccl/src/dev_runtime.cc index 8e44316023c..9c6d658a784 100644 --- a/projects/rccl/src/dev_runtime.cc +++ b/projects/rccl/src/dev_runtime.cc @@ -18,8 +18,11 @@ struct ncclDevrMemory { int refCount; struct ncclDevrMemory* next; CUmemGenericAllocationHandle memHandle; + void* primaryAddr; // What we hope is the VA of this memory's first mapping. size_t size; size_t bigOffset; // offset in big VA space + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]; + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]; }; struct ncclDevrWindowSorted { @@ -56,12 +59,21 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { struct ncclDevrState* devr = &comm->devrState; if (devr->bigSize != 0) return ncclSuccess; - bool lsaIsLocal = true; - for (int i=0; i < comm->localRanks; i++) { - lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i; + // LSA needs to be the same size for all ranks, and it needs to represent + // a consecutive set of ranks. + int lsaSize = 0; + int nodeSize = 1; + for (int r=1; r < comm->nRanks; r++) { + if (comm->rankToNode[r] == comm->rankToNode[r-1]) { + nodeSize += 1; + } else { + lsaSize = gcd(lsaSize, nodeSize); + nodeSize = 1; + } } - devr->lsaSelf = lsaIsLocal ? comm->localRank : 0; - devr->lsaSize = lsaIsLocal ? comm->localRanks : 1; + lsaSize = gcd(lsaSize, nodeSize); + devr->lsaSize = lsaSize; + devr->lsaSelf = comm->rank % lsaSize; devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int)); for (int i=0; i < devr->lsaSize; i++) { devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf); @@ -87,7 +99,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { } devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32); INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30); - + ncclSpaceConstruct(&devr->bigSpace); ncclShadowPoolConstruct(&devr->shadows); return ncclSuccess; @@ -98,6 +110,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) { } static void symTeamDestroyAll(struct ncclComm* comm); // Further down +static void symMemoryDropRef(struct ncclComm* comm, struct ncclDevrMemory* mem); // Further down ncclResult_t ncclDevrFinalize(struct ncclComm* comm) { struct ncclDevrState* devr = &comm->devrState; @@ -107,7 +120,7 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) { struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue); free(task); } - + symTeamDestroyAll(comm); { // delete windowTable cudaStream_t stream; @@ -124,10 +137,20 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) { CUDACHECKIGNORE(cudaStreamDestroy(stream)); } } - CUdeviceptr flatAddr = reinterpret_cast(devr->lsaFlatBase); + // Drain leaked windows so every per-peer slice is unmapped before VA free. + // Without this, on HIP cuMemAddressFree over a still-mapped range returns + // hipErrorInvalidValue, which then cascades into ibv_dealloc_pd EBUSY at teardown. + while (devr->memHead != nullptr) { + struct ncclDevrMemory* m = devr->memHead; + m->refCount = 1; // force drop on the next call + symMemoryDropRef(comm, m); + } + if (devr->lsaFlatBase != nullptr) { + CUdeviceptr flatAddr = reinterpret_cast(devr->lsaFlatBase); // Returns error: invalid argument. Already unmapped by symMemoryDropRef // CUCHECKIGNORE(cuMemUnmap(flatAddr, devr->lsaSize*devr->bigSize)); - CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize)); + CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize)); + } ncclShadowPoolDestruct(&devr->shadows); ncclSpaceDestruct(&devr->bigSpace); free(devr->lsaRankList); @@ -343,11 +366,17 @@ static void symTeamDestroyAll(struct ncclComm* comm) { } } +static ncclResult_t symMemoryRegisterGin(struct ncclComm* comm, struct ncclDevrMemory* mem) { + NCCLCHECK(ncclGinConnectOnce(comm)); + NCCLCHECK(ncclGinRegister(comm, mem->primaryAddr, mem->size, mem->ginHostWins, mem->ginDevWins)); + return ncclSuccess; +} + // On success we take caller's reference on memHandle. // Due to multicast binds for each pre-exiting team, this function requires // caller do a world barrier before returning to user. static ncclResult_t symMemoryObtain( - struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size, + struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, void* memAddr, size_t size, struct ncclDevrMemory** outMem ) { ncclResult_t ret = ncclSuccess; @@ -362,12 +391,14 @@ static ncclResult_t symMemoryObtain( } mem = mem->next; } + // New memory. mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory)); mem->refCount = 0; mem->memHandle = memHandle; + mem->primaryAddr = memAddr; mem->size = size; - + // Grab offset in the big space. NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem); mem->bigOffset = bigOffset; @@ -375,10 +406,20 @@ static ncclResult_t symMemoryObtain( // Map unicast addresses into flat VA space for lsa team. NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space); + // If our caller doesn't have a VA then we'll use the LSA mapping. + if (mem->primaryAddr == nullptr) { + mem->primaryAddr = (char*)devr->lsaFlatBase + devr->lsaSelf*devr->bigSize + mem->bigOffset; + } + // Bind new memory with each existing team. for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams); } + + if (devr->ginEnabled) { + NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail_mem_space_teams); + } + // Add to list of mems. mem->next = devr->memHead; devr->memHead = mem; @@ -405,6 +446,9 @@ static void symMemoryDropRef( ) { if (mem != nullptr && 0 == --mem->refCount) { struct ncclDevrState* devr = &comm->devrState; + if (devr->ginEnabled) { + ncclGinDeregister(comm, mem->ginHostWins); + } for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) { symUnbindTeamMemory(comm, t, mem); } @@ -470,18 +514,22 @@ static ncclResult_t symWindowCreate( winDevHost->lsaRank = devr->lsaSelf; winDevHost->worldRank = comm->rank; winDevHost->winHost = (void*)win; + winDevHost->ginOffset4K = memOffset>>12; + for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) { + winDevHost->ginWins[i] = mem->ginDevWins[i]; + } CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream)); NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists struct ncclDevCommWindowTable* tableDev = devr->windowTable; - struct ncclDevCommWindowTable* tableHost; - NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)); while (true) { + struct ncclDevCommWindowTable* tableHost; + NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost)); int i = 0; while (i < 32 && tableHost->entries[i].window != nullptr) i += 1; if (i < 32) { tableHost->entries[i].base = userAddr; - tableHost->entries[i].size = userAddr + userSize; + tableHost->entries[i].size = userSize; tableHost->entries[i].window = winDev; CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream)); break; @@ -491,7 +539,6 @@ static ncclResult_t symWindowCreate( CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream)); } tableDev = tableHost->next; - NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost)); } { // insert into winSorted[] @@ -520,9 +567,9 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi symMemoryDropRef(comm, winHost->memory); { struct ncclDevCommWindowTable* tableDev = devr->windowTable; - struct ncclDevCommWindowTable* tableHost; - NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted); while (true) { + struct ncclDevCommWindowTable* tableHost; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted); int i = 0; while (i < 32 && tableHost->entries[i].window != winDev) i += 1; if (i < 32) { @@ -532,7 +579,6 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi } if (tableHost->next == nullptr) break; // Error didn't find window in table tableDev = tableHost->next; - NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted); } } NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted); @@ -588,7 +634,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup( CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast(memAddr)), ret, fail_locReg); // Trade cumem handle for ncclDevrMemory* - NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle); + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, (void*)memAddr, memSize, &mem), ret, fail_locReg_memHandle); memHandle = 0x0; // symMemoryObtain took our reference CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); @@ -597,7 +643,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup( comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream ), ret, fail_locReg_memHandle_mem_stream); mem = nullptr; // symWindowCreate took our reference - + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win); // symWindowCreate needs barrier. @@ -689,15 +735,35 @@ ncclResult_t ncclDevrCommCreateInternal( struct ncclDevrState* devr = &comm->devrState; struct ncclTeam world = ncclTeamWorld(comm); struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize); + bool ginActivated = false; struct ncclDevrTeam* tmLsa; size_t bufSizeTotal; + int nGinContexts = 0; + int ginSignalTotal = 0, ginCounterTotal = 0; struct ncclDevResourceRequirements* resReqsHead; struct ncclDevResourceRequirements lsaBarReq; cudaStream_t stream = nullptr; + struct ncclDevResourceRequirements railGinBarrierReq; CUmemGenericAllocationHandle memHandle = 0x0; struct ncclDevrMemory* mem = nullptr; struct ncclDevrWindow* win = nullptr; struct ncclWindow_vidmem* winHost = nullptr; + size_t ginSignalShadowsOffset = 0; + + if (comm->nNodes > 1 || reqs->ginForceEnable || reqs->ginCounterCount != 0 || reqs->ginSignalCount != 0) { + ginActivated = !devr->ginEnabled; + devr->ginEnabled = true; + } + + if (ginActivated) { + NCCLCHECKGOTO(ncclGinConnectOnce(comm), ret, fail); + // Register all preexisting memories with GIN. Update the windows later when + // we have a stream. + for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) { + NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail); + } + } + if (devr->ginEnabled) nGinContexts = comm->sharedRes->ginState.ginCommCount; memset(outDevComm, 0, sizeof(*outDevComm)); outDevComm->rank = comm->rank; @@ -723,25 +789,52 @@ ncclResult_t ncclDevrCommCreateInternal( resReqsHead = reqs->resourceRequirementsList; - ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq); + ncclLsaBarrierCreateRequirement(lsa, std::max(reqs->barrierCount, reqs->lsaBarrierCount), &outDevComm->lsaBarrier, &lsaBarReq); lsaBarReq.next = resReqsHead; resReqsHead = &lsaBarReq; + ncclGinBarrierCreateRequirement(comm, ncclTeamRail(comm), std::max(reqs->barrierCount, reqs->railGinBarrierCount), &outDevComm->railGinBarrier, &railGinBarrierReq); + railGinBarrierReq.next = resReqsHead; + resReqsHead = &railGinBarrierReq; + { struct ncclDevResourceRequirements* rr = resReqsHead; bufSizeTotal = 0; + ginSignalTotal = reqs->ginSignalCount; + ginCounterTotal = reqs->ginCounterCount; while (rr != nullptr) { bufSizeTotal = alignUp(bufSizeTotal, std::max(128, rr->bufferAlign)); if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128; + if (rr->outGinSignalStart != nullptr) *rr->outGinSignalStart = ginSignalTotal; + if (rr->outGinCounterStart != nullptr) *rr->outGinCounterStart = ginCounterTotal; bufSizeTotal += rr->bufferSize; + ginSignalTotal += rr->ginSignalCount; + ginCounterTotal += rr->ginCounterCount; rr = rr->next; } + bufSizeTotal= alignUp(bufSizeTotal, 128); + ginSignalShadowsOffset = bufSizeTotal; + bufSizeTotal += nGinContexts*ginSignalTotal*sizeof(uint64_t); // include signal shadows bufSizeTotal = alignUp(bufSizeTotal, devr->granularity); } CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail); - NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists - outDevComm->windowTable = comm->devrState.windowTable; + if (ginActivated) { + // Now update the GIN handles in all existing windows. Registration of memories happened above. + for (int i=0; i < devr->winSortedCount; i++) { + struct ncclDevrWindow* win = devr->winSorted[i].win; + struct ncclWindow_vidmem* winHost; + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream); + winHost->ginOffset4K = (win->bigOffset - win->memory->bigOffset)>>12; + for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) { + winHost->ginWins[i] = win->memory->ginDevWins[i]; + } + CUDACHECKGOTO(cudaMemcpyAsync(win->vidmem, winHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream), ret, fail_stream); + } + } + + NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail_stream); // ensure devr->windowTable exists + outDevComm->windowTable = devr->windowTable; if (bufSizeTotal == 0) { outDevComm->resourceWindow = nullptr; @@ -755,45 +848,65 @@ ncclResult_t ncclDevrCommCreateInternal( #endif memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE; memProp.requestedHandleType = ncclCuMemHandleType; + // We have to assume that if GIN is possible it might be requested in the future, + // even on single node. + memProp.allocFlags.gpuDirectRDMACapable = comm->sharedRes->ginState.ncclGin != nullptr ? 1 : 0; memProp.location.id = comm->cudaDev; - CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail); + CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail_stream); - NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail); + NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, NULL, bufSizeTotal, &mem), ret, fail_stream_mem); memHandle = 0x0; // Reference given to symMemoryObtain NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward. comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0, /*localReg=*/nullptr, &outDevComm->resourceWindow, &win, - stream), ret, fail); + stream), ret, fail_stream_mem); mem = nullptr; // Reference given to symWindowCreate - NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail); + NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream_mem_win); outDevComm->resourceWindow_inlined = *winHost; + outDevComm->ginSignalShadows = (uint64_t*)add4G((char*)winHost->lsaFlatBase + ginSignalShadowsOffset, winHost->lsaRank*winHost->stride4G); - CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail); + CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail_stream_mem_win); } - CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail); + if (devr->ginEnabled) { + outDevComm->ginContextCount = nGinContexts; + outDevComm->ginSignalCount = ginSignalTotal; + outDevComm->ginCounterCount = ginCounterTotal; + NCCLCHECKGOTO(ncclGinAllocSignalsCounters(comm, + ginSignalTotal, &outDevComm->ginSignalBase, + ginCounterTotal, &outDevComm->ginCounterBase + ), ret, fail_stream_mem_win); + + for (int ctx=0; ctx < nGinContexts; ctx++) { + outDevComm->ginTypes[ctx] = (int)comm->sharedRes->ginState.ginDevHandles[ctx]->netDeviceType; + outDevComm->ginHandles[ctx] = comm->sharedRes->ginState.ginDevHandles[ctx]->handle; + } + } - NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail); + CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_stream_mem_win_signals); - CUDACHECKIGNORE(cudaStreamDestroy(stream)); + NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_stream_mem_win_signals); + CUDACHECKGOTO(cudaStreamDestroy(stream), ret, fail_stream_mem_win_signals); return ret; -fail: - if (win != nullptr) { - symWindowDestroy(comm, win->vidmem, stream); - CUDACHECKIGNORE(cudaStreamSynchronize(stream)); - } - if (mem != nullptr) { - symMemoryDropRef(comm, mem); - } - if (memHandle != 0x0) { - CUCHECKIGNORE(cuMemRelease(memHandle)); - } - if (stream != nullptr) { - CUDACHECKIGNORE(cudaStreamDestroy(stream)); +fail_stream_mem_win_signals: + if (devr->ginEnabled) { + ncclGinFreeSignalsCounters(comm, + outDevComm->ginSignalBase, outDevComm->ginSignalCount, + outDevComm->ginCounterBase, outDevComm->ginCounterCount + ); } +fail_stream_mem_win: + symWindowDestroy(comm, win->vidmem, stream); + cudaStreamSynchronize(stream); +fail_stream_mem: + if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); } + symMemoryDropRef(comm, mem); +fail_stream: + cudaStreamDestroy(stream); +fail: return ret; } @@ -919,7 +1032,13 @@ NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const* ncclResult_t ncclDevCommDestroy( struct ncclComm* comm, struct ncclDevComm const* devComm ) { - //struct ncclDevrState* devr = &comm->devrState; + struct ncclDevrState* devr = &comm->devrState; + if (devr->ginEnabled) { + ncclGinFreeSignalsCounters(comm, + devComm->ginSignalBase, devComm->ginSignalCount, + devComm->ginCounterBase, devComm->ginCounterCount + ); + } if (devComm->resourceWindow != nullptr) { NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow)); } @@ -934,7 +1053,7 @@ ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow* } struct ncclDevrState* devr = &comm->devrState; - + // Validate lsaRank is within bounds if (lsaRank < 0 || lsaRank >= devr->lsaSize) { return ncclInvalidArgument; @@ -963,7 +1082,7 @@ ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindo bool multimem = true; struct ncclDevrTeam* tm; NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm)); - + // Return the base multicast address for this team with offset *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset); return ncclSuccess; diff --git a/projects/rccl/src/device/CMakeLists.txt b/projects/rccl/src/device/CMakeLists.txt index 98447428df0..acaa9b65ddb 100644 --- a/projects/rccl/src/device/CMakeLists.txt +++ b/projects/rccl/src/device/CMakeLists.txt @@ -50,9 +50,9 @@ set_target_properties(nccl_device PROPERTIES # Set include directories for the target target_include_directories(nccl_device PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_BINARY_DIR}/include ${CMAKE_SOURCE_DIR}/src/include ${CMAKE_SOURCE_DIR}/src/include/plugin - ${CMAKE_BINARY_DIR}/include ${CUDAToolkit_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS}/cccl ) diff --git a/projects/rccl/src/device/Makefile b/projects/rccl/src/device/Makefile index fd8f2759d4c..cf0fa0637f1 100644 --- a/projects/rccl/src/device/Makefile +++ b/projects/rccl/src/device/Makefile @@ -23,12 +23,13 @@ INCFLAGS = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" CXXFLAGS += $(INCFLAGS) -NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all +NVCUFLAGS_SYM += -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden" SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1 +COMPILE.kernel = $(NVCC) $(NVCUFLAGS) -dw $2 -o $1 COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1 define COMPILE @$(SAY) "Compiling" $2;\ diff --git a/projects/rccl/src/device/network/unpack/unpack.h b/projects/rccl/src/device/network/unpack/unpack.h index 44098977d35..2489437cd36 100644 --- a/projects/rccl/src/device/network/unpack/unpack.h +++ b/projects/rccl/src/device/network/unpack/unpack.h @@ -248,7 +248,7 @@ inline __device__ void ncclNetDeviceUnpackInner( for (int x = 0; x < iter_meta_cnt; x++) { int meta_idx = x + w * PPW; - + // load page offs loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]); diff --git a/projects/rccl/src/device/reduce_kernel.h b/projects/rccl/src/device/reduce_kernel.h index 593f868eae1..d98fa356376 100755 --- a/projects/rccl/src/device/reduce_kernel.h +++ b/projects/rccl/src/device/reduce_kernel.h @@ -841,7 +841,7 @@ struct FuncSumPostDiv { using UintType = typename std::conditional::type; uint32_t divisor:31, isSigned:1; UintType recip; - + __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) { isSigned = opArg & 1; divisor = opArg >> 1; diff --git a/projects/rccl/src/device/symmetric/all_gather.cuh b/projects/rccl/src/device/symmetric/all_gather.cuh index f57c17cb2e5..be1f0e7face 100644 --- a/projects/rccl/src/device/symmetric/all_gather.cuh +++ b/projects/rccl/src/device/symmetric/all_gather.cuh @@ -356,7 +356,7 @@ static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const* char* blockInput = input.localPtr(); char* blockOutput = output.localPtr(); - uint32_t lowBits = nElts; + uint32_t lowBits = nAllElts; lowBits |= (uintptr_t)blockInput; lowBits |= (uintptr_t)blockOutput; if (__builtin_expect(lowBits%8 == 0, true)) { diff --git a/projects/rccl/src/device/symmetric/generate.py b/projects/rccl/src/device/symmetric/generate.py index 45958d5908a..594b403b93f 100755 --- a/projects/rccl/src/device/symmetric/generate.py +++ b/projects/rccl/src/device/symmetric/generate.py @@ -222,12 +222,20 @@ def partition(vals, keyfn): emitln(f, '') emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels()))) - emitln(f, 'extern void* const ncclSymkKernelList[] = {') + emitln(f, 'void* ncclSymkKernelList[] = {') for k in enumerate_kernels(): emitln(f, '(void*){cname},'.format(cname=kernel_cname(k))) emitln(f, 'nullptr};') emitln(f, '') + emitln(f, 'int ncclSymkKernelRequirements[] = {') + for index,k in enumerate(enumerate_kernels()): + cudart, _, _ = required_cuda(k) + sym = kernel_cname(k) + emitln(f, ' %7d, /*%4d %s*/' % (cudart or 0, index, sym)); + emitln(f, '};') + emitln(f, '') + emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {') indents += 1 emitln(f, 'switch (id) {') diff --git a/projects/rccl/src/device/symmetric/primitives.cuh b/projects/rccl/src/device/symmetric/primitives.cuh index 343f354e588..6d0c3352fd8 100644 --- a/projects/rccl/src/device/symmetric/primitives.cuh +++ b/projects/rccl/src/device/symmetric/primitives.cuh @@ -60,13 +60,14 @@ struct ncclSymkArgsHandler { workLo++; fracLo = 0; } - struct ncclSymkDevWork const& dw = devWork[workLo]; - indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; + struct ncclSymkDevWork const& dwLo = devWork[workLo]; + indexLo = ((fracLo * divUp(dwLo.nElts, EltPerCell)) >> 16) * EltPerCell; // Where the work ends workHi = channelWorkRange[block].workHi; fracHi = channelWorkRange[block].fracHi + 1; - indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); + struct ncclSymkDevWork const& dwHi = devWork[workHi]; + indexHi = min(((fracHi * divUp(dwHi.nElts, EltPerCell)) >> 16) * EltPerCell, dwHi.nElts); } template @@ -82,7 +83,7 @@ struct ncclSymkArgsHandler { lastBlock = dw.sChannelId+dw.nChannels-1; // Where the work begins - fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF); + fracLo = (dw.sChannelId>0 && channelWorkRange[dw.sChannelId-1].workHi == w) ? ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF) : 0; indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell; fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000; indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts); @@ -95,16 +96,16 @@ struct ncclSymkArgsHandler { getWorkRange(blockIdx.x, workLo, indexLo, workHi, indexHi); - size_t currentIndexLo = indexLo; #pragma unroll 1 for (int w = workLo; w <= workHi; w++) { struct ncclSymkDevWork const& dw = devWork[w]; size_t const& nAllElts = dw.nElts; - size_t currentIndexHi; + size_t currentIndexLo, currentIndexHi; int block, nBlocks; if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) { getWorkRangeFused(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi); } else { + currentIndexLo = (w > workLo) ? 0 : indexLo; currentIndexHi = (w < workHi) ? nAllElts : indexHi; block = 0; nBlocks = 1; diff --git a/projects/rccl/src/device/symmetric/reduce_scatter.cuh b/projects/rccl/src/device/symmetric/reduce_scatter.cuh index 9c149c8f225..c9ce8f56d4b 100644 --- a/projects/rccl/src/device/symmetric/reduce_scatter.cuh +++ b/projects/rccl/src/device/symmetric/reduce_scatter.cuh @@ -245,7 +245,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); int tn = nBlocks*blockDim.x; - reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts); + reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nAllElts, output, nElts); waitNeeded = false; } @@ -327,7 +327,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkAr threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE); int tn = nBlocks*blockDim.x; - reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts); + reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nAllElts, output.localPtr(), nElts); } ); @@ -406,7 +406,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs T* input = (T*)inputPtr.localPtr(); T* output = (T*)outputPtr.localPtr(); - uint32_t lowBits = nElts*sizeof(T); + uint32_t lowBits = nAllElts*sizeof(T); lowBits |= (uintptr_t)input; lowBits |= (uintptr_t)output; if (__builtin_expect(lowBits%8 == 0, true)) { diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index d10e0ba4696..b81c489e727 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -120,7 +120,10 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma if (fn == nullptr) continue; cudaError_t errcode = cudaFuncGetAttributes(&attr, fn); - if (errcode != cudaSuccess) continue; // Silently ignore failures + if (errcode != cudaSuccess) { + cudaGetLastError(); // Drain error code + continue; // Silently ignore failures + } if (maxStackSize) { if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes; } @@ -207,6 +210,9 @@ static void addWorkBatchToPlan( newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1); for (int i=0; i < chan->wipBatch.nP2ps; i++) { newBatch |= p2pRound == chan->wipBatch.p2pRounds[i]; + // Make sure we only aggregate p2p operations within the same p2p round epoch (one epoch is NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops). + // This enforces uniform batching accross ranks in the communicator and prevents hangs. + newBatch |= (p2pRound / NCCL_MAX_DEV_WORK_P2P_PER_BATCH) != (chan->wipBatch.p2pRounds[i] / NCCL_MAX_DEV_WORK_P2P_PER_BATCH); } } } @@ -3177,16 +3183,21 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) { } ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + // Early-out on invalid or revoked communicator + ncclResult_t ret = CommCheck(info->comm, info->opName, "comm"); + if (ret != ncclSuccess) return ncclGroupErrCheck(ret); + if (info->comm->revokedFlag) { + WARN("%s: communicator was revoked", info->opName); + return ncclGroupErrCheck(ncclInvalidUsage); + } // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls if (ncclProfilerApiState.profilerGroupDepth > 0) { ncclProfilerApiState.profilerGroupDepth++; } NCCLCHECK(ncclGroupStartInternal()); - ncclResult_t ret = ncclSuccess; + ret = ncclSuccess; int devOld = -1; - - NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail); // Check whether communicator is ready to communicate NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail); diff --git a/projects/rccl/src/gin/CMakeLists.txt b/projects/rccl/src/gin/CMakeLists.txt new file mode 100644 index 00000000000..e20d7ddf38d --- /dev/null +++ b/projects/rccl/src/gin/CMakeLists.txt @@ -0,0 +1,8 @@ +# Gin sources +set(GIN_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/gin_host.cc + ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_proxy.cc +) + +# Add gin sources to parent scope +set(GIN_SOURCES ${GIN_SOURCES} PARENT_SCOPE) diff --git a/projects/rccl/src/gin/gin_host.cc b/projects/rccl/src/gin/gin_host.cc new file mode 100644 index 00000000000..b42f88fdeb0 --- /dev/null +++ b/projects/rccl/src/gin/gin_host.cc @@ -0,0 +1,277 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "comm.h" +#include "param.h" +#include "graph.h" +#include "transport.h" +#include "register_inline.h" +#include "gin/gin_host.h" +#include "gin/gin_host_proxy.h" + +NCCL_PARAM(GinEnable, "GIN_ENABLE", 1); +NCCL_PARAM(GinType, "GIN_TYPE", -1); +NCCL_PARAM(GinSignalPoolSize, "GIN_SIGNAL_POOL_SIZE", 64 << 10); +NCCL_PARAM(GinCounterPoolSize, "GIN_COUNTER_POOL_SIZE", 64 << 10); + +void* ncclGinProgress(void* ginState_) { + struct ncclGinState* ginState = (struct ncclGinState*)ginState_; + while (1) { + pthread_mutex_lock(&ginState->threadLock); + if (ginState->ginProgress == 1) { + pthread_mutex_unlock(&ginState->threadLock); + for (int n=0; nginCommCount; n++) { + ncclResult_t ret; + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + ret = ncclGinProxyProgress(ginState->ncclGin, ginState->ginCtx[n]); + } else { + ret = ginState->ncclGin->ginProgress(ginState->ginComms[n]); + } + if (ret != ncclSuccess) { + __atomic_store_n(&ginState->asyncResult, ret, __ATOMIC_RELEASE); + INFO(NCCL_ALL,"%s:%d -> %d [GIN Progress Thread]", __FILE__, __LINE__, ret); + ginState->ginProgress = -2; + return NULL; + } + } + sched_yield(); + } else if (ginState->ginProgress == -1) { + pthread_mutex_unlock(&ginState->threadLock); + return NULL; + } else if (ginState->ginProgress == 0) { + pthread_cond_wait(&ginState->threadCond, &ginState->threadLock); + pthread_mutex_unlock(&ginState->threadLock); + } else { + pthread_mutex_unlock(&ginState->threadLock); + INFO(NCCL_ALL,"%s:%d -> [GIN Progress Thread] state unknown %d", __FILE__, __LINE__, ginState->ginProgress); + ginState->ginProgress = -2; + return NULL; + } + } +} + +NCCL_PARAM(GinNcontexts, "GIN_NCONTEXTS", NCCL_GIN_MAX_CONTEXTS); + +ncclResult_t ncclGinConnectOnce(struct ncclComm* comm) { + ncclResult_t ret = ncclSuccess; + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (ginState->ncclGin == NULL) { + WARN("GIN not supported."); + return ncclInvalidUsage; + } + if (ncclParamGinEnable() == 0) { + WARN("GIN is disabled."); + return ncclInternalError; + } + if (ginState->connected) return ncclSuccess; + + NCCLCHECK(ginState->ncclGin->init(&ginState->ginInstance, comm->commHash, ncclDebugLog)); + + int ndev = 0; + NCCLCHECK(ginState->ncclGin->devices(&ndev)); + if (ndev <= 0) { + WARN("No GIN-capable devices found."); + return ncclInternalError; + } + + ncclNetProperties_t props; + NCCLCHECK(ginState->ncclGin->getProperties(0, &props)); + ginState->ginType = props.netDeviceType; + if ((ncclParamGinType() != -1) && (ginState->ginType != ncclParamGinType())) { + WARN("GIN-capable device type mismatch."); + return ncclInternalError; + } + + int nLocalNets; + int64_t localNets[NCCL_TOPO_MAX_NODES]; + NCCLCHECK(ncclTopoGetLocalNets(comm->topo, comm->rank, localNets, &nLocalNets)); + + void** handles = NULL; + char* allHandles = NULL; + + ginState->ginCommCount = std::min(NCCL_GIN_MAX_CONTEXTS, ncclParamGinNcontexts()); + + NCCLCHECKGOTO(ncclCalloc(&allHandles, (size_t)comm->nRanks * NCCL_NET_HANDLE_MAXSIZE), ret, fail); + NCCLCHECKGOTO(ncclCalloc(&handles, comm->nRanks), ret, fail); + for (int r = 0; r < comm->nRanks; r++) handles[r] = allHandles + r * NCCL_NET_HANDLE_MAXSIZE; + + ginState->signalSpaceSize = ncclParamGinSignalPoolSize(); + if (ginState->signalSpaceSize < 0 || (1 << 30) <= ginState->signalSpaceSize) { + WARN("NCCL_GIN_SIGNAL_POOL_SIZE has invalid value."); + ginState->signalSpaceSize = 64 << 10; + } + ginState->counterSpaceSize = ncclParamGinCounterPoolSize(); + if (ginState->counterSpaceSize < 0 || (1 << 30) <= ginState->counterSpaceSize) { + WARN("NCCL_GIN_COUNTER_POOL_SIZE has invalid value."); + ginState->counterSpaceSize = 64 << 10; + } + + for (int n = 0; n < ginState->ginCommCount; n++) { + void* listenComm; + NCCLCHECKGOTO( + ginState->ncclGin->listen(ginState->ginInstance, localNets[n%nLocalNets], + allHandles + NCCL_NET_HANDLE_MAXSIZE * comm->rank, &listenComm), + ret, fail); + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allHandles, NCCL_NET_HANDLE_MAXSIZE), ret, + fail); + NCCLCHECKGOTO(ginState->ncclGin->connect(comm->ginContext, handles, comm->nRanks, comm->rank, + listenComm, ginState->ginComms + n), + ret, fail); + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECKGOTO(ncclGinProxyCreateContext(comm, ginState->ginComms[n], localNets[n%nLocalNets], + ginState->signalSpaceSize, ginState->counterSpaceSize, + &ginState->ginCtx[n], &ginState->ginDevHandles[n]), + ret, fail); + } else { + NCCLCHECKGOTO(ginState->ncclGin->createContext( + ginState->ginComms[n], ginState->signalSpaceSize, ginState->counterSpaceSize, + &ginState->ginCtx[n], &ginState->ginDevHandles[n]), + ret, fail); + } + NCCLCHECKGOTO(ginState->ncclGin->closeListen(listenComm), ret, fail); + } + free(handles); + handles = NULL; + free(allHandles); + allHandles = NULL; + + // Check whether we need proxy progress and if so, start / wake up the progress thread. + ginState->needsProxyProgress = 0; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginDevHandles[n]->needsProxyProgress) ginState->needsProxyProgress = 1; + } + if (ginState->needsProxyProgress) { + ginState->ginProgress = 1; + pthread_mutex_init(&ginState->threadLock, NULL); + pthread_cond_init(&ginState->threadCond, NULL); + PTHREADCHECK(pthread_create(&ginState->thread, NULL, ncclGinProgress, ginState), "pthread_create"); + ncclSetThreadName(ginState->thread, "NCCL GIN Progress%2d", comm->cudaDev); + } + + ncclSpaceConstruct(&ginState->counterSpace); + ncclSpaceConstruct(&ginState->signalSpace); + +exit: + if (ret == ncclSuccess) ginState->connected = true; + return ret; +fail: + free(allHandles); + free(handles); + goto exit; +} + +ncclResult_t ncclGinFinalize(struct ncclComm* comm) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (!ginState->connected) return ncclSuccess; + + if (ginState->needsProxyProgress) { + pthread_mutex_lock(&ginState->threadLock); + comm->sharedRes->ginState.ginProgress = -1; + pthread_cond_signal(&ginState->threadCond); + pthread_mutex_unlock(&ginState->threadLock); + PTHREADCHECK(pthread_join(ginState->thread, NULL), "pthread_join"); + } + + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginCtx[n] != NULL) { + NCCLCHECK(ncclGinProxyDestroyContext(ginState->ncclGin, ginState->ginCtx[n])); + ginState->ginCtx[n] = NULL; + } + } + } + + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginCtx[n] != NULL) { + NCCLCHECK(ginState->ncclGin->destroyContext(ginState->ginCtx[n])); + ginState->ginCtx[n] = NULL; + } + if (ginState->ginComms[n] != NULL) { + NCCLCHECK(ginState->ncclGin->closeColl(ginState->ginComms[n])); + ginState->ginComms[n] = NULL; + } + } + NCCLCHECK(ginState->ncclGin->finalize(ginState->ginInstance)); + memset(ginState, 0, sizeof(*ginState)); + return ncclSuccess; +} + +ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size, + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS], + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECK(ncclGinProxyRegister(ginState->ncclGin, ginState->ginCtx[n], address, size, + NCCL_PTR_CUDA, 0, &ginHostWins[n], &ginDevWins[n])); + } else { + NCCLCHECK(ginState->ncclGin->regMrSym(ginState->ginComms[n], address, size, NCCL_PTR_CUDA, 0, + &ginHostWins[n], &ginDevWins[n])); + } + if (ginHostWins[n] == NULL) { + WARN("rank %d - GIN Symmetric register failed: buff %p, size %ld", comm->rank, address, size); + return ncclSystemError; + } + } + return ncclSuccess; +} + +ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) { + NCCLCHECK(ncclGinProxyDeregister(ginState->ncclGin, ginState->ginCtx[n], ginHostWins[n])); + } else { + NCCLCHECK(ginState->ncclGin->deregMrSym(ginState->ginComms[n], ginHostWins[n])); + } + } + return ncclSuccess; +} + +ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0, + int nCounters, uint32_t* outCounter0) { + ncclResult_t ret = ncclSuccess; + struct ncclGinState* ginState = &comm->sharedRes->ginState; + int64_t start; + if (nSignals != 0) { + NCCLCHECKGOTO( + ncclSpaceAlloc(&ginState->signalSpace, ginState->signalSpaceSize, nSignals, 1, &start), ret, + fail); + *outSignal0 = (uint32_t)start; + } + if (nCounters != 0) { + NCCLCHECKGOTO( + ncclSpaceAlloc(&ginState->counterSpace, ginState->counterSpaceSize, nCounters, 1, &start), + ret, fail_signals); + *outCounter0 = (uint32_t)start; + } + return ncclSuccess; +fail_signals: + if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, *outSignal0, nSignals); +fail: + return ret; +} + +ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals, + uint32_t counter0, int nCounters) { + struct ncclGinState* ginState = &comm->sharedRes->ginState; + if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, signal0, nSignals); + if (nCounters != 0) ncclSpaceFree(&ginState->counterSpace, counter0, nCounters); + return ncclSuccess; +} + +ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError) { + bool hasError_ = false; + for (int n = 0; n < ginState->ginCommCount; n++) { + if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) + NCCLCHECK(ncclGinProxyQueryLastError(ginState->ncclGin, ginState->ginCtx[n], &hasError_)); + else + NCCLCHECK(ginState->ncclGin->queryLastError(ginState->ginCtx[n], &hasError_)); + if (hasError_) break; + } + *hasError = hasError_; + return ncclSuccess; +} diff --git a/projects/rccl/src/gin/gin_host_proxy.cc b/projects/rccl/src/gin/gin_host_proxy.cc new file mode 100644 index 00000000000..511e38b409e --- /dev/null +++ b/projects/rccl/src/gin/gin_host_proxy.cc @@ -0,0 +1,501 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include "nccl.h" +#include "comm.h" +#include "gin/gin_host.h" +#include "alloc.h" +#include "checks.h" +#include "gdrwrap.h" +#include "plugin/nccl_net.h" +#include "nccl_device/gin/proxy/gin_proxy_device_host_common.h" + +NCCL_PARAM(GinProxyQueueSize, "GIN_PROXY_QUEUE_SIZE", -1); +extern int64_t ncclParamIbDataDirect(); +extern int64_t ncclParamDmaBufEnable(); + +struct ginProxyGfdState { + ncclGinProxyOp_t op; + uint16_t counterId; + int done; + void *request; +}; + +// a member might be on the GPU, if it has a *GdrHandle counterpart +struct ginProxyHostGpuCtx { + size_t queueSize; + + // size = nRanks * queueSize + ncclGinProxyGfd_t *queues; + void *cisGdrHandle; + // Consumed Indices, one per rank + uint32_t *cis; + // to decrease the number of reads/writes to cis which might be on the GPU + uint32_t *cisShadow; + // Seen Indices one per rank + uint32_t *sis; + + // same size as queues + struct ginProxyGfdState *states; + // same size as queues + uint64_t *inlines; + // inlines is registered as a memory region with the GIN plugin + void *inlinesMhandle; + void *inlinesGinHandle; +}; + +struct ginProxyCtx { + struct ncclComm *comm; + void *collComm; + ncclNetDeviceHandle_v11_t *devHandle; + ncclNetProperties_t props; + + // GPU queues, if GDR on the GPU, else on the CPU + // Queue size, must be a power of 2 + struct ginProxyHostGpuCtx *hostGpuCtx; + + void *countersGdrHandle; + uint64_t *counters; + uint64_t *countersDev; + CUmemGenericAllocationHandle signalsCumemhandle; + void *signalsMhandle; + void *signalsGinHandle; + uint64_t *signalsDev; + int hasError; +}; + +// Depending on GDR, allocate memory on the CPU or GPU. +// host_flags is not used for now, but it is here for future use. +template +static ncclResult_t allocMemCPUAccessible(T **ptr, T **devPtr, size_t nelem, int host_flags, + void **gdrHandle, bool forceHost = false) { + if (ncclGdrCopy && !forceHost) { + NCCLCHECK(ncclGdrCudaCalloc(ptr, devPtr, nelem, gdrHandle)); + } else { + NCCLCHECK(ncclCuMemHostAlloc((void **)ptr, NULL, nelem * sizeof(T))); + memset((void *)*ptr, 0, nelem * sizeof(T)); + *devPtr = *ptr; + if (gdrHandle) *gdrHandle = NULL; // Mark as host allocated by nulling GDR handle + } + return ncclSuccess; +} + +// Depending on GDR, free memory on the CPU or GPU. +template +static ncclResult_t freeMemCPUAccessible(T *ptr, void *gdrHandle) { + if (gdrHandle != NULL) { // If a GDR handle exists, it was GDR memory + NCCLCHECK(ncclGdrCudaFree(gdrHandle)); + } else { // Otherwise, it was host memory (or GDR was off) + NCCLCHECK(ncclCuMemHostFree(ptr)); + } + return ncclSuccess; +} + +static ncclResult_t getDmaBufFd(void *addr, size_t length, int *fd, + bool forceNonDataDirect = false) { + if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage; + +#if CUDA_VERSION >= 11070 + static size_t hostPageSize = sysconf(_SC_PAGESIZE); + size_t alignedSize = length; + ALIGN_SIZE(alignedSize, hostPageSize); + +#if CUDA_VERSION >= 12080 + if (ncclParamIbDataDirect() && !forceNonDataDirect) { + CUresult status = pfn_cuMemGetHandleForAddressRange( + (void *)fd, (CUdeviceptr)addr, alignedSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, + CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE); + if (status == CUDA_SUCCESS) return ncclSuccess; + } +#endif + CUresult status = pfn_cuMemGetHandleForAddressRange((void *)fd, (CUdeviceptr)addr, alignedSize, + CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0); + if (status == CUDA_SUCCESS) return ncclSuccess; +#endif + + return ncclInvalidUsage; +} + +static ncclResult_t proxyGinPollCompletions(ncclGin_t *ginComm, void *collComm, + struct ginProxyCtx *ctx, + struct ginProxyHostGpuCtx *hostGpuCtx) { + for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) { + // loop on all seen but unconsumed GFDs + for (uint32_t i = hostGpuCtx->cisShadow[targetRank]; i < hostGpuCtx->sis[targetRank]; i++) { + uint32_t idx = i & (hostGpuCtx->queueSize - 1); + struct ginProxyGfdState *state = + &hostGpuCtx->states[targetRank * hostGpuCtx->queueSize + idx]; + // no need to poll if already done + if (!state->done) { + ginComm->test(collComm, state->request, &state->done); + if (state->done) { + TRACE(NCCL_NET, "GFD completed - stateIdx: %lu, request: %p", state - hostGpuCtx->states, + state->request); + // update the counter specified in the GFD + if (state->op & ncclGinProxyOpWithCounter) { + __atomic_store_n(&ctx->counters[state->counterId], ctx->counters[state->counterId] + 1, + __ATOMIC_RELAXED); + TRACE(NCCL_NET, "Updated counter %d to %ld", state->counterId, + ctx->counters[state->counterId]); + } + } + } + // allow holes in the CI space to get resolved + if (state->done && i == hostGpuCtx->cisShadow[targetRank]) { + // tell the GPU that we have consumed the GFD + __atomic_store_n(&hostGpuCtx->cis[targetRank], ++hostGpuCtx->cisShadow[targetRank], + __ATOMIC_RELAXED); + TRACE(NCCL_NET, "Updated cis[%u] to %u", targetRank, hostGpuCtx->cisShadow[targetRank]); + } + } + } + + return ncclSuccess; +} + +static int proxyGinPollGfd(struct ginProxyCtx *ctx, ginProxyHostGpuCtx *hostGpuCtx, int targetRank, + ncclGinProxyGfd_t *gfd, struct ginProxyGfdState **state) { + ncclGinProxyGfd_t *q = hostGpuCtx->queues + targetRank * hostGpuCtx->queueSize; + uint32_t idx = hostGpuCtx->sis[targetRank] & (hostGpuCtx->queueSize - 1); + ncclGinProxyQword_t qword; + __atomic_load(&q[idx].qword[ncclGinProxyGfdHeader].raw, &qword.raw, __ATOMIC_RELAXED); + if (qword.flag.v == 0) { + return 0; + } + + // We know for sure that the first qword is there, copy it. + gfd->qword[ncclGinProxyGfdHeader] = q[idx].qword[ncclGinProxyGfdHeader]; + // Wait for and copy the other qwords. + for (int k = 1; k < ncclGinProxyGfdQwords; k++) { + do { + __atomic_load(&q[idx].qword[k].raw, &qword.raw, __ATOMIC_RELAXED); + } while (qword.flag.v == 0); + gfd->qword[k] = qword; + } + // Now we have the full GFD in the local struct. + + // Reset the GFD in the queue. This lets the producer know that the GFD is consumed. + for (int k = 0; k < ncclGinProxyGfdQwords; k++) { + __atomic_store_n(&q[idx].qword[k].raw, 0, __ATOMIC_RELAXED); + } + + // set the counter_id into the state + uint32_t stateIdx = targetRank * hostGpuCtx->queueSize + idx; + *state = &hostGpuCtx->states[stateIdx]; + (*state)->op = (ncclGinProxyOp_t)(gfd->qword[ncclGinProxyGfdHeader].header.op); + (*state)->counterId = gfd->qword[ncclGinProxyGfdCompletion].completion.counterId; + (*state)->done = 0; + (*state)->request = NULL; + + TRACE(NCCL_NET, + "GFD to target PE %d raw idx: %u, idx: %u - op: %#lx, size: %lu, srcOff: %lu, dstOff: %lu, " + "srcHandle: %lu, dstHandle: %lu, counterId: %u, signalId: %u, stateIdx: %u", + targetRank, hostGpuCtx->sis[targetRank], idx, gfd->qword[ncclGinProxyGfdHeader].header.op, + gfd->qword[ncclGinProxyGfdHeader].header.size, + gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff, + gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff, + gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle, + gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle, + gfd->qword[ncclGinProxyGfdCompletion].completion.counterId, + gfd->qword[ncclGinProxyGfdCompletion].completion.signalId, stateIdx); + + hostGpuCtx->sis[targetRank]++; + + return 1; +} + +static int mapGfdOpToCollNetOp(ncclGinProxyGfd_t *gfd) { + switch (gfd->qword[ncclGinProxyGfdHeader].header.op & + (ncclGinProxyOpComplMask & ~ncclGinProxyOpWithCounter)) { + case ncclGinProxyOpWithSignalInc: + return NCCL_NET_SIGNAL_OP_INC; + case ncclGinProxyOpWithSignalAdd: + return NCCL_NET_SIGNAL_OP_ADD; + default: + return -1; + } +} + +static ncclResult_t proxyGinProcessGfd(ncclGin_t *ginComm, void *collComm, struct ginProxyCtx *ctx, + struct ginProxyHostGpuCtx *hostGpuCtx, int targetRank, + ncclGinProxyGfd_t *gfd, struct ginProxyGfdState *state) { + int signalOp; + uint64_t signalVal; + + uint64_t size = gfd->qword[ncclGinProxyGfdHeader].header.size; + uint64_t srcOff; + void *srcHandle; + if (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpWithInline) { + uint64_t *inlineVal = &hostGpuCtx->inlines[gfd - hostGpuCtx->queues]; + srcOff = (uint64_t)&inlineVal[0] - (uint64_t)hostGpuCtx->inlines; + // reconstruct the inline value from the two qwords + *inlineVal = gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow; + if (size == 8) { + *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 << 32; + *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh << 48; + } + srcHandle = hostGpuCtx->inlinesMhandle; + } else { + srcOff = gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff; + srcHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle; + } + uint64_t dstOff = gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff; + void *dstHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle; + + switch (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpBaseMask) { + case ncclGinProxyOpPut: + signalOp = mapGfdOpToCollNetOp(gfd); + if (signalOp == -1) { + // First cast from 63 bits to 64 bits and then to void * to avoid warnings + NCCLCHECK(ginComm->iput(collComm, srcOff, srcHandle, size, dstOff, dstHandle, + targetRank, &state->request)); + } else { + // reconstruct the signal value from the two qwords + signalVal = gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow; + signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 << 16; + signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh << 32; + uint64_t signalOff = + gfd->qword[ncclGinProxyGfdCompletion].completion.signalId * sizeof(uint64_t); + NCCLCHECK(ginComm->iputSignal(collComm, srcOff, srcHandle, size, dstOff, dstHandle, + targetRank, signalOff, ctx->signalsGinHandle, signalVal, + signalOp, &state->request)); + } + break; + default: + // this error should already have been checked in pollGfd + assert(0); + } + TRACE(NCCL_NET, "GFD submitted into GIN plugin - stateIdx: %lu, request: %p", + state - hostGpuCtx->states, state->request); + return ncclSuccess; +} + +static uint64_t isPowerOfTwo(uint64_t n) { return (n > 0) && ((n & (n - 1)) == 0); } + +// Check if the GIN plugin supports DMA-BUF, if so we can try to get the DMA-BUF handle from CUDA, +// if that fails we fallback to non-DMA-BUF +static ncclResult_t ncclGinProxyRegMrSym(ncclGin_t *ginComm, struct ginProxyCtx *ctx, void *addr, + size_t size, int type, int mr_flags, void **mhandle, + void **ginHandle) { + if (type == NCCL_PTR_HOST) { + NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle)); + } else if (type == NCCL_PTR_CUDA) { + ncclResult_t dmabufResult = ncclInvalidUsage; + if (ncclParamDmaBufEnable() && (ctx->props.ptrSupport & NCCL_PTR_DMABUF)) { + ncclResult_t registrationResult = ncclSuccess; + int dmabufFd = -1; + dmabufResult = getDmaBufFd(addr, size, &dmabufFd); + if (dmabufResult == ncclSuccess) { + registrationResult = ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd, + mr_flags, mhandle, ginHandle); + close(dmabufFd); + } + if (registrationResult != ncclSuccess) { + dmabufFd = -1; + dmabufResult = getDmaBufFd(addr, size, &dmabufFd, true); + if (dmabufResult == ncclSuccess) { + NCCLCHECK(ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd, + mr_flags, mhandle, ginHandle)); + close(dmabufFd); + } + } + } + // Fallback to non-DMA-BUF if the DMA-BUF handle is not supported + if (dmabufResult != ncclSuccess) { + NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle)); + } + } else { + return ncclInvalidUsage; + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId, + int nSignals, int nCounters, void **outGinCtx, + ncclNetDeviceHandle_v11_t **outDevHandle) { + ncclGin_t *ginComm = (ncclGin_t *)comm->sharedRes->ginState.ncclGin; + + if (!ncclGdrCopy) + INFO(NCCL_NET, "GIN Proxy will not be using GDRCopy"); + + struct ginProxyCtx *proxyCtx = NULL; + NCCLCHECK(ncclCalloc(&proxyCtx, 1)); + + proxyCtx->comm = comm; + proxyCtx->collComm = collComm; + + // Sanitize the queue size + NCCLCHECK(ginComm->getProperties(devId, &proxyCtx->props)); + uint64_t queueSize = ncclParamGinProxyQueueSize(); + uint32_t maxRequests = NCCL_NET_MAX_REQUESTS * proxyCtx->props.maxRecvs; + if (queueSize == -1) { + queueSize = maxRequests; + } + if (queueSize > maxRequests) { + INFO(NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is greater than the maximum outstanding requests in the GIN " + "plugin (%d), using the default/maximum value instead", + maxRequests); + queueSize = maxRequests; + } + if (queueSize < 1) { + INFO(NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is less than 1, using the default/maximum value instead"); + queueSize = maxRequests; + } + if (!isPowerOfTwo(queueSize)) { + INFO( + NCCL_NET, + "NCCL_GIN_PROXY_QUEUE_SIZE is not a power of two, using the default/maximum value instead"); + queueSize = maxRequests; + } + + // Allocate the counters on the GPU or CPU depending on GDR + NCCLCHECK(allocMemCPUAccessible(&proxyCtx->counters, &proxyCtx->countersDev, nCounters, + CU_MEMHOSTALLOC_WRITECOMBINED, + &proxyCtx->countersGdrHandle)); + + // Allocate the signals on the GPU and then register the memory region with the GIN plugin. + // Enforcing strong ordering on the signals mr is vital to ensure ordering between puts and + // signals. + size_t signalsBufSize = nSignals * sizeof(uint64_t); + NCCLCHECK(ncclCuMemAlloc((void **)&proxyCtx->signalsDev, &proxyCtx->signalsCumemhandle, + CU_MEM_HANDLE_TYPE_NONE, signalsBufSize)); + CUDACHECK(cudaMemset(proxyCtx->signalsDev, 0, signalsBufSize)); + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, proxyCtx->signalsDev, signalsBufSize, + NCCL_PTR_CUDA, NCCL_NET_MR_FLAG_FORCE_SO, + &proxyCtx->signalsMhandle, &proxyCtx->signalsGinHandle)); + + NCCLCHECK(ncclCalloc(&proxyCtx->hostGpuCtx, 1)); + struct ginProxyHostGpuCtx *hostGpuCtx = proxyCtx->hostGpuCtx; + hostGpuCtx->queueSize = queueSize; + size_t queuesLength = hostGpuCtx->queueSize * comm->nRanks; + NCCLCHECK(ncclCalloc(&hostGpuCtx->states, queuesLength)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->cisShadow, comm->nRanks)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->sis, comm->nRanks)); + NCCLCHECK(ncclCalloc(&hostGpuCtx->inlines, queuesLength)); + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, hostGpuCtx->inlines, + queuesLength * sizeof(uint64_t), NCCL_PTR_HOST, 0, + &hostGpuCtx->inlinesMhandle, &hostGpuCtx->inlinesGinHandle)); + + ncclGinProxyGpuCtx_t devGpuCtx_h; + devGpuCtx_h.nranks = comm->nRanks; + devGpuCtx_h.queueSize = hostGpuCtx->queueSize; + devGpuCtx_h.counters = proxyCtx->countersDev; + devGpuCtx_h.signals = proxyCtx->signalsDev; + NCCLCHECK(ncclCudaCalloc(&devGpuCtx_h.pis, comm->nRanks)); + + // Allocate the GFD queues, CIs, counters, signals and test/wait variables on the either the CPU + // or GPU. + NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->queues, &devGpuCtx_h.queues, queuesLength, 0, + NULL, true /*forceHost*/)); + NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->cis, &devGpuCtx_h.cis, comm->nRanks, + CU_MEMHOSTALLOC_WRITECOMBINED, &hostGpuCtx->cisGdrHandle)); + + ncclGinProxyGpuCtx_t *devGpuCtx_d = NULL; + NCCLCHECK(ncclCudaCalloc(&devGpuCtx_d, 1)); + // Copy the proxy's devGpuCtx to the GPU + NCCLCHECK(ncclCudaMemcpy(devGpuCtx_d, &devGpuCtx_h, 1)); + + ncclNetDeviceHandle_v11_t *devHandle = NULL; + NCCLCHECK(ncclCalloc(&devHandle, 1)); + devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY; + devHandle->netDeviceVersion = NCCL_GIN_PROXY_VERSION; + devHandle->handle = (void *)devGpuCtx_d; + devHandle->size = 0; + devHandle->needsProxyProgress = 1; + + proxyCtx->devHandle = devHandle; + + *outDevHandle = devHandle; + *outGinCtx = proxyCtx; + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size, + int type, int mr_flags, void **mhandle, void **ginHandle) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + // Register the memory region with the GIN plugin + NCCLCHECK(ncclGinProxyRegMrSym(ginComm, ctx, addr, size, type, mr_flags, mhandle, ginHandle)); + return ncclSuccess; +} + +ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + // Deregister the memory region with the GIN plugin + NCCLCHECK(ginComm->deregMrSym(ctx->collComm, mhandle)); + return ncclSuccess; +} + +ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx) { + if (!ginCtx) return ncclSuccess; + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + + // Free counters + if (ctx) { + if (ctx->counters || ctx->countersGdrHandle) + freeMemCPUAccessible(ctx->counters, ctx->countersGdrHandle); + + // Free signals + if (ginComm && ctx->collComm && ctx->signalsMhandle) + ginComm->deregMrSym(ctx->collComm, ctx->signalsMhandle); + if (ctx->signalsDev) ncclCudaFree(ctx->signalsDev); + + // Free hostGpuCtx and its allocations + struct ginProxyHostGpuCtx *hostGpuCtx = ctx->hostGpuCtx; + if (hostGpuCtx) { + if (hostGpuCtx->cisShadow) free(hostGpuCtx->cisShadow); + if (hostGpuCtx->sis) free(hostGpuCtx->sis); + if (hostGpuCtx->states) free(hostGpuCtx->states); + if (hostGpuCtx->inlines) free(hostGpuCtx->inlines); + if (ginComm && ctx->collComm && hostGpuCtx->inlinesMhandle) + ginComm->deregMrSym(ctx->collComm, hostGpuCtx->inlinesMhandle); + if (hostGpuCtx->queues) freeMemCPUAccessible(hostGpuCtx->queues, NULL); + if (hostGpuCtx->cis || hostGpuCtx->cisGdrHandle) + freeMemCPUAccessible(hostGpuCtx->cis, hostGpuCtx->cisGdrHandle); + free(hostGpuCtx); + } + + ncclNetDeviceHandle_v11_t *devHandle = (ncclNetDeviceHandle_v11_t *)ctx->devHandle; + if (devHandle) { + if (devHandle->handle) ncclCudaFree((void *)devHandle->handle); + free(devHandle); + } + + free(ctx); + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + + NCCLCHECK(proxyGinPollCompletions(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx)); + for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) { + // Poll on the GFD queue + ncclGinProxyGfd_t gfd; + struct ginProxyGfdState *state = NULL; + if (proxyGinPollGfd(ctx, ctx->hostGpuCtx, targetRank, &gfd, &state)) { + ncclResult_t ret = + proxyGinProcessGfd(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx, targetRank, &gfd, state); + if (ret) ctx->hasError = ret; + NCCLCHECK(ret); + } + if (ginComm->ginProgress) ginComm->ginProgress(ctx->collComm); + } + + return ncclSuccess; +} + +ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError) { + struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx; + *hasError = ctx->hasError; + return ncclSuccess; +} diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index ae44d4c0fc0..a9517f9531f 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -271,14 +271,18 @@ ncclResult_t ncclGetUserP2pLevel(int* level) { return ncclSuccess; } +// Tests two ranks for CUDA P2P connectivity. +// *cudaP2p returns 1 if CUDA P2P between the ranks is supported. +// *p2p returns 1 only if the distance between the ranks is no greater than NCCL_P2P_LEVEL. The connection may go through an intermediate rank. ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, - int* p2p, int *read, int* intermediateRank) { + int* p2p, int *read, int* intermediateRank, int* cudaP2p) { int mnnvl = 0; struct ncclPeerInfo* info1 = NULL; struct ncclPeerInfo* info2 = NULL; *p2p = 0; if (read) *read = 0; if (intermediateRank) *intermediateRank = -1; + if (cudaP2p) *cudaP2p = 0; // Rule out different nodes / isolated containers if (comm) { @@ -341,10 +345,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) if (*p2p == 1) { - // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to - // validate against NVML at all since they are pretending to be on other hw. - if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash && - info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) { + if (checkNvml) { int indexes[3] = {-1,-1,-1}; int verticeN = 0; NCCLCHECK(ncclNvmlEnsureInitialized()); @@ -381,6 +382,26 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1; } +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) + if (cudaP2p) { + if (checkNvml) { + int n1, n2; + n1 = system->nodes[GPU].nodes[g1].gpu.dev; + n2 = system->nodes[GPU].nodes[g2].gpu.dev; + *cudaP2p = (ncclNvmlDevicePairs[n1][n2].p2pStatusRead == NVML_P2P_STATUS_OK && + ncclNvmlDevicePairs[n1][n2].p2pStatusWrite == NVML_P2P_STATUS_OK); + } else { + // We assume P2P connectivity in case the ranks are connected using MNNVL or are on the same host. + *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash); + } + } +#else + if (cudaP2p) { + // On AMD/HIP, assume P2P connectivity based on MNNVL or same host + *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash); + } +#endif + return ncclSuccess; } @@ -632,7 +653,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, struct ncclTopoSystem* system = comm->topo; *nranks = 0; *intermediateRanks = NULL; - if (system->nodes[NET].count == 0) return ncclSuccess; + if (system->inter == 0) return ncclSuccess; int nr = 0; int* ranks = NULL; @@ -715,7 +736,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm for (int p=0; pnodes[GPU].count; p++) { int p2p; NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank, - system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL)); + system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL, NULL)); if (p2p == 0) { // Divert all traffic through the CPU int cpu; @@ -926,6 +947,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* for (int n=system->nodes[NET].count-1; n>=0; n--) NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail); } + system->inter = system->nodes[GPU].count == comm->nRanks ? 0 : 1; exit: free(domains); if (ids) free(ids); diff --git a/projects/rccl/src/graph/rings.cc b/projects/rccl/src/graph/rings.cc index 553554e2b79..382ba32ea19 100644 --- a/projects/rccl/src/graph/rings.cc +++ b/projects/rccl/src/graph/rings.cc @@ -31,6 +31,11 @@ void dumpLine(int* values, int nranks, const char* prefix) { } ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { + ncclResult_t ret = ncclSuccess; + uint64_t* rankFound; + int rankFoundSize = DIVUP(nranks, 64); + NCCLCHECK(ncclCalloc(&rankFound, rankFoundSize)); + for (int r=0; rmaxBw = 0.0; system->totalBw = 0.0; - int inter = system->nodes[NET].count; + int inter = system->inter; if (inter == 0 && system->nodes[GPU].count == 1) { system->maxBw = LOC_BW; system->totalBw = LOC_BW; @@ -533,14 +533,14 @@ static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system, return ncclSuccess; } -// Build a sorted list of the NETs to try. +// Build a sorted list of the NETs to try, the list will follow the NETDEVS_POLICY set by the user. // -// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu -// index when trying to get back to the NIC. +// The value of "gpu" can be set to -1 to build a list suitable for all GPUs (for example for the search start). +// The value of "gpu" can be set to the desired index when trying to get back to the NIC. // // The list is built the following way: -// 1. Select NETs starting with those close to GPU(s), based on paths[n].type. -// 2. add other NETs satisfying typeInter but not already in the list. +// 1. First gather the preferred NETs for each of the GPU(s), based on the NETDEVS_POLICY and the connection. +// 2. If the NETDEV_policy allows it, add all the other NETs satisfying typeInter but not already in the list of preferred NETs. NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1); ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) { ncclResult_t ret = ncclSuccess; @@ -555,9 +555,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount)); } + // Get the maximum of network devices allowed, depending on the policy. + // If the policy is not MAX, then allow all devices. + int maxDevCount = 0; + enum netDevsPolicy netDevsPolicy; + NCCLCHECK(ncclTopoGetNetDevsPolicy(&netDevsPolicy, &maxDevCount)); + if (gpu == -1) maxDevCount *= system->nodes[GPU].count; + if (netDevsPolicy != NETDEVS_POLICY_MAX) maxDevCount = NCCL_TOPO_MAX_NODES; + if (netCount >= maxDevCount) goto exit; + // Then add others satisfying typeInter for (int t=0; t <= typeInter; t++) { for (int g = 0; g < system->nodes[GPU].count; g++) { + // do not consider this GPU is it's not the GPU we asked for if (gpu != -1 && gpu != g) continue; int localNetCount = 0, localNets[MAXCHANNELS]; struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; @@ -569,16 +579,37 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in for (int i=0; i= maxDevCount) goto exit; } } } +exit: *netCountRet = netCount; return ret; } +NCCL_PARAM(MnnvlRailPerHost, "MNNVL_RAIL_PER_HOST", 0); + +static bool ncclTopoSearchCheckNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* startNet, int n, int step) { + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) return false; // Trees are symmetric + if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { + if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels - 1) * 2]) return false; + } else if (graph->crossNic == 0) { + if (ncclParamMnnvlRailPerHost() && NCCL_TOPO_ID_SYSTEM_ID(net->id) != NCCL_TOPO_ID_SYSTEM_ID(startNet->id)) { + // Different hosts in an MNNVL system: rail are per host and identified with the PCI id. + if (net->net.pciId != startNet->net.pciId || net->net.port != startNet->net.port) return false; + } else { + if (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port) return false; + } + } + if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE && step != 0 && net->id != graph->inter[graph->nChannels*2+1]) return false; + return true; +} + ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) { if ((*time) <= 0) return ncclSuccess; (*time)--; @@ -604,7 +635,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo int nets[NCCL_TOPO_MAX_NODES]; if (step == backToNet) { // first get back to NIC - if (system->nodes[NET].count) { + if (system->inter) { int startNetIndex; NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex)); struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex; @@ -612,24 +643,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount)); for (int i=0; inodes[NET].nodes+n; - if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric - if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) { - if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue; - } else { - if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue; - } - + if (!ncclTopoSearchCheckNet(system, graph, startNet, n, step)) continue; // Balanced Tree : count half of the bandwidth on first two GPUs int nextBackToNet = -1; float bwInterSave = graph->bwInter; if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) { // Count half of the bandwidth on each of the first two GPUs if (step == 0) nextBackToNet = 1; - else if (net->id != graph->inter[graph->nChannels*2+1]) continue; graph->bwInter /= 2; } + struct ncclTopoNode* net; NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net)); graph->bwInter = bwInterSave; if (net) { @@ -927,7 +951,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc int* intra = graph->intra+ngpus*c; NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel)); struct ncclXmlNode* node; - if (system->nodes[NET].count) { + if (system->inter) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0])); } @@ -947,7 +971,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc NCCLCHECK(xmlSetAttrLong(node, "dev", dev)); if (graph->id == 3) break; // NVLS graphs only use the first GPU } - if (system->nodes[NET].count) { + if (system->inter) { NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node)); NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1])); } @@ -1039,7 +1063,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra)); NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra)); } - if (system->nodes[NET].count > 0) { + if (system->inter) { NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter)); NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter)); maxTypeIntra = maxTypeInter; @@ -1124,7 +1148,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE; - if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { + if (system->inter == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) { // Force intra-node NVLS algorithm to pull evenly from all GPUs. graph->minChannels = graph->maxChannels; } @@ -1144,7 +1168,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph // First try crossnic, then decrease bw and finally increase bwIntra. int nspeeds = 0; float* speedArray = NULL; - if (system->nodes[NET].count == 0) { + if (system->inter == 0) { nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA); speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra); } else { @@ -1204,14 +1228,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } tmpGraph.pattern = graph->pattern; - int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra; + int maxIntra = system->inter ? tmpGraph.typeInter : maxTypeIntra; if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) { tmpGraph.typeIntra += 1; if (tmpGraph.typeIntra < PATH_DIS) goto search; } tmpGraph.typeIntra = minTypeIntra; - if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { + if (system->inter && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) { tmpGraph.typeInter += 1; if (tmpGraph.typeInter < PATH_DIS) goto search; } diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index 326929d6cf9..781c84366cd 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -363,26 +363,39 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s int dev; NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev)); + int64_t netId = NCCL_TOPO_ID(systemId, dev); struct ncclTopoNode* net; - NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev))); + NCCLCHECK(ncclTopoCreateNode(system, &net, NET, netId)); net->net.dev = dev; const char* str; + // if not guid is present use the net->id unique id instead, which will be unique within the node/NVLD NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str)); - if (str) sscanf(str, "0x%lx", &net->net.asic); - else net->net.asic = dev; + net->net.asic = (str) ? strtoull(str, NULL, 16) : netId; + - ncclDebugNoWarn = NCCL_GRAPH; int mbps; - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0)); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0), NCCL_GRAPH); if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1 net->net.bw = mbps / 8000.0; - if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0; - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS)); - NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0)); - net->net.busId = busId; - ncclDebugNoWarn = 0; + ncclResult_t ret; + NOWARN(ret = xmlGetAttrFloat(xmlNet, "latency", &net->net.latency), NCCL_GRAPH); + if (ret != ncclSuccess) net->net.latency = 0; + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS), NCCL_GRAPH); + NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0), NCCL_GRAPH); + net->net.busId = busId; // RCCL: keep this + + // build the PCI id using the parent PCI link + uint64_t hacc[2] = {1, 1}; + const char* pciBusId = NULL; + struct ncclXmlNode* parent = xmlNet->parent; + while (parent != NULL && strcmp(parent->name, "pci") != 0) parent = parent->parent; + if (parent) NCCLCHECK(xmlGetAttr(parent, "busid", &pciBusId)); + // If we fail to find the PCIe path, we use the GUID instead. + if (pciBusId) eatHash(hacc, pciBusId, strlen(pciBusId)); + else eatHash(hacc, &net->net.asic); + net->net.pciId = digestHash(hacc); NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw)); NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw)); @@ -1087,7 +1100,8 @@ ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netIn // Trigger the merge, then get the new device's properties int vDevIndex = 0; - ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps); + ncclResult_t ret; + NOWARN(ret = netInfo->makeVDevice(&vDevIndex, vProps), NCCL_GRAPH|NCCL_INIT|NCCL_NET); if (ret != ncclSuccess) { INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.", vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]); @@ -1686,16 +1700,8 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c return ncclSuccess; } -enum netDevsPolicy { - NETDEVS_POLICY_AUTO = 0x0, - NETDEVS_POLICY_ALL = 0x1, - NETDEVS_POLICY_MAX = 0x2, - NETDEVS_POLICY_UNDEF = 0xffffffff -}; - -static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF; static int netDevsPolicyNum = -1; - +static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF; static void getNetDevsPolicyOnce() { const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY"); if (envStr) { @@ -1718,6 +1724,18 @@ static void getNetDevsPolicyOnce() { if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO; } +ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum) { + static pthread_once_t onceNetDevsPolicy = PTHREAD_ONCE_INIT; + pthread_once(&onceNetDevsPolicy, getNetDevsPolicyOnce); + if (netDevsPolicy == NETDEVS_POLICY_MAX && netDevsPolicyNum <= 0) { + WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum); + return ncclInternalError; + } + if (policy) *policy = netDevsPolicy; + if (policyNum && netDevsPolicyNum >= 0) *policyNum = netDevsPolicyNum; + return ncclSuccess; +} + ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) { int gpu; NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); @@ -1732,22 +1750,19 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch return ncclInternalError; } - static pthread_once_t once = PTHREAD_ONCE_INIT; - pthread_once(&once,getNetDevsPolicyOnce); int netsPerGpu = 0; - if (netDevsPolicy == NETDEVS_POLICY_AUTO) { + int policyCount = 0; + enum netDevsPolicy policy; + NCCLCHECK(ncclTopoGetNetDevsPolicy(&policy, &policyCount)); + if (policy == NETDEVS_POLICY_AUTO) { int localGpus[NCCL_TOPO_MAX_NODES]; int localGpuCount; NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL)); netsPerGpu = DIVUP(localNetCount, localGpuCount); - } else if (netDevsPolicy == NETDEVS_POLICY_ALL) { + } else if (policy == NETDEVS_POLICY_ALL) { netsPerGpu = localNetCount; - } else if (netDevsPolicy == NETDEVS_POLICY_MAX) { - if (netDevsPolicyNum <= 0) { - WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum); - return ncclInternalError; - } - netsPerGpu = std::min(netDevsPolicyNum, localNetCount); + } else if (policy == NETDEVS_POLICY_MAX) { + netsPerGpu = std::min(policyCount, localNetCount); } else { WARN("Unknown netDevs policy"); return ncclInternalError; @@ -1761,6 +1776,21 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch return ncclSuccess; } +ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount) { + int gpu; + NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true)); + int localNetIndexes[NCCL_TOPO_MAX_NODES]; + NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNetIndexes, localNetCount, NULL)); + + if (*localNetCount == 0) { + WARN("Could not find any local path from gpu %d to net.", gpu); + return ncclInternalError; + } + // Convert index to ids + for (int n=0; n<*localNetCount; n++) localNets[n] = system->nodes[NET].nodes[localNetIndexes[n]].id; + return ncclSuccess; +} + ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) { ncclResult_t ret = ncclSuccess; int netIndex; diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h index 9a9fd5618a9..9e8c1f527c1 100644 --- a/projects/rccl/src/graph/topo.h +++ b/projects/rccl/src/graph/topo.h @@ -164,6 +164,7 @@ struct ncclTopoNode { }gpu; struct { int dev; // Plugin dev number + uint64_t pciId; uint64_t asic; int port; float bw; @@ -221,6 +222,7 @@ struct ncclTopoSystem { // [RCCL] Track hostIdx to support rail-optimized rings/trees int hostIdx; bool useRailOptimizedTrees; + int inter; /* RCCL Rome / GIO preset: RCCL_ROME_TOPO_PRESET_MODEL_IDX_* sentinels or romeTopoModels[] index */ int romeTopoModelIdx; /* Preset matchers assume uniform ranks per host; otherwise use generic search in ncclTopoCompute */ diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc index ecf4d7dc608..368f55f41f3 100644 --- a/projects/rccl/src/graph/xml.cc +++ b/projects/rccl/src/graph/xml.cc @@ -591,32 +591,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); char* path = NULL; - ncclDebugNoWarn = NCCL_GRAPH; - getPciPath(busId, &path); - ncclDebugNoWarn = 0; + NOWARN(getPciPath(busId, &path), NCCL_GRAPH); if (path) { NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } int index; - ncclDebugNoWarn = NCCL_GRAPH; - NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "vendor", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "device", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"), NCCL_GRAPH); } - NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); + NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_device", &index), NCCL_GRAPH); if (index == -1) { - if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"); + if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"), NCCL_GRAPH); } - ncclDebugNoWarn = 0; NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path) { @@ -658,7 +654,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor)); if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections int nlinks; - char* peers; + char* peers = NULL; NCCLCHECK(getBcmLinks(busId, &nlinks, &peers)); for (int l=0; lparent; @@ -980,9 +977,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev const char* busId; NCCLCHECK(xmlGetAttr(sub, "target", &busId)); char* path; - ncclDebugNoWarn = NCCL_GRAPH; - getPciPath(busId, &path); - ncclDebugNoWarn = 0; + NOWARN(getPciPath(busId, &path), NCCL_GRAPH); if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) { // Remote NVLink device is not visible inside this VM. Assume NVSwitch. NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000")); diff --git a/projects/rccl/src/include/allocator.h b/projects/rccl/src/include/allocator.h index 05da29a62a9..eccb5b5cd72 100644 --- a/projects/rccl/src/include/allocator.h +++ b/projects/rccl/src/include/allocator.h @@ -7,6 +7,10 @@ #ifndef NCCL_ALLOCATOR_H_ #define NCCL_ALLOCATOR_H_ +#include "nccl.h" +#include +#include + //////////////////////////////////////////////////////////////////////////////// // ncclSpace: Allocates contiguous segments of non-negative integers. Useful // as a memory allocator when we can't put allocator state within the memory diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h index 50c8f4c3ba2..f060ca1ed67 100644 --- a/projects/rccl/src/include/checks.h +++ b/projects/rccl/src/include/checks.h @@ -135,6 +135,21 @@ } \ } while (0) +#define NCCLCHECKNOWARN(call, FLAGS) do { \ + ncclResult_t RES; \ + NOWARN(RES = call, FLAGS); \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + return RES; \ + } \ +} while (0) + +#define NCCLCHECKGOTONOWARN(call, RES, label, FLAGS) do { \ + NOWARN(RES = call, FLAGS); \ + if (RES != ncclSuccess && RES != ncclInProgress) { \ + goto label; \ + } \ +} while (0) + #define NCCLWAIT(call, cond, abortFlagPtr) do { \ uint32_t* tmpAbortFlag = (abortFlagPtr); \ ncclResult_t RES = call; \ diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index fc677175c6a..2c6418d725a 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -154,6 +154,9 @@ struct ncclSharedResources { /* proxy related shared res */ struct ncclProxyState* proxyState; + + // GIN state + struct ncclGinState ginState; }; /** @@ -511,6 +514,7 @@ struct ncclComm { ncclNet_t* ncclNet; void* netContext; + void* ginContext; int netPluginIndex; int ncclNetVer; ncclNetDeviceType netDeviceType; @@ -524,7 +528,7 @@ struct ncclComm { int maxTreePattern; bool initAlgoChannels[NCCL_NUM_ALGORITHMS]; bool runtimeConn; // if dynamic connection is supported - bool directMode; + bool directMode; // if any process manages more than one local rank int cuMemSupport; uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches. @@ -624,6 +628,7 @@ struct ncclComm { uint32_t* childAbortFlag; uint32_t* childAbortFlagDev; uint32_t destroyFlag; + uint32_t revokedFlag; // Flags for enable P2P NET uint32_t p2pNet; @@ -757,7 +762,8 @@ struct ncclComm { // buffer registration cache struct ncclRegCache regCache; int isAllNvlink; - bool isAllDirectP2p; + bool isAllDirectP2p; // Subject to NCCL_P2P_LEVEL (for local ranks only). + bool isAllCudaP2p; // Raw CUDA capability (for local ranks only). int symmetricSupport; bool useNetPXN; bool useGdr; diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index 457ba57e3d2..ae7731bc880 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -29,8 +29,29 @@ extern char ncclLastError[]; #define ERROR(...) ncclDebugLog(NCCL_LOG_ERROR, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) -#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) -#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__) + +#define NOWARN(EXPR, FLAGS) \ + do { \ + int oldNoWarn = ncclDebugNoWarn; \ + ncclDebugNoWarn = FLAGS; \ + (EXPR); \ + ncclDebugNoWarn = oldNoWarn; \ + } while(0) + +#define INFO(FLAGS, ...) \ + do{ \ + int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ + if((level >= NCCL_LOG_INFO && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) \ + ncclDebugLog(NCCL_LOG_INFO, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \ + } while(0) + +#define TRACE_CALL(...) \ + do { \ + int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \ + if((level >= NCCL_LOG_TRACE && (NCCL_CALL & ncclDebugMask)) || (level < 0)) { \ + ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__); \ + } \ + } while (0) #ifdef ENABLE_TRACE #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) diff --git a/projects/rccl/src/include/dev_runtime.h b/projects/rccl/src/include/dev_runtime.h index 5f6e66e3387..70bf77496be 100644 --- a/projects/rccl/src/include/dev_runtime.h +++ b/projects/rccl/src/include/dev_runtime.h @@ -52,6 +52,7 @@ struct ncclDevrState { int* lsaRankList; size_t granularity; // cuMemGetAllocationGranularity + bool ginEnabled; struct ncclDevrMemory* memHead; struct ncclDevrWindowSorted* winSorted; int winSortedCapacity, winSortedCount; diff --git a/projects/rccl/src/include/device.h b/projects/rccl/src/include/device.h index 7cfd5bcdc74..b65972b979f 100644 --- a/projects/rccl/src/include/device.h +++ b/projects/rccl/src/include/device.h @@ -82,7 +82,7 @@ extern const char* funcNames[]; #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0 #endif -#include "net_device.h" +#include "nccl_device/net_device.h" enum ncclDevRedOp_t { ncclDevSum, ncclDevProd, ncclDevMinMax, @@ -245,6 +245,7 @@ struct ncclProxyConnector { int sameProcess; struct ncclProxyConnection* connection; ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary + ncclResult_t (*proxyGinProgress)(struct ncclProxyState* proxyState); }; struct ncclConnector { @@ -804,7 +805,8 @@ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) { // Host-side table of kernel function pointers. extern int const ncclDevKernelCount; -extern void* const ncclDevKernelList[/*ncclDevKernelCount*/]; +extern void* ncclDevKernelList[/*ncclDevKernelCount*/]; +extern int ncclDevKernelRequirements[/*ncclDevKernelCount*/]; // Table of most specialized kernel function to run given func index. extern int const ncclDevFuncRowToId[]; diff --git a/projects/rccl/src/include/env.h b/projects/rccl/src/include/env.h new file mode 100644 index 00000000000..0e00b31448a --- /dev/null +++ b/projects/rccl/src/include/env.h @@ -0,0 +1,23 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INT_ENV_H_ +#define NCCL_INT_ENV_H_ + +#include "nccl_env.h" + +// Initialize Env Plugin +ncclResult_t ncclEnvPluginInit(void); +// Finalize Env Plugin +void ncclEnvPluginFinalize(void); +// Env plugin get function for NCCL params, called in ncclGetEnv() +const char* ncclEnvPluginGetEnv(const char* name); + +bool ncclEnvPluginInitialized(void); + +ncclResult_t ncclInitEnv(void); + +#endif diff --git a/projects/rccl/src/include/gin/gin_host.h b/projects/rccl/src/include/gin/gin_host.h new file mode 100644 index 00000000000..d82a7950524 --- /dev/null +++ b/projects/rccl/src/include/gin/gin_host.h @@ -0,0 +1,54 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_GIN_HOST_H_ +#define _NCCL_GIN_HOST_H_ + +#include "allocator.h" +#include "nccl.h" +#include "nccl_net.h" +#include "nccl_device/gin/gin_device_host_common.h" +#include + +struct ncclGinState { + ncclGin_t* ncclGin; + void* ginInstance; + bool connected; + int ginType; + int ginCommCount; + void* ginComms[NCCL_GIN_MAX_CONTEXTS]; + void* ginCtx[NCCL_GIN_MAX_CONTEXTS]; + ncclNetDeviceHandle_t* ginDevHandles[NCCL_GIN_MAX_CONTEXTS]; + int needsProxyProgress; // Whether we need to progress GIN operations with the proxy + int ginProgress; // GIN progress is enabled + pthread_t thread; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; + ncclResult_t asyncResult; + + int signalSpaceSize; + int counterSpaceSize; + ncclSpace signalSpace; + ncclSpace counterSpace; +}; + +extern int64_t ncclParamGinType(); + +// FIXME change to ncclGinState instead of ncclComm, no need to pass comm +ncclResult_t ncclGinConnectOnce(struct ncclComm* comm); +ncclResult_t ncclGinFinalize(struct ncclComm* comm); +ncclResult_t ncclGinProgress(struct ncclGinState* ginState); +ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size, + void* ginHostWins[NCCL_GIN_MAX_CONTEXTS], + ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]); +ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]); +ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0, + int nCounters, uint32_t* outCounter0); +ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals, + uint32_t counter0, int nCounters); +ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError); + +#endif diff --git a/projects/rccl/src/include/gin/gin_host_proxy.h b/projects/rccl/src/include/gin/gin_host_proxy.h new file mode 100644 index 00000000000..14e8b93ca15 --- /dev/null +++ b/projects/rccl/src/include/gin/gin_host_proxy.h @@ -0,0 +1,28 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef GIN_HOST_PROXY_H_ +#define GIN_HOST_PROXY_H_ + +#include +#include +#include +#include +#include "nccl.h" +#include "gin/gin_host.h" +#include "plugin/nccl_net.h" + +ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId, + int nSignals, int nCounters, void **outGinCtx, + ncclNetDeviceHandle_v11_t **outDevHandle); +ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size, + int type, int mr_flags, void **mhandle, void **ginHandle); +ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle); +ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx); +ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx); +ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError); + +#endif diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h index cfae66faf8a..910fa9f034c 100644 --- a/projects/rccl/src/include/graph.h +++ b/projects/rccl/src/include/graph.h @@ -35,7 +35,7 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm); // Query topology ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank); -ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank); +ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank, int* cudaP2p); ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret); enum ncclTopoGdrMode { ncclTopoGdrModeDisable = 0, @@ -80,9 +80,18 @@ ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count); ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev); +ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount); ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex); ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count); +enum netDevsPolicy { + NETDEVS_POLICY_AUTO = 0x0, + NETDEVS_POLICY_ALL = 0x1, + NETDEVS_POLICY_MAX = 0x2, + NETDEVS_POLICY_UNDEF = 0xffffffff +}; +ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum); + // Allows for up to 32 NICs per node on GB200-NVL72 #define NCCL_TOPO_MAX_NODES 64 ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType); diff --git a/projects/rccl/src/include/group.h b/projects/rccl/src/include/group.h index 8d5b072991d..3fcbca6f70d 100644 --- a/projects/rccl/src/include/group.h +++ b/projects/rccl/src/include/group.h @@ -78,6 +78,10 @@ extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum]; extern __thread struct ncclComm* ncclGroupCommPreconnectHead; extern __thread int ncclGroupBlocking; +inline bool ncclGroupEnabled() { + return ncclGroupDepth != 0; +} + inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) { if (ncclGroupDepth > 0) { if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret; diff --git a/projects/rccl/src/include/nccl_device.h b/projects/rccl/src/include/nccl_device.h index 88b2531d19d..35e216c6288 100644 --- a/projects/rccl/src/include/nccl_device.h +++ b/projects/rccl/src/include/nccl_device.h @@ -4,12 +4,12 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "nccl_device/impl/comm__funcs.h" #include "nccl_device/coop.h" +#include "nccl_device/impl/barrier__funcs.h" +#include "nccl_device/impl/comm__funcs.h" #include "nccl_device/impl/core__funcs.h" #include "nccl_device/impl/ll_a2a__funcs.h" -#include "nccl_device/impl/mem_barrier__funcs.h" -//#include "nccl_device/net_barrier__funcs.h" -//#include "nccl_device/net_scratch_a2a__funcs.h" -//#include "nccl_device/barrier__funcs.h" +#include "nccl_device/impl/lsa_barrier__funcs.h" +#include "nccl_device/impl/gin__funcs.h" +#include "nccl_device/impl/gin_barrier__funcs.h" #include "nccl_device/impl/ptr__funcs.h" diff --git a/projects/rccl/src/include/nccl_device/barrier.h b/projects/rccl/src/include/nccl_device/barrier.h new file mode 100644 index 00000000000..0c11f6e5c2c --- /dev/null +++ b/projects/rccl/src/include/nccl_device/barrier.h @@ -0,0 +1,47 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_BARRIER_H_ +#define _NCCL_DEVICE_BARRIER_H_ +#include "impl/core__types.h" +#include "impl/lsa_barrier__types.h" +#include "impl/gin_barrier__types.h" + +#if __CUDACC__ +template +struct ncclBarrierSession_internal; + +template +struct ncclBarrierSession: ncclBarrierSession_internal { + // Full featured constructor: + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin, + ncclLsaBarrierHandle innerBarHandle, + ncclGinBarrierHandle outerBarHandle, + uint32_t index, + bool multimem=false, ncclMultimemHandle innerMmHandle={} + ); + // Convenience constructors for baked in teams: + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false + ); + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false + ); + NCCL_DEVICE_INLINE ncclBarrierSession( + Coop, ncclTeamTagRail, ncclGin, uint32_t index + ); + + ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable + + NCCL_DEVICE_INLINE ncclLsaBarrierSession& lsaBarrier(); + NCCL_DEVICE_INLINE ncclGinBarrierSession& ginBarrier(); + + NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel); +}; +#endif + +#endif // _NCCL_DEVICE_BARRIER_H_ diff --git a/projects/rccl/src/include/nccl_device/coop.h b/projects/rccl/src/include/nccl_device/coop.h index 7f3b33fca65..adcd31d9e3f 100644 --- a/projects/rccl/src/include/nccl_device/coop.h +++ b/projects/rccl/src/include/nccl_device/coop.h @@ -53,7 +53,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp. } NCCL_DEVICE_INLINE void sync() { #if ROCM_VERSION >= 70000 - __syncwarp(laneMask()); + if (nThreadsPow2 > 1) __syncwarp(laneMask()); #else __syncthreads(); #endif @@ -69,7 +69,7 @@ typedef ncclCoopTile ncclCoopWarp; #if __CUDACC__ struct ncclCoopLanes { // Some lanes of this warp. ncclCoopMask_t lmask; - + NCCL_DEVICE_INLINE constexpr ncclCoopLanes(ncclCoopMask_t lmask = ncclCoopFullMask): lmask(lmask) {} NCCL_DEVICE_INLINE int thread_rank() const { @@ -101,7 +101,7 @@ struct ncclCoopWarpSpan { NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id): warp0(warp0), nWarps(nWarps), id(id) { } - + NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x - WARP_SIZE*warp0; } @@ -160,6 +160,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; } #endif +#if __CUDACC__ +template +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile) { return true; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; } +NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; } +#endif + #if __CUDACC__ // Pick threads of our warp that are safe to use collectively. NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() { @@ -187,4 +195,55 @@ NCCL_DEVICE_INLINE ncclCoopTile ncclCoopCoalesced(ncclCoopTile +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile, T value, int root, bool entrySync=true) { + constexpr int n = (sizeof(T)+4-1)/4; + union { uint32_t u[n]; T v; }; + v = value; + #pragma unroll + for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads); + return v; +} +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) { + uint32_t m = coop.lmask; + uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root); + constexpr int n = (sizeof(T)+4-1)/4; + union { uint32_t u[n]; T v; }; + v = value; + #pragma unroll + for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r); + return v; +} + +NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() { + __shared__ ulong2 stash[15]; + return stash; +} + +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) { + static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required"); + if (entrySync) coop.sync(); + if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value; + coop.sync(); + return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id]; +} + +NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() { + __shared__ ulong2 stash; + return &stash; +} + +template +NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) { + static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required"); + if (entrySync) coop.sync(); + if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value; + coop.sync(); + return *(T*)ncclCoopBcast_Cta_stash(); +} +#endif + #endif diff --git a/projects/rccl/src/include/nccl_device/core.h b/projects/rccl/src/include/nccl_device/core.h index dd41d692507..9b0061a72d6 100644 --- a/projects/rccl/src/include/nccl_device/core.h +++ b/projects/rccl/src/include/nccl_device/core.h @@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t; typedef uint32_t ncclDevResourceHandle; typedef ncclDevResourceHandle ncclDevResourceHandle_t; +typedef uint32_t ncclGinSignal_t; +typedef uint32_t ncclGinCounter_t; + struct ncclLsaBarrierHandle; typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t; +struct ncclGinBarrierHandle; +typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t; + struct ncclLLA2AHandle; typedef struct ncclLLA2AHandle ncclLLA2AHandle_t; @@ -59,13 +65,26 @@ struct ncclDevCommRequirements { bool lsaMultimem; // Enable multimem on lsa team + int barrierCount; int lsaBarrierCount; + int railGinBarrierCount; + + int lsaLLA2ABlockCount, lsaLLA2ASlotCount; + + bool ginForceEnable; + int ginContextCount; // This is a hint, the actual context count in the devcomm may not match. + int ginSignalCount; // Guaranteed to start at id=0 + int ginCounterCount; // Guaranteed to start at id=0 }; struct ncclDevResourceRequirements { ncclDevResourceRequirements_t* next; size_t bufferSize, bufferAlign; ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate. + int ginSignalCount; + int ginCounterCount; + ncclGinSignal_t* outGinSignalStart; + ncclGinCounter_t* outGinCounterStart; }; struct ncclTeamRequirements { diff --git a/projects/rccl/src/include/nccl_device/gin.h b/projects/rccl/src/include/nccl_device/gin.h new file mode 100644 index 00000000000..0f5643f206d --- /dev/null +++ b/projects/rccl/src/include/nccl_device/gin.h @@ -0,0 +1,207 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_SESSION_H_ +#define _NCCL_DEVICE_GIN_SESSION_H_ +#include "core_tmp.h" +#include "gin/gin_device_common.h" + +#if __CUDACC__ +struct ncclGinCtx; // Definition in nccl_device/gin/gin_device_host_common.h +template struct ncclGinCtx_M; // ... + +struct ncclGinDescriptorSmem; // A type user allocates in __shared__ memory + +// Used as completion actions for ncclGinSession::put +struct ncclGin_None {}; + +struct ncclGin_SignalAdd { ncclGinSignal_t signal; uint64_t value; }; +// SignalInc: equivalent to SignalAdd{+1} except it may not be mixed with any +// other signal operator without intervening signal reset(). Formally: for a +// given signal, all operations between successive reset()'s of that signal must +// either all be SignalInc or all not SignalInc. +struct ncclGin_SignalInc { ncclGinSignal_t signal; }; +// Support deferred: +// struct ncclGin_SignalSet { ncclGinSignal_t signal; uint64_t value; }; +struct ncclGin_CounterInc { ncclGinCounter_t counter; }; + +struct ncclGin_DescriptorSmem { ncclGinDescriptorSmem* descriptor; }; + +template +struct ncclGin_BackendMask; + +template +using ncclGin_BackendOne = ncclGin_BackendMask<(1u<<(int)backend)>; + +using ncclGin = ncclGin_BackendMask; + +#endif + +#if __CUDACC__ +template +struct ncclGin_BackendMask { + ncclDevComm const& comm; + uint32_t nContexts:8, contextId:8, _ginBackend:8; + + // Loads GIN context into registers. Each context has one QP per peer. + NCCL_DEVICE_INLINE ncclGin_BackendMask(ncclDevComm const&, int contextIndex); + + template< + // Action to take on peer when put completes. If a signalling action is used + // then that signal will be visible only after the payload of this put as well as + // the payloads of preceding puts on this netContext to the same peer are settled. + typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet} + // Action to take locally when source has been consumed. + typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc} + // Set of threads participating in this put. Must be a subset of Coop. + typename Coop = ncclCoopThread, + // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem} + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void put( + ncclTeam, int peer, + ncclWindow_t dstWnd, size_t dstOffset, + ncclWindow_t srcWnd, size_t srcOffset, size_t bytes, + RemoteAction remoteAction = ncclGin_None{}, + LocalAction localAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, + // Action to take on peer when put completes. If a signalling action is used + // then that signal will be visible only after the payload of this put as well as + // the payloads of preceding puts on this context to the same peer are settled. + typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet} + // Action to take locally when source has been consumed. + typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc} + // Set of threads participating in this put. Must be a subset of Coop. + typename Coop = ncclCoopThread, + // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem} + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void put( + ncclTeam, int peer, + ncclSymPtr dstElts, ncclSymPtr srcElts, size_t nElts, + RemoteAction remoteAction = ncclGin_None{}, + LocalAction localAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, // requires sizeof(T) <= 8 + // See put() for all template arguments. + typename RemoteAction = ncclGin_None, + typename Coop = ncclCoopThread, + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void putValue( + ncclTeam, int peer, + ncclWindow_t dstWnd, size_t dstOffset, T value, + RemoteAction remoteAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template< + typename T, // requires sizeof(T) <= 8 + // See put() for all template arguments. + typename RemoteAction = ncclGin_None, + typename Coop = ncclCoopThread, + typename DescriptorSmem = ncclGin_None + > + NCCL_DEVICE_INLINE void putValue( + ncclTeam, int peer, + ncclSymPtr dst, T value, + RemoteAction remoteAction = ncclGin_None{}, + Coop coop = ncclCoopThread{}, + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + template + NCCL_DEVICE_INLINE void signal( + ncclTeam, int peer, RemoteAction remoteAction, + Coop coop = ncclCoopThread(), + DescriptorSmem descriptor = ncclGin_None{}, + cuda::thread_scope alreadyReleased = cuda::thread_scope_thread, + cuda::thread_scope expected_scope = cuda::thread_scope_device + ) const; + + // All source buffers from put's from any thread in this coop will be safe to reuse. + // Flush does not guarantee that data has settled in remote memory. + template + NCCL_DEVICE_INLINE void flush(Coop, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Counter and signal wait use "rolling" comparison logic of a given bit-width + // such that unsigned overflow does not disturb the property that: x < x+1. + // + // bool rolling_less_equal(uint64_t a, uint64_t b, int bits) { + // uint64_t m = uint64_t(-1)>>(64-bits); + // return ((b-a) & m) <= (m>>1); + // } + // + // The condition waited for is that the supplied value is rolling_less_equal + // to the internal value. + // + // Counters are restricted to using a maximum of 56 bits despite that being fewer + // than a uint64_t can carry. + + NCCL_DEVICE_INLINE uint64_t readCounter(ncclGinCounter_t counter, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const; + + template + NCCL_DEVICE_INLINE void waitCounter(Coop, ncclGinCounter_t counter, uint64_t least, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Each signal has a dedicated "shadow" which the user is free to manipulate for + // any reason. The only calls which manipulate the shadow are `increaseSignalShadow` + // and `resetSignal`. + NCCL_DEVICE_INLINE uint64_t* getSignalShadowPtr(ncclGinSignal_t signal) const; + NCCL_DEVICE_INLINE void increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const; + + // Returns current value of signal with all but bottom bits set to zero. + NCCL_DEVICE_INLINE uint64_t readSignal(ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait for signal to meet or exceed value. + template + NCCL_DEVICE_INLINE void waitSignal(Coop, ncclGinSignal_t signal, uint64_t least, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait for signal to meet or exceed shadow value. + template + NCCL_DEVICE_INLINE void waitSignalMeetShadow(Coop, ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Wait until signal exceeds shadow by `leastDelta` (typically 1), updates shadow + // with latest value, and returns with `before` equal to previous shadow value + // and `delta` equal to difference. + template + NCCL_DEVICE_INLINE void waitSignalFollowShadow(Coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const; + + // Sets to zero. May not race with concurrent modifications to counter. + NCCL_DEVICE_INLINE void resetCounter(ncclGinCounter_t counter) const; + // Sets signal and shadow to zero. May not race with concurrent modifcations to signal. + NCCL_DEVICE_INLINE void resetSignal(ncclGinSignal_t signal) const; + + ////////////////////////////////////////////////////////////////////////////// + // internal: + + void* _ginHandle; + uint64_t* _signalShadows; + + NCCL_DEVICE_INLINE ncclGinCtx_M _makeCtx() const; +}; +#endif + +#endif // _NCCL_DEVICE_GIN_SESSION_H_ diff --git a/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h new file mode 100644 index 00000000000..c14a5e2923f --- /dev/null +++ b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h @@ -0,0 +1,214 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_GDAKI_H_ +#define _NCCL_DEVICE_GIN_GDAKI_H_ + +#ifndef DOCA_VERBS_USE_CUDA_WRAPPER +#define DOCA_VERBS_USE_CUDA_WRAPPER +#endif + +#ifndef DOCA_VERBS_USE_NET_WRAPPER +#define DOCA_VERBS_USE_NET_WRAPPER +#endif + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG +#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1 +#endif + +#include "../gin_device_common.h" +#include "gin_gdaki_device_host_common.h" +#include "doca_gpunetio/doca_gpunetio_device.h" + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG +#include +#endif + +template <> +struct ncclGinApi_Put { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins, + ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin, + size_t srcOff, size_t bytes, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasCounter, + ncclGinCounter_t counterId, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given) { + using nccl::utility::loadConst; + + coop.sync(); + if (coop.thread_rank() == 0) { + ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer; + doca_gpu_dev_verbs_qp* companion_qp; + ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin; + ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin; + + doca_gpu_dev_verbs_addr raddr, laddr; + if (hasWins) { + raddr.addr = dstOff; + raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer); + laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey); + } + + doca_gpu_dev_verbs_addr sig_raddr, sig_laddr; + if (hasSignal) { + if (signalOp == ncclGinSignalInc) signalOpArg = 1; + sig_raddr.addr = sizeof(uint64_t) * signalId; + sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer); + sig_laddr.addr = 0; + sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + doca_gpu_dev_verbs_addr counter_raddr, counter_laddr; + if (hasCounter) { + companion_qp = loadConst(&gdaki->companion_gdqp) + peer; + counter_raddr.addr = sizeof(uint64_t) * counterId; + counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank); + counter_laddr.addr = 0; + counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + // cuda::thread_scope_system has the lowest value + if ((required == cuda::thread_scope_system) && (given > required)) { + doca_gpu_dev_verbs_fence_release(); + } + + if (hasWins) { + if (hasSignal && hasCounter) { + doca_gpu_dev_verbs_put_signal_counter( + qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, + counter_laddr, 1); + } else if (hasSignal) { + doca_gpu_dev_verbs_put_signal( + qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg); + } else if (hasCounter) { + doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr, + counter_laddr, 1); + } else { + doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes); + } + } else { + if (hasCounter) { + doca_gpu_dev_verbs_signal_counter( + qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1); + } else { + doca_gpu_dev_verbs_signal( + qp, sig_raddr, sig_laddr, signalOpArg); + } + } + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG + doca_gpu_dev_verbs_wait(qp); + if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp); +#endif + } + coop.sync(); + } +}; + +template <> +struct ncclGinApi_PutValue { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin, + size_t dstOff, T srcVal, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given) { + using nccl::utility::loadConst; + + coop.sync(); + if (coop.thread_rank() == 0) { + ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer; + ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin; + + doca_gpu_dev_verbs_addr raddr; + raddr.addr = dstOff; + raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer); + + doca_gpu_dev_verbs_addr sig_raddr, sig_laddr; + if (hasSignal) { + if (signalOp == ncclGinSignalInc) signalOpArg = 1; + sig_raddr.addr = sizeof(uint64_t) * signalId; + sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer); + sig_laddr.addr = 0; + sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey); + } + + // cuda::thread_scope_system has the lowest value + if ((required == cuda::thread_scope_system) && (given > required)) { + doca_gpu_dev_verbs_fence_release(); + } + + if (hasSignal) { + doca_gpu_dev_verbs_p_signal( + qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg); + } else { + doca_gpu_dev_verbs_p(qp, raddr, srcVal); + } + +#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG + doca_gpu_dev_verbs_wait(qp); +#endif + } + coop.sync(); + } +}; + +template <> +struct ncclGinApi_ResetCounter { + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + loadConst(&gdaki->counters_table.buffer)[counterId] = 0; + } +}; + +template <> +struct ncclGinApi_ResetSignal { + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + loadConst(&gdaki->signals_table.buffer)[signalId] = 0; + } +}; + +template <> +struct ncclGinApi_GetCounterPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + return loadConst(&gdaki->counters_table.buffer) + counterId; + } +}; + +template <> +struct ncclGinApi_GetSignalPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + return loadConst(&gdaki->signals_table.buffer) + signalId; + } +}; + +template <> +struct ncclGinApi_Flush { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) { + using nccl::utility::loadConst; + ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle; + doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp); +#pragma unroll 1 + for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) { + doca_gpu_dev_verbs_wait(qps + peer); + } + } +}; + +#endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */ diff --git a/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h new file mode 100644 index 00000000000..20299346f31 --- /dev/null +++ b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h @@ -0,0 +1,36 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ +#define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ + +#include + +#define NCCL_GIN_GDAKI_VERSION 100 + +template +struct ncclGinGdakiGlobalGPUBufferTable { + T *buffer; + __be32 *rkeys; + __be32 lkey; +}; + +struct ncclGinGdakiGPUContext { + struct doca_gpu_dev_verbs_qp *gdqp; + struct doca_gpu_dev_verbs_qp *companion_gdqp; + struct ncclGinGdakiGlobalGPUBufferTable counters_table; + struct ncclGinGdakiGlobalGPUBufferTable signals_table; + + // Local buffer we don't consume but is required for some operations. + __be32 sink_buffer_lkey; +}; + +struct ncclGinGdakiMemHandle { + __be32 *rkeys; + __be32 lkey; +}; + +#endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */ diff --git a/projects/rccl/src/include/nccl_device/gin/gin_device_api.h b/projects/rccl/src/include/nccl_device/gin/gin_device_api.h new file mode 100644 index 00000000000..20dde3af30d --- /dev/null +++ b/projects/rccl/src/include/nccl_device/gin/gin_device_api.h @@ -0,0 +1,18 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ +#ifndef _NCCL_GIN_DEVICE_API_H_ +#define _NCCL_GIN_DEVICE_API_H_ + +#include "gin_device_common.h" + +#if NCCL_GIN_GDAKI_ENABLE +#include "gdaki/gin_gdaki.h" +#endif +#if NCCL_GIN_PROXY_ENABLE +#include "proxy/gin_proxy.h" +#endif + +#endif diff --git a/projects/rccl/src/include/nccl_device/gin/gin_device_common.h b/projects/rccl/src/include/nccl_device/gin/gin_device_common.h new file mode 100644 index 00000000000..4e0798c0c5b --- /dev/null +++ b/projects/rccl/src/include/nccl_device/gin/gin_device_common.h @@ -0,0 +1,122 @@ +/************************************************************************* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef _NCCL_GIN_DEVICE_COMMON_H_ +#define _NCCL_GIN_DEVICE_COMMON_H_ + +#include "../net_device.h" +#include "../utility.h" +#include "gin_device_host_common.h" + +#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900 +#define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1 +#endif + +#ifndef NCCL_GIN_PROXY_ENABLE +#define NCCL_GIN_PROXY_ENABLE 1 +#endif + +#ifndef NCCL_GIN_GDAKI_ENABLE +#if defined(__HIP_PLATFORM_AMD__) +#define NCCL_GIN_GDAKI_ENABLE 0 +#elif CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700 +#define NCCL_GIN_GDAKI_ENABLE 1 +#else +#define NCCL_GIN_GDAKI_ENABLE 0 +#endif +#endif + +#define NCCL_GIN_BACKEND_MASK_ALL \ + (((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \ + ((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI) + +struct ncclGinCtx { + ncclNetDeviceType backend; + int rank; + int nRanks; + void* handle; +}; + +template +struct ncclGinCtx_M : ncclGinCtx {}; + +struct ncclGinDescriptorSmem { + alignas(16) char space[64]; +}; + +#if __CUDACC__ +template +struct ncclGinApi_Put { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins, + ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin, + size_t srcOff, size_t bytes, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasCounter, + ncclGinCounter_t counterId, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given); +}; + +template +struct ncclGinApi_PutValue { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin, + size_t dstOff, T srcData, bool hasSignal, + ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp, + uint64_t signalOpArg, bool hasDescriptor, + ncclGinDescriptorSmem* descriptor, + cuda::thread_scope required, cuda::thread_scope given); +}; + +template +struct ncclGinApi_GetSignalPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId); +}; +template +struct ncclGinApi_GetCounterPtr { + NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId); +}; + +template +struct ncclGinApi_ResetSignal { + NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId); +}; + +template +struct ncclGinApi_ResetCounter { + NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId); +}; + +template +struct ncclGinApi_Flush { + template + NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord); +}; +#endif + +#if __CUDACC__ +template