diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000000..d1eb6cc8e92
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,170 @@
+cmake_minimum_required(VERSION 3.25) # ipp6 is using 3.28
+
+# Version information
+# Read makefiles/version.mk file
+file(READ ${CMAKE_SOURCE_DIR}/makefiles/version.mk VERSION_CONTENT)
+string(REGEX REPLACE ".*NCCL_MAJOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MAJOR "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_MINOR[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_MINOR "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_PATCH[ ]*:=[ ]*([0-9]+).*" "\\1" NCCL_PATCH "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*NCCL_SUFFIX[ ]*:=[ ]*([a-zA-Z0-9]*).*" "\\1" NCCL_SUFFIX "${VERSION_CONTENT}")
+string(REGEX REPLACE ".*PKG_REVISION[ ]*:=[ ]*([0-9]+).*" "\\1" PKG_REVISION "${VERSION_CONTENT}")
+math(EXPR NCCL_VERSION_CODE "(${NCCL_MAJOR} * 10000) + (${NCCL_MINOR} * 100) + ${NCCL_PATCH}")
+
+# Make version information available to C++ source files
+add_compile_definitions(
+    NCCL_USE_CMAKE
+    NCCL_MAJOR=${NCCL_MAJOR}
+    NCCL_MINOR=${NCCL_MINOR}
+    NCCL_PATCH=${NCCL_PATCH}
+    NCCL_VERSION_CODE=${NCCL_VERSION_CODE}
+)
+
+set(ENV{NCCL_USE_CMAKE} "1")
+
+project(NCCL VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+        LANGUAGES CUDA CXX C)
+
+# Make CMAKE_BUILD_TYPE to release by default if not set
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+option(VERBOSE "Enable verbose output" OFF)
+option(KEEP "Keep intermediate files" OFF)
+option(DEBUG "Enable debug build" OFF)
+option(ASAN "Enable Address Sanitizer" OFF)
+option(UBSAN "Enable Undefined Behavior Sanitizer" OFF)
+option(TRACE "Enable tracing" OFF)
+option(WERROR "Treat warnings as errors" OFF)
+option(PROFAPI "Enable profiling API" ON)
+option(NVTX "Enable NVTX" ON)
+option(RDMA_CORE "Enable RDMA core" OFF)
+option(NET_PROFILER "Enable network profiler" OFF)
+option(MLX5DV "Enable MLX5DV" OFF)
+option(MAX_EXT_NET_PLUGINS "Maximum external network plugins" 0)
+
+find_package(CUDAToolkit REQUIRED)
+
+# CUDA version detection
+string(REGEX MATCH "([0-9]+\\.[0-9]+)" CUDA_VERSION "${CUDAToolkit_VERSION}")
+
+# Extract major and minor version numbers
+string(REGEX MATCH "([0-9]+)" CUDA_MAJOR "${CUDA_VERSION}")
+string(REGEX MATCH "([0-9]+)$" CUDA_MINOR "${CUDA_VERSION}")
+string(REGEX REPLACE ".*\\.([0-9]+)$" "\\1" CUDA_MINOR "${CUDA_VERSION}")
+
+# Add CUDA version definitions after find_package
+add_compile_definitions(
+    CUDA_MAJOR=${CUDA_MAJOR}
+    CUDA_MINOR=${CUDA_MINOR}
+)
+
+# CUDA architecture flags
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "")
+    message(STATUS "CMAKE_CUDA_ARCHITECTURES not defined or empty, setting default values based on CUDA version")
+
+    if(${CUDA_MAJOR} LESS 9)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61")
+    elseif(${CUDA_MAJOR} EQUAL 9)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
+    elseif(${CUDA_MAJOR} EQUAL 10)
+        set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70")
+    elseif(${CUDA_MAJOR} EQUAL 11)
+        if(${CUDA_MINOR} LESS 8)
+            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "35;50;60;61;70;80;90")
+        endif()
+    elseif(${CUDA_MAJOR} EQUAL 12)
+        if(${CUDA_MINOR} LESS 8)
+            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90")
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;120")
+        endif()
+    elseif(${CUDA_MAJOR} EQUAL 13)
+        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
+    else()
+        # For future CUDA versions, include all architectures up to the latest known
+        set(CMAKE_CUDA_ARCHITECTURES "50;60;61;70;80;90;100;110;120")
+    endif()
+endif()
+message(STATUS "Using CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -fvisibility=hidden -Wall -Wno-unused-function -Wno-sign-compare -Wvla -g")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all -fPIC")
+
+# Sanitizer options
+if(ASAN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address -static-libasan")
+endif()
+
+if(UBSAN)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=undefined -static-libubsan")
+endif()
+
+# Additional options
+if(TRACE)
+    add_definitions(-DENABLE_TRACE)
+endif()
+
+if(NOT NVTX)
+    add_definitions(-DNVTX_DISABLE)
+endif()
+
+if(WERROR)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
+endif()
+
+if(PROFAPI)
+    add_definitions(-DPROFAPI)
+endif()
+
+set(EXTRA_LIBS)
+
+# RDMA and MLX5DV are Linux-specific features
+if(RDMA_CORE)
+    add_definitions(-DNCCL_BUILD_RDMA_CORE=1)
+    find_library(VERBS_LIBRARY NAMES verbs)
+    if(VERBS_LIBRARY)
+        list(APPEND EXTRA_LIBS ${VERBS_LIBRARY})
+    endif()
+endif()
+
+if(MLX5DV)
+    add_definitions(-DNCCL_BUILD_MLX5DV=1)
+    find_library(MLX5_LIBRARY NAMES mlx5)
+    if(MLX5_LIBRARY)
+        list(APPEND EXTRA_LIBS ${MLX5_LIBRARY})
+    endif()
+endif()
+
+if(NET_PROFILER)
+    add_definitions(-DNCCL_ENABLE_NET_PROFILING=1)
+endif()
+
+if(MAX_EXT_NET_PLUGINS GREATER 0)
+    add_definitions(-DNCCL_NET_MAX_PLUGINS=${MAX_EXT_NET_PLUGINS})
+endif()
+
+add_definitions(-DDOCA_VERBS_USE_CUDA_WRAPPER)
+add_definitions(-DDOCA_VERBS_USE_NET_WRAPPER)
+add_definitions(-DNCCL_GIN_PROXY_ENABLE=1)
+
+# Library dependencies
+find_library(RT_LIBRARY NAMES rt)
+if(RT_LIBRARY)
+    list(APPEND EXTRA_LIBS ${RT_LIBRARY})
+endif()
+
+# Debug/Release specific flags
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -O0")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS} -O0 -G -g")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3")
+set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS} -O3")
+
+add_subdirectory(ext-net)
+add_subdirectory(ext-profiler/example)
+add_subdirectory(ext-tuner/example)
+add_subdirectory(src)
diff --git a/examples/06_device_api/02_gin_alltoall_pure/Makefile b/examples/06_device_api/02_gin_alltoall_pure/Makefile
new file mode 100644
index 00000000000..43d65cec1ec
--- /dev/null
+++ b/examples/06_device_api/02_gin_alltoall_pure/Makefile
@@ -0,0 +1,84 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = gin_alltoall_pure_device_api
+
+# Common utilities
+COMMON_INC = ../../common/include
+COMMON_SRC = ../../common/src
+
+# Build configuration
+INCLUDES += -I$(COMMON_INC)
+
+# Source files
+SOURCES = main.cu $(COMMON_SRC)/utils.cc
+OBJECTS = $(SOURCES:.cu=.o)
+OBJECTS := $(OBJECTS:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+else
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@
+endif
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cu
+	$(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.cc
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+else
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+endif
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+ifeq ($(MPI),1)
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+else
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+endif
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Pure GIN AlltoAll Device API"
+	@echo "=============================================="
+	@echo ""
+	@echo "This example demonstrates pure GPU-Initiated Networking (GIN)"
+	@echo "for AlltoAll operations without LSA optimizations."
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/06_device_api/02_gin_alltoall_pure/README.md b/examples/06_device_api/02_gin_alltoall_pure/README.md
new file mode 100644
index 00000000000..72a0b33b30b
--- /dev/null
+++ b/examples/06_device_api/02_gin_alltoall_pure/README.md
@@ -0,0 +1,178 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Device API Pure GIN AlltoAll Example
+
+This example demonstrates NCCL's GPU-Initiated Networking (GIN) capabilities for performing AlltoAll collective operations directly from GPU kernels using only network-based communication.
+
+## Overview
+
+This example showcases **pure GIN communication** where all data exchange happens through the network, without any Load Store Access (LSA) optimizations. This is particularly useful for:
+
+- Multi-node environments where ranks cannot use LSA
+- Testing network performance without local optimizations  
+- Understanding the baseline GIN communication patterns
+- Scenarios where all communication must go through the network
+
+## What This Example Does
+
+1. **Creates device communicators** using `ncclDevCommCreate` for GPU kernel access to NCCL operations
+2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for direct peer-to-peer access
+3. **Launches GPU kernel** that performs AlltoAll operations using pure GIN for all peer communication
+
+## Building and Running
+
+The advanced examples can be built using either pthread or MPI for parallelization. pthread is the default choice. To use MPI the user needs to set `MPI=1` at build time and can optionally provide a valid MPI installation under `MPI_HOME`.
+
+### Build
+```bash
+make [MPI=1] [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run when compiled for pthreads (default)
+```bash
+[NTHREADS=N] ./gin_alltoall_pure_device_api
+```
+
+### Run when compiled for MPI
+```bash
+mpirun -np <num_processes> ./gin_alltoall_pure_device_api
+```
+
+## Code Walk-through
+
+### Device Communicator Creation (Host-side)
+The `ncclDevComm` is the core component enabling GPU kernels to perform network communication directly. For pure GIN communication, we configure the device communicator with GIN-specific resources. The `ncclDevCommRequirements` specifies GIN barriers for network synchronization and signals for completion detection. Unlike LSA-based examples, we don't need LSA barriers since all communication goes through the network.
+
+```cpp
+ncclDevComm devComm;
+ncclDevCommRequirements reqs;
+memset(&reqs, 0, sizeof(reqs));
+// GIN barriers enable cross-node synchronization over the network
+reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT;  
+// GIN signals provide completion notifications for asynchronous operations
+reqs.ginSignalCount = 1;
+
+// Create device communicator with pure GIN support
+NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+```
+
+### Memory Window Registration (Host-side)
+The device API requires symmetric memory windows registered using `NCCL_WIN_COLL_SYMMETRIC`. These windows enable GPU kernels to access remote memory through GIN operations. Unlike LSA which provides direct memory access, GIN windows are accessed through network put/get operations.
+
+```cpp
+ncclWindow_t send_win;
+ncclWindow_t recv_win;
+
+// Register symmetric windows for GIN network access
+NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+```
+
+### GIN Barriers (Device-side)
+GIN barriers enable cross-node synchronization from device code over the network. Each thread block uses `blockIdx.x` to select its dedicated barrier, allowing blocks to progress independently while coordinating with corresponding blocks on other nodes. This is crucial for ensuring all ranks are ready before starting the AlltoAll exchange.
+
+```cpp
+// GIN barriers coordinate GPU threads across different nodes over network
+ncclGinBarrierSession<ncclCoopCta> bar { 
+    ncclCoopCta(),                    // Barrier scope: entire CTA (thread block)
+    gin,                              // GIN context for network operations
+    ncclTeamWorld(devComm),          // Team spanning all ranks
+    devComm.railGinBarrier,          // GIN barrier handle
+    blockIdx.x                       // Barrier index: matches our CTA index
+};
+bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+```
+
+### GIN Put Operations (Device-side)
+GIN provides one-sided put operations for direct remote memory writes over the network. Each thread handles a subset of destination ranks, writing its rank's data to the appropriate location in each peer's receive buffer. The `ncclGin_SignalInc` parameter increments a signal counter, enabling asynchronous completion detection.
+
+```cpp
+// Send data to all peers via GIN network operations
+const size_t size = count * sizeof(T);
+for (int r = tid; r < devComm.nRanks; r += nthreads) {
+    gin.put(ncclTeamWorld(devComm), r,
+        recvwin, recvoffset + devComm.rank * size,  // Destination: peer r's buffer
+        sendwin, sendoffset + r * size,             // Source: data for peer r
+        size, ncclGin_SignalInc{signalIndex});      // Signal increment for completion
+}
+```
+
+### Signal-based Completion (Device-side)
+GIN uses signals for asynchronous completion detection of network operations. The kernel waits for the signal value to reach the expected count (initial value + number of ranks), indicating all put operations have completed. The `gin.flush()` ensures all pending operations are committed before proceeding.
+
+```cpp
+// Wait for all remote puts to complete
+gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + devComm.nRanks);
+gin.flush(ncclCoopCta());  // Ensure all operations are committed
+```
+
+## Expected Output
+
+```
+Starting Pure GIN AlltoAll initialization
+  Rank 0 using GPU device 0
+  Rank 1 using GPU device 1
+  Rank 0 initialized NCCL communicator for 2 total ranks
+  Rank 1 initialized NCCL communicator for 2 total ranks
+  Rank 0 initialized send data
+  Rank 1 initialized send data
+  Rank 0 created device communicator with GIN support
+  Rank 1 created device communicator with GIN support
+Starting Pure GIN AlltoAll with 1024 elements per rank (2048 total elements, 0 MB)
+
+=== Executing Pure GIN AlltoAll ===
+  Rank 0 completed pure GIN AlltoAll kernel
+  Rank 1 completed pure GIN AlltoAll kernel
+Pure GIN AlltoAll result: PASSED
+```
+
+## When to Use
+
+- **Multi-node environments**: When ranks cannot use LSA
+- **Testing network performance**: Without local optimizations  
+- **Understanding the baseline GIN communication patterns**
+- **Scenarios where all communication must go through the network**
+
+## Performance Considerations
+
+- **Network overhead**: All communication goes through the network stack
+- **Signal-based completion**: Enables asynchronous operation patterns
+- **Barrier synchronization**: Ensures proper ordering of network operations
+- **Multiple GIN contexts**: Can improve parallel communication performance
+
+## Common Issues and Solutions
+
+### Issue: Deadlock at util_broadcast
+**Solution:** Ensure you're running with multiple GPUs/processes
+```bash
+NTHREADS=2 ./gin_alltoall_pure_device_api  # For 2 GPUs
+```
+
+### Issue: CUDA out of memory
+**Solution:** Reduce the data size in the example
+
+### Issue: Network errors
+**Solution:** Ensure proper network configuration for multi-node setups
+
+## Performance Notes
+
+- These are educational examples, not optimized for performance
+- Real implementations should consider:
+  - Optimal GIN context usage for parallel operations
+  - Signal pool management for high-throughput scenarios
+  - Memory coalescing patterns for network operations
+  - Network topology-aware communication strategies
+
+## Error Handling
+
+The example uses comprehensive error checking for CUDA, NCCL, and GIN operations. Device kernels should implement proper error handling for network operations and signal management.
+
+## Next Steps
+
+After understanding this example, explore:
+- **Custom network protocols**: Implement specialized communication patterns using GIN
+- **Performance optimization**: Fine-tune GIN context usage and signal management
+- **Hybrid approaches**: Combine GIN with LSA for topology-aware optimizations
+- **Integration with compute**: Fuse network communication with computation kernels
diff --git a/examples/06_device_api/02_gin_alltoall_pure/main.cu b/examples/06_device_api/02_gin_alltoall_pure/main.cu
new file mode 100644
index 00000000000..08fdddf5e6e
--- /dev/null
+++ b/examples/06_device_api/02_gin_alltoall_pure/main.cu
@@ -0,0 +1,251 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+ #include "cuda_runtime.h"
+ #include "nccl.h"
+ #include "nccl_device.h"
+ #include "utils.h"
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <sys/time.h>
+ #include <unistd.h>
+ 
+/*
+ * NCCL Device API Pure GIN AlltoAll Example
+ *
+ * This example demonstrates NCCL's GPU-Initiated Networking (GIN) capabilities
+ * for performing AlltoAll collective operations directly from GPU kernels using
+ * only network-based communication.
+ * GIN enables GPU kernels to initiate network communication without CPU
+ * intervention, providing low-latency communication for distributed applications.
+ *
+ * Learning Objectives:
+ * - Understand pure GIN (GPU-Initiated Networking) communication
+ * - Learn how to use ncclGin for device-initiated network communication
+ * - See pure GIN AlltoAll implementation for network-based communication
+ * - Practice GIN barriers and signal-based synchronization
+ *
+ * Key GIN Concepts:
+ * - ncclGin: Device-side networking object for kernel-initiated communication
+ * - GIN contexts: Network communication channels for parallel operations
+ * - GIN signals: Completion notifications for asynchronous operations
+ * - GIN barriers: Network-based synchronization across ranks
+ * - One-sided put operations: Direct remote memory writes over network
+ *
+ * When to Use Pure GIN:
+ * - Communication between ranks that cannot use LSA (different nodes)
+ * - Network-based collective operations in multi-node environments
+ * - Scenarios where all communication must go through the network
+ * - Testing network performance without local optimizations
+ *
+ * Performance Considerations:
+ * - GIN provides network communication from GPU kernels
+ * - All communication goes through the network (no local optimizations)
+ * - Signal-based completion detection enables asynchronous operation
+ * - Multiple GIN contexts can improve parallel communication performance
+ */
+ 
+// Device API kernel launch configuration
+// CTA count must match railGinBarrierCount for proper barrier synchronization
+ #define NCCL_DEVICE_CTA_COUNT 1
+ #define NCCL_DEVICE_THREADS_PER_CTA 512
+ 
+ // ==========================================================================
+ // Device Kernel Implementations
+ // ==========================================================================
+ 
+// Pure GIN AlltoAll kernel - uses GIN for all peer communication
+// This kernel demonstrates network-based AlltoAll using GPU-initiated networking
+template <typename T>
+__global__ void PureGinAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, 
+                                      ncclWindow_t recvwin, size_t recvoffset, 
+                                      size_t count, int root, struct ncclDevComm devComm) {
+  int ginContext = 0;
+  unsigned int signalIndex = 0;
+  ncclGin gin { devComm, ginContext };
+  uint64_t signalValue = gin.readSignal(signalIndex);
+
+  // GIN barriers enable coordination between GPU threads across different ranks over network
+  ncclGinBarrierSession<ncclCoopCta> bar { ncclCoopCta(), gin, ncclTeamWorld(devComm),
+                                           devComm.railGinBarrier, blockIdx.x };
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int nthreads = blockDim.x * gridDim.x;
+
+  // Send to all peers via GIN (GPU-initiated networking)
+  const size_t size = count * sizeof(T);
+  for (int r = tid; r < devComm.nRanks; r += nthreads) {
+    gin.put(ncclTeamWorld(devComm), r,
+        recvwin, recvoffset + devComm.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+
+  // Wait for all remote puts to complete using signal-based synchronization
+  gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + devComm.nRanks);
+  gin.flush(ncclCoopCta());
+}
+ 
+ // ==========================================================================
+ // Host-Side Setup and Device API Initialization
+ // ==========================================================================
+ 
+void* pureGinAlltoAll(int my_rank, int total_ranks, int local_device, int devices_per_rank) {
+  ncclComm_t comm;
+  ncclUniqueId nccl_unique_id;
+
+  if (my_rank == 0) {
+    printf("Starting Pure GIN AlltoAll initialization\n");
+  }
+
+  // Standard NCCL communicator initialization
+  if (my_rank == 0) {
+    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+  }
+
+  // Distribute unique ID
+  util_broadcast(0, my_rank, &nccl_unique_id);
+
+  // Set device context for this rank
+  CUDACHECK(cudaSetDevice(local_device));
+  printf("  Rank %d using GPU device %d\n", my_rank, local_device);
+
+  // ==========================================================================
+  // STEP 2: Initialize NCCL Communicator and Allocate Memory
+  // ==========================================================================
+
+  // Initialize NCCL communicator
+  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
+  printf("  Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks);
+
+  // Allocate memory for AlltoAll operation
+  size_t count = 1024; // Elements per rank
+  size_t total_elements = count * total_ranks;
+  size_t size_bytes = total_elements * sizeof(float);
+
+  float *h_sendbuff = (float*)malloc(size_bytes);
+  float *h_recvbuff = (float*)malloc(size_bytes);
+  void* d_sendbuff;
+  void* d_recvbuff;
+  ncclWindow_t send_win;
+  ncclWindow_t recv_win;
+
+  // Device API requires symmetric memory allocation
+  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+  // ==========================================================================
+  // STEP 3: Register Memory Windows for Device-Side Access
+  // ==========================================================================
+
+  // Register symmetric windows for GIN access
+  NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+  NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+
+  // Initialize data: each rank sends unique values to each destination
+  for (size_t i = 0; i < total_elements; i++) {
+    int dest_rank = i / count;
+    int element_idx = i % count;
+    h_sendbuff[i] = (float)(my_rank * 1000 + dest_rank * 100 + element_idx);
+  }
+  CUDACHECK(cudaMemcpy(d_sendbuff, h_sendbuff, size_bytes, cudaMemcpyHostToDevice));
+  printf("  Rank %d initialized send data\n", my_rank);
+
+  // ==========================================================================
+  // STEP 4: Create Device Communicator with GIN Support
+  // ==========================================================================
+
+  // Create stream for kernel execution
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  // Create device communicator with GIN support
+  ncclDevComm devComm;
+  ncclDevCommRequirements reqs;
+  memset(&reqs, 0, sizeof(reqs));
+  reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT;  // GIN barriers for network synchronization
+  reqs.ginSignalCount = 1;  // GIN signals for completion detection
+  NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+  printf("  Rank %d created device communicator with GIN support\n", my_rank);
+
+  if (my_rank == 0) {
+    printf("Starting Pure GIN AlltoAll with %zu elements per rank (%zu total elements, %zu MB)\n",
+            count, total_elements, size_bytes / (1024 * 1024));
+  }
+
+  // ==========================================================================
+  // STEP 5: Execute Pure GIN AlltoAll Kernel
+  // ==========================================================================
+
+  if (my_rank == 0) {
+    printf("\n=== Executing Pure GIN AlltoAll ===\n");
+  }
+
+    // Clear receive buffer
+    CUDACHECK(cudaMemset(d_recvbuff, 0, size_bytes));
+
+  // Launch pure GIN AlltoAll kernel
+  PureGinAlltoAllKernel<float><<<NCCL_DEVICE_CTA_COUNT, NCCL_DEVICE_THREADS_PER_CTA, 0, stream>>>(
+      send_win, 0, recv_win, 0, count, 0, devComm);
+
+  // Wait for completion
+  CUDACHECK(cudaStreamSynchronize(stream));
+  printf("  Rank %d completed pure GIN AlltoAll kernel\n", my_rank);
+
+  // ==========================================================================
+  // STEP 6: Verify Results
+  // ==========================================================================
+
+  // Verify pure GIN results
+  CUDACHECK(cudaMemcpy(h_recvbuff, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost));
+  bool gin_success = true;
+  for (int src_rank = 0; src_rank < total_ranks; src_rank++) {
+    for (size_t i = 0; i < count; i++) {
+      size_t recv_idx = src_rank * count + i;
+      float expected = (float)(src_rank * 1000 + my_rank * 100 + i);
+      if (h_recvbuff[recv_idx] != expected) {
+        gin_success = false;
+        printf("  Rank %d: Pure GIN mismatch at [%d][%zu]: got %.0f, expected %.0f\n", 
+                my_rank, src_rank, i, h_recvbuff[recv_idx], expected);
+        break;
+      }
+    }
+    if (!gin_success) break;
+  }
+
+  if (my_rank == 0) {
+    printf("Pure GIN AlltoAll result: %s\n", gin_success ? "PASSED" : "FAILED");
+  }
+
+  // ==========================================================================
+  // STEP 7: Cleanup Resources
+  // ==========================================================================
+
+  // Cleanup host memory
+  free(h_sendbuff);
+  free(h_recvbuff);
+
+  // Device API specific cleanup
+  NCCLCHECK(ncclDevCommDestroy(comm, &devComm));
+  NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
+  NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
+  NCCLCHECK(ncclMemFree(d_sendbuff));
+  NCCLCHECK(ncclMemFree(d_recvbuff));
+
+  // Standard NCCL cleanup
+  CUDACHECK(cudaStreamDestroy(stream));
+  NCCLCHECK(ncclCommFinalize(comm));
+  NCCLCHECK(ncclCommDestroy(comm));
+
+  return NULL;
+}
+ 
+int main(int argc, char* argv[]) {
+  // Run example using the provided utility framework
+  return run_example(argc, argv, pureGinAlltoAll);
+}
diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/Makefile b/examples/06_device_api/03_gin_alltoall_hybrid/Makefile
new file mode 100644
index 00000000000..30733120df2
--- /dev/null
+++ b/examples/06_device_api/03_gin_alltoall_hybrid/Makefile
@@ -0,0 +1,85 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Include common build rules
+include ../../../makefiles/common.mk
+include ../../../makefiles/examples.mk
+
+# Target executable
+TARGET = gin_alltoall_hybrid_device_api
+
+# Common utilities
+COMMON_INC = ../../common/include
+COMMON_SRC = ../../common/src
+
+# Build configuration
+INCLUDES += -I$(COMMON_INC)
+
+# Source files
+SOURCES = main.cu $(COMMON_SRC)/utils.cc
+OBJECTS = $(SOURCES:.cu=.o)
+OBJECTS := $(OBJECTS:.cc=.o)
+
+# Default target
+all: $(TARGET)
+
+# Build executable
+$(TARGET): $(OBJECTS)
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -o $@
+else
+	$(CXX) $(CXXFLAGS) $(OBJECTS) $(LIBRARIES) $(LDFLAGS) -lpthread -o $@
+endif
+	@echo "Built target $@"
+
+# Compile source files
+%.o: %.cu
+	$(NVCC) $(NVCUFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.cc
+ifeq ($(MPI),1)
+	$(MPICXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+else
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+endif
+
+# Test target
+test: $(TARGET)
+	@echo "Testing $(TARGET)..."
+ifeq ($(MPI),1)
+	@echo "Running with 2 processes"
+	$(MPIRUN) -np 2 ./$(TARGET)
+else
+	@echo "Running with all available GPUs"
+	./$(TARGET)
+endif
+
+# Clean build artifacts
+clean:
+	rm -f $(OBJECTS) $(TARGET)
+
+# Install target
+install: $(TARGET)
+	@mkdir -p $(PREFIX)/bin
+	cp $(TARGET) $(PREFIX)/bin/
+
+# Help
+help:
+	@echo "NCCL Example: Hybrid AlltoAll Device API"
+	@echo "========================================="
+	@echo ""
+	@echo "This example demonstrates hybrid communication combining"
+	@echo "GPU-Initiated Networking (GIN) for remote peers with"
+	@echo "Load Store Access (LSA) for local peers."
+	@echo ""
+	@echo "Targets:"
+	@echo "  all       - Build the example (default)"
+	@echo "  test      - Build and run test with all GPUs"
+	@echo "  clean     - Remove build artifacts"
+	@echo "  install   - Install to PREFIX/bin (default: /usr/local/bin)"
+	@echo "  help      - Show this help"
+
+.PHONY: all test clean install help
diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/README.md b/examples/06_device_api/03_gin_alltoall_hybrid/README.md
new file mode 100644
index 00000000000..4354fcd918d
--- /dev/null
+++ b/examples/06_device_api/03_gin_alltoall_hybrid/README.md
@@ -0,0 +1,228 @@
+<!-- Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+See LICENSE.txt for license information -->
+
+# NCCL Device API Hybrid AlltoAll Example
+
+This example shows how to implement AlltoAll operations using a hybrid approach that combines Load Store Access (LSA) for local peers with GPU-Initiated Networking (GIN) for remote peers. We create a device communicator with `ncclDevCommCreate` supporting both LSA and GIN capabilities, enabling optimal communication performance across different peer types.
+
+## Overview
+
+This example showcases **hybrid communication** that intelligently selects the optimal communication method for each peer:
+
+- **LSA (Load Store Access)** for local peers (same node/memory space)
+- **GIN (GPU-Initiated Networking)** for remote peers (different nodes)
+
+## What This Example Does
+
+1. **Creates hybrid device communicators** using `ncclDevCommCreate` with both LSA and GIN support for optimal peer communication
+2. **Registers symmetric memory windows** with `ncclCommWindowRegister` for both LSA direct access and GIN network operations  
+3. **Launches GPU kernel** that performs AlltoAll operations using LSA for local peers and GIN for remote peers
+4. **Demonstrates hybrid synchronization** coordinating both LSA barriers and GIN signals for correctness
+
+## Building and Running
+
+The advanced examples can be built using either pthread or MPI for parallelization. pthread is the default choice. To use MPI the user needs to set `MPI=1` at build time and can optionally provide a valid MPI installation under `MPI_HOME`.
+
+### Build
+```bash
+make [MPI=1] [MPI_HOME=<path-to-mpi>] [NCCL_HOME=<path-to-nccl>] [CUDA_HOME=<path-to-cuda>]
+```
+
+### Run when compiled for pthreads (default)
+```bash
+[NTHREADS=N] ./gin_alltoall_hybrid_device_api
+```
+
+### Run when compiled for MPI
+```bash
+mpirun -np <num_processes> ./gin_alltoall_hybrid_device_api
+```
+
+## Code Walk-through
+
+### Device Communicator Creation (Host-side)
+The `ncclDevComm` is the core component enabling GPU kernels to perform both local and remote communication. For hybrid communication, we configure the device communicator with both LSA and GIN resources. The `ncclDevCommRequirements` specifies LSA barriers for local synchronization, GIN barriers for network synchronization, and GIN signals for completion detection. This dual setup enables optimal communication for each peer type.
+
+```cpp
+ncclDevComm devComm;
+ncclDevCommRequirements reqs;
+memset(&reqs, 0, sizeof(reqs));
+// LSA barriers enable direct memory access coordination for local peers
+reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT;
+// GIN barriers enable cross-node synchronization over the network  
+reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT;
+// GIN signals provide completion notifications for asynchronous network operations
+reqs.ginSignalCount = 1;
+
+// Create device communicator with hybrid LSA+GIN support
+NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+```
+
+### Memory Window Registration (Host-side)
+The device API requires symmetric memory windows registered using `NCCL_WIN_COLL_SYMMETRIC`. These windows enable both LSA direct access for local peers and GIN network operations for remote peers. The same memory windows support both communication methods, with the kernel automatically selecting the appropriate access pattern based on peer locality.
+
+```cpp
+ncclWindow_t send_win;
+ncclWindow_t recv_win;
+
+// Register symmetric windows for both LSA and GIN access
+NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+```
+
+### Hybrid Barriers (Device-side)
+Hybrid barriers coordinate both local LSA operations and remote GIN operations. The barrier session uses the world team and GIN context to ensure synchronization across all ranks, regardless of their communication method. This unified barrier approach ensures all peers reach the same synchronization point before proceeding with data exchange.
+
+```cpp
+// Hybrid barriers coordinate both LSA and GIN operations across all ranks
+ncclBarrierSession<ncclCoopCta> bar { 
+    ncclCoopCta(),              // Barrier scope: entire CTA (thread block)
+    ncclTeamTagWorld(),         // Team spanning all ranks (local + remote)
+    gin,                        // GIN context for network coordination
+    blockIdx.x                  // Barrier index: matches our CTA index
+};
+bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+```
+
+### Peer Classification (Device-side)
+The hybrid kernel intelligently classifies peers into local (LSA-accessible) and remote (GIN-only) categories. This classification determines the optimal communication method for each peer. Local peers benefit from direct memory access, while remote peers use network communication.
+
+```cpp
+// Classify peers into local (LSA) and remote (GIN) categories
+ncclTeam world = ncclTeamWorld(devComm);  // All ranks
+ncclTeam lsa = ncclTeamLsa(devComm);      // Local ranks only
+const int startLsa = world.rank - lsa.rank;  // First local rank in world
+const int lsaSize = lsa.nRanks;              // Number of local peers
+```
+
+### Memory Access (Device-side)
+`ncclGetLsaPointer` allows CUDA kernels to directly access other GPUs' memory within the LSA team, while `gin.put` handles remote communication over the network. The hybrid approach uses the most efficient method for each peer type.
+
+```cpp
+// Handle local peers using direct memory access (LSA)
+T* sendLocal = (T*)ncclGetLocalPointer(sendwin, sendoffset);
+T* recvPtr = (T*)ncclGetLsaPointer(recvwin, recvoffset, lp);
+
+// Handle remote peers using network operations (GIN)
+gin.put(world, r, recvwin, recvoffset + world.rank * size,
+        sendwin, sendoffset + r * size, size, ncclGin_SignalInc{signalIndex});
+```
+
+## Building and Running
+
+### Build
+```bash
+make
+```
+
+### Run with pthread mode (default)
+```bash
+# Run with all available GPUs
+./gin_alltoall_hybrid_device_api
+
+# Run with specific number of GPUs
+NTHREADS=4 ./gin_alltoall_hybrid_device_api
+```
+
+### Run with MPI mode
+```bash
+# Build with MPI support
+make MPI=1
+
+# Run with MPI across multiple nodes
+mpirun -np 4 --hostfile hosts ./gin_alltoall_hybrid_device_api
+```
+
+### Test
+```bash
+make test
+```
+
+## Expected Output
+
+```
+Starting Hybrid AlltoAll initialization
+  Rank 0 using GPU device 0
+  Rank 1 using GPU device 1
+  Rank 2 using GPU device 2
+  Rank 3 using GPU device 3
+  Rank 0 initialized NCCL communicator for 4 total ranks
+  Rank 1 initialized NCCL communicator for 4 total ranks
+  Rank 2 initialized NCCL communicator for 4 total ranks
+  Rank 3 initialized NCCL communicator for 4 total ranks
+  Rank 0 initialized send data
+  Rank 1 initialized send data
+  Rank 2 initialized send data
+  Rank 3 initialized send data
+  Rank 0 created device communicator with hybrid support
+  Rank 1 created device communicator with hybrid support
+  Rank 2 created device communicator with hybrid support
+  Rank 3 created device communicator with hybrid support
+Starting Hybrid AlltoAll with 1024 elements per rank (4096 total elements, 0 MB)
+Using LSA for local peers and GIN for remote peers
+
+=== Executing Hybrid AlltoAll ===
+  Rank 0 completed hybrid AlltoAll kernel
+  Rank 1 completed hybrid AlltoAll kernel
+  Rank 2 completed hybrid AlltoAll kernel
+  Rank 3 completed hybrid AlltoAll kernel
+Hybrid AlltoAll result: PASSED
+✓ All 4096 elements correctly exchanged using hybrid communication
+```
+
+## When to Use
+
+- **Multi-node training**: Mixed local/remote communication patterns
+- **Large-scale inference**: Optimized for various topologies
+- **Production workloads**: Where performance is critical
+- **Heterogeneous clusters**: Different node configurations
+
+## Performance Considerations
+
+**Advantages:**
+- **Reduced Latency**: LSA provides ultra-low latency for local communication
+- **Optimal Bandwidth**: GIN efficiently handles remote communication
+- **Reduced Network Load**: Local traffic stays off the network
+- **Scalable Design**: Efficient across different node configurations
+
+**Disadvantages:**
+- More complex programming model requiring coordination of both LSA and GIN
+- Requires careful synchronization between different communication methods
+- Higher development complexity compared to pure approaches
+
+## Common Issues and Solutions
+
+### Issue: LSA barriers not supported
+**Cause:** GPUs not connected through NVLink or PCIe for direct memory access
+**Solution:** Verify GPU topology with `nvidia-smi topo -m` and ensure proper LSA-capable connections
+
+### Issue: Hybrid synchronization failures
+**Solution:** Ensure both `lsaBarrierCount` and `railGinBarrierCount` match the number of thread blocks in kernel launch configuration
+
+### Issue: Peer classification errors
+**Solution:** Verify LSA team setup and ensure symmetric memory allocation is properly configured for all ranks
+
+### Issue: Mixed communication performance issues
+**Solution:** Profile LSA vs GIN usage patterns and optimize barrier configurations for your specific topology
+
+## Performance Notes
+
+- These are educational examples, not optimized for performance
+- Real implementations should consider:
+  - Optimal balance between LSA and GIN operations based on topology
+  - Memory coalescing patterns for both LSA and GIN operations
+  - Barrier synchronization overhead minimization
+  - Signal pool management for high-throughput GIN scenarios
+
+## Error Handling
+
+The example uses comprehensive error checking for CUDA, NCCL, LSA, and GIN operations. Device kernels should implement proper error handling for both direct memory access patterns and network operations.
+
+## Next Steps
+
+After understanding this example, explore:
+- **Topology-aware optimization**: Fine-tune LSA/GIN balance based on hardware topology
+- **Custom hybrid patterns**: Implement specialized communication strategies
+- **Performance profiling**: Analyze LSA vs GIN performance characteristics
+- **Advanced synchronization**: Optimize barrier usage for complex communication patterns
diff --git a/examples/06_device_api/03_gin_alltoall_hybrid/main.cu b/examples/06_device_api/03_gin_alltoall_hybrid/main.cu
new file mode 100644
index 00000000000..d1201654957
--- /dev/null
+++ b/examples/06_device_api/03_gin_alltoall_hybrid/main.cu
@@ -0,0 +1,278 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "cuda_runtime.h"
+#include "nccl.h"
+#include "nccl_device.h"
+#include "utils.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+ 
+/*
+ * NCCL Device API Hybrid AlltoAll Example
+ *
+ * This example demonstrates NCCL's hybrid communication approach that combines
+ * GPU-Initiated Networking (GIN) for remote peers with Load Store Access (LSA)
+ * for local peers, optimizing AlltoAll collective operations.
+ *
+ * Learning Objectives:
+ * - Understand hybrid communication optimization
+ * - Learn when to use GIN vs LSA for different peer types
+ * - Practice combining network and memory-based communication
+ * - See performance optimization through intelligent peer selection
+ *
+ * Key Hybrid Concepts:
+ * - **LSA (Load Store Access)**: Direct memory access for local peers
+ * - **GIN (GPU-Initiated Networking)**: Network communication for remote peers
+ * - **Peer classification**: Distinguishing between local and remote peers
+ * - **Hybrid synchronization**: Combining LSA and GIN completion mechanisms
+ * - **Performance optimization**: Using the fastest method for each peer type
+ *
+ * When to Use Hybrid:
+ * - Multi-node environments with both local and remote peers
+ * - Performance-critical applications requiring optimal communication
+ * - Mixed communication patterns (intra-node + inter-node)
+ * - Production workloads where efficiency matters
+ *
+ * Performance Benefits:
+ * - LSA provides low-latency local communication
+ * - GIN handles remote communication efficiently
+ * - Reduced network traffic for local operations
+ * - Optimal bandwidth utilization across communication types
+ */
+ 
+// Device API kernel launch configuration
+// CTA count must match railGinBarrierCount for proper barrier synchronization
+#define NCCL_DEVICE_CTA_COUNT 16
+#define NCCL_DEVICE_THREADS_PER_CTA 512
+ 
+// ==========================================================================
+// Device Kernel Implementation
+// ==========================================================================
+
+// Hybrid AlltoAll kernel - optimizes by using LSA for local peers, GIN for remote
+// This kernel demonstrates performance optimization using both communication methods
+template <typename T>
+__global__ void HybridAlltoAllKernel(ncclWindow_t sendwin, size_t sendoffset, 
+                                      ncclWindow_t recvwin, size_t recvoffset, 
+                                      size_t count, int root, struct ncclDevComm devComm) {
+  int ginContext = 0;
+  unsigned int signalIndex = 0;
+  ncclGin gin { devComm, ginContext };
+  uint64_t signalValue = gin.readSignal(signalIndex);
+
+  // GIN barriers for cross-node synchronization
+  ncclBarrierSession<ncclCoopCta> bar { ncclCoopCta(), ncclTeamTagWorld(), gin, blockIdx.x };
+  bar.sync(ncclCoopCta(), cuda::memory_order_relaxed, ncclGinFenceLevel::Relaxed);
+
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int nthreads = blockDim.x * gridDim.x;
+
+  ncclTeam world = ncclTeamWorld(devComm);
+  ncclTeam lsa = ncclTeamLsa(devComm);
+  const int startLsa = world.rank - lsa.rank;
+  const int lsaSize = lsa.nRanks;
+
+  // Handle remote peers (i.e., non-LSA) using GIN for network communication
+  const size_t size = count * sizeof(T);
+  for (int r = tid; r < startLsa; r += nthreads) {
+    gin.put(world, r,
+        recvwin, recvoffset + world.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+  for (int r = startLsa + lsaSize + tid; r < world.nRanks; r += nthreads) {
+    gin.put(world, r,
+        recvwin, recvoffset + world.rank * size,
+        sendwin, sendoffset + r * size,
+        size, ncclGin_SignalInc{signalIndex});
+  }
+
+  // Handle local peers with LSA (Load Store Access) for optimal performance
+  T* sendLocal = (T*)ncclGetLocalPointer(sendwin, sendoffset);
+  for (size_t offset = tid; offset < count; offset += nthreads) {
+    for (int lp = 0; lp < lsa.nRanks; lp++) {
+      int wr = startLsa + lp;
+      T* recvPtr = (T*)ncclGetLsaPointer(recvwin, recvoffset, lp);
+      recvPtr[world.rank * count + offset] = sendLocal[wr * count + offset];
+    }
+  }
+
+  // Wait for remote GIN operations to complete
+  int numRemotePeers = world.nRanks - lsa.nRanks;
+  gin.waitSignal(ncclCoopCta(), signalIndex, signalValue + numRemotePeers);
+  gin.flush(ncclCoopCta());
+
+  // Final synchronization barrier
+  bar.sync(ncclCoopCta(), cuda::memory_order_release, ncclGinFenceLevel::Relaxed);
+}
+ 
+ // ==========================================================================
+ // Host-Side Setup and Device API Initialization
+ // ==========================================================================
+ 
+void* hybridAlltoAll(int my_rank, int total_ranks, int local_device, int devices_per_rank) {
+  ncclComm_t comm;
+  ncclUniqueId nccl_unique_id;
+
+  if (my_rank == 0) {
+    printf("Starting Hybrid AlltoAll initialization\n");
+  }
+
+  // Standard NCCL communicator initialization
+  if (my_rank == 0) {
+    NCCLCHECK(ncclGetUniqueId(&nccl_unique_id));
+  }
+
+  // Distribute unique ID
+  util_broadcast(0, my_rank, &nccl_unique_id);
+
+  // Set device context for this rank
+  CUDACHECK(cudaSetDevice(local_device));
+  printf("  Rank %d using GPU device %d\n", my_rank, local_device);
+ 
+  // ==========================================================================
+  // STEP 2: Initialize NCCL Communicator and Allocate Memory
+  // ==========================================================================
+
+  // Initialize NCCL communicator
+  NCCLCHECK(ncclCommInitRank(&comm, total_ranks, nccl_unique_id, my_rank));
+  printf("  Rank %d initialized NCCL communicator for %d total ranks\n", my_rank, total_ranks);
+
+  // Allocate memory for AlltoAll operation
+  size_t count = 1024; // Elements per rank
+  size_t total_elements = count * total_ranks;
+  size_t size_bytes = total_elements * sizeof(float);
+
+  float *h_sendbuff = (float*)malloc(size_bytes);
+  float *h_recvbuff = (float*)malloc(size_bytes);
+  void* d_sendbuff;
+  void* d_recvbuff;
+  ncclWindow_t send_win;
+  ncclWindow_t recv_win;
+
+  // Device API requires symmetric memory allocation
+  NCCLCHECK(ncclMemAlloc(&d_sendbuff, size_bytes));
+  NCCLCHECK(ncclMemAlloc(&d_recvbuff, size_bytes));
+
+  // ==========================================================================
+  // STEP 3: Register Memory Windows for Device-Side Access
+  // ==========================================================================
+
+  // Register symmetric windows for both LSA and GIN access
+  NCCLCHECK(ncclCommWindowRegister(comm, d_sendbuff, size_bytes, &send_win, NCCL_WIN_COLL_SYMMETRIC));
+  NCCLCHECK(ncclCommWindowRegister(comm, d_recvbuff, size_bytes, &recv_win, NCCL_WIN_COLL_SYMMETRIC));
+
+  // Initialize data: each rank sends unique values to each destination
+  for (size_t i = 0; i < total_elements; i++) {
+    int dest_rank = i / count;
+    int element_idx = i % count;
+    h_sendbuff[i] = (float)(my_rank * 1000 + dest_rank * 100 + element_idx);
+  }
+  CUDACHECK(cudaMemcpy(d_sendbuff, h_sendbuff, size_bytes, cudaMemcpyHostToDevice));
+  printf("  Rank %d initialized send data\n", my_rank);
+ 
+  // ==========================================================================
+  // STEP 4: Create Device Communicator with Hybrid Support
+  // ==========================================================================
+
+  // Create stream for kernel execution
+  cudaStream_t stream;
+  CUDACHECK(cudaStreamCreate(&stream));
+
+  // Create device communicator with both LSA and GIN support
+  ncclDevComm devComm;
+  ncclDevCommRequirements reqs;
+  memset(&reqs, 0, sizeof(reqs));
+  reqs.lsaBarrierCount = NCCL_DEVICE_CTA_COUNT;  // LSA barriers for local synchronization
+  reqs.railGinBarrierCount = NCCL_DEVICE_CTA_COUNT;  // GIN barriers for network synchronization
+  reqs.ginSignalCount = 1;  // GIN signals for completion detection
+  NCCLCHECK(ncclDevCommCreate(comm, &reqs, &devComm));
+  printf("  Rank %d created device communicator with hybrid support\n", my_rank);
+
+  if (my_rank == 0) {
+    printf("Starting Hybrid AlltoAll with %zu elements per rank (%zu total elements, %zu MB)\n",
+           count, total_elements, size_bytes / (1024 * 1024));
+    printf("Using LSA for local peers and GIN for remote peers\n");
+  }
+
+  // ==========================================================================
+  // STEP 5: Execute Hybrid AlltoAll Kernel
+  // ==========================================================================
+
+  if (my_rank == 0) {
+    printf("\n=== Executing Hybrid AlltoAll ===\n");
+  }
+
+  // Clear receive buffer
+  CUDACHECK(cudaMemset(d_recvbuff, 0, size_bytes));
+
+  // Launch hybrid AlltoAll kernel
+  HybridAlltoAllKernel<float><<<NCCL_DEVICE_CTA_COUNT, NCCL_DEVICE_THREADS_PER_CTA, 0, stream>>>(
+      send_win, 0, recv_win, 0, count, 0, devComm);
+
+  // Wait for completion
+  CUDACHECK(cudaStreamSynchronize(stream));
+  printf("  Rank %d completed hybrid AlltoAll kernel\n", my_rank);
+
+  // ==========================================================================
+  // STEP 6: Verify Results
+  // ==========================================================================
+
+  // Verify hybrid results
+  CUDACHECK(cudaMemcpy(h_recvbuff, d_recvbuff, size_bytes, cudaMemcpyDeviceToHost));
+  bool hybrid_success = true;
+  for (int src_rank = 0; src_rank < total_ranks; src_rank++) {
+    for (size_t i = 0; i < count; i++) {
+      size_t recv_idx = src_rank * count + i;
+      float expected = (float)(src_rank * 1000 + my_rank * 100 + i);
+      if (h_recvbuff[recv_idx] != expected) {
+        hybrid_success = false;
+        printf("  Rank %d: Hybrid mismatch at [%d][%zu]: got %.0f, expected %.0f\n", 
+               my_rank, src_rank, i, h_recvbuff[recv_idx], expected);
+        break;
+      }
+    }
+    if (!hybrid_success) break;
+  }
+
+  if (my_rank == 0) {
+    printf("Hybrid AlltoAll result: %s\n", hybrid_success ? "PASSED" : "FAILED");
+    if (hybrid_success) {
+      printf("✓ All %zu elements correctly exchanged using hybrid communication\n", total_elements);
+    }
+  }
+ 
+  // ==========================================================================
+  // STEP 7: Cleanup Resources
+  // ==========================================================================
+ 
+    // Cleanup host memory
+    free(h_sendbuff);
+    free(h_recvbuff);
+  
+    // Device API specific cleanup
+    NCCLCHECK(ncclDevCommDestroy(comm, &devComm));
+    NCCLCHECK(ncclCommWindowDeregister(comm, send_win));
+    NCCLCHECK(ncclCommWindowDeregister(comm, recv_win));
+    NCCLCHECK(ncclMemFree(d_sendbuff));
+    NCCLCHECK(ncclMemFree(d_recvbuff));
+  
+    // Standard NCCL cleanup
+    CUDACHECK(cudaStreamDestroy(stream));
+    NCCLCHECK(ncclCommFinalize(comm));
+    NCCLCHECK(ncclCommDestroy(comm));
+  
+    return NULL;
+}
+ 
+int main(int argc, char* argv[]) {
+  // Run example using the provided utility framework
+  return run_example(argc, argv, hybridAlltoAll);
+}
\ No newline at end of file
diff --git a/projects/rccl/Makefile b/projects/rccl/Makefile
index 458a507415b..2b1a57c5a53 100644
--- a/projects/rccl/Makefile
+++ b/projects/rccl/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -11,6 +11,7 @@ BUILDDIR ?= $(abspath ./build)
 ABSBUILDDIR := $(abspath $(BUILDDIR))
 TARGETS := src pkg
 clean: ${TARGETS:%=%.clean}
+examples.build: src.build
 LICENSE_FILES := LICENSE.txt
 LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
 lic: $(LICENSE_TARGETS)
@@ -23,6 +24,9 @@ ${BUILDDIR}/%.txt: %.txt
 src.%:
 	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
 
+examples: src.build
+	${MAKE} -C examples NCCL_HOME=${ABSBUILDDIR}
+
 pkg.%:
 	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
 
diff --git a/projects/rccl/cmake/rocmIb.cmake b/projects/rccl/cmake/rocmIb.cmake
index f6566778fc5..4842b6fe932 100644
--- a/projects/rccl/cmake/rocmIb.cmake
+++ b/projects/rccl/cmake/rocmIb.cmake
@@ -264,6 +264,13 @@ execute_process(
   COMMAND bash -c "sed -i 's/ncclIbSetNetAttr/rocmNetIbSetNetAttr/g' ${ROCM_NETIB_FILE}"
   WORKING_DIRECTORY ${RCCL_SRC_DIR}
 )
+# Rename GIN functions to avoid duplicate symbols with net_ib.cc
+# Note: We rename ncclGinIb* to rocmGinIb*, then restore the struct name
+# since ncclGinIbCollComm is defined in net_ib_gin.h (not renamed)
+execute_process(
+  COMMAND bash -c "sed -i -e 's/ncclGinIb/rocmGinIb/g' -e 's/rocmGinIbCollComm/ncclGinIbCollComm/g' ${ROCM_NETIB_FILE}"
+  WORKING_DIRECTORY ${RCCL_SRC_DIR}
+)
 execute_process(
   COMMAND bash -c "sed -i 's/cuMemGetHandleForAddressRange/hipMemGetHandleForAddressRange/g' ${ROCM_NETIB_FILE}"
   WORKING_DIRECTORY ${RCCL_SRC_DIR}
diff --git a/projects/rccl/ext-src/rocm_netib.patch b/projects/rccl/ext-src/rocm_netib.patch
index 882c8ecf369..ef61c811de2 100644
--- a/projects/rccl/ext-src/rocm_netib.patch
+++ b/projects/rccl/ext-src/rocm_netib.patch
@@ -172,10 +172,11 @@
    struct ibv_sge sges[NCCL_NET_IB_MAX_RECVS];
    struct ibv_send_wr wrs[NCCL_NET_IB_MAX_RECVS + 1];
    // Each dev correlates to a mergedIbDev
-@@ -1370,6 +1457,7 @@
+@@ -1370,7 +1457,8 @@
    struct ncclIbRemSizesFifo remSizesFifo;
    uint64_t fifoHead;
    int ar; // Use adaptive routing when all merged devices have it enabled
+   uint64_t putSignalScratchpad;
 +  bool useCtsOffload;
  };
  // The SendFifo needs to be 32-byte aligned and each element needs
@@ -393,8 +394,8 @@
      // Local ibDevN
      ibDevN = rComm->devs[devIndex].base.ibDevN;
      ibDev = ncclIbDevs + ibDevN;
--    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
-+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp, channel_id, false, q, remMeta.isP2p), ret, fail);
+-    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp), ret, fail);
++    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp, channel_id, false, q, remMeta.isP2p), ret, fail);
      qp->devIndex = devIndex;
      devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
  
diff --git a/projects/rccl/ext-tuner/example/plugin.c b/projects/rccl/ext-tuner/example/plugin.c
index 5e4ca9e4bae..9eba0f55df2 100644
--- a/projects/rccl/ext-tuner/example/plugin.c
+++ b/projects/rccl/ext-tuner/example/plugin.c
@@ -307,7 +307,7 @@ __hidden ncclResult_t pluginInit(void** context, uint64_t commId, size_t nRanks,
     // Set Ring/Simple base network latency to 280
     constants->hwLatencies[NCCL_HW_NET][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = 280.0;
   }
-  
+
   TunerContext* ctx = (TunerContext*)malloc(sizeof(TunerContext));
   if (!ctx) return ncclSystemError;
 
diff --git a/projects/rccl/ext-tuner/example/test/test_plugin.c b/projects/rccl/ext-tuner/example/test/test_plugin.c
index a74386731e5..746cb8ff782 100644
--- a/projects/rccl/ext-tuner/example/test/test_plugin.c
+++ b/projects/rccl/ext-tuner/example/test/test_plugin.c
@@ -744,16 +744,16 @@ int test_nvl_domain_info() {
     .minRanksPerNvlDomain = 3, // minimum ranks across all domains (bottleneck)
     .maxRanksPerNvlDomain = 5  // maximum ranks across all domains (capacity)
   };
-  
+
   void* context = NULL;
   ncclResult_t result = pluginInit(&context, 0, 8, 2, mock_logger, &nvl_domain, NULL);
   TEST_ASSERT(result == ncclSuccess, "Plugin init with NVLink domains should succeed");
-  
+
   // Validate NVLD info structure
   TEST_ASSERT(nvl_domain.nNvlDomains == 2, "Should have 2 domains (nodes)");
   TEST_ASSERT(nvl_domain.minRanksPerNvlDomain == 3, "Should have minimum 3 ranks per domain");
   TEST_ASSERT(nvl_domain.maxRanksPerNvlDomain == 5, "Should have maximum 5 ranks per domain");
-  
+
   // Clean up
   pluginFinalize(context);
   printf("NVLink domain info test passed!\n");
diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk
index f8f455dec66..2b1d1c4b383 100644
--- a/projects/rccl/makefiles/common.mk
+++ b/projects/rccl/makefiles/common.mk
@@ -20,7 +20,7 @@ NET_PROFILER ?= 0
 MLX5DV ?= 0
 MAX_EXT_NET_PLUGINS ?= 0
 
-NVCC = $(CUDA_HOME)/bin/nvcc
+NVCC ?= $(CUDA_HOME)/bin/nvcc
 
 CUDA_LIB ?= $(CUDA_HOME)/lib64
 CUDA_INC ?= $(CUDA_HOME)/include
@@ -85,6 +85,8 @@ NVCUFLAGS  := -ccbin $(CXX) $(NVCC_GENCODE) $(CXXSTD) --expt-extended-lambda -Xp
 # Use addprefix so that we can specify more than one path
 NVLDFLAGS  := -L${CUDA_LIB} -lcudart -lrt
 
+NVCUFLAGS_SYM :=
+
 ########## GCOV ##########
 GCOV ?= 0 # disable by default.
 GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
@@ -158,3 +160,8 @@ endif
 ifneq ($(MAX_EXT_NET_PLUGINS), 0)
 CXXFLAGS += -DNCCL_NET_MAX_PLUGINS=$(MAX_EXT_NET_PLUGINS)
 endif
+
+CXXFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER
+NVCUFLAGS += -DDOCA_VERBS_USE_CUDA_WRAPPER -DDOCA_VERBS_USE_NET_WRAPPER
+
+CXXFLAGS += -DNCCL_GIN_PROXY_ENABLE=1
diff --git a/projects/rccl/makefiles/examples.mk b/projects/rccl/makefiles/examples.mk
new file mode 100644
index 00000000000..6f3a520f3d0
--- /dev/null
+++ b/projects/rccl/makefiles/examples.mk
@@ -0,0 +1,31 @@
+#
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+# Make sure NCCL headers are found and libraries are linked
+ifneq ($(NCCL_HOME), "")
+NVCUFLAGS += -I$(NCCL_HOME)/include/
+NVLDFLAGS += -L$(NCCL_HOME)/lib
+endif
+
+# Build configuration
+INCLUDES = -I$(CUDA_HOME)/include -I$(NCCL_HOME)/include
+LIBRARIES = -L$(CUDA_HOME)/lib64 -L$(NCCL_HOME)/lib
+LDFLAGS = -lcudart -lnccl -Wl,-rpath,$(NCCL_HOME)/lib
+
+
+# MPI configuration
+ifeq ($(MPI), 1)
+
+ifdef MPI_HOME
+MPICXX ?= $(MPI_HOME)/bin/mpicxx
+MPIRUN ?= $(MPI_HOME)/bin/mpirun
+else
+MPICXX ?= mpicxx
+MPIRUN ?= mpirun
+endif
+
+CXXFLAGS += -DMPI_SUPPORT
+endif
diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk
index d0e97c06576..08c9dc78071 100644
--- a/projects/rccl/makefiles/version.mk
+++ b/projects/rccl/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR   := 2
 NCCL_MINOR   := 28
-NCCL_PATCH   := 3
+NCCL_PATCH   := 9
 NCCL_SUFFIX  :=
 PKG_REVISION := 1
diff --git a/projects/rccl/src/CMakeLists.txt b/projects/rccl/src/CMakeLists.txt
index bf423a5c80c..9f0d41d4123 100644
--- a/projects/rccl/src/CMakeLists.txt
+++ b/projects/rccl/src/CMakeLists.txt
@@ -94,11 +94,14 @@ set(SRC_FILES
   include/debug.h
   include/dev_runtime.h
   include/device.h
+  include/env.h
   include/enqueue.h
   include/gdrwrap.h
   include/git_version.h
   include/graph.h
   include/group.h
+  include/gin/gin_host.h
+  include/gin/gin_host_proxy.h
   include/hip_rocm_version_info.h
   include/ibvcore.h
   include/ibvsymbols.h
@@ -108,7 +111,7 @@ set(SRC_FILES
   include/mnnvl.h
   include/nccl_common.h
   include/nccl_device.h
-  include/net_device.h
+  include/nccl_device/net_device.h
   include/net.h
   include/net_ib_cast_inspect.h
   include/nvmlwrap.h
@@ -149,8 +152,18 @@ set(SRC_FILES
   include/nccl_device/comm.h
   include/nccl_device/coop.h
   include/nccl_device/core.h
+  include/nccl_device/barrier.h
+  include/nccl_device/gin.h
+  include/nccl_device/gin_barrier.h
+  include/nccl_device/gin/gdaki/gin_gdaki.h
+  include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
+  include/nccl_device/gin/gin_device_api.h
+  include/nccl_device/gin/gin_device_common.h
+  include/nccl_device/gin/gin_device_host_common.h
+  include/nccl_device/gin/proxy/gin_proxy.h
+  include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
   include/nccl_device/ll_a2a.h
-  include/nccl_device/mem_barrier.h
+  include/nccl_device/lsa_barrier.h
   include/nccl_device/ptr.h
   include/nccl_device/rccl_ptr.h
   include/nccl_device/utility.h
@@ -158,10 +171,16 @@ set(SRC_FILES
   include/nccl_device/impl/comm__types.h
   include/nccl_device/impl/core__funcs.h
   include/nccl_device/impl/core__types.h
+  include/nccl_device/impl/barrier__funcs.h
+  include/nccl_device/impl/barrier__types.h
+  include/nccl_device/impl/gin__funcs.h
+  include/nccl_device/impl/gin__types.h
+  include/nccl_device/impl/gin_barrier__funcs.h
+  include/nccl_device/impl/gin_barrier__types.h
   include/nccl_device/impl/ll_a2a__funcs.h
   include/nccl_device/impl/ll_a2a__types.h
-  include/nccl_device/impl/mem_barrier__funcs.h
-  include/nccl_device/impl/mem_barrier__types.h
+  include/nccl_device/impl/lsa_barrier__funcs.h
+  include/nccl_device/impl/lsa_barrier__types.h
   include/nccl_device/impl/ptr__funcs.h
   include/nccl_device/impl/ptr__types.h
   include/npkit/npkit.h
@@ -205,6 +224,8 @@ set(SRC_FILES
   include/plugin/nccl_net.h
   include/plugin/nccl_profiler.h
   include/plugin/nccl_tuner.h
+  include/plugin/nccl_env.h
+  include/plugin/env/env_v1.h
   include/plugin/plugin.h
   include/plugin/net/net_v6.h
   include/plugin/net/net_v7.h
@@ -253,9 +274,12 @@ set(SRC_FILES
   misc/utils.cc
   misc/proxy_trace/proxy_trace.cc
   nccl_device/core.cc
+  nccl_device/gin_barrier.cc
   nccl_device/ll_a2a.cc
-  nccl_device/mem_barrier.cc
+  nccl_device/lsa_barrier.cc
   plugin/net.cc
+  plugin/env.cc
+  plugin/env/env_v1.cc
   plugin/plugin_open.cc
   plugin/profiler.cc
   plugin/tuner.cc
@@ -291,13 +315,15 @@ set(SRC_FILES
   transport/net.cc
   transport/net_ib.cc
   transport/net_ib_cast.cc
-  # net_ib_rocm.cc is generated by rocmIb.cmake directly into the hipify staging area
-  # so it is not listed here - it's added to HIP_SOURCES separately below
+  transport/net_ib_gin.h
   transport/net_socket.cc
   transport/nvls.cc
   transport/p2p.cc
   transport/profiler.cc
   transport/shm.cc
+  gin/gin_host.cc
+  gin/gin_host_proxy.cc
+  transport/gdaki/gin_host_gdaki.h
   include/latency_profiler/CollTrace.h
   include/latency_profiler/CollTraceEvent.h
   include/latency_profiler/CollTraceFunc.h
@@ -321,6 +347,8 @@ if(USE_AMDSMI)
   )
 else()
   set(SMI_SOURCES
+    src/include/amdsmi_wrap.h
+    src/misc/amdsmi_wrap.cc
     src/include/rocm_smi_wrap.h
     src/misc/rocm_smi_wrap.cc
   )
@@ -384,6 +412,18 @@ foreach(SRC_FILE ${SRC_FILES})
 endforeach()
 
 set(NCCL_DEVICE_HEADER "${PROJECT_BINARY_DIR}/include/rccl/nccl_device.h")
+# Copy hip_compat.h as-is (no hipification — contains both CUDA and HIP paths)
+set(HIP_COMPAT_SRC "${RCCL_SOURCE_DIR}/src/include/nccl_device/hip_compat.h")
+set(HIP_COMPAT_DST "${HIPIFY_DIR}/src/include/nccl_device/hip_compat.h")
+add_custom_command(
+  OUTPUT ${HIP_COMPAT_DST}
+  COMMAND ${CMAKE_COMMAND} -E copy ${HIP_COMPAT_SRC} ${HIP_COMPAT_DST}
+  MAIN_DEPENDENCY ${HIP_COMPAT_SRC}
+  COMMENT "Copying hip_compat.h (no hipification)"
+)
+list(APPEND HIP_SOURCES ${HIP_COMPAT_DST})
+list(APPEND NCCL_DEVICE_HIP_FILES ${HIP_COMPAT_DST})
+
 add_custom_command(
   OUTPUT ${NCCL_DEVICE_HEADER}
   COMMAND ${CMAKE_COMMAND} -E make_directory "${PROJECT_BINARY_DIR}/include"
@@ -1011,6 +1051,10 @@ if (HAVE_KERNARG_PRELOAD AND NOT ENABLE_DEVICE_LINKER)
   target_link_options(rccl PRIVATE "SHELL:-Xoffload-linker -mllvm=-amdgpu-kernarg-preload-count=16")
 endif()
 
+if(ENABLE_MSCCLPP)
+  include(${RCCL_SOURCE_DIR}/cmake/MSCCLPP.cmake)
+endif()
+
 ## Track linking time
 set_property(TARGET rccl PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
 
diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile
index be026cc2670..471a0335ef7 100644
--- a/projects/rccl/src/Makefile
+++ b/projects/rccl/src/Makefile
@@ -8,7 +8,7 @@ include ../makefiles/version.mk
 
 ##### src files
 INCEXPORTS  := nccl.h nccl_device.h \
-	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/impl/*.h))
+	$(patsubst include/%,%,$(wildcard include/nccl_device/*.h include/nccl_device/*/*.h include/nccl_device/*/*/*.h))
 
 LIBSRCFILES := \
 	bootstrap.cc channel.cc collectives.cc debug.cc enqueue.cc group.cc \
@@ -16,13 +16,16 @@ LIBSRCFILES := \
 	$(wildcard graph/*.cc) \
 	$(wildcard misc/*.cc) \
 	$(wildcard transport/*.cc) \
+	$(wildcard transport/gdaki/*.cc) \
 	$(wildcard register/*.cc) \
 	$(wildcard plugin/*.cc) \
 	$(wildcard plugin/net/*.cc) \
 	$(wildcard plugin/tuner/*.cc) \
 	$(wildcard plugin/profiler/*.cc) \
+	$(wildcard plugin/env/*.cc) \
 	$(wildcard nccl_device/*.cc) \
 	$(wildcard scheduler/*.cc) \
+	$(wildcard gin/*.cc) \
 	$(filter-out ras/client.cc,$(wildcard ras/*.cc))
 BINSRCFILES := ras/client.cc
 
@@ -40,6 +43,7 @@ LIBDIR := $(BUILDDIR)/lib
 OBJDIR := $(BUILDDIR)/obj
 PKGDIR := $(BUILDDIR)/lib/pkgconfig
 BINDIR := $(BUILDDIR)/bin
+
 ##### target files
 CUDARTLIB  ?= cudart_static
 
@@ -61,6 +65,17 @@ INCPLUGIN  := include/plugin
 
 DEVMANIFEST := $(BUILDDIR)/obj/device/manifest
 
+# DOCA GPUNetIO definitions
+DOCA_HOME        ?= transport/gdaki/doca-gpunetio
+DOCA_INC_INSTALL := $(INCDIR)/nccl_device/gin/gdaki/doca_gpunetio
+DOCA_OBJDIR      := $(OBJDIR)/transport/gdaki/doca-gpunetio
+DOCA_INCLUDES    := $(DOCA_HOME)/include/doca_gpunetio_device.h $(wildcard $(DOCA_HOME)/include/common/*.h) $(wildcard $(DOCA_HOME)/include/device/*.cuh)
+DOCA_INCTARGETS  := $(DOCA_INCLUDES:$(DOCA_HOME)/include/%=$(DOCA_INC_INSTALL)/%)
+INCTARGETS       += $(DOCA_INCTARGETS)
+DOCA_LIBSRC      := doca_verbs_qp.cpp doca_verbs_cq.cpp doca_verbs_device_attr.cpp doca_verbs_umem.cpp doca_verbs_srq.cpp doca_verbs_uar.cpp doca_gpunetio.cpp doca_gpunetio_log.cpp doca_gpunetio_high_level.cpp doca_verbs_cuda_wrapper.cpp doca_verbs_mlx5dv_wrapper.cpp doca_verbs_ibv_wrapper.cpp doca_gpunetio_gdrcopy.cpp
+DOCA_LIBOBJ      := $(DOCA_LIBSRC:%.cpp=$(DOCA_OBJDIR)/%.o)
+LIBOBJ           += $(DOCA_LIBOBJ)
+
 ##### rules
 build : lib staticlib binary
 
@@ -94,7 +109,7 @@ $(INCDIR)/nccl.h : nccl.h.in ../makefiles/version.mk
 $(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVMANIFEST)
 	@printf "Linking    %-35s > %s\n" $(LIBTARGET) $@
 	mkdir -p $(LIBDIR)
-	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $$(cat $(DEVMANIFEST)) $(LDFLAGS) -Wl,--version-script=libnccl.map
 	ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME)
 	ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME)
 
@@ -137,6 +152,36 @@ $(INCDIR)/nccl_device/impl/%.h: include/nccl_device/impl/%.h
 	mkdir -p $(INCDIR)/nccl_device/impl
 	install -m 644 $< $@
 
+$(INCDIR)/nccl_device/gin/%.h: include/nccl_device/gin/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/gin/gdaki/%.h: include/nccl_device/gin/gdaki/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin/gdaki
+	install -m 644 $< $@
+
+$(INCDIR)/nccl_device/gin/proxy/%.h: include/nccl_device/gin/proxy/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(INCDIR)/nccl_device/gin/proxy
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/%.h: $(DOCA_HOME)/include/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/common/%.h: $(DOCA_HOME)/include/common/%.h
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)/common
+	install -m 644 $< $@
+
+$(DOCA_INC_INSTALL)/device/%.cuh: $(DOCA_HOME)/include/device/%.cuh
+	@printf "Grabbing   %-35s > %s\n" $< $@
+	mkdir -p $(DOCA_INC_INSTALL)/device
+	install -m 644 $< $@
+
 $(PKGDIR)/%.pc : %.pc
 	@printf "Grabbing   %-35s > %s\n" $< $@
 	mkdir -p $(PKGDIR)
@@ -145,8 +190,18 @@ $(PKGDIR)/%.pc : %.pc
 $(OBJDIR)/%.o : %.cc $(INCTARGETS)
 	@printf "Compiling  %-35s > %s\n" $< $@
 	mkdir -p `dirname $@`
-	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -c $< -o $@
-	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -M $< > $(@:%.o=%.d.tmp)
+	$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -c $< -o $@
+	@$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -I$(INCPLUGIN) -I$(DOCA_HOME)/include -M $< > $(@:%.o=%.d.tmp)
+	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
+                sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
+	@rm -f $(@:%.o=%.d.tmp)
+
+$(DOCA_OBJDIR)/%.o : $(DOCA_HOME)/src/%.cpp
+	@printf "Compiling  %-35s > %s\n" $< $@
+	mkdir -p `dirname $@`
+	$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -c $< -o $@
+	@$(CXX) -I$(DOCA_HOME)/src -I$(DOCA_HOME)/include $(CXXFLAGS) -M $< > $(@:%.o=%.d.tmp)
 	@sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d)
 	@sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \
                 sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d)
diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc
index 7ed3000d5b3..1e0bab43f37 100644
--- a/projects/rccl/src/bootstrap.cc
+++ b/projects/rccl/src/bootstrap.cc
@@ -227,6 +227,21 @@ static ncclResult_t socketSendRecv(struct ncclSocket* sendSock, void* sendData,
   return ncclSuccess;
 }
 
+static ncclResult_t socketDoubleSendRecv(struct ncclSocketOp ops[4]) {
+  // ops synchronously exchange size then asynchronously exchange data in send->recv->send->recv order
+  int senderRecvSize1, senderRecvSize2;
+  NCCLCHECK(ncclSocketSendRecv(ops[0].sock, &ops[0].size, sizeof(int), ops[1].sock, &senderRecvSize1, sizeof(int)));
+  NCCLCHECK(ncclSocketSendRecv(ops[2].sock, &ops[2].size, sizeof(int), ops[3].sock, &senderRecvSize2, sizeof(int)));
+  if (senderRecvSize1 > ops[1].size || senderRecvSize2 > ops[3].size) {
+    WARN("Message truncated : received %d,%d bytes instead of %d,%d", senderRecvSize1, senderRecvSize2, ops[1].size, ops[3].size);
+    return ncclInternalError;
+  }
+  ops[1].size = std::min(ops[1].size, senderRecvSize1);
+  ops[3].size = std::min(ops[3].size, senderRecvSize2);
+  NCCLCHECK(ncclSocketMultiOp(ops, 4));
+  return ncclSuccess;
+}
+
 union ringConnectInfo {
   union ncclSocketAddress addr;
   char handle[NCCL_NET_HANDLE_MAXSIZE];
@@ -1012,22 +1027,40 @@ static ncclResult_t netRingAllGather(ncclNet_t* net, void* sendComm, void* recvC
   if (recvDataHandle) netDereg(net, recvComm, &recvDataHandle);
   return res;
 }
-static ncclResult_t socketRingAllGather(struct ncclSocket* sendSock, struct ncclSocket* recvSock, int rank, int nranks, char* data, int size) {
+static ncclResult_t socketRingAllGather(struct ncclSocket* nextSock, struct ncclSocket* prevSock, int rank, int nranks, char* data, int size) {
   ncclResult_t res = ncclSuccess;
   uint64_t tFirst = 0, tRest = 0;
   /* Simple ring based AllGather
    * At each step i receive data from (rank-i-1) from prev
    * and send previous step's data from (rank-i) to next
    */
-  TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started");
+  TRACE(NCCL_BOOTSTRAP, "socketRingAllGather started: rank=%d nranks=%d", rank, nranks);
+  int totalSteps = nranks / 2;
+  TRACE(NCCL_BOOTSTRAP, "bidirectional bootstrap: totalSteps=%d", totalSteps);
   BOOTSTRAP_PROF_OPEN(tFirst);
-  for (int i = 0; i < nranks - 1; i++) {
-    size_t rslice = (rank - i - 1 + nranks) % nranks;
-    size_t sslice = (rank - i + nranks) % nranks;
-    void* recv_data = data + rslice * size;
-    void* send_data = data + sslice * size;
-    NCCLCHECKGOTO(socketSendRecv(sendSock, send_data, size, recvSock, recv_data, size), res, exit);
-    if (i == 0) {
+  for (int step = 0; step < totalSteps; step++) {
+    // N ranks requires (N-1)/2 steps for the double ring  algorithm. If N is even, the last step is requires a single send/recv
+    bool isFinalUnidirectional = (step == totalSteps - 1) && (nranks % 2 == 0);
+    // Ring0: ring from previous to next
+    int sendSliceRing0 = (rank - step + nranks) % nranks;      // Send this slice to next neighbor
+    int recvSliceRing0 = (rank - step - 1 + nranks) % nranks;  // Receive this slice from prev neighbor
+    // Ring1: ring from next to previous
+    int sendSliceRing1 = (rank + step) % nranks;               // Send this slice to prev neighbor
+    int recvSliceRing1 = (rank + step + 1) % nranks;           // Receive this slice from next neighbor
+    if (isFinalUnidirectional) {
+      // Final unidirectional step, only Ring0 is used
+      NCCLCHECKGOTO(socketSendRecv(nextSock, data + sendSliceRing0 * size, size, prevSock, data + recvSliceRing0 * size, size), res, exit);
+    } else {
+      // Bidirectional step: Ring0 and Ring1 are used simultaneously
+      struct ncclSocketOp ops[4] = {
+        {NCCL_SOCKET_SEND, nextSock, data + sendSliceRing0 * size, size, 0},  // Ring0: send to next
+        {NCCL_SOCKET_RECV, prevSock, data + recvSliceRing0 * size, size, 0},  // Ring0: recv from prev
+        {NCCL_SOCKET_SEND, prevSock, data + sendSliceRing1 * size, size, 0},  // Ring1: send to prev
+        {NCCL_SOCKET_RECV, nextSock, data + recvSliceRing1 * size, size, 0}   // Ring1: recv from next
+      };
+      NCCLCHECKGOTO(socketDoubleSendRecv(ops), res, exit);
+    }
+    if (step == 0) {
       BOOTSTRAP_PROF_CLOSE(tFirst);
       BOOTSTRAP_PROF_OPEN(tRest);
     }
diff --git a/projects/rccl/src/ce_coll.cc b/projects/rccl/src/ce_coll.cc
index 1caf65fcbc9..53e33ca8162 100644
--- a/projects/rccl/src/ce_coll.cc
+++ b/projects/rccl/src/ce_coll.cc
@@ -87,13 +87,13 @@ ncclResult_t ncclCeInit(struct ncclComm* comm) {
 
 ncclResult_t ncclCeFinalize(struct ncclComm* comm) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Clean up ceInitTaskQueue
   while (!ncclIntruQueueEmpty(&comm->ceInitTaskQueue)) {
     struct ncclCeInitTask* task = ncclIntruQueueDequeue(&comm->ceInitTaskQueue);
     free(task);
   }
-  
+
   // Clean up CE resources
   if (comm->ceColl.baseUCSymReadyPtr != NULL) {
     if (comm->ceColl.ceSyncWin && comm->ceColl.ceSyncWin->vidmem) {
@@ -153,7 +153,7 @@ ncclResult_t ncclPrepMCSync(struct ncclComm* comm, bool isComplete, hipStreamBat
   void* dstPtr = isComplete ? (void*)&completePtrs[comm->rank] : (void*)&readyPtrs[comm->rank];
   size_t offset = (uint8_t*)dstPtr - (uint8_t*)comm->ceColl.ceSyncWin->userPtr;
   NCCLCHECKGOTO(ncclDevrGetLsaTeamPtrMC(comm, comm->ceColl.ceSyncWin, offset, ncclTeamLsa(comm), &mcDstPtr), ret, fail);
-  
+
   // Write our own ready/complete flag to the multi-cast address
   CUDACHECKGOTO(cudaMemcpyAsync(
     mcDstPtr,
@@ -233,7 +233,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
   // Get pointers to the ready and complete synchronization arrays
   uint32_t* readyPtrs = (uint32_t*)comm->ceColl.baseUCSymReadyPtr;
   uint32_t* completePtrs = (uint32_t*)comm->ceColl.baseUCSymComplPtr;
-  
+
   // Allocate enough slots for all possible ops
   size_t batchSize = (comm->nvlsSupport ? NCCL_CE_SYNC_OPS_PER_RANK_MC : NCCL_CE_SYNC_OPS_PER_RANK_UC) * comm->nRanks;
   size_t opIdx = 0;
@@ -262,7 +262,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
       opIdx++;
     }
   }
-  
+
   // Execute all memory operations in a single batch
   CUCHECKGOTO(hipStreamBatchMemOp(stream, opIdx, batchParams, 0), ret, fail);
 
@@ -278,7 +278,7 @@ ncclResult_t ncclMemOpSync(struct ncclComm* comm, cudaStream_t stream) {
 
 ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int nRanks) {
   ncclResult_t ret = ncclSuccess;
-  
+
   params->srcs = nullptr;
   params->dsts = nullptr;
   params->sizes = nullptr;
@@ -289,7 +289,7 @@ ncclResult_t ncclCeInitBatchOpsParams(struct ncclCeBatchOpsParams* params, int n
   params->attrIdxs = nullptr;
   params->numAttrs = 0;
 #endif
-  
+
   NCCLCHECKGOTO(ncclCalloc(&params->srcs, nRanks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&params->dsts, nRanks), ret, fail);
   NCCLCHECKGOTO(ncclCalloc(&params->sizes, nRanks), ret, fail);
@@ -326,6 +326,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa
 
   int driverVersion;
   NCCLCHECKGOTO(ncclCudaDriverVersion(&driverVersion), ret, fail);
+
   //--------------Graph capture--------------
   // cudaMemcpyBatchAsync is not supported during CUDA graph capture
   if (capturing) {
@@ -430,7 +431,7 @@ ncclResult_t ncclCeLaunchBatchOps(struct ncclComm* comm, struct ncclCeBatchOpsPa
 
 ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of each rank's data chunk
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -481,7 +482,7 @@ ncclResult_t ncclCeAllGather(struct ncclComm* comm, struct ncclCeCollArgs* args,
 
 ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data each rank sends to every other rank
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -500,7 +501,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args,
     int dstRank = (comm->rank + r) % comm->nRanks;
     uint8_t* srcPtr = mySendBuff + dstRank * chunkBytes;
     uint8_t* dstPtr = myRecvBuff + comm->rank * chunkBytes;
-    
+
     if (dstRank == comm->rank) {
       // Local copy for own data
       batchOpsParams.srcs[batchOpsParams.numOps] = (void*)srcPtr;
@@ -536,7 +537,7 @@ ncclResult_t ncclCeAlltoAll(struct ncclComm* comm, struct ncclCeCollArgs* args,
 
 ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data root sends to each rank
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
@@ -596,7 +597,7 @@ ncclResult_t ncclCeScatter(struct ncclComm* comm, struct ncclCeCollArgs* args, c
 
 ncclResult_t ncclCeGather(struct ncclComm* comm, struct ncclCeCollArgs* args, cudaStream_t stream) {
   ncclResult_t ret = ncclSuccess;
-  
+
   // Calculate the size of data each rank sends to root
   const size_t chunkBytes = args->nElts * args->eltSize;
   uint8_t* mySendBuff = (uint8_t*)args->sendBuff;
diff --git a/projects/rccl/src/debug.cc b/projects/rccl/src/debug.cc
index 11129f010d5..2efefe4a2df 100644
--- a/projects/rccl/src/debug.cc
+++ b/projects/rccl/src/debug.cc
@@ -15,10 +15,12 @@
 #include <sys/syscall.h>
 #include <chrono>
 #include "param.h"
+#include <mutex>
+#include "env.h"
 
 #define NCCL_DEBUG_RESET_TRIGGERED (-2)
 
-int ncclDebugLevel = -1;
+__attribute__((visibility("default"))) int ncclDebugLevel = -1;
 static uint32_t ncclDebugTimestampLevels = 0;     // bitmaps of levels that have timestamps turned on
 static char ncclDebugTimestampFormat[256];        // with space for subseconds
 static int ncclDebugTimestampSubsecondsStart;     // index where the subseconds starts
@@ -28,7 +30,7 @@ static int pid = -1;
 static char hostname[1024];
 thread_local int ncclDebugNoWarn = 0;
 char ncclLastError[1024] = ""; // Global string for the last error in human readable form
-uint64_t ncclDebugMask = 0;
+__attribute__((visibility("default"))) uint64_t ncclDebugMask = 0;
 FILE *ncclDebugFile = stdout;
 static pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
 static std::chrono::steady_clock::time_point ncclEpoch;
@@ -36,9 +38,12 @@ static bool ncclWarnSetDebugInfo = false;
 
 static __thread int tid = -1;
 
+typedef const char* (*ncclGetEnvFunc_t)(const char*);
+
 // This function must be called with ncclDebugLock locked!
 static void ncclDebugInit() {
-  const char* nccl_debug = ncclGetEnv("NCCL_DEBUG");
+  ncclGetEnvFunc_t getEnvFunc = ncclEnvPluginInitialized() ? ncclGetEnv : (ncclGetEnvFunc_t)getenv;
+  const char* nccl_debug = getEnvFunc("NCCL_DEBUG");
   int tempNcclDebugLevel = -1;
   uint64_t tempNcclDebugMask = NCCL_INIT | NCCL_BOOTSTRAP | NCCL_ENV; // Default debug sub-system mask
   if (ncclDebugLevel == NCCL_DEBUG_RESET_TRIGGERED && ncclDebugFile != stdout) {
@@ -46,6 +51,7 @@ static void ncclDebugInit() {
     fclose(ncclDebugFile);
     ncclDebugFile = stdout;
   }
+
   if (nccl_debug == NULL) {
     tempNcclDebugLevel = NCCL_LOG_ERROR;
   } else if (strcasecmp(nccl_debug, "NONE") == 0) {
@@ -66,7 +72,7 @@ static void ncclDebugInit() {
    * This can be a comma separated list such as INIT,COLL
    * or ^INIT,COLL etc
    */
-  const char* ncclDebugSubsysEnv = ncclGetEnv("NCCL_DEBUG_SUBSYS");
+  const char* ncclDebugSubsysEnv = getEnvFunc("NCCL_DEBUG_SUBSYS");
   if (ncclDebugSubsysEnv != NULL) {
     int invert = 0;
     if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
@@ -120,7 +126,7 @@ static void ncclDebugInit() {
     free(ncclDebugSubsys);
   }
 
-  const char* ncclWarnSetDebugInfoEnv = ncclGetEnv("NCCL_WARN_ENABLE_DEBUG_INFO");
+  const char* ncclWarnSetDebugInfoEnv = getEnvFunc("NCCL_WARN_ENABLE_DEBUG_INFO");
   if (ncclWarnSetDebugInfoEnv != NULL && strlen(ncclWarnSetDebugInfoEnv) > 0) {
     int64_t value;
     errno = 0;
@@ -130,7 +136,7 @@ static void ncclDebugInit() {
   }
 
   // Determine which debug levels will have timestamps.
-  const char* timestamps = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_LEVELS");
+  const char* timestamps = getEnvFunc("NCCL_DEBUG_TIMESTAMP_LEVELS");
   if (timestamps == nullptr) {
     ncclDebugTimestampLevels = (1<<NCCL_LOG_WARN);
   } else {
@@ -166,7 +172,7 @@ static void ncclDebugInit() {
   }
 
   // Store a copy of the timestamp format with space for the subseconds, if used.
-  const char* tsFormat = ncclGetEnv("NCCL_DEBUG_TIMESTAMP_FORMAT");
+  const char* tsFormat = getEnvFunc("NCCL_DEBUG_TIMESTAMP_FORMAT");
   if (tsFormat == nullptr) tsFormat = "[%F %T] ";
   ncclDebugTimestampSubsecondsStart = -1;
   // Find where the subseconds are in the format.
@@ -219,7 +225,7 @@ static void ncclDebugInit() {
    * then create the debug file. But don't bother unless the
    * NCCL_DEBUG level is > VERSION
    */
-  const char* ncclDebugFileEnv = ncclGetEnv("NCCL_DEBUG_FILE");
+  const char* ncclDebugFileEnv = getEnvFunc("NCCL_DEBUG_FILE");
   if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
     int c = 0;
     char debugFn[PATH_MAX+1] = "";
@@ -419,4 +425,4 @@ void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
   va_end(vargs);
   pthread_setname_np(thread, threadName);
 #endif
-}
\ No newline at end of file
+}
diff --git a/projects/rccl/src/dev_runtime.cc b/projects/rccl/src/dev_runtime.cc
index 8e44316023c..9c6d658a784 100644
--- a/projects/rccl/src/dev_runtime.cc
+++ b/projects/rccl/src/dev_runtime.cc
@@ -18,8 +18,11 @@ struct ncclDevrMemory {
   int refCount;
   struct ncclDevrMemory* next;
   CUmemGenericAllocationHandle memHandle;
+  void* primaryAddr; // What we hope is the VA of this memory's first mapping.
   size_t size;
   size_t bigOffset; // offset in big VA space
+  void* ginHostWins[NCCL_GIN_MAX_CONTEXTS];
+  ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS];
 };
 
 struct ncclDevrWindowSorted {
@@ -56,12 +59,21 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
   struct ncclDevrState* devr = &comm->devrState;
   if (devr->bigSize != 0) return ncclSuccess;
 
-  bool lsaIsLocal = true;
-  for (int i=0; i < comm->localRanks; i++) {
-    lsaIsLocal &= comm->localRankToRank[i] == comm->localRankToRank[0] + i;
+  // LSA needs to be the same size for all ranks, and it needs to represent
+  // a consecutive set of ranks.
+  int lsaSize = 0;
+  int nodeSize = 1;
+  for (int r=1; r < comm->nRanks; r++) {
+    if (comm->rankToNode[r] == comm->rankToNode[r-1]) {
+      nodeSize += 1;
+    } else {
+      lsaSize = gcd(lsaSize, nodeSize);
+      nodeSize = 1;
+    }
   }
-  devr->lsaSelf = lsaIsLocal ? comm->localRank : 0;
-  devr->lsaSize = lsaIsLocal ? comm->localRanks : 1;
+  lsaSize = gcd(lsaSize, nodeSize);
+  devr->lsaSize = lsaSize;
+  devr->lsaSelf = comm->rank % lsaSize;
   devr->lsaRankList = (int*)malloc(devr->lsaSize*sizeof(int));
   for (int i=0; i < devr->lsaSize; i++) {
     devr->lsaRankList[i] = comm->rank + (i - devr->lsaSelf);
@@ -87,7 +99,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
   }
   devr->bigSize = alignUp(devr->bigSize, size_t(1)<<32);
   INFO(NCCL_INIT, "Symmetric VA size=%ldGB", (long)devr->bigSize>>30);
-  
+
   ncclSpaceConstruct(&devr->bigSpace);
   ncclShadowPoolConstruct(&devr->shadows);
   return ncclSuccess;
@@ -98,6 +110,7 @@ ncclResult_t ncclDevrInitOnce(struct ncclComm* comm) {
 }
 
 static void symTeamDestroyAll(struct ncclComm* comm); // Further down
+static void symMemoryDropRef(struct ncclComm* comm, struct ncclDevrMemory* mem); // Further down
 
 ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
   struct ncclDevrState* devr = &comm->devrState;
@@ -107,7 +120,7 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
     struct ncclDevrRegTask* task = ncclIntruQueueDequeue(&devr->regTaskQueue);
     free(task);
   }
-  
+
   symTeamDestroyAll(comm);
   { // delete windowTable
     cudaStream_t stream;
@@ -124,10 +137,20 @@ ncclResult_t ncclDevrFinalize(struct ncclComm* comm) {
       CUDACHECKIGNORE(cudaStreamDestroy(stream));
     }
   }
-  CUdeviceptr flatAddr = reinterpret_cast<CUdeviceptr>(devr->lsaFlatBase);
+  // Drain leaked windows so every per-peer slice is unmapped before VA free.
+  // Without this, on HIP cuMemAddressFree over a still-mapped range returns
+  // hipErrorInvalidValue, which then cascades into ibv_dealloc_pd EBUSY at teardown.
+  while (devr->memHead != nullptr) {
+    struct ncclDevrMemory* m = devr->memHead;
+    m->refCount = 1; // force drop on the next call
+    symMemoryDropRef(comm, m);
+  }
+  if (devr->lsaFlatBase != nullptr) {
+    CUdeviceptr flatAddr = reinterpret_cast<CUdeviceptr>(devr->lsaFlatBase);
   // Returns error: invalid argument. Already unmapped by symMemoryDropRef
   // CUCHECKIGNORE(cuMemUnmap(flatAddr, devr->lsaSize*devr->bigSize));
-  CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize));
+    CUCHECKIGNORE(cuMemAddressFree(flatAddr, devr->lsaSize*devr->bigSize));
+  }
   ncclShadowPoolDestruct(&devr->shadows);
   ncclSpaceDestruct(&devr->bigSpace);
   free(devr->lsaRankList);
@@ -343,11 +366,17 @@ static void symTeamDestroyAll(struct ncclComm* comm) {
   }
 }
 
+static ncclResult_t symMemoryRegisterGin(struct ncclComm* comm, struct ncclDevrMemory* mem) {
+  NCCLCHECK(ncclGinConnectOnce(comm));
+  NCCLCHECK(ncclGinRegister(comm, mem->primaryAddr, mem->size, mem->ginHostWins, mem->ginDevWins));
+  return ncclSuccess;
+}
+
 // On success we take caller's reference on memHandle.
 // Due to multicast binds for each pre-exiting team, this function requires
 // caller do a world barrier before returning to user.
 static ncclResult_t symMemoryObtain(
-    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, size_t size,
+    struct ncclComm* comm, CUmemGenericAllocationHandle memHandle, void* memAddr, size_t size,
     struct ncclDevrMemory** outMem
   ) {
   ncclResult_t ret = ncclSuccess;
@@ -362,12 +391,14 @@ static ncclResult_t symMemoryObtain(
     }
     mem = mem->next;
   }
+
   // New memory.
   mem = (struct ncclDevrMemory*)malloc(sizeof(struct ncclDevrMemory));
   mem->refCount = 0;
   mem->memHandle = memHandle;
+  mem->primaryAddr = memAddr;
   mem->size = size;
- 
+
   // Grab offset in the big space.
   NCCLCHECKGOTO(ncclSpaceAlloc(&devr->bigSpace, devr->bigSize, size, devr->granularity, &bigOffset), ret, fail_mem);
   mem->bigOffset = bigOffset;
@@ -375,10 +406,20 @@ static ncclResult_t symMemoryObtain(
   // Map unicast addresses into flat VA space for lsa team.
   NCCLCHECKGOTO(symMemoryMapLsaTeam(comm, memHandle, size, bigOffset), ret, fail_mem_space);
 
+  // If our caller doesn't have a VA then we'll use the LSA mapping.
+  if (mem->primaryAddr == nullptr) {
+    mem->primaryAddr = (char*)devr->lsaFlatBase + devr->lsaSelf*devr->bigSize + mem->bigOffset;
+  }
+
   // Bind new memory with each existing team.
   for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
     NCCLCHECKGOTO(symBindTeamMemory(comm, t, mem), ret, fail_mem_space_teams);
   }
+
+  if (devr->ginEnabled) {
+    NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail_mem_space_teams);
+  }
+
   // Add to list of mems.
   mem->next = devr->memHead;
   devr->memHead = mem;
@@ -405,6 +446,9 @@ static void symMemoryDropRef(
   ) {
   if (mem != nullptr && 0 == --mem->refCount) {
     struct ncclDevrState* devr = &comm->devrState;
+    if (devr->ginEnabled) {
+      ncclGinDeregister(comm, mem->ginHostWins);
+    }
     for (struct ncclDevrTeam* t = devr->teamHead; t != nullptr; t = t->next) {
       symUnbindTeamMemory(comm, t, mem);
     }
@@ -470,18 +514,22 @@ static ncclResult_t symWindowCreate(
   winDevHost->lsaRank = devr->lsaSelf;
   winDevHost->worldRank = comm->rank;
   winDevHost->winHost = (void*)win;
+  winDevHost->ginOffset4K = memOffset>>12;
+  for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) {
+    winDevHost->ginWins[i] = mem->ginDevWins[i];
+  }
   CUDACHECK(cudaMemcpyAsync(winDev, winDevHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream));
 
   NCCLCHECK(symWindowTableInitOnce(comm, stream)); // ensure devr->windowTable exists
   struct ncclDevCommWindowTable* tableDev = devr->windowTable;
-  struct ncclDevCommWindowTable* tableHost;
-  NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost));
   while (true) {
+    struct ncclDevCommWindowTable* tableHost;
+    NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost));
     int i = 0;
     while (i < 32 && tableHost->entries[i].window != nullptr) i += 1;
     if (i < 32) {
       tableHost->entries[i].base = userAddr;
-      tableHost->entries[i].size = userAddr + userSize;
+      tableHost->entries[i].size = userSize;
       tableHost->entries[i].window = winDev;
       CUDACHECK(cudaMemcpyAsync(&tableDev->entries[i], &tableHost->entries[i], sizeof(tableHost->entries[i]), cudaMemcpyHostToDevice, stream));
       break;
@@ -491,7 +539,6 @@ static ncclResult_t symWindowCreate(
       CUDACHECK(cudaMemcpyAsync(&tableDev->next, &tableHost->next, sizeof(tableHost->next), cudaMemcpyHostToDevice, stream));
     }
     tableDev = tableHost->next;
-    NCCLCHECK(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost));
   }
 
   { // insert into winSorted[]
@@ -520,9 +567,9 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi
   symMemoryDropRef(comm, winHost->memory);
 
   { struct ncclDevCommWindowTable* tableDev = devr->windowTable;
-    struct ncclDevCommWindowTable* tableHost;
-    NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted);
     while (true) {
+      struct ncclDevCommWindowTable* tableHost;
+      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableDev, &tableHost), ret, remove_winSorted);
       int i = 0;
       while (i < 32 && tableHost->entries[i].window != winDev) i += 1;
       if (i < 32) {
@@ -532,7 +579,6 @@ static ncclResult_t symWindowDestroy(struct ncclComm* comm, struct ncclWindow_vi
       }
       if (tableHost->next == nullptr) break; // Error didn't find window in table
       tableDev = tableHost->next;
-      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, tableHost->next, &tableHost), ret, remove_winSorted);
     }
   }
   NCCLCHECKGOTO(ncclShadowPoolFree(&devr->shadows, winDev, stream), ret, remove_winSorted);
@@ -588,7 +634,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup(
   CUCHECKGOTO(cuMemRetainAllocationHandle(&memHandle, reinterpret_cast<void*>(memAddr)), ret, fail_locReg);
 
   // Trade cumem handle for ncclDevrMemory*
-  NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, memSize, &mem), ret, fail_locReg_memHandle);
+  NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, (void*)memAddr, memSize, &mem), ret, fail_locReg_memHandle);
   memHandle = 0x0; // symMemoryObtain took our reference
 
   CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
@@ -597,7 +643,7 @@ ncclResult_t ncclDevrWindowRegisterInGroup(
       comm, mem, memOffset, userPtr, userSize, winFlags, localRegHandle, outWinDev, nullptr, stream
     ), ret, fail_locReg_memHandle_mem_stream);
   mem = nullptr; // symWindowCreate took our reference
-  
+
   CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_locReg_memHandle_mem_stream_win);
 
   // symWindowCreate needs barrier.
@@ -689,15 +735,35 @@ ncclResult_t ncclDevrCommCreateInternal(
   struct ncclDevrState* devr = &comm->devrState;
   struct ncclTeam world = ncclTeamWorld(comm);
   struct ncclTeam lsa = ncclTeamInnerFactor(world, devr->lsaSize);
+  bool ginActivated = false;
   struct ncclDevrTeam* tmLsa;
   size_t bufSizeTotal;
+  int nGinContexts = 0;
+  int ginSignalTotal = 0, ginCounterTotal = 0;
   struct ncclDevResourceRequirements* resReqsHead;
   struct ncclDevResourceRequirements lsaBarReq;
   cudaStream_t stream = nullptr;
+  struct ncclDevResourceRequirements railGinBarrierReq;
   CUmemGenericAllocationHandle memHandle = 0x0;
   struct ncclDevrMemory* mem = nullptr;
   struct ncclDevrWindow* win = nullptr;
   struct ncclWindow_vidmem* winHost = nullptr;
+  size_t ginSignalShadowsOffset = 0;
+
+  if (comm->nNodes > 1 || reqs->ginForceEnable || reqs->ginCounterCount != 0 || reqs->ginSignalCount != 0) {
+    ginActivated = !devr->ginEnabled;
+    devr->ginEnabled = true;
+  }
+
+  if (ginActivated) {
+    NCCLCHECKGOTO(ncclGinConnectOnce(comm), ret, fail);
+    // Register all preexisting memories with GIN. Update the windows later when
+    // we have a stream.
+    for (struct ncclDevrMemory* mem = devr->memHead; mem != nullptr; mem = mem->next) {
+      NCCLCHECKGOTO(symMemoryRegisterGin(comm, mem), ret, fail);
+    }
+  }
+  if (devr->ginEnabled) nGinContexts = comm->sharedRes->ginState.ginCommCount;
 
   memset(outDevComm, 0, sizeof(*outDevComm));
   outDevComm->rank = comm->rank;
@@ -723,25 +789,52 @@ ncclResult_t ncclDevrCommCreateInternal(
 
   resReqsHead = reqs->resourceRequirementsList;
 
-  ncclLsaBarrierCreateRequirement(lsa, reqs->lsaBarrierCount, &outDevComm->lsaBarrier, &lsaBarReq);
+  ncclLsaBarrierCreateRequirement(lsa, std::max(reqs->barrierCount, reqs->lsaBarrierCount), &outDevComm->lsaBarrier, &lsaBarReq);
   lsaBarReq.next = resReqsHead;
   resReqsHead = &lsaBarReq;
 
+  ncclGinBarrierCreateRequirement(comm, ncclTeamRail(comm), std::max(reqs->barrierCount, reqs->railGinBarrierCount), &outDevComm->railGinBarrier, &railGinBarrierReq);
+  railGinBarrierReq.next = resReqsHead;
+  resReqsHead = &railGinBarrierReq;
+
   { struct ncclDevResourceRequirements* rr = resReqsHead;
     bufSizeTotal = 0;
+    ginSignalTotal = reqs->ginSignalCount;
+    ginCounterTotal = reqs->ginCounterCount;
     while (rr != nullptr) {
       bufSizeTotal = alignUp(bufSizeTotal, std::max<size_t>(128, rr->bufferAlign));
       if (rr->outBufferHandle != nullptr) *rr->outBufferHandle = bufSizeTotal/128;
+      if (rr->outGinSignalStart != nullptr) *rr->outGinSignalStart = ginSignalTotal;
+      if (rr->outGinCounterStart != nullptr) *rr->outGinCounterStart = ginCounterTotal;
       bufSizeTotal += rr->bufferSize;
+      ginSignalTotal += rr->ginSignalCount;
+      ginCounterTotal += rr->ginCounterCount;
       rr = rr->next;
     }
+    bufSizeTotal= alignUp(bufSizeTotal, 128);
+    ginSignalShadowsOffset = bufSizeTotal;
+    bufSizeTotal += nGinContexts*ginSignalTotal*sizeof(uint64_t); // include signal shadows
     bufSizeTotal = alignUp(bufSizeTotal, devr->granularity);
   }
 
   CUDACHECKGOTO(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking), ret, fail);
 
-  NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail); // ensure devr->windowTable exists
-  outDevComm->windowTable = comm->devrState.windowTable;
+  if (ginActivated) {
+    // Now update the GIN handles in all existing windows. Registration of memories happened above.
+    for (int i=0; i < devr->winSortedCount; i++) {
+      struct ncclDevrWindow* win = devr->winSorted[i].win;
+      struct ncclWindow_vidmem* winHost;
+      NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream);
+      winHost->ginOffset4K = (win->bigOffset - win->memory->bigOffset)>>12;
+      for (int i=0; i < NCCL_GIN_MAX_CONTEXTS; i++) {
+        winHost->ginWins[i] = win->memory->ginDevWins[i];
+      }
+      CUDACHECKGOTO(cudaMemcpyAsync(win->vidmem, winHost, sizeof(struct ncclWindow_vidmem), cudaMemcpyHostToDevice, stream), ret, fail_stream);
+    }
+  }
+
+  NCCLCHECKGOTO(symWindowTableInitOnce(comm, stream), ret, fail_stream); // ensure devr->windowTable exists
+  outDevComm->windowTable = devr->windowTable;
 
   if (bufSizeTotal == 0) {
     outDevComm->resourceWindow = nullptr;
@@ -755,45 +848,65 @@ ncclResult_t ncclDevrCommCreateInternal(
 #endif
     memProp.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     memProp.requestedHandleType = ncclCuMemHandleType;
+    // We have to assume that if GIN is possible it might be requested in the future,
+    // even on single node.
+    memProp.allocFlags.gpuDirectRDMACapable = comm->sharedRes->ginState.ncclGin != nullptr ? 1 : 0;
     memProp.location.id = comm->cudaDev;
 
-    CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail);
+    CUCHECKGOTO(cuMemCreate(&memHandle, bufSizeTotal, &memProp, 0), ret, fail_stream);
 
-    NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, bufSizeTotal, &mem), ret, fail);
+    NCCLCHECKGOTO(symMemoryObtain(comm, memHandle, NULL, bufSizeTotal, &mem), ret, fail_stream_mem);
     memHandle = 0x0; // Reference given to symMemoryObtain
 
     NCCLCHECKGOTO(symWindowCreate( // Requires world barrier afterward.
       comm, mem, /*memOffset=*/0, nullptr, bufSizeTotal, /*winFlags=*/0,
       /*localReg=*/nullptr, &outDevComm->resourceWindow, &win,
-      stream), ret, fail);
+      stream), ret, fail_stream_mem);
     mem = nullptr; // Reference given to symWindowCreate
-    NCCLCHECKGOTO(ncclShadowPoolToHost(&comm->devrState.shadows, win->vidmem, &winHost), ret, fail);
+    NCCLCHECKGOTO(ncclShadowPoolToHost(&devr->shadows, win->vidmem, &winHost), ret, fail_stream_mem_win);
     outDevComm->resourceWindow_inlined = *winHost;
+    outDevComm->ginSignalShadows = (uint64_t*)add4G((char*)winHost->lsaFlatBase + ginSignalShadowsOffset, winHost->lsaRank*winHost->stride4G);
 
-    CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail);
+    CUDACHECKGOTO(cudaMemsetAsync(win->userPtr, 0, bufSizeTotal, stream), ret, fail_stream_mem_win);
   }
 
-  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail);
+  if (devr->ginEnabled) {
+    outDevComm->ginContextCount = nGinContexts;
+    outDevComm->ginSignalCount = ginSignalTotal;
+    outDevComm->ginCounterCount = ginCounterTotal;
+    NCCLCHECKGOTO(ncclGinAllocSignalsCounters(comm,
+      ginSignalTotal, &outDevComm->ginSignalBase,
+      ginCounterTotal, &outDevComm->ginCounterBase
+    ), ret, fail_stream_mem_win);
+
+    for (int ctx=0; ctx < nGinContexts; ctx++) {
+      outDevComm->ginTypes[ctx] = (int)comm->sharedRes->ginState.ginDevHandles[ctx]->netDeviceType;
+      outDevComm->ginHandles[ctx] = comm->sharedRes->ginState.ginDevHandles[ctx]->handle;
+    }
+  }
 
-  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail);
+  CUDACHECKGOTO(cudaStreamSynchronize(stream), ret, fail_stream_mem_win_signals);
 
-  CUDACHECKIGNORE(cudaStreamDestroy(stream));
+  NCCLCHECKGOTO(bootstrapBarrier(comm->bootstrap, comm->rank, comm->nRanks, 0xbeef), ret, fail_stream_mem_win_signals);
+  CUDACHECKGOTO(cudaStreamDestroy(stream), ret, fail_stream_mem_win_signals);
   return ret;
 
-fail:
-  if (win != nullptr) {
-    symWindowDestroy(comm, win->vidmem, stream);
-    CUDACHECKIGNORE(cudaStreamSynchronize(stream));
-  }
-  if (mem != nullptr) {
-    symMemoryDropRef(comm, mem);
-  }
-  if (memHandle != 0x0) {
-    CUCHECKIGNORE(cuMemRelease(memHandle));
-  }
-  if (stream != nullptr) {
-    CUDACHECKIGNORE(cudaStreamDestroy(stream));
+fail_stream_mem_win_signals:
+  if (devr->ginEnabled) {
+    ncclGinFreeSignalsCounters(comm,
+      outDevComm->ginSignalBase, outDevComm->ginSignalCount,
+      outDevComm->ginCounterBase, outDevComm->ginCounterCount
+    );
   }
+fail_stream_mem_win:
+  symWindowDestroy(comm, win->vidmem, stream);
+  cudaStreamSynchronize(stream);
+fail_stream_mem:
+  if (memHandle != 0x0) { CUCHECKIGNORE(cuMemRelease(memHandle)); }
+  symMemoryDropRef(comm, mem);
+fail_stream:
+  cudaStreamDestroy(stream);
+fail:
   return ret;
 }
 
@@ -919,7 +1032,13 @@ NCCL_API(ncclResult_t, ncclDevCommDestroy, ncclComm_t comm, ncclDevComm_t const*
 ncclResult_t ncclDevCommDestroy(
     struct ncclComm* comm, struct ncclDevComm const* devComm
   ) {
-  //struct ncclDevrState* devr = &comm->devrState;
+  struct ncclDevrState* devr = &comm->devrState;
+  if (devr->ginEnabled) {
+    ncclGinFreeSignalsCounters(comm,
+      devComm->ginSignalBase, devComm->ginSignalCount,
+      devComm->ginCounterBase, devComm->ginCounterCount
+    );
+  }
   if (devComm->resourceWindow != nullptr) {
     NCCLCHECK(ncclCommWindowDeregister(comm, devComm->resourceWindow));
   }
@@ -934,7 +1053,7 @@ ncclResult_t ncclDevrGetLsaRankPtr(struct ncclComm* comm, struct ncclDevrWindow*
   }
 
   struct ncclDevrState* devr = &comm->devrState;
-  
+
   // Validate lsaRank is within bounds
   if (lsaRank < 0 || lsaRank >= devr->lsaSize) {
     return ncclInvalidArgument;
@@ -963,7 +1082,7 @@ ncclResult_t ncclDevrGetLsaTeamPtrMC(struct ncclComm* comm, struct ncclDevrWindo
   bool multimem = true;
   struct ncclDevrTeam* tm;
   NCCLCHECK(symTeamObtain(comm, lsaTeam, multimem, &tm));
-    
+
   // Return the base multicast address for this team with offset
   *outPtr = (void*)((uintptr_t)tm->mcBasePtr + winHost->bigOffset + offset);
   return ncclSuccess;
diff --git a/projects/rccl/src/device/CMakeLists.txt b/projects/rccl/src/device/CMakeLists.txt
index 98447428df0..acaa9b65ddb 100644
--- a/projects/rccl/src/device/CMakeLists.txt
+++ b/projects/rccl/src/device/CMakeLists.txt
@@ -50,9 +50,9 @@ set_target_properties(nccl_device PROPERTIES
 # Set include directories for the target
 target_include_directories(nccl_device PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_BINARY_DIR}/include
     ${CMAKE_SOURCE_DIR}/src/include
     ${CMAKE_SOURCE_DIR}/src/include/plugin
-    ${CMAKE_BINARY_DIR}/include
     ${CUDAToolkit_INCLUDE_DIRS}
     ${CUDAToolkit_INCLUDE_DIRS}/cccl
 )
diff --git a/projects/rccl/src/device/Makefile b/projects/rccl/src/device/Makefile
index fd8f2759d4c..cf0fa0637f1 100644
--- a/projects/rccl/src/device/Makefile
+++ b/projects/rccl/src/device/Makefile
@@ -23,12 +23,13 @@ INCFLAGS  = -I. -I.. -I$(BUILDDIR)/include -I../include -I../include/plugin
 NVCUFLAGS += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 CXXFLAGS  += $(INCFLAGS)
 
-NVCUFLAGS_SYM := -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
+NVCUFLAGS_SYM += -ccbin $(CXX) $(CXXSTD) --expt-extended-lambda -Xptxas -maxrregcount=128 -Xfatbin -compress-all
 NVCUFLAGS_SYM += $(INCFLAGS) --compiler-options "-fPIC -fvisibility=hidden"
 
 SAY = @bash -c 'path="$$2"; [[ "$$(realpath "$$2")" =~ ^$(subst .,\.,$(abspath $(NCCLDIR)))/(.*)$$ ]] && path="$${BASH_REMATCH[1]}"; printf "%-15s %s\n" "$$1" "$$path"' SAY
 
 COMPILE.cu = $(NVCC) $(NVCUFLAGS) -dc $2 -o $1
+COMPILE.kernel = $(NVCC) $(NVCUFLAGS) -dw $2 -o $1
 COMPILE.cc = $(CXX) $(CXXFLAGS) -c $2 -o $1
 define COMPILE
 @$(SAY) "Compiling" $2;\
diff --git a/projects/rccl/src/device/network/unpack/unpack.h b/projects/rccl/src/device/network/unpack/unpack.h
index 44098977d35..2489437cd36 100644
--- a/projects/rccl/src/device/network/unpack/unpack.h
+++ b/projects/rccl/src/device/network/unpack/unpack.h
@@ -248,7 +248,7 @@ inline __device__ void ncclNetDeviceUnpackInner(
 
     for (int x = 0; x < iter_meta_cnt; x++) {
       int meta_idx = x + w * PPW;
-      
+
       // load page offs
       loadShmem128(shmemCvtPtr((uint64_t*) (s_meta + meta_idx)), meta.r64[0], meta.r64[1]);
 
diff --git a/projects/rccl/src/device/reduce_kernel.h b/projects/rccl/src/device/reduce_kernel.h
index 593f868eae1..d98fa356376 100755
--- a/projects/rccl/src/device/reduce_kernel.h
+++ b/projects/rccl/src/device/reduce_kernel.h
@@ -841,7 +841,7 @@ struct FuncSumPostDiv {
   using UintType = typename std::conditional<sizeof(T)==8, uint64_t, uint32_t>::type;
   uint32_t divisor:31, isSigned:1;
   UintType recip;
-  
+
   __device__ __forceinline__ FuncSumPostDiv(uint64_t opArg=0) {
     isSigned = opArg & 1;
     divisor = opArg >> 1;
diff --git a/projects/rccl/src/device/symmetric/all_gather.cuh b/projects/rccl/src/device/symmetric/all_gather.cuh
index f57c17cb2e5..be1f0e7face 100644
--- a/projects/rccl/src/device/symmetric/all_gather.cuh
+++ b/projects/rccl/src/device/symmetric/all_gather.cuh
@@ -356,7 +356,7 @@ static __device__ void ncclSymkRun_AllGather_LL_impl(ncclSymkDevWorkArgs const*
         char* blockInput = input.localPtr();
         char* blockOutput = output.localPtr();
 
-        uint32_t lowBits = nElts;
+        uint32_t lowBits = nAllElts;
         lowBits |= (uintptr_t)blockInput;
         lowBits |= (uintptr_t)blockOutput;
         if (__builtin_expect(lowBits%8 == 0, true)) {
diff --git a/projects/rccl/src/device/symmetric/generate.py b/projects/rccl/src/device/symmetric/generate.py
index 45958d5908a..594b403b93f 100755
--- a/projects/rccl/src/device/symmetric/generate.py
+++ b/projects/rccl/src/device/symmetric/generate.py
@@ -222,12 +222,20 @@ def partition(vals, keyfn):
   emitln(f, '')
 
   emitln(f, 'extern int const ncclSymkKernelCount = %d;' % len(list(enumerate_kernels())))
-  emitln(f, 'extern void* const ncclSymkKernelList[] = {')
+  emitln(f, 'void* ncclSymkKernelList[] = {')
   for k in enumerate_kernels():
     emitln(f, '(void*){cname},'.format(cname=kernel_cname(k)))
   emitln(f, 'nullptr};')
   emitln(f, '')
 
+  emitln(f, 'int ncclSymkKernelRequirements[] = {')
+  for index,k in enumerate(enumerate_kernels()):
+    cudart, _, _ = required_cuda(k)
+    sym = kernel_cname(k)
+    emitln(f, '  %7d, /*%4d %s*/' % (cudart or 0, index, sym));
+  emitln(f, '};')
+  emitln(f, '')
+
   emitln(f, 'void* ncclSymkGetKernelPtr(ncclSymkKernelId id, int red, ncclDataType_t ty) {')
   indents += 1
   emitln(f, 'switch (id) {')
diff --git a/projects/rccl/src/device/symmetric/primitives.cuh b/projects/rccl/src/device/symmetric/primitives.cuh
index 343f354e588..6d0c3352fd8 100644
--- a/projects/rccl/src/device/symmetric/primitives.cuh
+++ b/projects/rccl/src/device/symmetric/primitives.cuh
@@ -60,13 +60,14 @@ struct ncclSymkArgsHandler {
       workLo++;
       fracLo = 0;
     }
-    struct ncclSymkDevWork const& dw = devWork[workLo];
-    indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
+    struct ncclSymkDevWork const& dwLo = devWork[workLo];
+    indexLo = ((fracLo * divUp(dwLo.nElts, EltPerCell)) >> 16) * EltPerCell;
 
     // Where the work ends
     workHi = channelWorkRange[block].workHi;
     fracHi = channelWorkRange[block].fracHi + 1;
-    indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
+    struct ncclSymkDevWork const& dwHi = devWork[workHi];
+    indexHi = min(((fracHi * divUp(dwHi.nElts, EltPerCell)) >> 16) * EltPerCell, dwHi.nElts);
   }
 
   template<typename T>
@@ -82,7 +83,7 @@ struct ncclSymkArgsHandler {
     lastBlock = dw.sChannelId+dw.nChannels-1;
 
     // Where the work begins
-    fracLo = (dw.sChannelId==0) ? 0 : ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF);
+    fracLo = (dw.sChannelId>0 && channelWorkRange[dw.sChannelId-1].workHi == w) ? ((channelWorkRange[dw.sChannelId-1].fracHi + 1) & 0xFFFF) : 0;
     indexLo = ((fracLo * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell;
     fracHi = (channelWorkRange[lastBlock].workHi == w) ? channelWorkRange[lastBlock].fracHi + 1 : 0x10000;
     indexHi = min(((fracHi * divUp(dw.nElts, EltPerCell)) >> 16) * EltPerCell, dw.nElts);
@@ -95,16 +96,16 @@ struct ncclSymkArgsHandler {
 
       getWorkRange<T>(blockIdx.x, workLo, indexLo, workHi, indexHi);
 
-      size_t currentIndexLo = indexLo;
       #pragma unroll 1
       for (int w = workLo; w <= workHi; w++) {
         struct ncclSymkDevWork const& dw = devWork[w];
         size_t const& nAllElts = dw.nElts;
-        size_t currentIndexHi;
+        size_t currentIndexLo, currentIndexHi;
         int block, nBlocks;
         if (blockIdx.x >= dw.sChannelId && blockIdx.x < dw.sChannelId + dw.nChannels) {
           getWorkRangeFused<T>(blockIdx.x, w, block, nBlocks, currentIndexLo, currentIndexHi);
         } else {
+          currentIndexLo = (w > workLo) ? 0 : indexLo;
           currentIndexHi = (w < workHi) ? nAllElts : indexHi;
           block = 0;
           nBlocks = 1;
diff --git a/projects/rccl/src/device/symmetric/reduce_scatter.cuh b/projects/rccl/src/device/symmetric/reduce_scatter.cuh
index 9c149c8f225..c9ce8f56d4b 100644
--- a/projects/rccl/src/device/symmetric/reduce_scatter.cuh
+++ b/projects/rccl/src/device/symmetric/reduce_scatter.cuh
@@ -245,7 +245,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LD(ncclSymkDevWorkArgs
                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
         int tn = nBlocks*blockDim.x;
 
-        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nElts, output, nElts);
+        reduce(handler, tn, t, nBlocks, waitNeeded, bar, red, input + rank*nAllElts, output, nElts);
 
         waitNeeded = false;
       }
@@ -327,7 +327,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LDMC(ncclSymkDevWorkAr
                           threadIdx.x/WARP_SIZE, blockDim.x/WARP_SIZE);
         int tn = nBlocks*blockDim.x;
 
-        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nElts, output.localPtr(), nElts);
+        reduceMultimem(tn, t, red, input.multimemPtr(multimem) + rank*nAllElts, output.localPtr(), nElts);
       }
     );
 
@@ -406,7 +406,7 @@ __device__ __forceinline__ void ncclSymkRun_ReduceScatter_LL(ncclSymkDevWorkArgs
         T* input = (T*)inputPtr.localPtr();
         T* output = (T*)outputPtr.localPtr();
 
-        uint32_t lowBits = nElts*sizeof(T);
+        uint32_t lowBits = nAllElts*sizeof(T);
         lowBits |= (uintptr_t)input;
         lowBits |= (uintptr_t)output;
         if (__builtin_expect(lowBits%8 == 0, true)) {
diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc
index d10e0ba4696..b81c489e727 100644
--- a/projects/rccl/src/enqueue.cc
+++ b/projects/rccl/src/enqueue.cc
@@ -120,7 +120,10 @@ ncclResult_t ncclInitKernelsForDevice(int cudaArch, int maxSharedMem, size_t* ma
       if (fn == nullptr) continue;
 
       cudaError_t errcode = cudaFuncGetAttributes(&attr, fn);
-      if (errcode != cudaSuccess) continue; // Silently ignore failures
+      if (errcode != cudaSuccess) {
+		  cudaGetLastError(); // Drain error code
+		  continue; // Silently ignore failures
+	  }
       if (maxStackSize) {
         if (attr.localSizeBytes > *maxStackSize) *maxStackSize = attr.localSizeBytes;
       }
@@ -207,6 +210,9 @@ static void addWorkBatchToPlan(
       newBatch |= (comm->nNodes > 2 && batchP2P)? (chan->wipBatch.nP2ps == NCCL_MAX_DEV_WORK_P2P_PER_BATCH) : (chan->wipBatch.nP2ps == 1);
       for (int i=0; i < chan->wipBatch.nP2ps; i++) {
         newBatch |= p2pRound == chan->wipBatch.p2pRounds[i];
+        // Make sure we only aggregate p2p operations within the same p2p round epoch (one epoch is NCCL_MAX_DEV_WORK_P2P_PER_BATCH ops).
+        // This enforces uniform batching accross ranks in the communicator and prevents hangs.
+        newBatch |= (p2pRound / NCCL_MAX_DEV_WORK_P2P_PER_BATCH) != (chan->wipBatch.p2pRounds[i] / NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
       }
     }
   }
@@ -3177,16 +3183,21 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
 }
 
 ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
+  // Early-out on invalid or revoked communicator
+  ncclResult_t ret = CommCheck(info->comm, info->opName, "comm");
+  if (ret != ncclSuccess) return ncclGroupErrCheck(ret);
+  if (info->comm->revokedFlag) {
+    WARN("%s: communicator was revoked", info->opName);
+    return ncclGroupErrCheck(ncclInvalidUsage);
+  }
   // Profiler - If a group API event has already started, update the profilerGroupDepth so that the depth
   // updates correctly for implicit ncclGroupStartInternal and ncclGroupEndInternal calls
   if (ncclProfilerApiState.profilerGroupDepth > 0) {
     ncclProfilerApiState.profilerGroupDepth++;
   }
   NCCLCHECK(ncclGroupStartInternal());
-  ncclResult_t ret = ncclSuccess;
+  ret = ncclSuccess;
   int devOld = -1;
-
-  NCCLCHECKGOTO(CommCheck(info->comm, info->opName, "comm"), ret, fail);
   // Check whether communicator is ready to communicate
   NCCLCHECKGOTO(ncclCommEnsureReady(info->comm), ret, fail);
 
diff --git a/projects/rccl/src/gin/CMakeLists.txt b/projects/rccl/src/gin/CMakeLists.txt
new file mode 100644
index 00000000000..e20d7ddf38d
--- /dev/null
+++ b/projects/rccl/src/gin/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Gin sources
+set(GIN_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_host.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_proxy.cc
+)
+
+# Add gin sources to parent scope
+set(GIN_SOURCES ${GIN_SOURCES} PARENT_SCOPE)
diff --git a/projects/rccl/src/gin/gin_host.cc b/projects/rccl/src/gin/gin_host.cc
new file mode 100644
index 00000000000..b42f88fdeb0
--- /dev/null
+++ b/projects/rccl/src/gin/gin_host.cc
@@ -0,0 +1,277 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "comm.h"
+#include "param.h"
+#include "graph.h"
+#include "transport.h"
+#include "register_inline.h"
+#include "gin/gin_host.h"
+#include "gin/gin_host_proxy.h"
+
+NCCL_PARAM(GinEnable, "GIN_ENABLE", 1);
+NCCL_PARAM(GinType, "GIN_TYPE", -1);
+NCCL_PARAM(GinSignalPoolSize, "GIN_SIGNAL_POOL_SIZE", 64 << 10);
+NCCL_PARAM(GinCounterPoolSize, "GIN_COUNTER_POOL_SIZE", 64 << 10);
+
+void* ncclGinProgress(void* ginState_) {
+  struct ncclGinState* ginState = (struct ncclGinState*)ginState_;
+  while (1) {
+    pthread_mutex_lock(&ginState->threadLock);
+    if (ginState->ginProgress == 1) {
+      pthread_mutex_unlock(&ginState->threadLock);
+      for (int n=0; n<ginState->ginCommCount; n++) {
+        ncclResult_t ret;
+        if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+          ret = ncclGinProxyProgress(ginState->ncclGin, ginState->ginCtx[n]);
+        } else {
+          ret = ginState->ncclGin->ginProgress(ginState->ginComms[n]);
+        }
+        if (ret != ncclSuccess) {
+          __atomic_store_n(&ginState->asyncResult, ret, __ATOMIC_RELEASE);
+          INFO(NCCL_ALL,"%s:%d -> %d [GIN Progress Thread]", __FILE__, __LINE__, ret);
+          ginState->ginProgress = -2;
+          return NULL;
+        }
+      }
+      sched_yield();
+    } else if (ginState->ginProgress == -1) {
+      pthread_mutex_unlock(&ginState->threadLock);
+      return NULL;
+    } else if (ginState->ginProgress == 0) {
+      pthread_cond_wait(&ginState->threadCond, &ginState->threadLock);
+      pthread_mutex_unlock(&ginState->threadLock);
+    } else {
+      pthread_mutex_unlock(&ginState->threadLock);
+      INFO(NCCL_ALL,"%s:%d -> [GIN Progress Thread] state unknown %d", __FILE__, __LINE__, ginState->ginProgress);
+      ginState->ginProgress = -2;
+      return NULL;
+    }
+  }
+}
+
+NCCL_PARAM(GinNcontexts, "GIN_NCONTEXTS", NCCL_GIN_MAX_CONTEXTS);
+
+ncclResult_t ncclGinConnectOnce(struct ncclComm* comm) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (ginState->ncclGin == NULL) {
+    WARN("GIN not supported.");
+    return ncclInvalidUsage;
+  }
+  if (ncclParamGinEnable() == 0) {
+    WARN("GIN is disabled.");
+    return ncclInternalError;
+  }
+  if (ginState->connected) return ncclSuccess;
+
+  NCCLCHECK(ginState->ncclGin->init(&ginState->ginInstance, comm->commHash, ncclDebugLog));
+
+  int ndev = 0;
+  NCCLCHECK(ginState->ncclGin->devices(&ndev));
+  if (ndev <= 0) {
+    WARN("No GIN-capable devices found.");
+    return ncclInternalError;
+  }
+
+  ncclNetProperties_t props;
+  NCCLCHECK(ginState->ncclGin->getProperties(0, &props));
+  ginState->ginType = props.netDeviceType;
+  if ((ncclParamGinType() != -1) && (ginState->ginType != ncclParamGinType())) {
+    WARN("GIN-capable device type mismatch.");
+    return ncclInternalError;
+  }
+
+  int nLocalNets;
+  int64_t localNets[NCCL_TOPO_MAX_NODES];
+  NCCLCHECK(ncclTopoGetLocalNets(comm->topo, comm->rank, localNets, &nLocalNets));
+
+  void** handles = NULL;
+  char* allHandles = NULL;
+
+  ginState->ginCommCount = std::min<int>(NCCL_GIN_MAX_CONTEXTS, ncclParamGinNcontexts());
+
+  NCCLCHECKGOTO(ncclCalloc(&allHandles, (size_t)comm->nRanks * NCCL_NET_HANDLE_MAXSIZE), ret, fail);
+  NCCLCHECKGOTO(ncclCalloc(&handles, comm->nRanks), ret, fail);
+  for (int r = 0; r < comm->nRanks; r++) handles[r] = allHandles + r * NCCL_NET_HANDLE_MAXSIZE;
+
+  ginState->signalSpaceSize = ncclParamGinSignalPoolSize();
+  if (ginState->signalSpaceSize < 0 || (1 << 30) <= ginState->signalSpaceSize) {
+    WARN("NCCL_GIN_SIGNAL_POOL_SIZE has invalid value.");
+    ginState->signalSpaceSize = 64 << 10;
+  }
+  ginState->counterSpaceSize = ncclParamGinCounterPoolSize();
+  if (ginState->counterSpaceSize < 0 || (1 << 30) <= ginState->counterSpaceSize) {
+    WARN("NCCL_GIN_COUNTER_POOL_SIZE has invalid value.");
+    ginState->counterSpaceSize = 64 << 10;
+  }
+
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    void* listenComm;
+    NCCLCHECKGOTO(
+      ginState->ncclGin->listen(ginState->ginInstance, localNets[n%nLocalNets],
+                                allHandles + NCCL_NET_HANDLE_MAXSIZE * comm->rank, &listenComm),
+      ret, fail);
+    NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allHandles, NCCL_NET_HANDLE_MAXSIZE), ret,
+                  fail);
+    NCCLCHECKGOTO(ginState->ncclGin->connect(comm->ginContext, handles, comm->nRanks, comm->rank,
+                                             listenComm, ginState->ginComms + n),
+                  ret, fail);
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECKGOTO(ncclGinProxyCreateContext(comm, ginState->ginComms[n], localNets[n%nLocalNets],
+                                              ginState->signalSpaceSize, ginState->counterSpaceSize,
+                                              &ginState->ginCtx[n], &ginState->ginDevHandles[n]),
+                    ret, fail);
+    } else {
+      NCCLCHECKGOTO(ginState->ncclGin->createContext(
+                      ginState->ginComms[n], ginState->signalSpaceSize, ginState->counterSpaceSize,
+                      &ginState->ginCtx[n], &ginState->ginDevHandles[n]),
+                    ret, fail);
+    }
+    NCCLCHECKGOTO(ginState->ncclGin->closeListen(listenComm), ret, fail);
+  }
+  free(handles);
+  handles = NULL;
+  free(allHandles);
+  allHandles = NULL;
+
+  // Check whether we need proxy progress and if so, start / wake up the progress thread.
+  ginState->needsProxyProgress = 0;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginDevHandles[n]->needsProxyProgress) ginState->needsProxyProgress = 1;
+  }
+  if (ginState->needsProxyProgress) {
+    ginState->ginProgress = 1;
+    pthread_mutex_init(&ginState->threadLock, NULL);
+    pthread_cond_init(&ginState->threadCond, NULL);
+    PTHREADCHECK(pthread_create(&ginState->thread, NULL, ncclGinProgress, ginState), "pthread_create");
+    ncclSetThreadName(ginState->thread, "NCCL GIN Progress%2d", comm->cudaDev);
+  }
+
+  ncclSpaceConstruct(&ginState->counterSpace);
+  ncclSpaceConstruct(&ginState->signalSpace);
+
+exit:
+  if (ret == ncclSuccess) ginState->connected = true;
+  return ret;
+fail:
+  free(allHandles);
+  free(handles);
+  goto exit;
+}
+
+ncclResult_t ncclGinFinalize(struct ncclComm* comm) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (!ginState->connected) return ncclSuccess;
+
+  if (ginState->needsProxyProgress) {
+    pthread_mutex_lock(&ginState->threadLock);
+    comm->sharedRes->ginState.ginProgress = -1;
+    pthread_cond_signal(&ginState->threadCond);
+    pthread_mutex_unlock(&ginState->threadLock);
+    PTHREADCHECK(pthread_join(ginState->thread, NULL), "pthread_join");
+  }
+
+  if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+    for (int n = 0; n < ginState->ginCommCount; n++) {
+      if (ginState->ginCtx[n] != NULL) {
+        NCCLCHECK(ncclGinProxyDestroyContext(ginState->ncclGin, ginState->ginCtx[n]));
+        ginState->ginCtx[n] = NULL;
+      }
+    }
+  }
+
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginCtx[n] != NULL) {
+      NCCLCHECK(ginState->ncclGin->destroyContext(ginState->ginCtx[n]));
+      ginState->ginCtx[n] = NULL;
+    }
+    if (ginState->ginComms[n] != NULL) {
+      NCCLCHECK(ginState->ncclGin->closeColl(ginState->ginComms[n]));
+      ginState->ginComms[n] = NULL;
+    }
+  }
+  NCCLCHECK(ginState->ncclGin->finalize(ginState->ginInstance));
+  memset(ginState, 0, sizeof(*ginState));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size,
+                             void* ginHostWins[NCCL_GIN_MAX_CONTEXTS],
+                             ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECK(ncclGinProxyRegister(ginState->ncclGin, ginState->ginCtx[n], address, size,
+                                     NCCL_PTR_CUDA, 0, &ginHostWins[n], &ginDevWins[n]));
+    } else {
+      NCCLCHECK(ginState->ncclGin->regMrSym(ginState->ginComms[n], address, size, NCCL_PTR_CUDA, 0,
+                                            &ginHostWins[n], &ginDevWins[n]));
+    }
+    if (ginHostWins[n] == NULL) {
+      WARN("rank %d - GIN Symmetric register failed: buff %p, size %ld", comm->rank, address, size);
+      return ncclSystemError;
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY) {
+      NCCLCHECK(ncclGinProxyDeregister(ginState->ncclGin, ginState->ginCtx[n], ginHostWins[n]));
+    } else {
+      NCCLCHECK(ginState->ncclGin->deregMrSym(ginState->ginComms[n], ginHostWins[n]));
+    }
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0,
+                                         int nCounters, uint32_t* outCounter0) {
+  ncclResult_t ret = ncclSuccess;
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  int64_t start;
+  if (nSignals != 0) {
+    NCCLCHECKGOTO(
+      ncclSpaceAlloc(&ginState->signalSpace, ginState->signalSpaceSize, nSignals, 1, &start), ret,
+      fail);
+    *outSignal0 = (uint32_t)start;
+  }
+  if (nCounters != 0) {
+    NCCLCHECKGOTO(
+      ncclSpaceAlloc(&ginState->counterSpace, ginState->counterSpaceSize, nCounters, 1, &start),
+      ret, fail_signals);
+    *outCounter0 = (uint32_t)start;
+  }
+  return ncclSuccess;
+fail_signals:
+  if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, *outSignal0, nSignals);
+fail:
+  return ret;
+}
+
+ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals,
+                                        uint32_t counter0, int nCounters) {
+  struct ncclGinState* ginState = &comm->sharedRes->ginState;
+  if (nSignals != 0) ncclSpaceFree(&ginState->signalSpace, signal0, nSignals);
+  if (nCounters != 0) ncclSpaceFree(&ginState->counterSpace, counter0, nCounters);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError) {
+  bool hasError_ = false;
+  for (int n = 0; n < ginState->ginCommCount; n++) {
+    if (ginState->ginType == NCCL_NET_DEVICE_GIN_PROXY)
+      NCCLCHECK(ncclGinProxyQueryLastError(ginState->ncclGin, ginState->ginCtx[n], &hasError_));
+    else
+      NCCLCHECK(ginState->ncclGin->queryLastError(ginState->ginCtx[n], &hasError_));
+    if (hasError_) break;
+  }
+  *hasError = hasError_;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/gin/gin_host_proxy.cc b/projects/rccl/src/gin/gin_host_proxy.cc
new file mode 100644
index 00000000000..511e38b409e
--- /dev/null
+++ b/projects/rccl/src/gin/gin_host_proxy.cc
@@ -0,0 +1,501 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <assert.h>
+#include "nccl.h"
+#include "comm.h"
+#include "gin/gin_host.h"
+#include "alloc.h"
+#include "checks.h"
+#include "gdrwrap.h"
+#include "plugin/nccl_net.h"
+#include "nccl_device/gin/proxy/gin_proxy_device_host_common.h"
+
+NCCL_PARAM(GinProxyQueueSize, "GIN_PROXY_QUEUE_SIZE", -1);
+extern int64_t ncclParamIbDataDirect();
+extern int64_t ncclParamDmaBufEnable();
+
+struct ginProxyGfdState {
+  ncclGinProxyOp_t op;
+  uint16_t counterId;
+  int done;
+  void *request;
+};
+
+// a member might be on the GPU, if it has a *GdrHandle counterpart
+struct ginProxyHostGpuCtx {
+  size_t queueSize;
+
+  // size = nRanks * queueSize
+  ncclGinProxyGfd_t *queues;
+  void *cisGdrHandle;
+  // Consumed Indices, one per rank
+  uint32_t *cis;
+  // to decrease the number of reads/writes to cis which might be on the GPU
+  uint32_t *cisShadow;
+  // Seen Indices one per rank
+  uint32_t *sis;
+
+  // same size as queues
+  struct ginProxyGfdState *states;
+  // same size as queues
+  uint64_t *inlines;
+  // inlines is registered as a memory region with the GIN plugin
+  void *inlinesMhandle;
+  void *inlinesGinHandle;
+};
+
+struct ginProxyCtx {
+  struct ncclComm *comm;
+  void *collComm;
+  ncclNetDeviceHandle_v11_t *devHandle;
+  ncclNetProperties_t props;
+
+  // GPU queues, if GDR on the GPU, else on the CPU
+  // Queue size, must be a power of 2
+  struct ginProxyHostGpuCtx *hostGpuCtx;
+
+  void *countersGdrHandle;
+  uint64_t *counters;
+  uint64_t *countersDev;
+  CUmemGenericAllocationHandle signalsCumemhandle;
+  void *signalsMhandle;
+  void *signalsGinHandle;
+  uint64_t *signalsDev;
+  int hasError;
+};
+
+// Depending on GDR, allocate memory on the CPU or GPU.
+// host_flags is not used for now, but it is here for future use.
+template <typename T>
+static ncclResult_t allocMemCPUAccessible(T **ptr, T **devPtr, size_t nelem, int host_flags,
+                                          void **gdrHandle, bool forceHost = false) {
+  if (ncclGdrCopy && !forceHost) {
+    NCCLCHECK(ncclGdrCudaCalloc(ptr, devPtr, nelem, gdrHandle));
+  } else {
+    NCCLCHECK(ncclCuMemHostAlloc((void **)ptr, NULL, nelem * sizeof(T)));
+    memset((void *)*ptr, 0, nelem * sizeof(T));
+    *devPtr = *ptr;
+    if (gdrHandle) *gdrHandle = NULL;  // Mark as host allocated by nulling GDR handle
+  }
+  return ncclSuccess;
+}
+
+// Depending on GDR, free memory on the CPU or GPU.
+template <typename T>
+static ncclResult_t freeMemCPUAccessible(T *ptr, void *gdrHandle) {
+  if (gdrHandle != NULL) {  // If a GDR handle exists, it was GDR memory
+    NCCLCHECK(ncclGdrCudaFree(gdrHandle));
+  } else {  // Otherwise, it was host memory (or GDR was off)
+    NCCLCHECK(ncclCuMemHostFree(ptr));
+  }
+  return ncclSuccess;
+}
+
+static ncclResult_t getDmaBufFd(void *addr, size_t length, int *fd,
+                                bool forceNonDataDirect = false) {
+  if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage;
+
+#if CUDA_VERSION >= 11070
+  static size_t hostPageSize = sysconf(_SC_PAGESIZE);
+  size_t alignedSize = length;
+  ALIGN_SIZE(alignedSize, hostPageSize);
+
+#if CUDA_VERSION >= 12080
+  if (ncclParamIbDataDirect() && !forceNonDataDirect) {
+    CUresult status = pfn_cuMemGetHandleForAddressRange(
+      (void *)fd, (CUdeviceptr)addr, alignedSize, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+      CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (status == CUDA_SUCCESS) return ncclSuccess;
+  }
+#endif
+  CUresult status = pfn_cuMemGetHandleForAddressRange((void *)fd, (CUdeviceptr)addr, alignedSize,
+                                                      CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
+  if (status == CUDA_SUCCESS) return ncclSuccess;
+#endif
+
+  return ncclInvalidUsage;
+}
+
+static ncclResult_t proxyGinPollCompletions(ncclGin_t *ginComm, void *collComm,
+                                            struct ginProxyCtx *ctx,
+                                            struct ginProxyHostGpuCtx *hostGpuCtx) {
+  for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) {
+    // loop on all seen but unconsumed GFDs
+    for (uint32_t i = hostGpuCtx->cisShadow[targetRank]; i < hostGpuCtx->sis[targetRank]; i++) {
+      uint32_t idx = i & (hostGpuCtx->queueSize - 1);
+      struct ginProxyGfdState *state =
+        &hostGpuCtx->states[targetRank * hostGpuCtx->queueSize + idx];
+      // no need to poll if already done
+      if (!state->done) {
+        ginComm->test(collComm, state->request, &state->done);
+        if (state->done) {
+          TRACE(NCCL_NET, "GFD completed - stateIdx: %lu, request: %p", state - hostGpuCtx->states,
+                state->request);
+          // update the counter specified in the GFD
+          if (state->op & ncclGinProxyOpWithCounter) {
+            __atomic_store_n(&ctx->counters[state->counterId], ctx->counters[state->counterId] + 1,
+                             __ATOMIC_RELAXED);
+            TRACE(NCCL_NET, "Updated counter %d to %ld", state->counterId,
+                  ctx->counters[state->counterId]);
+          }
+        }
+      }
+      // allow holes in the CI space to get resolved
+      if (state->done && i == hostGpuCtx->cisShadow[targetRank]) {
+        // tell the GPU that we have consumed the GFD
+        __atomic_store_n(&hostGpuCtx->cis[targetRank], ++hostGpuCtx->cisShadow[targetRank],
+                         __ATOMIC_RELAXED);
+        TRACE(NCCL_NET, "Updated cis[%u] to %u", targetRank, hostGpuCtx->cisShadow[targetRank]);
+      }
+    }
+  }
+
+  return ncclSuccess;
+}
+
+static int proxyGinPollGfd(struct ginProxyCtx *ctx, ginProxyHostGpuCtx *hostGpuCtx, int targetRank,
+                           ncclGinProxyGfd_t *gfd, struct ginProxyGfdState **state) {
+  ncclGinProxyGfd_t *q = hostGpuCtx->queues + targetRank * hostGpuCtx->queueSize;
+  uint32_t idx = hostGpuCtx->sis[targetRank] & (hostGpuCtx->queueSize - 1);
+  ncclGinProxyQword_t qword;
+  __atomic_load(&q[idx].qword[ncclGinProxyGfdHeader].raw, &qword.raw, __ATOMIC_RELAXED);
+  if (qword.flag.v == 0) {
+    return 0;
+  }
+
+  // We know for sure that the first qword is there, copy it.
+  gfd->qword[ncclGinProxyGfdHeader] = q[idx].qword[ncclGinProxyGfdHeader];
+  // Wait for and copy the other qwords.
+  for (int k = 1; k < ncclGinProxyGfdQwords; k++) {
+    do {
+      __atomic_load(&q[idx].qword[k].raw, &qword.raw, __ATOMIC_RELAXED);
+    } while (qword.flag.v == 0);
+    gfd->qword[k] = qword;
+  }
+  // Now we have the full GFD in the local struct.
+
+  // Reset the GFD in the queue. This lets the producer know that the GFD is consumed.
+  for (int k = 0; k < ncclGinProxyGfdQwords; k++) {
+    __atomic_store_n(&q[idx].qword[k].raw, 0, __ATOMIC_RELAXED);
+  }
+
+  // set the counter_id into the state
+  uint32_t stateIdx = targetRank * hostGpuCtx->queueSize + idx;
+  *state = &hostGpuCtx->states[stateIdx];
+  (*state)->op = (ncclGinProxyOp_t)(gfd->qword[ncclGinProxyGfdHeader].header.op);
+  (*state)->counterId = gfd->qword[ncclGinProxyGfdCompletion].completion.counterId;
+  (*state)->done = 0;
+  (*state)->request = NULL;
+
+  TRACE(NCCL_NET,
+        "GFD to target PE %d raw idx: %u, idx: %u - op: %#lx, size: %lu, srcOff: %lu, dstOff: %lu, "
+        "srcHandle: %lu, dstHandle: %lu, counterId: %u, signalId: %u, stateIdx: %u",
+        targetRank, hostGpuCtx->sis[targetRank], idx, gfd->qword[ncclGinProxyGfdHeader].header.op,
+        gfd->qword[ncclGinProxyGfdHeader].header.size,
+        gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff,
+        gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff,
+        gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle,
+        gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle,
+        gfd->qword[ncclGinProxyGfdCompletion].completion.counterId,
+        gfd->qword[ncclGinProxyGfdCompletion].completion.signalId, stateIdx);
+
+  hostGpuCtx->sis[targetRank]++;
+
+  return 1;
+}
+
+static int mapGfdOpToCollNetOp(ncclGinProxyGfd_t *gfd) {
+  switch (gfd->qword[ncclGinProxyGfdHeader].header.op &
+          (ncclGinProxyOpComplMask & ~ncclGinProxyOpWithCounter)) {
+    case ncclGinProxyOpWithSignalInc:
+      return NCCL_NET_SIGNAL_OP_INC;
+    case ncclGinProxyOpWithSignalAdd:
+      return NCCL_NET_SIGNAL_OP_ADD;
+    default:
+      return -1;
+  }
+}
+
+static ncclResult_t proxyGinProcessGfd(ncclGin_t *ginComm, void *collComm, struct ginProxyCtx *ctx,
+                                       struct ginProxyHostGpuCtx *hostGpuCtx, int targetRank,
+                                       ncclGinProxyGfd_t *gfd, struct ginProxyGfdState *state) {
+  int signalOp;
+  uint64_t signalVal;
+
+  uint64_t size = gfd->qword[ncclGinProxyGfdHeader].header.size;
+  uint64_t srcOff;
+  void *srcHandle;
+  if (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpWithInline) {
+    uint64_t *inlineVal = &hostGpuCtx->inlines[gfd - hostGpuCtx->queues];
+    srcOff = (uint64_t)&inlineVal[0] - (uint64_t)hostGpuCtx->inlines;
+    // reconstruct the inline value from the two qwords
+    *inlineVal = gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow;
+    if (size == 8) {
+      *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 << 32;
+      *inlineVal |= (uint64_t)gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh << 48;
+    }
+    srcHandle = hostGpuCtx->inlinesMhandle;
+  } else {
+    srcOff = gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff;
+    srcHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle;
+  }
+  uint64_t dstOff = gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff;
+  void *dstHandle = (void *)(uint64_t)gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle;
+
+  switch (gfd->qword[ncclGinProxyGfdHeader].header.op & ncclGinProxyOpBaseMask) {
+    case ncclGinProxyOpPut:
+      signalOp = mapGfdOpToCollNetOp(gfd);
+      if (signalOp == -1) {
+        // First cast from 63 bits to 64 bits and then to void * to avoid warnings
+        NCCLCHECK(ginComm->iput(collComm, srcOff, srcHandle, size, dstOff, dstHandle,
+                                targetRank, &state->request));
+      } else {
+        // reconstruct the signal value from the two qwords
+        signalVal = gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow;
+        signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 << 16;
+        signalVal |= (uint64_t)gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh << 32;
+        uint64_t signalOff =
+          gfd->qword[ncclGinProxyGfdCompletion].completion.signalId * sizeof(uint64_t);
+        NCCLCHECK(ginComm->iputSignal(collComm, srcOff, srcHandle, size, dstOff, dstHandle,
+                                      targetRank, signalOff, ctx->signalsGinHandle, signalVal,
+                                      signalOp, &state->request));
+      }
+      break;
+    default:
+      // this error should already have been checked in pollGfd
+      assert(0);
+  }
+  TRACE(NCCL_NET, "GFD submitted into GIN plugin - stateIdx: %lu, request: %p",
+        state - hostGpuCtx->states, state->request);
+  return ncclSuccess;
+}
+
+static uint64_t isPowerOfTwo(uint64_t n) { return (n > 0) && ((n & (n - 1)) == 0); }
+
+// Check if the GIN plugin supports DMA-BUF, if so we can try to get the DMA-BUF handle from CUDA,
+// if that fails we fallback to non-DMA-BUF
+static ncclResult_t ncclGinProxyRegMrSym(ncclGin_t *ginComm, struct ginProxyCtx *ctx, void *addr,
+                                         size_t size, int type, int mr_flags, void **mhandle,
+                                         void **ginHandle) {
+  if (type == NCCL_PTR_HOST) {
+    NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle));
+  } else if (type == NCCL_PTR_CUDA) {
+    ncclResult_t dmabufResult = ncclInvalidUsage;
+    if (ncclParamDmaBufEnable() && (ctx->props.ptrSupport & NCCL_PTR_DMABUF)) {
+      ncclResult_t registrationResult = ncclSuccess;
+      int dmabufFd = -1;
+      dmabufResult = getDmaBufFd(addr, size, &dmabufFd);
+      if (dmabufResult == ncclSuccess) {
+        registrationResult = ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd,
+                                                     mr_flags, mhandle, ginHandle);
+        close(dmabufFd);
+      }
+      if (registrationResult != ncclSuccess) {
+        dmabufFd = -1;
+        dmabufResult = getDmaBufFd(addr, size, &dmabufFd, true);
+        if (dmabufResult == ncclSuccess) {
+          NCCLCHECK(ginComm->regMrSymDmaBuf(ctx->collComm, addr, size, type, 0, dmabufFd,
+                                            mr_flags, mhandle, ginHandle));
+          close(dmabufFd);
+        }
+      }
+    }
+    // Fallback to non-DMA-BUF if the DMA-BUF handle is not supported
+    if (dmabufResult != ncclSuccess) {
+      NCCLCHECK(ginComm->regMrSym(ctx->collComm, addr, size, type, mr_flags, mhandle, ginHandle));
+    }
+  } else {
+    return ncclInvalidUsage;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId,
+                                       int nSignals, int nCounters, void **outGinCtx,
+                                       ncclNetDeviceHandle_v11_t **outDevHandle) {
+  ncclGin_t *ginComm = (ncclGin_t *)comm->sharedRes->ginState.ncclGin;
+
+  if (!ncclGdrCopy)
+    INFO(NCCL_NET, "GIN Proxy will not be using GDRCopy");
+
+  struct ginProxyCtx *proxyCtx = NULL;
+  NCCLCHECK(ncclCalloc(&proxyCtx, 1));
+
+  proxyCtx->comm = comm;
+  proxyCtx->collComm = collComm;
+
+  // Sanitize the queue size
+  NCCLCHECK(ginComm->getProperties(devId, &proxyCtx->props));
+  uint64_t queueSize = ncclParamGinProxyQueueSize();
+  uint32_t maxRequests = NCCL_NET_MAX_REQUESTS * proxyCtx->props.maxRecvs;
+  if (queueSize == -1) {
+    queueSize = maxRequests;
+  }
+  if (queueSize > maxRequests) {
+    INFO(NCCL_NET,
+         "NCCL_GIN_PROXY_QUEUE_SIZE is greater than the maximum outstanding requests in the GIN "
+         "plugin (%d), using the default/maximum value instead",
+         maxRequests);
+    queueSize = maxRequests;
+  }
+  if (queueSize < 1) {
+    INFO(NCCL_NET,
+         "NCCL_GIN_PROXY_QUEUE_SIZE is less than 1, using the default/maximum value instead");
+    queueSize = maxRequests;
+  }
+  if (!isPowerOfTwo(queueSize)) {
+    INFO(
+      NCCL_NET,
+      "NCCL_GIN_PROXY_QUEUE_SIZE is not a power of two, using the default/maximum value instead");
+    queueSize = maxRequests;
+  }
+
+  // Allocate the counters on the GPU or CPU depending on GDR
+  NCCLCHECK(allocMemCPUAccessible(&proxyCtx->counters, &proxyCtx->countersDev, nCounters,
+                                  CU_MEMHOSTALLOC_WRITECOMBINED,
+                                  &proxyCtx->countersGdrHandle));
+
+  // Allocate the signals on the GPU and then register the memory region with the GIN plugin.
+  // Enforcing strong ordering on the signals mr is vital to ensure ordering between puts and
+  // signals.
+  size_t signalsBufSize = nSignals * sizeof(uint64_t);
+  NCCLCHECK(ncclCuMemAlloc((void **)&proxyCtx->signalsDev, &proxyCtx->signalsCumemhandle,
+                           CU_MEM_HANDLE_TYPE_NONE, signalsBufSize));
+  CUDACHECK(cudaMemset(proxyCtx->signalsDev, 0, signalsBufSize));
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, proxyCtx->signalsDev, signalsBufSize,
+                                 NCCL_PTR_CUDA, NCCL_NET_MR_FLAG_FORCE_SO,
+                                 &proxyCtx->signalsMhandle, &proxyCtx->signalsGinHandle));
+
+  NCCLCHECK(ncclCalloc(&proxyCtx->hostGpuCtx, 1));
+  struct ginProxyHostGpuCtx *hostGpuCtx = proxyCtx->hostGpuCtx;
+  hostGpuCtx->queueSize = queueSize;
+  size_t queuesLength = hostGpuCtx->queueSize * comm->nRanks;
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->states, queuesLength));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->cisShadow, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->sis, comm->nRanks));
+  NCCLCHECK(ncclCalloc(&hostGpuCtx->inlines, queuesLength));
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, proxyCtx, hostGpuCtx->inlines,
+                                       queuesLength * sizeof(uint64_t), NCCL_PTR_HOST, 0,
+                                       &hostGpuCtx->inlinesMhandle, &hostGpuCtx->inlinesGinHandle));
+
+  ncclGinProxyGpuCtx_t devGpuCtx_h;
+  devGpuCtx_h.nranks = comm->nRanks;
+  devGpuCtx_h.queueSize = hostGpuCtx->queueSize;
+  devGpuCtx_h.counters = proxyCtx->countersDev;
+  devGpuCtx_h.signals = proxyCtx->signalsDev;
+  NCCLCHECK(ncclCudaCalloc(&devGpuCtx_h.pis, comm->nRanks));
+
+  // Allocate the GFD queues, CIs, counters, signals and test/wait variables on the either the CPU
+  // or GPU.
+  NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->queues, &devGpuCtx_h.queues, queuesLength, 0,
+                                        NULL, true /*forceHost*/));
+  NCCLCHECK(allocMemCPUAccessible(&hostGpuCtx->cis, &devGpuCtx_h.cis, comm->nRanks,
+                                        CU_MEMHOSTALLOC_WRITECOMBINED, &hostGpuCtx->cisGdrHandle));
+
+  ncclGinProxyGpuCtx_t *devGpuCtx_d = NULL;
+  NCCLCHECK(ncclCudaCalloc(&devGpuCtx_d, 1));
+  // Copy the proxy's devGpuCtx to the GPU
+  NCCLCHECK(ncclCudaMemcpy(devGpuCtx_d, &devGpuCtx_h, 1));
+
+  ncclNetDeviceHandle_v11_t *devHandle = NULL;
+  NCCLCHECK(ncclCalloc(&devHandle, 1));
+  devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY;
+  devHandle->netDeviceVersion = NCCL_GIN_PROXY_VERSION;
+  devHandle->handle = (void *)devGpuCtx_d;
+  devHandle->size = 0;
+  devHandle->needsProxyProgress = 1;
+
+  proxyCtx->devHandle = devHandle;
+
+  *outDevHandle = devHandle;
+  *outGinCtx = proxyCtx;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size,
+                                  int type, int mr_flags, void **mhandle, void **ginHandle) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  // Register the memory region with the GIN plugin
+  NCCLCHECK(ncclGinProxyRegMrSym(ginComm, ctx, addr, size, type, mr_flags, mhandle, ginHandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  // Deregister the memory region with the GIN plugin
+  NCCLCHECK(ginComm->deregMrSym(ctx->collComm, mhandle));
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx) {
+  if (!ginCtx) return ncclSuccess;
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+
+  // Free counters
+  if (ctx) {
+    if (ctx->counters || ctx->countersGdrHandle)
+      freeMemCPUAccessible(ctx->counters, ctx->countersGdrHandle);
+
+    // Free signals
+    if (ginComm && ctx->collComm && ctx->signalsMhandle)
+      ginComm->deregMrSym(ctx->collComm, ctx->signalsMhandle);
+    if (ctx->signalsDev) ncclCudaFree(ctx->signalsDev);
+
+    // Free hostGpuCtx and its allocations
+    struct ginProxyHostGpuCtx *hostGpuCtx = ctx->hostGpuCtx;
+    if (hostGpuCtx) {
+      if (hostGpuCtx->cisShadow) free(hostGpuCtx->cisShadow);
+      if (hostGpuCtx->sis) free(hostGpuCtx->sis);
+      if (hostGpuCtx->states) free(hostGpuCtx->states);
+      if (hostGpuCtx->inlines) free(hostGpuCtx->inlines);
+      if (ginComm && ctx->collComm && hostGpuCtx->inlinesMhandle)
+        ginComm->deregMrSym(ctx->collComm, hostGpuCtx->inlinesMhandle);
+      if (hostGpuCtx->queues) freeMemCPUAccessible(hostGpuCtx->queues, NULL);
+      if (hostGpuCtx->cis || hostGpuCtx->cisGdrHandle)
+        freeMemCPUAccessible(hostGpuCtx->cis, hostGpuCtx->cisGdrHandle);
+      free(hostGpuCtx);
+    }
+
+    ncclNetDeviceHandle_v11_t *devHandle = (ncclNetDeviceHandle_v11_t *)ctx->devHandle;
+    if (devHandle) {
+      if (devHandle->handle) ncclCudaFree((void *)devHandle->handle);
+      free(devHandle);
+    }
+
+    free(ctx);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+
+  NCCLCHECK(proxyGinPollCompletions(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx));
+  for (int targetRank = 0; targetRank < ctx->comm->nRanks; targetRank++) {
+    // Poll on the GFD queue
+    ncclGinProxyGfd_t gfd;
+    struct ginProxyGfdState *state = NULL;
+    if (proxyGinPollGfd(ctx, ctx->hostGpuCtx, targetRank, &gfd, &state)) {
+      ncclResult_t ret =
+        proxyGinProcessGfd(ginComm, ctx->collComm, ctx, ctx->hostGpuCtx, targetRank, &gfd, state);
+      if (ret) ctx->hasError = ret;
+      NCCLCHECK(ret);
+    }
+    if (ginComm->ginProgress) ginComm->ginProgress(ctx->collComm);
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError) {
+  struct ginProxyCtx *ctx = (struct ginProxyCtx *)ginCtx;
+  *hasError = ctx->hasError;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc
index ae44d4c0fc0..a9517f9531f 100644
--- a/projects/rccl/src/graph/paths.cc
+++ b/projects/rccl/src/graph/paths.cc
@@ -271,14 +271,18 @@ ncclResult_t ncclGetUserP2pLevel(int* level) {
   return ncclSuccess;
 }
 
+// Tests two ranks for CUDA P2P connectivity.
+// *cudaP2p returns 1 if CUDA P2P between the ranks is supported.
+// *p2p returns 1 only if the distance between the ranks is no greater than NCCL_P2P_LEVEL.  The connection may go through an intermediate rank.
 ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2,
-                              int* p2p, int *read, int* intermediateRank) {
+                              int* p2p, int *read, int* intermediateRank, int* cudaP2p) {
   int mnnvl = 0;
   struct ncclPeerInfo* info1 = NULL;
   struct ncclPeerInfo* info2 = NULL;
   *p2p = 0;
   if (read) *read = 0;
   if (intermediateRank) *intermediateRank = -1;
+  if (cudaP2p) *cudaP2p = 0;
 
   // Rule out different nodes / isolated containers
   if (comm) {
@@ -341,10 +345,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
 
 #if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
   if (*p2p == 1) {
-    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
-    // validate against NVML at all since they are pretending to be on other hw.
-    if (g1 != g2 && (comm == NULL || (info1->hostHash == comm->peerInfo[comm->rank].hostHash &&
-                                      info1->hostHash == info2->hostHash)) && ncclParamIgnoreDisabledP2p() != 2) {
+    if (checkNvml) {
       int indexes[3] = {-1,-1,-1};
       int verticeN = 0;
       NCCLCHECK(ncclNvmlEnsureInitialized());
@@ -381,6 +382,26 @@ ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* syst
     if (read && (gpu1->gpu.cudaCompCap == gpu2->gpu.cudaCompCap) && (gpu1->gpu.cudaCompCap == 80)) *read = 1;
   }
 
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
+  if (cudaP2p) {
+    if (checkNvml) {
+      int n1, n2;
+      n1 = system->nodes[GPU].nodes[g1].gpu.dev;
+      n2 = system->nodes[GPU].nodes[g2].gpu.dev;
+      *cudaP2p = (ncclNvmlDevicePairs[n1][n2].p2pStatusRead == NVML_P2P_STATUS_OK &&
+                  ncclNvmlDevicePairs[n1][n2].p2pStatusWrite == NVML_P2P_STATUS_OK);
+    } else {
+      // We assume P2P connectivity in case the ranks are connected using MNNVL or are on the same host.
+      *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash);
+    }
+  }
+#else
+  if (cudaP2p) {
+    // On AMD/HIP, assume P2P connectivity based on MNNVL or same host
+    *cudaP2p = (mnnvl || comm == NULL || info1->hostHash == info2->hostHash);
+  }
+#endif
+
   return ncclSuccess;
 }
 
@@ -632,7 +653,7 @@ ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks,
   struct ncclTopoSystem* system = comm->topo;
   *nranks = 0;
   *intermediateRanks = NULL;
-  if (system->nodes[NET].count == 0) return ncclSuccess;
+  if (system->inter == 0) return ncclSuccess;
 
   int nr = 0;
   int* ranks = NULL;
@@ -715,7 +736,7 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm
     for (int p=0; p<system->nodes[GPU].count; p++) {
       int p2p;
       NCCLCHECK(ncclTopoCheckP2p(comm, system, system->nodes[GPU].nodes[p].gpu.rank,
-                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL));
+                                 system->nodes[GPU].nodes[g].gpu.rank, &p2p, NULL, NULL, NULL));
       if (p2p == 0) {
         // Divert all traffic through the CPU
         int cpu;
@@ -926,6 +947,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
     for (int n=system->nodes[NET].count-1; n>=0; n--)
       NCCLCHECKGOTO(ncclTopoRemoveNode(system, NET, n), ret, fail);
   }
+  system->inter = system->nodes[GPU].count == comm->nRanks ? 0 : 1;
 exit:
   free(domains);
   if (ids) free(ids);
diff --git a/projects/rccl/src/graph/rings.cc b/projects/rccl/src/graph/rings.cc
index 553554e2b79..382ba32ea19 100644
--- a/projects/rccl/src/graph/rings.cc
+++ b/projects/rccl/src/graph/rings.cc
@@ -31,6 +31,11 @@ void dumpLine(int* values, int nranks, const char* prefix) {
 }
 
 ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  ncclResult_t ret = ncclSuccess;
+  uint64_t* rankFound;
+  int rankFoundSize = DIVUP(nranks, 64);
+  NCCLCHECK(ncclCalloc(&rankFound, rankFoundSize));
+
   for (int r=0; r<nrings; r++) {
     char prefix[40];
     /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
@@ -40,6 +45,7 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
 
     int current = rank;
     for (int i=0; i<nranks; i++) {
+      rankFound[current/64] |= (1<<(current%64));
       rings[r*nranks+i] = current;
       current = next[r*nranks+current];
     }
@@ -47,24 +53,25 @@ ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* p
     if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
     if (current != rank) {
       WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
-      return ncclInternalError;
+      ret = ncclInternalError;
+      goto end;
     }
     // Check that all ranks are there
     for (int i=0; i<nranks; i++) {
-      int found = 0;
-      for (int j=0; j<nranks; j++) {
-        if (rings[r*nranks+j] == i) {
-          found = 1;
-          break;
-        }
-      }
-      if (found == 0) {
+      uint64_t bits = rankFound[i/64], mask = 1<<(i%64);
+      // Fast check 64 ranks at a time
+      if (mask == 1 && bits == 0xffffffffffffffff) { i += 63; continue; }
+      if ((bits & mask) == 0) {
         WARN("Error : ring %d does not contain rank %d", r, i);
-        return ncclInternalError;
+        ret = ncclInternalError;
+        goto end;
       }
     }
+    memset(rankFound, 0, rankFoundSize*sizeof(uint64_t));
   }
-  return ncclSuccess;
+end:
+  free(rankFound);
+  return ret;
 }
 
 /**
diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc
index 897e0f8f1a8..cdd2eb57de7 100644
--- a/projects/rccl/src/graph/search.cc
+++ b/projects/rccl/src/graph/search.cc
@@ -42,7 +42,7 @@ static float getTotalBw(struct ncclTopoSystem* system, struct ncclTopoNode* gpu)
 ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system) {
   system->maxBw = 0.0;
   system->totalBw = 0.0;
-  int inter = system->nodes[NET].count;
+  int inter = system->inter;
   if (inter == 0 && system->nodes[GPU].count == 1) {
     system->maxBw = LOC_BW;
     system->totalBw = LOC_BW;
@@ -533,14 +533,14 @@ static ncclResult_t ncclTopoPrefNetsChannelFirst(struct ncclTopoSystem* system,
   return ncclSuccess;
 }
 
-// Build a sorted list of the NETs to try.
+// Build a sorted list of the NETs to try, the list will follow the NETDEVS_POLICY set by the user.
 //
-// "gpu" can be set to -1 to build a list suitable for all GPUs (search start) or to a given gpu
-//  index when trying to get back to the NIC.
+// The value of "gpu" can be set to -1 to build a list suitable for all GPUs (for example for the search start).
+// The value of "gpu" can be set to the desired index when trying to get back to the NIC.
 //
 // The list is built the following way:
-// 1. Select NETs starting with those close to GPU(s), based on paths[n].type.
-// 2. add other NETs satisfying typeInter but not already in the list.
+// 1. First gather the preferred NETs for each of the GPU(s), based on the NETDEVS_POLICY and the connection.
+// 2. If the NETDEV_policy allows it, add all the other NETs satisfying typeInter but not already in the list of preferred NETs.
 NCCL_PARAM(ScatterEnable, "MNNVL_SCATTER_NETS_ENABLE", 1);
 ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, int gpu, int nets[NCCL_TOPO_MAX_NODES], int* netCountRet) {
   ncclResult_t ret = ncclSuccess;
@@ -555,9 +555,19 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
     NCCLCHECK(ncclTopoPrefNetsChannelFirst(system, gpu, nets, &netCount));
   }
 
+  // Get the maximum of network devices allowed, depending on the policy.
+  // If the policy is not MAX, then allow all devices.
+  int maxDevCount = 0;
+  enum netDevsPolicy netDevsPolicy;
+  NCCLCHECK(ncclTopoGetNetDevsPolicy(&netDevsPolicy, &maxDevCount));
+  if (gpu == -1) maxDevCount *= system->nodes[GPU].count;
+  if (netDevsPolicy != NETDEVS_POLICY_MAX) maxDevCount = NCCL_TOPO_MAX_NODES;
+  if (netCount >= maxDevCount) goto exit;
+
   // Then add others satisfying typeInter
   for (int t=0; t <= typeInter; t++) {
     for (int g = 0; g < system->nodes[GPU].count; g++) {
+      // do not consider this GPU is it's not the GPU we asked for
       if (gpu != -1 && gpu != g) continue;
       int localNetCount = 0, localNets[MAXCHANNELS];
       struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
@@ -569,16 +579,37 @@ ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int typeInter, in
       for (int i=0; i<localNetCount; i++) {
         int n = localNets[i];
         int found = 0;
-        while (found<netCount && nets[found] != n) found++;
+        while (found < netCount && nets[found] != n) found++;
         if (found == netCount) nets[netCount++] = n;
+        if (netCount >= maxDevCount) goto exit;
       }
     }
   }
 
+exit:
   *netCountRet = netCount;
   return ret;
 }
 
+NCCL_PARAM(MnnvlRailPerHost, "MNNVL_RAIL_PER_HOST", 0);
+
+static bool ncclTopoSearchCheckNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoNode* startNet, int n, int step) {
+  struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+  if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) return false; // Trees are symmetric
+  if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
+    if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels - 1) * 2]) return false;
+  } else if (graph->crossNic == 0) {
+    if (ncclParamMnnvlRailPerHost() && NCCL_TOPO_ID_SYSTEM_ID(net->id) != NCCL_TOPO_ID_SYSTEM_ID(startNet->id)) {
+      // Different hosts in an MNNVL system: rail are per host and identified with the PCI id.
+      if (net->net.pciId != startNet->net.pciId || net->net.port != startNet->net.port) return false;
+    } else {
+      if (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port) return false;
+    }
+  }
+  if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE && step != 0 && net->id != graph->inter[graph->nChannels*2+1]) return false;
+  return true;
+}
+
 ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, struct ncclTopoNode* gpu, int step, int backToNet, int backToFirstRank, int forcedOrder, int *time) {
   if ((*time) <= 0) return ncclSuccess;
   (*time)--;
@@ -604,7 +635,7 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
   int nets[NCCL_TOPO_MAX_NODES];
   if (step == backToNet) {
     // first get back to NIC
-    if (system->nodes[NET].count) {
+    if (system->inter) {
       int startNetIndex;
       NCCLCHECK(getNetIndex(system, graph->inter[graph->nChannels*2], &startNetIndex));
       struct ncclTopoNode* startNet = system->nodes[NET].nodes+startNetIndex;
@@ -612,24 +643,17 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
       NCCLCHECK(ncclTopoSelectNets(system, graph->typeInter, g, nets, &netCount));
       for (int i=0; i<netCount; i++) {
         int n = nets[i];
-        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-        if (graph->pattern == NCCL_TOPO_PATTERN_TREE && net->id != startNet->id) continue; // Trees are symmetric
-        if (graph->pattern == NCCL_TOPO_PATTERN_RING && graph->crossNic == 2) {
-          if (graph->nChannels & 1 && net->id != graph->inter[(graph->nChannels-1)*2]) continue;
-        } else {
-          if (graph->crossNic == 0 && (net->net.asic != startNet->net.asic || net->net.port != startNet->net.port)) continue;
-        }
-
+        if (!ncclTopoSearchCheckNet(system, graph, startNet, n, step)) continue;
         // Balanced Tree : count half of the bandwidth on first two GPUs
         int nextBackToNet = -1;
         float bwInterSave = graph->bwInter;
         if (graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE) {
           // Count half of the bandwidth on each of the first two GPUs
           if (step == 0) nextBackToNet = 1;
-          else if (net->id != graph->inter[graph->nChannels*2+1]) continue;
           graph->bwInter /= 2;
         }
 
+        struct ncclTopoNode* net;
         NCCLCHECK(ncclTopoFollowPath(system, graph, GPU, g, NET, n, 1, &net));
         graph->bwInter = bwInterSave;
         if (net) {
@@ -927,7 +951,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
   int* intra = graph->intra+ngpus*c;
   NCCLCHECK(xmlAddNode(xml, parent, "channel", &xmlChannel));
   struct ncclXmlNode* node;
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
     NCCLCHECK(xmlSetAttrLong(node, "dev", inter[0]));
   }
@@ -947,7 +971,7 @@ ncclResult_t ncclTopoGetXmlFromChannel(struct ncclTopoGraph* graph, int c, struc
     NCCLCHECK(xmlSetAttrLong(node, "dev", dev));
     if (graph->id == 3) break; // NVLS graphs only use the first GPU
   }
-  if (system->nodes[NET].count) {
+  if (system->inter) {
     NCCLCHECK(xmlAddNode(xml, xmlChannel, "net", &node));
     NCCLCHECK(xmlSetAttrLong(node, "dev", inter[1]));
   }
@@ -1039,7 +1063,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
     NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
   }
-  if (system->nodes[NET].count > 0) {
+  if (system->inter) {
     NCCLCHECK(ncclTopoGetGpuMinPath(system, NET, &minTypeInter));
     NCCLCHECK(ncclTopoGetGpuMaxPath(system, NET, &maxTypeInter));
     maxTypeIntra = maxTypeInter;
@@ -1124,7 +1148,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
 
   if (ngpus == 1) if (graph->pattern != NCCL_TOPO_PATTERN_RING) graph->pattern = NCCL_TOPO_PATTERN_TREE;
 
-  if (system->nodes[NET].count == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
+  if (system->inter == 0 && graph->pattern == NCCL_TOPO_PATTERN_NVLS) {
     // Force intra-node NVLS algorithm to pull evenly from all GPUs.
     graph->minChannels = graph->maxChannels;
   }
@@ -1144,7 +1168,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
   // First try crossnic, then decrease bw and finally increase bwIntra.
   int nspeeds = 0;
   float* speedArray = NULL;
-  if (system->nodes[NET].count == 0) {
+  if (system->inter == 0) {
     nspeeds = ccMin >= 100 ? NSPEEDSINTRA_SM100 : (ccMin >= 90 ? NSPEEDSINTRA_SM90 : NSPEEDSINTRA);
     speedArray = ccMin >= 100 ? sm100SpeedArrayIntra : (ccMin >= 90 ? sm90SpeedArrayIntra : speedArrayIntra);
   } else {
@@ -1204,14 +1228,14 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     }
     tmpGraph.pattern = graph->pattern;
 
-    int maxIntra = system->nodes[NET].count > 0 ? tmpGraph.typeInter : maxTypeIntra;
+    int maxIntra = system->inter ? tmpGraph.typeInter : maxTypeIntra;
     if (tmpGraph.typeIntra < maxIntra && (graph->nChannels == 0 || tmpGraph.typeIntra < graph->typeIntra)) {
       tmpGraph.typeIntra += 1;
       if (tmpGraph.typeIntra < PATH_DIS) goto search;
     }
     tmpGraph.typeIntra = minTypeIntra;
 
-    if (system->nodes[NET].count > 0 && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
+    if (system->inter && tmpGraph.typeInter < maxTypeInter && (graph->nChannels == 0 || tmpGraph.typeInter < graph->typeInter || tmpGraph.typeInter < PATH_PXN)) {
       tmpGraph.typeInter += 1;
       if (tmpGraph.typeInter < PATH_DIS) goto search;
     }
diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc
index 326929d6cf9..781c84366cd 100644
--- a/projects/rccl/src/graph/topo.cc
+++ b/projects/rccl/src/graph/topo.cc
@@ -363,26 +363,39 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
   int dev;
   NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));
 
+  int64_t netId = NCCL_TOPO_ID(systemId, dev);
   struct ncclTopoNode* net;
-  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, NCCL_TOPO_ID(systemId, dev)));
+  NCCLCHECK(ncclTopoCreateNode(system, &net, NET, netId));
   net->net.dev = dev;
   const char* str;
+  // if not guid is present use the net->id unique id instead, which will be unique within the node/NVLD
   NCCLCHECK(xmlGetAttr(xmlNet, "guid", &str));
-  if (str) sscanf(str, "0x%lx", &net->net.asic);
-  else net->net.asic = dev;
+  net->net.asic = (str) ? strtoull(str, NULL, 16) : netId;
+
 
-  ncclDebugNoWarn = NCCL_GRAPH;
   int mbps;
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0));
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "speed", &mbps, 0), NCCL_GRAPH);
   if (mbps <= 0) mbps = 10000; // Some NICs define speed = -1
   net->net.bw = mbps / 8000.0;
-  if (xmlGetAttrFloat(xmlNet, "latency", &net->net.latency) != ncclSuccess) net->net.latency = 0;
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS));
-  NCCLCHECK(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0));
-  net->net.busId = busId;
-  ncclDebugNoWarn = 0;
+  ncclResult_t ret;
+  NOWARN(ret = xmlGetAttrFloat(xmlNet, "latency", &net->net.latency), NCCL_GRAPH);
+  if (ret != ncclSuccess) net->net.latency = 0;
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "port", &net->net.port, 0), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "gdr", &net->net.gdrSupport, 0), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "maxconn", &net->net.maxChannels, MAXCHANNELS), NCCL_GRAPH);
+  NCCLCHECKNOWARN(xmlGetAttrIntDefault(xmlNet, "coll", &net->net.collSupport, 0), NCCL_GRAPH);
+  net->net.busId = busId;  // RCCL: keep this
+
+  // build the PCI id using the parent PCI link
+  uint64_t hacc[2] = {1, 1};
+  const char* pciBusId = NULL;
+  struct ncclXmlNode* parent = xmlNet->parent;
+  while (parent != NULL && strcmp(parent->name, "pci") != 0) parent = parent->parent;
+  if (parent) NCCLCHECK(xmlGetAttr(parent, "busid", &pciBusId));
+  // If we fail to find the PCIe path, we use the GUID instead.
+  if (pciBusId) eatHash(hacc, pciBusId, strlen(pciBusId));
+  else eatHash(hacc, &net->net.asic);
+  net->net.pciId = digestHash(hacc);
 
   NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.bw));
   NCCLCHECK(ncclTopoConnectNodes(net, nic, LINK_NET, net->net.bw));
@@ -1087,7 +1100,8 @@ ncclResult_t ncclTopoMakeVnic(struct ncclXml* xml, struct ncclTopoNetInfo* netIn
 
   // Trigger the merge, then get the new device's properties
   int vDevIndex = 0;
-  ncclResult_t ret = netInfo->makeVDevice(&vDevIndex, vProps);
+  ncclResult_t ret;
+  NOWARN(ret = netInfo->makeVDevice(&vDevIndex, vProps), NCCL_GRAPH|NCCL_INIT|NCCL_NET);
   if (ret != ncclSuccess) {
     INFO(NCCL_GRAPH|NCCL_INIT|NCCL_NET, "TOPO/NET : Tried merging multiple devices together and failed. vProps={ndevs=%d, devs=[%d %d %d %d]}. Set NCCL_NET_MERGE_LEVEL=LOC to disable NIC fusion.",
       vProps->ndevs, vProps->devs[0], vProps->devs[1], vProps->devs[2], vProps->devs[3]);
@@ -1686,16 +1700,8 @@ ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *c
   return ncclSuccess;
 }
 
-enum netDevsPolicy {
-  NETDEVS_POLICY_AUTO = 0x0,
-  NETDEVS_POLICY_ALL = 0x1,
-  NETDEVS_POLICY_MAX = 0x2,
-  NETDEVS_POLICY_UNDEF = 0xffffffff
-};
-
-static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
 static int netDevsPolicyNum = -1;
-
+static enum netDevsPolicy netDevsPolicy = NETDEVS_POLICY_UNDEF;
 static void getNetDevsPolicyOnce() {
   const char* envStr = ncclGetEnv("NCCL_NETDEVS_POLICY");
   if (envStr) {
@@ -1718,6 +1724,18 @@ static void getNetDevsPolicyOnce() {
   if (netDevsPolicy == NETDEVS_POLICY_UNDEF) netDevsPolicy = NETDEVS_POLICY_AUTO;
 }
 
+ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum) {
+  static pthread_once_t onceNetDevsPolicy = PTHREAD_ONCE_INIT;
+  pthread_once(&onceNetDevsPolicy, getNetDevsPolicyOnce);
+  if (netDevsPolicy == NETDEVS_POLICY_MAX && netDevsPolicyNum <= 0) {
+    WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
+    return ncclInternalError;
+  }
+  if (policy) *policy = netDevsPolicy;
+  if (policyNum && netDevsPolicyNum >= 0) *policyNum = netDevsPolicyNum;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev) {
   int gpu;
   NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
@@ -1732,22 +1750,19 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
     return ncclInternalError;
   }
 
-  static pthread_once_t once = PTHREAD_ONCE_INIT;
-  pthread_once(&once,getNetDevsPolicyOnce);
   int netsPerGpu = 0;
-  if (netDevsPolicy == NETDEVS_POLICY_AUTO) {
+  int policyCount = 0;
+  enum netDevsPolicy policy;
+  NCCLCHECK(ncclTopoGetNetDevsPolicy(&policy, &policyCount));
+  if (policy == NETDEVS_POLICY_AUTO) {
     int localGpus[NCCL_TOPO_MAX_NODES];
     int localGpuCount;
     NCCLCHECK(ncclTopoGetLocal(system, NET, localNets[0], GPU, localGpus, &localGpuCount, NULL));
     netsPerGpu = DIVUP(localNetCount, localGpuCount);
-  } else if (netDevsPolicy == NETDEVS_POLICY_ALL) {
+  } else if (policy == NETDEVS_POLICY_ALL) {
     netsPerGpu = localNetCount;
-  } else if (netDevsPolicy == NETDEVS_POLICY_MAX) {
-    if (netDevsPolicyNum <= 0) {
-      WARN("Invalid number of network devices = %d for policy MAX", netDevsPolicyNum);
-      return ncclInternalError;
-    }
-    netsPerGpu = std::min(netDevsPolicyNum, localNetCount);
+  } else if (policy == NETDEVS_POLICY_MAX) {
+    netsPerGpu = std::min(policyCount, localNetCount);
   } else {
     WARN("Unknown netDevs policy");
     return ncclInternalError;
@@ -1761,6 +1776,21 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int ch
   return ncclSuccess;
 }
 
+ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount) {
+  int gpu;
+  NCCLCHECK(ncclTopoRankToIndex(system, rank, &gpu, /*showWarn=*/true));
+  int localNetIndexes[NCCL_TOPO_MAX_NODES];
+  NCCLCHECK(ncclTopoGetLocal(system, GPU, gpu, NET, localNetIndexes, localNetCount, NULL));
+
+  if (*localNetCount == 0) {
+    WARN("Could not find any local path from gpu %d to net.", gpu);
+    return ncclInternalError;
+  }
+  // Convert index to ids
+  for (int n=0; n<*localNetCount; n++) localNets[n] = system->nodes[NET].nodes[localNetIndexes[n]].id;
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex) {
   ncclResult_t ret = ncclSuccess;
   int netIndex;
diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h
index 9a9fd5618a9..9e8c1f527c1 100644
--- a/projects/rccl/src/graph/topo.h
+++ b/projects/rccl/src/graph/topo.h
@@ -164,6 +164,7 @@ struct ncclTopoNode {
     }gpu;
     struct {
       int dev; // Plugin dev number
+      uint64_t pciId;
       uint64_t asic;
       int port;
       float bw;
@@ -221,6 +222,7 @@ struct ncclTopoSystem {
   // [RCCL] Track hostIdx to support rail-optimized rings/trees
   int hostIdx;
   bool useRailOptimizedTrees;
+  int inter;
   /* RCCL Rome / GIO preset: RCCL_ROME_TOPO_PRESET_MODEL_IDX_* sentinels or romeTopoModels[] index */
   int romeTopoModelIdx;
   /* Preset matchers assume uniform ranks per host; otherwise use generic search in ncclTopoCompute */
diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc
index ecf4d7dc608..368f55f41f3 100644
--- a/projects/rccl/src/graph/xml.cc
+++ b/projects/rccl/src/graph/xml.cc
@@ -591,32 +591,28 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
   const char* busId;
   NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
   char* path = NULL;
-  ncclDebugNoWarn = NCCL_GRAPH;
-  getPciPath(busId, &path);
-  ncclDebugNoWarn = 0;
+  NOWARN(getPciPath(busId, &path), NCCL_GRAPH);
 
   if (path) {
     NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
   }
   int index;
-  ncclDebugNoWarn = NCCL_GRAPH;
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "vendor", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "device", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "device", "device");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"), NCCL_GRAPH);
   }
-  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
+  NCCLCHECKNOWARN(xmlGetAttrIndex(pciNode, "subsystem_device", &index), NCCL_GRAPH);
   if (index == -1) {
-    if (path) ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device");
+    if (path) NOWARN(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"), NCCL_GRAPH);
   }
-  ncclDebugNoWarn = 0;
   NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
   if (index == -1) {
     if (path) {
@@ -658,7 +654,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
   NCCLCHECK(xmlGetAttr(pciNode, "vendor", &vendor));
   if (vendor != NULL && strcmp(vendor, "0x1000") == 0) { // BCM switch, look for P2P connections
     int nlinks;
-    char* peers;
+    char* peers = NULL;
     NCCLCHECK(getBcmLinks(busId, &nlinks, &peers));
     for (int l=0; l<nlinks; l++) {
       char* target = peers+l*BUSID_SIZE;
@@ -669,6 +665,7 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
         NCCLCHECK(xmlSetAttr(linkNode, "target", target));
       }
     }
+    free(peers);
   }
 
   struct ncclXmlNode* parent = pciNode->parent;
@@ -980,9 +977,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
       const char* busId;
       NCCLCHECK(xmlGetAttr(sub, "target", &busId));
       char* path;
-      ncclDebugNoWarn = NCCL_GRAPH;
-      getPciPath(busId, &path);
-      ncclDebugNoWarn = 0;
+      NOWARN(getPciPath(busId, &path), NCCL_GRAPH);
       if (path == NULL || strcmp(busId, "fffffff:ffff:ff") == 0) {
         // Remote NVLink device is not visible inside this VM. Assume NVSwitch.
         NCCLCHECK(xmlSetAttr(sub, "tclass", "0x068000"));
diff --git a/projects/rccl/src/include/allocator.h b/projects/rccl/src/include/allocator.h
index 05da29a62a9..eccb5b5cd72 100644
--- a/projects/rccl/src/include/allocator.h
+++ b/projects/rccl/src/include/allocator.h
@@ -7,6 +7,10 @@
 #ifndef NCCL_ALLOCATOR_H_
 #define NCCL_ALLOCATOR_H_
 
+#include "nccl.h"
+#include <stdint.h>
+#include <cuda_runtime.h>
+
 ////////////////////////////////////////////////////////////////////////////////
 // ncclSpace: Allocates contiguous segments of non-negative integers. Useful
 // as a memory allocator when we can't put allocator state within the memory
diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h
index 50c8f4c3ba2..f060ca1ed67 100644
--- a/projects/rccl/src/include/checks.h
+++ b/projects/rccl/src/include/checks.h
@@ -135,6 +135,21 @@
   } \
 } while (0)
 
+#define NCCLCHECKNOWARN(call, FLAGS) do { \
+  ncclResult_t RES; \
+  NOWARN(RES = call, FLAGS); \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    return RES; \
+  } \
+} while (0)
+
+#define NCCLCHECKGOTONOWARN(call, RES, label, FLAGS) do { \
+  NOWARN(RES = call, FLAGS); \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    goto label; \
+  } \
+} while (0)
+
 #define NCCLWAIT(call, cond, abortFlagPtr) do {         \
   uint32_t* tmpAbortFlag = (abortFlagPtr);     \
   ncclResult_t RES = call;                \
diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h
index fc677175c6a..2c6418d725a 100644
--- a/projects/rccl/src/include/comm.h
+++ b/projects/rccl/src/include/comm.h
@@ -154,6 +154,9 @@ struct ncclSharedResources {
 
   /* proxy related shared res */
   struct ncclProxyState* proxyState;
+
+  // GIN state
+  struct ncclGinState ginState;
 };
 
  /**
@@ -511,6 +514,7 @@ struct ncclComm {
 
   ncclNet_t* ncclNet;
   void* netContext;
+  void* ginContext;
   int netPluginIndex;
   int ncclNetVer;
   ncclNetDeviceType netDeviceType;
@@ -524,7 +528,7 @@ struct ncclComm {
   int maxTreePattern;
   bool initAlgoChannels[NCCL_NUM_ALGORITHMS];
   bool runtimeConn; // if dynamic connection is supported
-  bool directMode;
+  bool directMode; // if any process manages more than one local rank
   int cuMemSupport;
 
   uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
@@ -624,6 +628,7 @@ struct ncclComm {
   uint32_t* childAbortFlag;
   uint32_t* childAbortFlagDev;
   uint32_t destroyFlag;
+  uint32_t revokedFlag;
 
   // Flags for enable P2P NET
   uint32_t p2pNet;
@@ -757,7 +762,8 @@ struct ncclComm {
   // buffer registration cache
   struct ncclRegCache regCache;
   int isAllNvlink;
-  bool isAllDirectP2p;
+  bool isAllDirectP2p; // Subject to NCCL_P2P_LEVEL (for local ranks only).
+  bool isAllCudaP2p; // Raw CUDA capability (for local ranks only).
   int symmetricSupport;
   bool useNetPXN;
   bool useGdr;
diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h
index 457ba57e3d2..ae7731bc880 100644
--- a/projects/rccl/src/include/debug.h
+++ b/projects/rccl/src/include/debug.h
@@ -29,8 +29,29 @@ extern char ncclLastError[];
 #define ERROR(...) ncclDebugLog(NCCL_LOG_ERROR, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define VERSION(...) ncclDebugLog(NCCL_LOG_VERSION, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
 #define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
-#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
-#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
+
+#define NOWARN(EXPR, FLAGS) \
+  do { \
+    int oldNoWarn = ncclDebugNoWarn; \
+    ncclDebugNoWarn = FLAGS; \
+    (EXPR); \
+    ncclDebugNoWarn = oldNoWarn; \
+  } while(0)
+
+#define INFO(FLAGS, ...) \
+    do{ \
+        int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
+        if((level >= NCCL_LOG_INFO && ((unsigned long)(FLAGS) & ncclDebugMask)) || (level < 0)) \
+            ncclDebugLog(NCCL_LOG_INFO, (unsigned long)(FLAGS), __func__, __LINE__, __VA_ARGS__); \
+    } while(0)
+
+#define TRACE_CALL(...) \
+    do { \
+        int level = __atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE); \
+        if((level >= NCCL_LOG_TRACE && (NCCL_CALL & ncclDebugMask)) || (level < 0)) { \
+            ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__); \
+        } \
+    } while (0)
 
 #ifdef ENABLE_TRACE
 #define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
diff --git a/projects/rccl/src/include/dev_runtime.h b/projects/rccl/src/include/dev_runtime.h
index 5f6e66e3387..70bf77496be 100644
--- a/projects/rccl/src/include/dev_runtime.h
+++ b/projects/rccl/src/include/dev_runtime.h
@@ -52,6 +52,7 @@ struct ncclDevrState {
   int* lsaRankList;
 
   size_t granularity; // cuMemGetAllocationGranularity
+  bool ginEnabled;
   struct ncclDevrMemory* memHead;
   struct ncclDevrWindowSorted* winSorted;
   int winSortedCapacity, winSortedCount;
diff --git a/projects/rccl/src/include/device.h b/projects/rccl/src/include/device.h
index 7cfd5bcdc74..b65972b979f 100644
--- a/projects/rccl/src/include/device.h
+++ b/projects/rccl/src/include/device.h
@@ -82,7 +82,7 @@ extern const char* funcNames[];
   #define NCCL_CUDA_ARCH_FAMILY_SPECIFIC 0
 #endif
 
-#include "net_device.h"
+#include "nccl_device/net_device.h"
 
 enum ncclDevRedOp_t {
   ncclDevSum, ncclDevProd, ncclDevMinMax,
@@ -245,6 +245,7 @@ struct ncclProxyConnector {
   int sameProcess;
   struct ncclProxyConnection* connection;
   ncclResult_t (*proxyProgress)(struct ncclProxyState* proxyState, struct ncclProxyArgs*); // Copied from transport if necessary
+  ncclResult_t (*proxyGinProgress)(struct ncclProxyState* proxyState);
 };
 
 struct ncclConnector {
@@ -804,7 +805,8 @@ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
 
 // Host-side table of kernel function pointers.
 extern int const ncclDevKernelCount;
-extern void* const ncclDevKernelList[/*ncclDevKernelCount*/];
+extern void* ncclDevKernelList[/*ncclDevKernelCount*/];
+extern int ncclDevKernelRequirements[/*ncclDevKernelCount*/];
 
 // Table of most specialized kernel function to run given func index.
 extern int const ncclDevFuncRowToId[];
diff --git a/projects/rccl/src/include/env.h b/projects/rccl/src/include/env.h
new file mode 100644
index 00000000000..0e00b31448a
--- /dev/null
+++ b/projects/rccl/src/include/env.h
@@ -0,0 +1,23 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_INT_ENV_H_
+#define NCCL_INT_ENV_H_
+
+#include "nccl_env.h"
+
+// Initialize Env Plugin
+ncclResult_t ncclEnvPluginInit(void);
+// Finalize Env Plugin
+void ncclEnvPluginFinalize(void);
+// Env plugin get function for NCCL params, called in ncclGetEnv()
+const char* ncclEnvPluginGetEnv(const char* name);
+
+bool ncclEnvPluginInitialized(void);
+
+ncclResult_t ncclInitEnv(void);
+
+#endif
diff --git a/projects/rccl/src/include/gin/gin_host.h b/projects/rccl/src/include/gin/gin_host.h
new file mode 100644
index 00000000000..d82a7950524
--- /dev/null
+++ b/projects/rccl/src/include/gin/gin_host.h
@@ -0,0 +1,54 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_HOST_H_
+#define _NCCL_GIN_HOST_H_
+
+#include "allocator.h"
+#include "nccl.h"
+#include "nccl_net.h"
+#include "nccl_device/gin/gin_device_host_common.h"
+#include <pthread.h>
+
+struct ncclGinState {
+  ncclGin_t* ncclGin;
+  void* ginInstance;
+  bool connected;
+  int ginType;
+  int ginCommCount;
+  void* ginComms[NCCL_GIN_MAX_CONTEXTS];
+  void* ginCtx[NCCL_GIN_MAX_CONTEXTS];
+  ncclNetDeviceHandle_t* ginDevHandles[NCCL_GIN_MAX_CONTEXTS];
+  int needsProxyProgress;  // Whether we need to progress GIN operations with the proxy
+  int ginProgress;         // GIN progress is enabled
+  pthread_t thread;
+  pthread_mutex_t threadLock;
+  pthread_cond_t threadCond;
+  ncclResult_t asyncResult;
+
+  int signalSpaceSize;
+  int counterSpaceSize;
+  ncclSpace signalSpace;
+  ncclSpace counterSpace;
+};
+
+extern int64_t ncclParamGinType();
+
+// FIXME change to ncclGinState instead of ncclComm, no need to pass comm
+ncclResult_t ncclGinConnectOnce(struct ncclComm* comm);
+ncclResult_t ncclGinFinalize(struct ncclComm* comm);
+ncclResult_t ncclGinProgress(struct ncclGinState* ginState);
+ncclResult_t ncclGinRegister(struct ncclComm* comm, void* address, size_t size,
+                             void* ginHostWins[NCCL_GIN_MAX_CONTEXTS],
+                             ncclGinWindow_t ginDevWins[NCCL_GIN_MAX_CONTEXTS]);
+ncclResult_t ncclGinDeregister(struct ncclComm* comm, void* ginHostWins[NCCL_GIN_MAX_CONTEXTS]);
+ncclResult_t ncclGinAllocSignalsCounters(struct ncclComm* comm, int nSignals, uint32_t* outSignal0,
+                                         int nCounters, uint32_t* outCounter0);
+ncclResult_t ncclGinFreeSignalsCounters(struct ncclComm* comm, uint32_t signal0, int nSignals,
+                                        uint32_t counter0, int nCounters);
+ncclResult_t ncclGinQueryLastError(struct ncclGinState* ginState, bool* hasError);
+
+#endif
diff --git a/projects/rccl/src/include/gin/gin_host_proxy.h b/projects/rccl/src/include/gin/gin_host_proxy.h
new file mode 100644
index 00000000000..14e8b93ca15
--- /dev/null
+++ b/projects/rccl/src/include/gin/gin_host_proxy.h
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef GIN_HOST_PROXY_H_
+#define GIN_HOST_PROXY_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include "nccl.h"
+#include "gin/gin_host.h"
+#include "plugin/nccl_net.h"
+
+ncclResult_t ncclGinProxyCreateContext(struct ncclComm *comm, void *collComm, int devId,
+                                       int nSignals, int nCounters, void **outGinCtx,
+                                       ncclNetDeviceHandle_v11_t **outDevHandle);
+ncclResult_t ncclGinProxyRegister(ncclGin_t *ginComm, void *ginCtx, void *addr, size_t size,
+                                  int type, int mr_flags, void **mhandle, void **ginHandle);
+ncclResult_t ncclGinProxyDeregister(ncclGin_t *ginComm, void *ginCtx, void *mhandle);
+ncclResult_t ncclGinProxyDestroyContext(ncclGin_t *ginComm, void *ginCtx);
+ncclResult_t ncclGinProxyProgress(ncclGin_t *ginComm, void *ginCtx);
+ncclResult_t ncclGinProxyQueryLastError(ncclGin_t *ginComm, void *ginCtx, bool *hasError);
+
+#endif
diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h
index cfae66faf8a..910fa9f034c 100644
--- a/projects/rccl/src/include/graph.h
+++ b/projects/rccl/src/include/graph.h
@@ -35,7 +35,7 @@ ncclResult_t ncclTopoComputeCommCPU(struct ncclComm* comm);
 
 // Query topology
 ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int64_t* id, int* dev, int* proxyRank);
-ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclComm* comm, struct ncclTopoSystem* system, int rank1, int rank2, int* p2p, int *read, int* intermediateRank, int* cudaP2p);
 ncclResult_t ncclTopoCheckMNNVL(struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2, int* ret);
 enum ncclTopoGdrMode {
   ncclTopoGdrModeDisable = 0,
@@ -80,9 +80,18 @@ ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
 ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int64_t* id, int* dev);
+ncclResult_t ncclTopoGetLocalNets(struct ncclTopoSystem* system, int rank, int64_t* localNets, int* localNetCount);
 ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int64_t netId, int* gpuIndex);
 ncclResult_t getLocalNetCountByBw(struct ncclTopoSystem* system, int gpu, int *count);
 
+enum netDevsPolicy {
+  NETDEVS_POLICY_AUTO = 0x0,
+  NETDEVS_POLICY_ALL = 0x1,
+  NETDEVS_POLICY_MAX = 0x2,
+  NETDEVS_POLICY_UNDEF = 0xffffffff
+};
+ncclResult_t ncclTopoGetNetDevsPolicy(enum netDevsPolicy* policy, int* policyNum);
+
 // Allows for up to 32 NICs per node on GB200-NVL72
 #define NCCL_TOPO_MAX_NODES 64
 ncclResult_t ncclTopoGetLocal(struct ncclTopoSystem* system, int type, int index, int resultType, int locals[NCCL_TOPO_MAX_NODES], int* localCount, int* pathType);
diff --git a/projects/rccl/src/include/group.h b/projects/rccl/src/include/group.h
index 8d5b072991d..3fcbca6f70d 100644
--- a/projects/rccl/src/include/group.h
+++ b/projects/rccl/src/include/group.h
@@ -78,6 +78,10 @@ extern __thread struct ncclComm* ncclGroupCommHead[ncclGroupTaskTypeNum];
 extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
 extern __thread int ncclGroupBlocking;
 
+inline bool ncclGroupEnabled() {
+  return ncclGroupDepth != 0;
+}
+
 inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
   if (ncclGroupDepth > 0) {
     if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
diff --git a/projects/rccl/src/include/nccl_device.h b/projects/rccl/src/include/nccl_device.h
index 88b2531d19d..35e216c6288 100644
--- a/projects/rccl/src/include/nccl_device.h
+++ b/projects/rccl/src/include/nccl_device.h
@@ -4,12 +4,12 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 
-#include "nccl_device/impl/comm__funcs.h"
 #include "nccl_device/coop.h"
+#include "nccl_device/impl/barrier__funcs.h"
+#include "nccl_device/impl/comm__funcs.h"
 #include "nccl_device/impl/core__funcs.h"
 #include "nccl_device/impl/ll_a2a__funcs.h"
-#include "nccl_device/impl/mem_barrier__funcs.h"
-//#include "nccl_device/net_barrier__funcs.h"
-//#include "nccl_device/net_scratch_a2a__funcs.h"
-//#include "nccl_device/barrier__funcs.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
+#include "nccl_device/impl/gin__funcs.h"
+#include "nccl_device/impl/gin_barrier__funcs.h"
 #include "nccl_device/impl/ptr__funcs.h"
diff --git a/projects/rccl/src/include/nccl_device/barrier.h b/projects/rccl/src/include/nccl_device/barrier.h
new file mode 100644
index 00000000000..0c11f6e5c2c
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/barrier.h
@@ -0,0 +1,47 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER_H_
+#define _NCCL_DEVICE_BARRIER_H_
+#include "impl/core__types.h"
+#include "impl/lsa_barrier__types.h"
+#include "impl/gin_barrier__types.h"
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclBarrierSession_internal;
+
+template<typename Coop>
+struct ncclBarrierSession: ncclBarrierSession_internal<Coop> {
+  // Full featured constructor:
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin,
+    ncclLsaBarrierHandle innerBarHandle,
+    ncclGinBarrierHandle outerBarHandle,
+    uint32_t index,
+    bool multimem=false, ncclMultimemHandle innerMmHandle={}
+  );
+  // Convenience constructors for baked in teams:
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagWorld, ncclGin, uint32_t index, bool multimem=false
+  );
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagLsa, ncclDevComm const&, uint32_t index, bool multimem=false
+  );
+  NCCL_DEVICE_INLINE ncclBarrierSession(
+    Coop, ncclTeamTagRail, ncclGin, uint32_t index
+  );
+
+  ncclBarrierSession(ncclBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& lsaBarrier();
+  NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ginBarrier();
+
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
+};
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER_H_
diff --git a/projects/rccl/src/include/nccl_device/coop.h b/projects/rccl/src/include/nccl_device/coop.h
index 7f3b33fca65..adcd31d9e3f 100644
--- a/projects/rccl/src/include/nccl_device/coop.h
+++ b/projects/rccl/src/include/nccl_device/coop.h
@@ -53,7 +53,7 @@ struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
   }
   NCCL_DEVICE_INLINE void sync() {
 #if ROCM_VERSION >= 70000
-    __syncwarp(laneMask());
+    if (nThreadsPow2 > 1) __syncwarp(laneMask());
 #else
     __syncthreads();
 #endif
@@ -69,7 +69,7 @@ typedef ncclCoopTile<WARP_SIZE> ncclCoopWarp;
 #if __CUDACC__
 struct ncclCoopLanes { // Some lanes of this warp.
   ncclCoopMask_t lmask;
-  
+
   NCCL_DEVICE_INLINE constexpr ncclCoopLanes(ncclCoopMask_t lmask = ncclCoopFullMask): lmask(lmask) {}
 
   NCCL_DEVICE_INLINE int thread_rank() const {
@@ -101,7 +101,7 @@ struct ncclCoopWarpSpan {
   NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
     warp0(warp0), nWarps(nWarps), id(id) {
   }
-  
+
   NCCL_DEVICE_INLINE int thread_rank() const {
     return threadIdx.x - WARP_SIZE*warp0;
   }
@@ -160,6 +160,14 @@ NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return fa
 NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
 #endif
 
+#if __CUDACC__
+template<int nThreads>
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile<nThreads>) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; }
+#endif
+
 #if __CUDACC__
 // Pick threads of our warp that are safe to use collectively.
 NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
@@ -187,4 +195,55 @@ NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThread
 }
 #endif
 
+#if __CUDACC__
+template<int nThreads, typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile<nThreads>, T value, int root, bool entrySync=true) {
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads);
+  return v;
+}
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) {
+  uint32_t m = coop.lmask;
+  uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root);
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r);
+  return v;
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() {
+  __shared__ ulong2 stash[15];
+  return stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value;
+  coop.sync();
+  return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id];
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() {
+  __shared__ ulong2 stash;
+  return &stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value;
+  coop.sync();
+  return *(T*)ncclCoopBcast_Cta_stash();
+}
+#endif
+
 #endif
diff --git a/projects/rccl/src/include/nccl_device/core.h b/projects/rccl/src/include/nccl_device/core.h
index dd41d692507..9b0061a72d6 100644
--- a/projects/rccl/src/include/nccl_device/core.h
+++ b/projects/rccl/src/include/nccl_device/core.h
@@ -24,9 +24,15 @@ typedef struct ncclMultimemHandle ncclMultimemHandle_t;
 typedef uint32_t ncclDevResourceHandle;
 typedef ncclDevResourceHandle ncclDevResourceHandle_t;
 
+typedef uint32_t ncclGinSignal_t;
+typedef uint32_t ncclGinCounter_t;
+
 struct ncclLsaBarrierHandle;
 typedef struct ncclLsaBarrierHandle ncclLsaBarrierHandle_t;
 
+struct ncclGinBarrierHandle;
+typedef struct ncclGinBarrierHandle ncclGinBarrierHandle_t;
+
 struct ncclLLA2AHandle;
 typedef struct ncclLLA2AHandle ncclLLA2AHandle_t;
 
@@ -59,13 +65,26 @@ struct ncclDevCommRequirements {
 
   bool lsaMultimem; // Enable multimem on lsa team
 
+  int barrierCount;
   int lsaBarrierCount;
+  int railGinBarrierCount;
+
+  int lsaLLA2ABlockCount, lsaLLA2ASlotCount;
+
+  bool ginForceEnable;
+  int ginContextCount; // This is a hint, the actual context count in the devcomm may not match.
+  int ginSignalCount; // Guaranteed to start at id=0
+  int ginCounterCount; // Guaranteed to start at id=0
 };
 
 struct ncclDevResourceRequirements {
   ncclDevResourceRequirements_t* next;
   size_t bufferSize, bufferAlign;
   ncclDevResourceHandle_t* outBufferHandle; // If non-null, target assigned during ncclDevCommCreate.
+  int ginSignalCount;
+  int ginCounterCount;
+  ncclGinSignal_t* outGinSignalStart;
+  ncclGinCounter_t* outGinCounterStart;
 };
 
 struct ncclTeamRequirements {
diff --git a/projects/rccl/src/include/nccl_device/gin.h b/projects/rccl/src/include/nccl_device/gin.h
new file mode 100644
index 00000000000..0f5643f206d
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin.h
@@ -0,0 +1,207 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION_H_
+#define _NCCL_DEVICE_GIN_SESSION_H_
+#include "core_tmp.h"
+#include "gin/gin_device_common.h"
+
+#if __CUDACC__
+struct ncclGinCtx; // Definition in nccl_device/gin/gin_device_host_common.h
+template<unsigned> struct ncclGinCtx_M; // ...
+
+struct ncclGinDescriptorSmem; // A type user allocates in __shared__ memory
+
+// Used as completion actions for ncclGinSession::put
+struct ncclGin_None {};
+
+struct ncclGin_SignalAdd { ncclGinSignal_t signal; uint64_t value; };
+// SignalInc: equivalent to SignalAdd{+1} except it may not be mixed with any
+// other signal operator without intervening signal reset(). Formally: for a
+// given signal, all operations between successive reset()'s of that signal must
+// either all be SignalInc or all not SignalInc.
+struct ncclGin_SignalInc { ncclGinSignal_t signal; };
+// Support deferred:
+// struct ncclGin_SignalSet { ncclGinSignal_t signal; uint64_t value; };
+struct ncclGin_CounterInc { ncclGinCounter_t counter; };
+
+struct ncclGin_DescriptorSmem { ncclGinDescriptorSmem* descriptor; };
+
+template<unsigned backendMask>
+struct ncclGin_BackendMask;
+
+template<ncclNetDeviceType backend>
+using ncclGin_BackendOne = ncclGin_BackendMask<(1u<<(int)backend)>;
+
+using ncclGin = ncclGin_BackendMask<NCCL_GIN_BACKEND_MASK_ALL>;
+
+#endif
+
+#if __CUDACC__
+template<unsigned backendMask>
+struct ncclGin_BackendMask {
+  ncclDevComm const& comm;
+  uint32_t nContexts:8, contextId:8, _ginBackend:8;
+
+  // Loads GIN context into registers. Each context has one QP per peer.
+  NCCL_DEVICE_INLINE ncclGin_BackendMask(ncclDevComm const&, int contextIndex);
+
+  template<
+    // Action to take on peer when put completes. If a signalling action is used
+    // then that signal will be visible only after the payload of this put as well as
+    // the payloads of preceding puts on this netContext to the same peer are settled.
+    typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet}
+    // Action to take locally when source has been consumed.
+    typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc}
+    // Set of threads participating in this put. Must be a subset of Coop.
+    typename Coop = ncclCoopThread,
+    // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem}
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void put(
+    ncclTeam, int peer,
+    ncclWindow_t dstWnd, size_t dstOffset,
+    ncclWindow_t srcWnd, size_t srcOffset, size_t bytes,
+    RemoteAction remoteAction = ncclGin_None{},
+    LocalAction localAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T,
+    // Action to take on peer when put completes. If a signalling action is used
+    // then that signal will be visible only after the payload of this put as well as
+    // the payloads of preceding puts on this context to the same peer are settled.
+    typename RemoteAction = ncclGin_None, // one of ncclGin_{None|SignalInc|SignalAdd|SignalSet}
+    // Action to take locally when source has been consumed.
+    typename LocalAction = ncclGin_None, // one of ncclGin_{None|CounterInc}
+    // Set of threads participating in this put. Must be a subset of Coop.
+    typename Coop = ncclCoopThread,
+    // Optional smem descriptor space to use. Either ncclGin_{None|DescriptorSmem}
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void put(
+    ncclTeam, int peer,
+    ncclSymPtr<T> dstElts, ncclSymPtr<T> srcElts, size_t nElts,
+    RemoteAction remoteAction = ncclGin_None{},
+    LocalAction localAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T, // requires sizeof(T) <= 8
+    // See put() for all template arguments.
+    typename RemoteAction = ncclGin_None,
+    typename Coop = ncclCoopThread,
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void putValue(
+    ncclTeam, int peer,
+    ncclWindow_t dstWnd, size_t dstOffset, T value,
+    RemoteAction remoteAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<
+    typename T, // requires sizeof(T) <= 8
+    // See put() for all template arguments.
+    typename RemoteAction = ncclGin_None,
+    typename Coop = ncclCoopThread,
+    typename DescriptorSmem = ncclGin_None
+  >
+  NCCL_DEVICE_INLINE void putValue(
+    ncclTeam, int peer,
+    ncclSymPtr<T> dst, T value,
+    RemoteAction remoteAction = ncclGin_None{},
+    Coop coop = ncclCoopThread{},
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  template<typename RemoteAction,
+           typename Coop = ncclCoopThread,
+           typename DescriptorSmem = ncclGin_None>
+  NCCL_DEVICE_INLINE void signal(
+    ncclTeam, int peer, RemoteAction remoteAction,
+    Coop coop = ncclCoopThread(),
+    DescriptorSmem descriptor = ncclGin_None{},
+    cuda::thread_scope alreadyReleased = cuda::thread_scope_thread,
+    cuda::thread_scope expected_scope = cuda::thread_scope_device
+  ) const;
+
+  // All source buffers from put's from any thread in this coop will be safe to reuse.
+  // Flush does not guarantee that data has settled in remote memory.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void flush(Coop, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Counter and signal wait use "rolling" comparison logic of a given bit-width
+  // such that unsigned overflow does not disturb the property that: x < x+1.
+  //
+  // bool rolling_less_equal(uint64_t a, uint64_t b, int bits) {
+  //   uint64_t m = uint64_t(-1)>>(64-bits);
+  //   return ((b-a) & m) <= (m>>1);
+  // }
+  //
+  // The condition waited for is that the supplied value is rolling_less_equal
+  // to the internal value.
+  //
+  // Counters are restricted to using a maximum of 56 bits despite that being fewer
+  // than a uint64_t can carry.
+
+  NCCL_DEVICE_INLINE uint64_t readCounter(ncclGinCounter_t counter, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitCounter(Coop, ncclGinCounter_t counter, uint64_t least, int bits=56, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Each signal has a dedicated "shadow" which the user is free to manipulate for
+  // any reason. The only calls which manipulate the shadow are `increaseSignalShadow`
+  // and `resetSignal`.
+  NCCL_DEVICE_INLINE uint64_t* getSignalShadowPtr(ncclGinSignal_t signal) const;
+  NCCL_DEVICE_INLINE void increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const;
+
+  // Returns current value of signal with all but bottom bits set to zero.
+  NCCL_DEVICE_INLINE uint64_t readSignal(ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait for signal to meet or exceed value.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitSignal(Coop, ncclGinSignal_t signal, uint64_t least, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait for signal to meet or exceed shadow value.
+  template<typename Coop>
+  NCCL_DEVICE_INLINE void waitSignalMeetShadow(Coop, ncclGinSignal_t signal, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Wait until signal exceeds shadow by `leastDelta` (typically 1), updates shadow
+  // with latest value, and returns with `before` equal to previous shadow value
+  // and `delta` equal to difference.
+  template<typename Coop, typename Uint>
+  NCCL_DEVICE_INLINE void waitSignalFollowShadow(Coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits=64, cuda::memory_order ord = cuda::memory_order_acquire) const;
+
+  // Sets to zero. May not race with concurrent modifications to counter.
+  NCCL_DEVICE_INLINE void resetCounter(ncclGinCounter_t counter) const;
+  // Sets signal and shadow to zero. May not race with concurrent modifcations to signal.
+  NCCL_DEVICE_INLINE void resetSignal(ncclGinSignal_t signal) const;
+
+  //////////////////////////////////////////////////////////////////////////////
+  // internal:
+
+  void* _ginHandle;
+  uint64_t* _signalShadows;
+
+  NCCL_DEVICE_INLINE ncclGinCtx_M<backendMask> _makeCtx() const;
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_SESSION_H_
diff --git a/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h
new file mode 100644
index 00000000000..c14a5e2923f
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki.h
@@ -0,0 +1,214 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_GDAKI_H_
+#define _NCCL_DEVICE_GIN_GDAKI_H_
+
+#ifndef DOCA_VERBS_USE_CUDA_WRAPPER
+#define DOCA_VERBS_USE_CUDA_WRAPPER
+#endif
+
+#ifndef DOCA_VERBS_USE_NET_WRAPPER
+#define DOCA_VERBS_USE_NET_WRAPPER
+#endif
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 1
+#endif
+
+#include "../gin_device_common.h"
+#include "gin_gdaki_device_host_common.h"
+#include "doca_gpunetio/doca_gpunetio_device.h"
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+#include <stdio.h>
+#endif
+
+template <>
+struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    using nccl::utility::loadConst;
+
+    coop.sync();
+    if (coop.thread_rank() == 0) {
+      ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
+      doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
+      doca_gpu_dev_verbs_qp* companion_qp;
+      ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
+      ncclGinGdakiMemHandle* srcMh = (ncclGinGdakiMemHandle*)srcWin;
+
+      doca_gpu_dev_verbs_addr raddr, laddr;
+      if (hasWins) {
+        raddr.addr = dstOff;
+        raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
+        laddr.addr = srcOff, laddr.key = loadConst(&srcMh->lkey);
+      }
+
+      doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
+      if (hasSignal) {
+        if (signalOp == ncclGinSignalInc) signalOpArg = 1;
+        sig_raddr.addr = sizeof(uint64_t) * signalId;
+        sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
+        sig_laddr.addr = 0;
+        sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      doca_gpu_dev_verbs_addr counter_raddr, counter_laddr;
+      if (hasCounter) {
+        companion_qp = loadConst(&gdaki->companion_gdqp) + peer;
+        counter_raddr.addr = sizeof(uint64_t) * counterId;
+        counter_raddr.key = loadConst(loadConst(&gdaki->counters_table.rkeys) + ctx.rank);
+        counter_laddr.addr = 0;
+        counter_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      // cuda::thread_scope_system has the lowest value
+      if ((required == cuda::thread_scope_system) && (given > required)) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+      }
+
+      if (hasWins) {
+        if (hasSignal && hasCounter) {
+          doca_gpu_dev_verbs_put_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr,
+            counter_laddr, 1);
+        } else if (hasSignal) {
+          doca_gpu_dev_verbs_put_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, raddr, laddr, bytes, sig_raddr, sig_laddr, signalOpArg);
+        } else if (hasCounter) {
+          doca_gpu_dev_verbs_put_counter(qp, raddr, laddr, bytes, companion_qp, counter_raddr,
+                                              counter_laddr, 1);
+        } else {
+          doca_gpu_dev_verbs_put(qp, raddr, laddr, bytes);
+        }
+      } else {
+        if (hasCounter) {
+          doca_gpu_dev_verbs_signal_counter<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, sig_raddr, sig_laddr, signalOpArg, companion_qp, counter_raddr, counter_laddr, 1);
+        } else {
+          doca_gpu_dev_verbs_signal<DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+            qp, sig_raddr, sig_laddr, signalOpArg);
+        }
+      }
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+      doca_gpu_dev_verbs_wait(qp);
+      if (hasCounter) doca_gpu_dev_verbs_wait(companion_qp);
+#endif
+    }
+    coop.sync();
+  }
+};
+
+template <>
+struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcVal, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    using nccl::utility::loadConst;
+
+    coop.sync();
+    if (coop.thread_rank() == 0) {
+      ncclGinGdakiGPUContext* gdaki = (struct ncclGinGdakiGPUContext*)ctx.handle;
+      doca_gpu_dev_verbs_qp* qp = loadConst(&gdaki->gdqp) + peer;
+      ncclGinGdakiMemHandle* dstMh = (ncclGinGdakiMemHandle*)dstWin;
+
+      doca_gpu_dev_verbs_addr raddr;
+      raddr.addr = dstOff;
+      raddr.key = loadConst(loadConst(&dstMh->rkeys) + peer);
+
+      doca_gpu_dev_verbs_addr sig_raddr, sig_laddr;
+      if (hasSignal) {
+        if (signalOp == ncclGinSignalInc) signalOpArg = 1;
+        sig_raddr.addr = sizeof(uint64_t) * signalId;
+        sig_raddr.key = loadConst(loadConst(&gdaki->signals_table.rkeys) + peer);
+        sig_laddr.addr = 0;
+        sig_laddr.key = loadConst(&gdaki->sink_buffer_lkey);
+      }
+
+      // cuda::thread_scope_system has the lowest value
+      if ((required == cuda::thread_scope_system) && (given > required)) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+      }
+
+      if (hasSignal) {
+        doca_gpu_dev_verbs_p_signal<T, DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD>(
+          qp, raddr, srcVal, sig_raddr, sig_laddr, signalOpArg);
+      } else {
+        doca_gpu_dev_verbs_p(qp, raddr, srcVal);
+      }
+
+#ifdef NCCL_DEVICE_GIN_GDAKI_ENABLE_DEBUG
+      doca_gpu_dev_verbs_wait(qp);
+#endif
+    }
+    coop.sync();
+  }
+};
+
+template <>
+struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    loadConst(&gdaki->counters_table.buffer)[counterId] = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    loadConst(&gdaki->signals_table.buffer)[signalId] = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    return loadConst(&gdaki->counters_table.buffer) + counterId;
+  }
+};
+
+template <>
+struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_GDAKI> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    return loadConst(&gdaki->signals_table.buffer) + signalId;
+  }
+};
+
+template <>
+struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_GDAKI> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
+    using nccl::utility::loadConst;
+    ncclGinGdakiGPUContext* gdaki = (ncclGinGdakiGPUContext*)ctx.handle;
+    doca_gpu_dev_verbs_qp* qps = loadConst(&gdaki->gdqp);
+#pragma unroll 1
+    for (int peer = coop.thread_rank(); peer < ctx.nRanks; peer += coop.size()) {
+      doca_gpu_dev_verbs_wait(qps + peer);
+    }
+  }
+};
+
+#endif /* _NCCL_DEVICE_GIN_GDAKI_H_ */
diff --git a/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
new file mode 100644
index 00000000000..20299346f31
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/gdaki/gin_gdaki_device_host_common.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
+#define _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_
+
+#include <linux/types.h>
+
+#define NCCL_GIN_GDAKI_VERSION 100
+
+template <typename T>
+struct ncclGinGdakiGlobalGPUBufferTable {
+  T *buffer;
+  __be32 *rkeys;
+  __be32 lkey;
+};
+
+struct ncclGinGdakiGPUContext {
+  struct doca_gpu_dev_verbs_qp *gdqp;
+  struct doca_gpu_dev_verbs_qp *companion_gdqp;
+  struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> counters_table;
+  struct ncclGinGdakiGlobalGPUBufferTable<uint64_t> signals_table;
+
+  // Local buffer we don't consume but is required for some operations.
+  __be32 sink_buffer_lkey;
+};
+
+struct ncclGinGdakiMemHandle {
+  __be32 *rkeys;
+  __be32 lkey;
+};
+
+#endif /* _NCCL_DEVICE_GIN_GDAKI_DEVICE_HOST_COMMON_H_ */
diff --git a/projects/rccl/src/include/nccl_device/gin/gin_device_api.h b/projects/rccl/src/include/nccl_device/gin/gin_device_api.h
new file mode 100644
index 00000000000..20dde3af30d
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/gin_device_api.h
@@ -0,0 +1,18 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef _NCCL_GIN_DEVICE_API_H_
+#define _NCCL_GIN_DEVICE_API_H_
+
+#include "gin_device_common.h"
+
+#if NCCL_GIN_GDAKI_ENABLE
+#include "gdaki/gin_gdaki.h"
+#endif
+#if NCCL_GIN_PROXY_ENABLE
+#include "proxy/gin_proxy.h"
+#endif
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/gin/gin_device_common.h b/projects/rccl/src/include/nccl_device/gin/gin_device_common.h
new file mode 100644
index 00000000000..4e0798c0c5b
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/gin_device_common.h
@@ -0,0 +1,122 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_DEVICE_COMMON_H_
+#define _NCCL_GIN_DEVICE_COMMON_H_
+
+#include "../net_device.h"
+#include "../utility.h"
+#include "gin_device_host_common.h"
+
+#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
+#define NCCL_GIN_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
+#endif
+
+#ifndef NCCL_GIN_PROXY_ENABLE
+#define NCCL_GIN_PROXY_ENABLE 1
+#endif
+
+#ifndef NCCL_GIN_GDAKI_ENABLE
+#if defined(__HIP_PLATFORM_AMD__)
+#define NCCL_GIN_GDAKI_ENABLE 0
+#elif CUDA_VERSION >= 12020 && __CUDA_ARCH__ >= 700
+#define NCCL_GIN_GDAKI_ENABLE 1
+#else
+#define NCCL_GIN_GDAKI_ENABLE 0
+#endif
+#endif
+
+#define NCCL_GIN_BACKEND_MASK_ALL                                               \
+  (((NCCL_GIN_PROXY_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_PROXY | \
+   ((NCCL_GIN_GDAKI_ENABLE) ? 1u : 0u) << (unsigned)NCCL_NET_DEVICE_GIN_GDAKI)
+
+struct ncclGinCtx {
+  ncclNetDeviceType backend;
+  int rank;
+  int nRanks;
+  void* handle;
+};
+
+template <unsigned backendMask>
+struct ncclGinCtx_M : ncclGinCtx {};
+
+struct ncclGinDescriptorSmem {
+  alignas(16) char space[64];
+};
+
+#if __CUDACC__
+template <ncclNetDeviceType backend>
+struct ncclGinApi_Put {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_PutValue {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcData, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_GetSignalPtr {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinSignal_t signalId);
+};
+template <ncclNetDeviceType backend>
+struct ncclGinApi_GetCounterPtr {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx, int peer, ncclGinCounter_t counterId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_ResetSignal {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinSignal_t signalId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_ResetCounter {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, ncclGinCounter_t counterId);
+};
+
+template <ncclNetDeviceType backend>
+struct ncclGinApi_Flush {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx, Coop, cuda::memory_order ord);
+};
+#endif
+
+#if __CUDACC__
+template <template <ncclNetDeviceType> typename ApiFn, unsigned beMask, typename... Arg>
+NCCL_DEVICE_INLINE static decltype(auto) ncclGinCall(ncclGinCtx_M<beMask> ctx, Arg&&... arg) {
+  bool singleton = (beMask & (beMask - 1)) == 0;  // Only one bit set
+  switch (singleton ? __popc(beMask - 1) : (int)ctx.backend) {
+#if NCCL_GIN_PROXY_ENABLE
+    case (int)NCCL_NET_DEVICE_GIN_PROXY:
+      if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_PROXY))) __builtin_unreachable();
+      return ApiFn<NCCL_NET_DEVICE_GIN_PROXY>::call(ctx, static_cast<Arg&&>(arg)...);
+#endif
+#if NCCL_GIN_GDAKI_ENABLE
+    case (int)NCCL_NET_DEVICE_GIN_GDAKI:
+      if (!(1 & (beMask >> (int)NCCL_NET_DEVICE_GIN_GDAKI))) __builtin_unreachable();
+      return ApiFn<NCCL_NET_DEVICE_GIN_GDAKI>::call(ctx, static_cast<Arg&&>(arg)...);
+#endif
+    default:
+      __builtin_unreachable();
+  }
+}
+#endif
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/gin/gin_device_host_common.h b/projects/rccl/src/include/nccl_device/gin/gin_device_host_common.h
new file mode 100644
index 00000000000..639a7eb1a58
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/gin_device_host_common.h
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_GIN_DEVICE_HOST_COMMON_H_
+#define _NCCL_GIN_DEVICE_HOST_COMMON_H_
+
+#include <cuda.h>
+#include "../net_device.h"
+#include "../core.h"  // for ncclGin{Signal|Counter}_t
+
+#define NCCL_GIN_MAX_CONTEXTS 4
+
+typedef struct ncclGinGpuCtx *ncclGinGpuCtx_t;
+typedef void *ncclGinWindow_t;
+
+typedef enum ncclGinSignalOp_t {
+  ncclGinSignalInc = 0,
+  ncclGinSignalAdd,
+} ncclGinSignalOp_t;
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy.h b/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy.h
new file mode 100644
index 00000000000..1233f8eba5f
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy.h
@@ -0,0 +1,235 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_PROXY_H_
+#define _NCCL_DEVICE_GIN_PROXY_H_
+
+//#include <config.h>
+
+#include <cstdint>
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include "nccl.h"
+#include "nccl_device/utility.h"
+#include "../gin_device_host_common.h"
+#include "gin_proxy_device_host_common.h"
+
+namespace nccl {
+namespace gin {
+namespace proxy {
+NCCL_DEVICE_INLINE void flush(ncclGinProxyGpuCtx_t* proxyCtx, uint32_t pe, cuda::memory_order ord) {
+  using nccl::utility::loadConst;
+  using nccl::utility::rollingLessEq;
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> pi(loadConst(&proxyCtx->pis)[pe]);
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> ci(loadConst(&proxyCtx->cis)[pe]);
+
+  // The PI and CI can keep moving because of concurrent threads posting GFDs to this queue, and the CPU consuming them.
+  // Therefore, to prevent overflow issues in the while statement, we need to use a special comparison function.
+  uint32_t p = pi.load(cuda::memory_order_relaxed);
+#pragma unroll 1
+  while (!rollingLessEq<uint32_t>(p, ci.load(ord))) continue;
+}
+
+
+template <typename Coop>
+NCCL_DEVICE_INLINE void postGfd(Coop coop, ncclGinProxyGpuCtx_t* proxyCtx, ncclGinProxyGfd_t* gfd,
+                                uint32_t pe) {
+  using nccl::utility::loadConst;
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> pi(loadConst(&proxyCtx->pis)[pe]);
+  cuda::atomic_ref<uint32_t, cuda::thread_scope_system> ci(loadConst(&proxyCtx->cis)[pe]);
+  ncclGinProxyGfd_t* q = &loadConst(&proxyCtx->queues)[pe * proxyCtx->queueSize];
+  uint32_t queueSize = loadConst(&proxyCtx->queueSize);
+
+  if (coop.thread_rank() == 0) {
+    // claim a slot in the gfd queue
+    uint32_t idx = pi.fetch_add(1, cuda::memory_order_relaxed);
+    // wait for credits
+    while (queueSize <= idx - ci.load(cuda::memory_order_relaxed)) {
+    }
+    idx &= queueSize - 1;
+// 4x16 byte store with the write-through cache hint
+#pragma unroll
+    for (uint8_t i = 0; i < 4; i++) {
+      __stwt((uint4*)&q[idx] + i, ((uint4*)gfd)[i]);
+    }
+  }
+}
+
+template <typename T>
+// Descriptor must be at least GWQ_GFD_SIZE bytes and it should be aligned
+__device__ __forceinline__ void buildGfd(ncclGinProxyGfd_t* gfd, ncclGinProxyOp_t op, T srcVal,
+                                         bool hasInline, size_t srcOff, ncclGinWindow_t srcHandle,
+                                         size_t dstOff, ncclGinWindow_t dstHandle, size_t size,
+                                         ncclGinCounter_t counterId, ncclGinSignal_t signalId,
+                                         uint64_t signalVal) {
+  gfd->qword[ncclGinProxyGfdHeader].header.flag = 1;
+  gfd->qword[ncclGinProxyGfdHeader].header.op = op;
+  gfd->qword[ncclGinProxyGfdHeader].header.size = (uint64_t)size;
+
+  if (hasInline) {
+    gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.flag = 1;
+    gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow = (uint32_t)srcVal;
+    gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.flag = 1;
+    if (sizeof(T) > 4)
+      gfd->qword[ncclGinProxyGfdInlineLow].inlineLow.inlineValLow2 = (uint64_t)srcVal >> 32;
+    if (sizeof(T) > 6)
+      gfd->qword[ncclGinProxyGfdInlineHigh].inlineHigh.inlineValHigh = (uint64_t)srcVal >> 48;
+  } else {
+    gfd->qword[ncclGinProxyGfdSrcOff].srcOff.flag = 1;
+    gfd->qword[ncclGinProxyGfdSrcOff].srcOff.srcOff = (uint64_t)srcOff;
+    gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.flag = 1;
+    gfd->qword[ncclGinProxyGfdSrcHandle].srcHandle.srcHandle = (uint64_t)srcHandle;
+  }
+
+  gfd->qword[ncclGinProxyGfdDstOff].dstOff.flag = 1;
+  gfd->qword[ncclGinProxyGfdDstOff].dstOff.dstOff = (uint64_t)dstOff;
+  gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.flag = 1;
+  gfd->qword[ncclGinProxyGfdDstHandle].dstHandle.dstHandle = (uint64_t)dstHandle;
+
+  gfd->qword[ncclGinProxyGfdCompletion].completion.flag = 1;
+  gfd->qword[ncclGinProxyGfdCompletion].completion.counterId = (uint16_t)counterId;
+  gfd->qword[ncclGinProxyGfdCompletion].completion.signalId = (uint16_t)signalId;
+
+  // The signal value is split between two qwords, as the signal value is a full 64 bits
+  gfd->qword[ncclGinProxyGfdCompletion].completion.signalValLow = (uint16_t)signalVal;
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.flag = 1;
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValLow2 = (uint16_t)(signalVal >> 16);
+  gfd->qword[ncclGinProxyGfdSignalVal].signalVal.signalValHigh = (uint32_t)(signalVal >> 32);
+
+  gfd->qword[ncclGinProxyGfdReserved].flag.v = 1;
+}
+
+__device__ __forceinline__ void constructProxyOp(ncclGinProxyOp_t& op, bool hasInline,
+                                                 bool hasSignal, ncclGinSignalOp_t signalOp,
+                                                 bool hasCounter) {
+  op = ncclGinProxyOpPut;
+  if (hasInline)
+    op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                       static_cast<uint8_t>(ncclGinProxyOpWithInline));
+  if (hasCounter)
+    op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                       static_cast<uint8_t>(ncclGinProxyOpWithCounter));
+  if (hasSignal) {
+    switch (signalOp) {
+      case ncclGinSignalInc:
+        op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                           static_cast<uint8_t>(ncclGinProxyOpWithSignalInc));
+        break;
+      case ncclGinSignalAdd:
+        op = static_cast<ncclGinProxyOp_t>(static_cast<uint8_t>(op) |
+                                           static_cast<uint8_t>(ncclGinProxyOpWithSignalAdd));
+        break;
+      default:
+        __builtin_unreachable();
+    }
+  }
+}
+
+template <typename Coop, typename T>
+NCCL_DEVICE_INLINE void put(Coop coop, ncclGinProxyGfd_t* gfd, ncclGinProxyGpuCtx_t* proxyCtx,
+                            int peer, ncclGinWindow_t dstWnd, size_t dstOff, T srcVal,
+                            bool hasInline, ncclGinWindow_t srcWnd, size_t srcOff, size_t bytes,
+                            bool hasSignal, ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                            uint64_t signalVal, bool hasCounter, ncclGinCounter_t counterId,
+                            cuda::thread_scope required, cuda::thread_scope given) {
+  if ((int)given > (int)cuda::thread_scope_system) {
+    cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_system);
+  }
+  ncclGinProxyOp_t op;
+  constructProxyOp(op, hasInline, hasSignal, signalOp, hasCounter);
+  nccl::gin::proxy::buildGfd(gfd, op, srcVal, hasInline, srcOff, srcWnd, dstOff, dstWnd, bytes,
+                             hasCounter ? counterId : 0, hasSignal ? signalId : 0, signalVal);
+  nccl::gin::proxy::postGfd<Coop>(coop, proxyCtx, gfd, peer);
+}
+}  // namespace proxy
+}  // namespace gin
+}  // namespace nccl
+
+template <>
+struct ncclGinApi_GetCounterPtr<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* counter = nccl::utility::loadConst(&proxyCtx->counters) + counterId;
+    return counter;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetCounter<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinCounter_t counterId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* counter = nccl::utility::loadConst(&proxyCtx->counters) + counterId;
+    *counter = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_GetSignalPtr<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static uint64_t* call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* signal = nccl::utility::loadConst(&proxyCtx->signals) + signalId;
+    return signal;
+  }
+};
+
+template <>
+struct ncclGinApi_ResetSignal<NCCL_NET_DEVICE_GIN_PROXY> {
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, ncclGinSignal_t signalId) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+    uint64_t* signal = nccl::utility::loadConst(&proxyCtx->signals) + signalId;
+    *signal = 0;
+  }
+};
+
+template <>
+struct ncclGinApi_Flush<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, cuda::memory_order ord) {
+    ncclGinProxyGpuCtx_t* proxyCtx = (ncclGinProxyGpuCtx_t*)ctx.handle;
+#pragma unroll 1
+    for (int pe = coop.thread_rank(); pe < ctx.nRanks; pe += coop.size()) {
+      nccl::gin::proxy::flush(proxyCtx, pe, ord);
+    }
+  }
+};
+
+template <>
+struct ncclGinApi_Put<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, bool hasWins,
+                                      ncclGinWindow_t dstWin, size_t dstOff, ncclGinWindow_t srcWin,
+                                      size_t srcOff, size_t bytes, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasCounter,
+                                      ncclGinCounter_t counterId, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    ncclGinProxyGfd_t tmpDesc;
+    ncclGinProxyGfd_t* desc = hasDescriptor ? (ncclGinProxyGfd_t*)descriptor : &tmpDesc;
+    nccl::gin::proxy::put<Coop, uint64_t>(
+      coop, desc, (ncclGinProxyGpuCtx_t*)ctx.handle, peer, dstWin, dstOff, 0, false, srcWin, srcOff,
+      bytes, hasSignal, signalId, signalOp, signalOpArg, hasCounter, counterId, required, given);
+  }
+};
+
+template <>
+struct ncclGinApi_PutValue<NCCL_NET_DEVICE_GIN_PROXY> {
+  template <typename Coop, typename T>
+  NCCL_DEVICE_INLINE static void call(ncclGinCtx ctx, Coop coop, int peer, ncclGinWindow_t dstWin,
+                                      size_t dstOff, T srcVal, bool hasSignal,
+                                      ncclGinSignal_t signalId, ncclGinSignalOp_t signalOp,
+                                      uint64_t signalOpArg, bool hasDescriptor,
+                                      ncclGinDescriptorSmem* descriptor,
+                                      cuda::thread_scope required, cuda::thread_scope given) {
+    ncclGinProxyGfd_t tmpDesc;
+    ncclGinProxyGfd_t* desc = hasDescriptor ? (ncclGinProxyGfd_t*)descriptor : &tmpDesc;
+    nccl::gin::proxy::put<Coop, T>(coop, desc, (ncclGinProxyGpuCtx_t*)ctx.handle, peer, dstWin,
+                                   dstOff, srcVal, true, nullptr, 0, sizeof(T), hasSignal, signalId,
+                                   signalOp, signalOpArg, false, 0, required, given);
+  }
+};
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h b/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
new file mode 100644
index 00000000000..8466f88747c
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin/proxy/gin_proxy_device_host_common.h
@@ -0,0 +1,125 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef GIN_PROXY_DEFS_H
+#define GIN_PROXY_DEFS_H
+
+#include <stdint.h>
+#include <stddef.h>
+
+#define NCCL_GIN_PROXY_VERSION 100
+
+typedef enum {
+  ncclGinProxyOpPut = 1 << 0,
+  ncclGinProxyOpBaseMask = 1 << 0,
+  ncclGinProxyOpWithInline = 1 << 1,
+  ncclGinProxyOpWithCounter = 1 << 2,
+  ncclGinProxyOpWithSignalInc = 1 << 3,
+  ncclGinProxyOpWithSignalAdd = 1 << 4,
+  ncclGinProxyOpComplMask = ~ncclGinProxyOpPut,
+} ncclGinProxyOp_t;
+
+static_assert(sizeof(void *) == sizeof(uint64_t) && sizeof(size_t) == sizeof(uint64_t),
+              "The proxy code is built on the assumption that the pointer size is 64 bits and at "
+              "most 57 bits are used for the actual pointer.");
+
+typedef union {
+  uint64_t raw;
+  struct {
+    uint64_t v : 1;
+    uint64_t resv : 63;
+  } __attribute__((packed)) flag;
+  struct {
+    uint64_t flag : 1;
+    uint64_t op : 6;
+    uint64_t size : 57;
+  } __attribute__((packed)) header;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t srcOff : 63;
+  } __attribute__((packed)) srcOff;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t srcHandle : 63;
+  } __attribute__((packed)) srcHandle;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint32_t inlineValLow;
+    uint16_t inlineValLow2;
+  } __attribute__((packed)) inlineLow;
+  // inline supports a max of 96 bit / 12 byte values
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint16_t inlineValHigh;
+    uint8_t resv1;
+    uint32_t resv2;
+  } __attribute__((packed)) inlineHigh;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t dstOff : 63;
+  } __attribute__((packed)) dstOff;
+  struct {
+    // the last bit is the flag, so we support 63 bit VAs
+    uint64_t flag : 1;
+    uint64_t dstHandle : 63;
+  } __attribute__((packed)) dstHandle;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv1 : 7;
+    // must be non-zero if WITH_COUNTER is set
+    uint16_t counterId;
+    // must be non-zero if WITH_SIGNAL_INC, WITH_SIGNAL_ADD, or WITH_SIGNAL_SET is set
+    uint16_t signalId;
+    uint16_t signalValLow;
+    uint8_t resv2;
+  } __attribute__((packed)) completion;
+  struct {
+    uint8_t flag : 1;
+    uint8_t resv : 7;
+    uint16_t signalValLow2;
+    uint32_t signalValHigh;
+  } __attribute__((packed)) signalVal;
+} ncclGinProxyQword_t;
+static_assert(sizeof(ncclGinProxyQword_t) == sizeof(uint64_t),
+              "sizeof(ncclGinProxyQword_t) != sizeof(uint64_t)");
+
+typedef enum {
+  ncclGinProxyGfdHeader = 0,
+  ncclGinProxyGfdInlineLow = 1,
+  ncclGinProxyGfdInlineHigh = 2,
+  ncclGinProxyGfdSrcOff = 1,
+  ncclGinProxyGfdSrcHandle = 2,
+  ncclGinProxyGfdDstOff = 3,
+  ncclGinProxyGfdDstHandle = 4,
+  ncclGinProxyGfdCompletion = 5,
+  ncclGinProxyGfdSignalVal = 6,
+  ncclGinProxyGfdReserved = 7,
+  ncclGinProxyGfdQwords = 8,
+} ncclGinProxyGfdQwordIdx_t;
+
+typedef struct __attribute__((packed)) {
+  ncclGinProxyQword_t qword[ncclGinProxyGfdQwords];
+} ncclGinProxyGfd_t;
+static_assert(sizeof(ncclGinProxyGfd_t) == 64,
+              "sizeof(ncclGinProxyGfd_t) != 64 - it is crucial the GFD is 64 bytes!");
+
+typedef struct {
+  int nranks;
+  uint32_t queueSize;
+  ncclGinProxyGfd_t *queues;
+  uint32_t *pis;
+  // The consumer indices will reside in CPU or GPU memory depending on the availability of GDR
+  uint32_t *cis;
+
+  uint64_t *counters;
+  uint64_t *signals;
+} ncclGinProxyGpuCtx_t;
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/gin_barrier.h b/projects/rccl/src/include/nccl_device/gin_barrier.h
new file mode 100644
index 00000000000..68f3c5a638d
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/gin_barrier.h
@@ -0,0 +1,37 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER_H_
+#define _NCCL_DEVICE_GIN_BARRIER_H_
+#include "core_tmp.h"
+#include "gin.h"
+
+struct ncclGinBarrierHandle;
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclGinBarrierCreateRequirement(ncclComm_t, ncclTeam_t, int nBarriers, ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+
+#if __CUDACC__
+enum class ncclGinFenceLevel {
+  Relaxed
+};
+
+template<typename Coop>
+struct ncclGinBarrierSession_internal;
+
+template<typename Coop>
+struct ncclGinBarrierSession: ncclGinBarrierSession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclGinBarrierSession(Coop, ncclGin, ncclTeam, ncclGinBarrierHandle, uint32_t index);
+  NCCL_DEVICE_INLINE ncclGinBarrierSession(Coop, ncclGin, ncclTeamTagRail, uint32_t index);
+
+  NCCL_DEVICE_INLINE ~ncclGinBarrierSession();
+
+  ncclGinBarrierSession(ncclGinBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order, ncclGinFenceLevel);
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER_H_
diff --git a/projects/rccl/src/include/nccl_device/hip_compat.h b/projects/rccl/src/include/nccl_device/hip_compat.h
new file mode 100644
index 00000000000..1d01910c5a9
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/hip_compat.h
@@ -0,0 +1,572 @@
+#ifndef _NCCL_DEVICE_HIP_COMPAT_H_
+#define _NCCL_DEVICE_HIP_COMPAT_H_
+
+/*
+ * This header provides compatibility between CUDA and HIP for the nccl_device
+ * headers. It defines unified macros and provides HIP implementations of
+ * CUDA-specific constructs.
+ *
+ * Usage: Include this header FIRST in nccl_device headers, then use
+ * NCCL_DEVICE_COMPILE instead of __CUDACC__ for device code guards.
+ *
+ * "NO HIP EQUIVALENT:" marks NVIDIA-only features (e.g. NVLINK multicast).
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// Unified device compiler detection
+//
+// NCCL_DEVICE_COMPILE: defined when compiling device code (CUDA or HIP)
+// NCCL_HIP_PLATFORM:   defined when targeting AMD/HIP
+// NCCL_CUDA_PLATFORM:  defined when targeting NVIDIA/CUDA
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__HIPCC__) || defined(__HIP_PLATFORM_AMD__)
+  #define NCCL_HIP_PLATFORM 1
+  #define NCCL_DEVICE_COMPILE 1
+#elif defined(__CUDACC__)
+  #define NCCL_CUDA_PLATFORM 1
+  #define NCCL_DEVICE_COMPILE 1
+#else
+  #define NCCL_DEVICE_COMPILE 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Device function qualifiers
+////////////////////////////////////////////////////////////////////////////////
+
+#if NCCL_DEVICE_COMPILE
+  #define NCCL_DEVICE_INLINE __device__ __forceinline__
+  #define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
+#else
+  #ifndef __host__
+    #define __host__
+  #endif
+  #ifndef __device__
+    #define __device__
+  #endif
+  #define NCCL_DEVICE_INLINE
+  #define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline))
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Architecture detection
+//
+// NCCL_DEVICE_ARCH: Non-zero when compiling for device
+// Use this instead of __CUDA_ARCH__ or __HIP_DEVICE_COMPILE__
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(__CUDA_ARCH__)
+  #define NCCL_DEVICE_ARCH __CUDA_ARCH__
+#elif defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
+  // Map HIP GFX versions to a comparable value
+  // MI200 (gfx90a) and MI300 (gfx942) are roughly Hopper-class
+  #if defined(__gfx942__) || defined(__gfx950__)
+    #define NCCL_DEVICE_ARCH 942  // MI300 class
+  #elif defined(__gfx90a__)
+    #define NCCL_DEVICE_ARCH 90   // MI200 class
+  #elif defined(__gfx908__)
+    #define NCCL_DEVICE_ARCH 80   // MI100 class
+  #else
+    #define NCCL_DEVICE_ARCH 70   // Generic GCN
+  #endif
+#else
+  #define NCCL_DEVICE_ARCH 0
+#endif
+
+// Hopper+ features (multimem, etc.) - NVIDIA only
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+  #define NCCL_ARCH_HAS_MULTIMEM 1
+#else
+  #define NCCL_ARCH_HAS_MULTIMEM 0
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Wave/Warp size abstraction
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(NCCL_HIP_PLATFORM)
+  // AMD GPUs use 64-wide waves (or 32 in wave32 mode)
+  #if defined(__GFX10__) || defined(__GFX11__) || defined(__gfx1100__) || \
+      defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
+    #define NCCL_WARP_SIZE 32
+  #else
+    #define NCCL_WARP_SIZE 64
+  #endif
+#else
+  #define NCCL_WARP_SIZE 32
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory ordering types for atomic operations
+//
+// These provide a unified interface matching cuda::memory_order
+////////////////////////////////////////////////////////////////////////////////
+
+#if NCCL_DEVICE_COMPILE
+
+#if defined(NCCL_HIP_PLATFORM)
+
+namespace nccl_hip {
+
+// Memory order enumeration matching cuda::memory_order
+enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+};
+
+// Thread scope enumeration
+enum thread_scope {
+  thread_scope_thread   = 0,
+  thread_scope_block    = 1,
+  thread_scope_device   = 2,
+  thread_scope_system   = 3
+};
+
+// Map thread_scope to HIP memory scope
+NCCL_DEVICE_INLINE constexpr int toHipMemoryScope(thread_scope scope) {
+  switch (scope) {
+    case thread_scope_thread: return __HIP_MEMORY_SCOPE_SINGLETHREAD;
+    case thread_scope_block:  return __HIP_MEMORY_SCOPE_WORKGROUP;
+    case thread_scope_device: return __HIP_MEMORY_SCOPE_AGENT;
+    case thread_scope_system: return __HIP_MEMORY_SCOPE_SYSTEM;
+    default:                  return __HIP_MEMORY_SCOPE_SYSTEM;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// atomic_ref implementation for HIP
+//
+// Provides cuda::atomic_ref-compatible interface using HIP atomics
+////////////////////////////////////////////////////////////////////////////////
+
+template<typename T, thread_scope Scope = thread_scope_system>
+struct atomic_ref {
+  T* ptr;
+
+  NCCL_DEVICE_INLINE explicit atomic_ref(T& ref) : ptr(&ref) {}
+
+  NCCL_DEVICE_INLINE void store(T val, memory_order order = memory_order_seq_cst) const {
+    if constexpr (sizeof(T) == 4) {
+      __hip_atomic_store(reinterpret_cast<unsigned int*>(ptr),
+                         *reinterpret_cast<unsigned int*>(&val),
+                         order, toHipMemoryScope(Scope));
+    } else if constexpr (sizeof(T) == 8) {
+      __hip_atomic_store(reinterpret_cast<unsigned long long*>(ptr),
+                         *reinterpret_cast<unsigned long long*>(&val),
+                         order, toHipMemoryScope(Scope));
+    } else {
+      __atomic_store_n(ptr, val, order);
+    }
+  }
+
+  NCCL_DEVICE_INLINE T load(memory_order order = memory_order_seq_cst) const {
+    T result;
+    if constexpr (sizeof(T) == 4) {
+      unsigned int tmp = __hip_atomic_load(reinterpret_cast<unsigned int*>(ptr),
+                                           order, toHipMemoryScope(Scope));
+      result = *reinterpret_cast<T*>(&tmp);
+    } else if constexpr (sizeof(T) == 8) {
+      unsigned long long tmp = __hip_atomic_load(reinterpret_cast<unsigned long long*>(ptr),
+                                                  order, toHipMemoryScope(Scope));
+      result = *reinterpret_cast<T*>(&tmp);
+    } else {
+      result = __atomic_load_n(ptr, order);
+    }
+    return result;
+  }
+
+  NCCL_DEVICE_INLINE T fetch_add(T val, memory_order order = memory_order_seq_cst) const {
+    if constexpr (sizeof(T) == 4) {
+      return __hip_atomic_fetch_add(ptr, val, order, toHipMemoryScope(Scope));
+    } else if constexpr (sizeof(T) == 8) {
+      return __hip_atomic_fetch_add(ptr, val, order, toHipMemoryScope(Scope));
+    } else {
+      return __atomic_fetch_add(ptr, val, order);
+    }
+  }
+};
+
+// __builtin_amdgcn_fence requires compile-time constant arguments, so we
+// dispatch order x scope with a helper macro instead of a runtime switch.
+#define NCCL_HIP_FENCE_SCOPE(ORD, scope) do { \
+    switch (scope) { \
+      case thread_scope_thread:  __atomic_signal_fence(ORD); break; \
+      case thread_scope_block:   __builtin_amdgcn_fence(ORD, "workgroup"); break; \
+      case thread_scope_device:  __builtin_amdgcn_fence(ORD, "agent"); break; \
+      case thread_scope_system: default: __builtin_amdgcn_fence(ORD, ""); break; \
+    } \
+  } while(0)
+
+NCCL_DEVICE_INLINE void atomic_thread_fence(memory_order order, thread_scope scope = thread_scope_device) {
+  switch (order) {
+    case memory_order_relaxed: break;
+    case memory_order_acquire: NCCL_HIP_FENCE_SCOPE(__ATOMIC_ACQUIRE, scope); break;
+    case memory_order_release: NCCL_HIP_FENCE_SCOPE(__ATOMIC_RELEASE, scope); break;
+    case memory_order_acq_rel: NCCL_HIP_FENCE_SCOPE(__ATOMIC_ACQ_REL, scope); break;
+    case memory_order_seq_cst: default: NCCL_HIP_FENCE_SCOPE(__ATOMIC_SEQ_CST, scope); break;
+  }
+}
+
+#undef NCCL_HIP_FENCE_SCOPE
+
+} // namespace nccl_hip
+
+// Bring into cuda namespace for source compatibility
+namespace cuda {
+  using nccl_hip::memory_order;
+  using nccl_hip::memory_order_relaxed;
+  using nccl_hip::memory_order_acquire;
+  using nccl_hip::memory_order_release;
+  using nccl_hip::memory_order_acq_rel;
+  using nccl_hip::memory_order_seq_cst;
+  using nccl_hip::thread_scope;
+  using nccl_hip::thread_scope_thread;
+  using nccl_hip::thread_scope_block;
+  using nccl_hip::thread_scope_device;
+  using nccl_hip::thread_scope_system;
+  using nccl_hip::atomic_ref;
+  using nccl_hip::atomic_thread_fence;
+}
+
+#else // CUDA platform
+
+// Include CUDA's atomic header when available
+// #include <cuda/atomic>  // Uncomment when CUDA 11+ is required
+
+#endif // NCCL_HIP_PLATFORM
+
+#endif // NCCL_DEVICE_COMPILE
+
+////////////////////////////////////////////////////////////////////////////////
+// Warp/Wave synchronization primitives
+////////////////////////////////////////////////////////////////////////////////
+
+#if NCCL_DEVICE_COMPILE
+
+#if defined(NCCL_HIP_PLATFORM)
+
+// Lane ID - get the lane index within the wave
+NCCL_DEVICE_INLINE int nccl_lane_id() {
+  return __lane_id();
+}
+
+// Lane mask less than - lower 32 bits of the mask of lanes below current.
+// For wave64 lanes 32-63, all lower 32 lanes are below -> 0xFFFFFFFF.
+// A 64-bit variant would be needed for full wave64 cooperative groups.
+NCCL_DEVICE_INLINE unsigned int nccl_lanemask_lt() {
+  int lane = __lane_id();
+  #if NCCL_WARP_SIZE == 64
+    if (lane >= 32) return 0xFFFFFFFFu;
+    return (1u << lane) - 1;
+  #else
+    return (1u << lane) - 1;
+  #endif
+}
+
+// Warp sync - AMD CDNA waves execute in lockstep (no divergence-convergence
+// like post-Volta NVIDIA), so explicit sync is unnecessary.  Matches the
+// RCCL convention in common.h (#define __syncwarp()).
+NCCL_DEVICE_INLINE void nccl_syncwarp(unsigned int mask = 0xffffffff) {
+  (void)mask;
+}
+
+// Active mask - get mask of active lanes
+NCCL_DEVICE_INLINE unsigned int nccl_activemask() {
+  // __ballot(1) returns mask of lanes where predicate is true
+  // Since we pass 1 (true), returns mask of all active lanes
+  return __ballot(1);
+}
+
+// Named barrier with count.  CUDA has __barrier_sync_count(id, count) for
+// sub-block synchronization.  AMD GCN only has s_barrier (full-block).
+// __syncthreads() is an overly-conservative but safe fallback.  For a
+// software multi-warp barrier, see barrier_generic in primitives.h.
+NCCL_DEVICE_INLINE void nccl_barrier_sync_count(int id, int count) {
+  (void)id;
+  (void)count;
+  __syncthreads();
+}
+
+// Population count - count set bits
+NCCL_DEVICE_INLINE int nccl_popc(unsigned int x) {
+  return __popc(x);
+}
+
+// Population count - 64-bit version
+NCCL_DEVICE_INLINE int nccl_popcll(unsigned long long x) {
+  return __popcll(x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Wave64-compatible warp intrinsic wrappers
+//
+// CUDA warp intrinsics take an explicit lane mask; AMD CDNA waves execute in
+// lockstep so the mask is implicit.  These wrappers let NCCL source use the
+// CUDA signatures unchanged.
+////////////////////////////////////////////////////////////////////////////////
+
+NCCL_DEVICE_INLINE int __shfl_sync(unsigned int mask, int val, int srcLane, int width = NCCL_WARP_SIZE) {
+  (void)mask;
+  return __shfl(val, srcLane, width);
+}
+NCCL_DEVICE_INLINE unsigned int __shfl_sync(unsigned int mask, unsigned int val, int srcLane, int width = NCCL_WARP_SIZE) {
+  (void)mask;
+  return __shfl(val, srcLane, width);
+}
+NCCL_DEVICE_INLINE float __shfl_sync(unsigned int mask, float val, int srcLane, int width = NCCL_WARP_SIZE) {
+  (void)mask;
+  return __shfl(val, srcLane, width);
+}
+
+NCCL_DEVICE_INLINE unsigned long long __ballot_sync(unsigned int mask, int pred) {
+  (void)mask;
+  return __ballot(pred);
+}
+
+// __syncwarp(mask) — CUDA warp sync with lane mask.
+// AMD CDNA waves are lockstep; this is a no-op.  common.h defines
+// __syncwarp() (zero-arg macro) but coop.h needs the 1-arg form.
+NCCL_DEVICE_INLINE void __syncwarp(unsigned int mask) {
+  (void)mask;
+}
+
+// __activemask — HIP provides this natively (amd_warp_functions.h)
+// __fns — HIP provides this natively (amd_device_functions.h)
+
+// __barrier_sync_count — CUDA named barrier; HIP only has s_barrier.
+NCCL_DEVICE_INLINE void __barrier_sync_count(int id, int count) {
+  (void)id; (void)count;
+  __syncthreads();
+}
+
+#else // CUDA platform
+
+NCCL_DEVICE_INLINE int nccl_lane_id() {
+  int ret;
+  asm("mov.u32 %0, %%laneid;" : "=r"(ret));
+  return ret;
+}
+
+NCCL_DEVICE_INLINE unsigned int nccl_lanemask_lt() {
+  unsigned int ret;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
+  return ret;
+}
+
+NCCL_DEVICE_INLINE void nccl_syncwarp(unsigned int mask = 0xffffffff) {
+  __syncwarp(mask);
+}
+
+NCCL_DEVICE_INLINE unsigned int nccl_activemask() {
+  return __activemask();
+}
+
+NCCL_DEVICE_INLINE void nccl_barrier_sync_count(int id, int count) {
+  __barrier_sync_count(id, count);
+}
+
+NCCL_DEVICE_INLINE int nccl_popc(unsigned int x) {
+  return __popc(x);
+}
+
+NCCL_DEVICE_INLINE int nccl_popcll(unsigned long long x) {
+  return __popcll(x);
+}
+
+#endif // NCCL_HIP_PLATFORM
+
+#endif // NCCL_DEVICE_COMPILE
+
+////////////////////////////////////////////////////////////////////////////////
+// Math intrinsics
+////////////////////////////////////////////////////////////////////////////////
+
+#if NCCL_DEVICE_COMPILE
+
+#if defined(NCCL_HIP_PLATFORM)
+
+NCCL_DEVICE_INLINE uint32_t nccl_umulhi(uint32_t a, uint32_t b) {
+  return __umulhi(a, b);
+}
+
+NCCL_DEVICE_INLINE uint64_t nccl_umul64hi(uint64_t a, uint64_t b) {
+  return __umul64hi(a, b);
+}
+
+#else // CUDA
+
+NCCL_DEVICE_INLINE uint32_t nccl_umulhi(uint32_t a, uint32_t b) {
+  return __umulhi(a, b);
+}
+
+NCCL_DEVICE_INLINE uint64_t nccl_umul64hi(uint64_t a, uint64_t b) {
+  return __umul64hi(a, b);
+}
+
+#endif // NCCL_HIP_PLATFORM
+
+#else // Host fallbacks
+
+inline uint32_t nccl_umulhi(uint32_t a, uint32_t b) {
+  return uint64_t(a) * b >> 32;
+}
+
+inline uint64_t nccl_umul64hi(uint64_t a, uint64_t b) {
+  return (uint64_t)(((unsigned __int128)a) * b >> 64);
+}
+
+#endif // NCCL_DEVICE_COMPILE
+
+////////////////////////////////////////////////////////////////////////////////
+// Memory operations
+////////////////////////////////////////////////////////////////////////////////
+
+#if NCCL_DEVICE_COMPILE
+
+#if defined(NCCL_HIP_PLATFORM)
+
+// Load through constant cache (ldg) - HIP doesn't have direct equivalent
+// Just use regular load
+template<typename T>
+NCCL_DEVICE_INLINE T nccl_ldg(const T* ptr) {
+  return *ptr;
+}
+
+// 128-bit volatile vector loads/stores for LL protocol.
+//
+// DWORDX4 detection replicated from rccl_ptr.h (hip_compat.h must be
+// self-contained and cannot include rccl_ptr.h).
+#if defined(__HIP_DEVICE_COMPILE__)
+  #if (defined(__gfx942__) || defined(__gfx950__)) \
+      && __has_builtin(__builtin_amdgcn_global_load_b128) \
+      && __has_builtin(__builtin_amdgcn_global_store_b128) \
+      && !defined(DWORDX4_INTRINSICS_FORCE_OFF)
+    #define NCCL_COMPAT_HAVE_DWORDX4 1
+  #else
+    #define NCCL_COMPAT_HAVE_DWORDX4 0
+  #endif
+#else
+  #define NCCL_COMPAT_HAVE_DWORDX4 0
+#endif
+
+typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int nccl_compat_v4u;
+typedef __attribute__((address_space(1))) nccl_compat_v4u* nccl_compat_v4u_gptr;
+
+NCCL_DEVICE_INLINE void nccl_st_volatile_v4_u32(void* ptr, uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
+  union { nccl_compat_v4u vec; uint32_t u32[4]; } u;
+  u.u32[0] = v0; u.u32[1] = v1; u.u32[2] = v2; u.u32[3] = v3;
+  #if NCCL_COMPAT_HAVE_DWORDX4
+    __builtin_amdgcn_global_store_b128((nccl_compat_v4u_gptr)ptr, u.vec, "");
+  #else
+    typedef __attribute__((address_space(1))) uint64_t* u64_gptr_t;
+    uint64_t* p64 = reinterpret_cast<uint64_t*>(ptr);
+    __builtin_nontemporal_store(*reinterpret_cast<uint64_t*>(&u.u32[0]), (u64_gptr_t)p64);
+    __builtin_nontemporal_store(*reinterpret_cast<uint64_t*>(&u.u32[2]), (u64_gptr_t)(p64 + 1));
+  #endif
+}
+
+NCCL_DEVICE_INLINE void nccl_ld_volatile_v4_u32(const void* ptr, uint32_t& v0, uint32_t& v1, uint32_t& v2, uint32_t& v3) {
+  union { nccl_compat_v4u vec; uint32_t u32[4]; } u;
+  #if NCCL_COMPAT_HAVE_DWORDX4
+    u.vec = __builtin_amdgcn_global_load_b128((nccl_compat_v4u_gptr)ptr, "");
+  #else
+    typedef __attribute__((address_space(1))) uint64_t* u64_gptr_t;
+    const uint64_t* p64 = reinterpret_cast<const uint64_t*>(ptr);
+    *reinterpret_cast<uint64_t*>(&u.u32[0]) = __builtin_nontemporal_load((u64_gptr_t)p64);
+    *reinterpret_cast<uint64_t*>(&u.u32[2]) = __builtin_nontemporal_load((u64_gptr_t)(p64 + 1));
+  #endif
+  v0 = u.u32[0]; v1 = u.u32[1]; v2 = u.u32[2]; v3 = u.u32[3];
+}
+
+// GPU-scope (agent) acquire fence
+NCCL_DEVICE_INLINE void nccl_fence_acq_gpu() {
+  __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "agent");
+}
+
+// GPU-scope (agent) release fence
+NCCL_DEVICE_INLINE void nccl_fence_rel_gpu() {
+  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "agent");
+}
+
+// Multimem reduction - NVLINK multicast, NO HIP EQUIVALENT
+// These are stubs that will compile but do nothing
+NCCL_DEVICE_INLINE void nccl_multimem_red_release_add_u32(void* ptr) {
+  // NO HIP EQUIVALENT: Requires NVLINK multicast hardware
+  (void)ptr;
+}
+
+NCCL_DEVICE_INLINE void nccl_multimem_red_relaxed_add_u32(void* ptr) {
+  // NO HIP EQUIVALENT: Requires NVLINK multicast hardware
+  (void)ptr;
+}
+
+#else // CUDA platform
+
+template<typename T>
+NCCL_DEVICE_INLINE T nccl_ldg(const T* ptr) {
+  return __ldg(ptr);
+}
+
+NCCL_DEVICE_INLINE void nccl_st_volatile_v4_u32(void* ptr, uint32_t v0, uint32_t v1, uint32_t v2, uint32_t v3) {
+  asm volatile("st.volatile.v4.u32 [%0],{%1,%2,%3,%4};" ::
+    "l"(ptr), "r"(v0), "r"(v1), "r"(v2), "r"(v3));
+}
+
+NCCL_DEVICE_INLINE void nccl_ld_volatile_v4_u32(const void* ptr, uint32_t& v0, uint32_t& v1, uint32_t& v2, uint32_t& v3) {
+  asm volatile("ld.volatile.v4.u32 {%0,%1,%2,%3},[%4];"
+    : "=r"(v0), "=r"(v1), "=r"(v2), "=r"(v3)
+    : "l"(ptr));
+}
+
+NCCL_DEVICE_INLINE void nccl_fence_acq_gpu() {
+  static __device__ int dummy;
+  int tmp;
+  asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
+  dummy = tmp;
+}
+
+NCCL_DEVICE_INLINE void nccl_fence_rel_gpu() {
+  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
+}
+
+NCCL_DEVICE_INLINE void nccl_multimem_red_release_add_u32(void* ptr) {
+  #if __CUDA_ARCH__ >= 900
+    asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(ptr));
+  #else
+    (void)ptr;
+  #endif
+}
+
+NCCL_DEVICE_INLINE void nccl_multimem_red_relaxed_add_u32(void* ptr) {
+  #if __CUDA_ARCH__ >= 900
+    asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(ptr));
+  #else
+    (void)ptr;
+  #endif
+}
+
+#endif // NCCL_HIP_PLATFORM
+
+#endif // NCCL_DEVICE_COMPILE
+
+////////////////////////////////////////////////////////////////////////////////
+// Diagnostic macros for NYI features
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(NCCL_HIP_PLATFORM) && NCCL_DEVICE_COMPILE
+  // Mark features that are not yet implemented for HIP
+  #define NCCL_HIP_NYI(feature) \
+    do { /* NYI: feature */ } while(0)
+
+  // Warning for features that have no HIP equivalent
+  #define NCCL_HIP_NO_EQUIVALENT(feature) \
+    do { /* NO HIP EQUIVALENT: feature */ } while(0)
+#else
+  #define NCCL_HIP_NYI(feature) do {} while(0)
+  #define NCCL_HIP_NO_EQUIVALENT(feature) do {} while(0)
+#endif
+
+#endif // _NCCL_DEVICE_HIP_COMPAT_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/barrier__funcs.h b/projects/rccl/src/include/nccl_device/impl/barrier__funcs.h
new file mode 100644
index 00000000000..69e1dfda503
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/barrier__funcs.h
@@ -0,0 +1,94 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_BARRIER__FUNCS_H_
+#include "barrier__types.h"
+#include "lsa_barrier__funcs.h"
+#include "gin_barrier__funcs.h"
+#include "../utility.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeam innerTeam, ncclTeam outerTeam, ncclGin gin,
+    ncclLsaBarrierHandle innerHandle, ncclGinBarrierHandle outerHandle,
+    uint32_t index, bool multimem, ncclMultimemHandle innerMmHandle
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::present(gin),
+    nccl::utility::present(coop, gin.comm, innerTeam, innerHandle, index, multimem, innerMmHandle),
+    nccl::utility::present(coop, gin, outerTeam, outerHandle, index)
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagWorld, ncclGin gin, uint32_t index, bool multimem
+  ):
+  ncclBarrierSession<Coop>(
+    coop, ncclTeamLsa(gin.comm), ncclTeamRail(gin.comm), gin,
+    gin.comm.lsaBarrier, gin.comm.railGinBarrier,
+    index, multimem, gin.comm.lsaMultimem
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagLsa, ncclDevComm const& comm, uint32_t index, bool multimem
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::Absent(),
+    nccl::utility::present(coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem),
+    nccl::utility::Absent()
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclBarrierSession<Coop>::ncclBarrierSession(
+    Coop coop, ncclTeamTagRail, ncclGin gin, uint32_t index
+  ):
+  ncclBarrierSession_internal<Coop>(coop,
+    nccl::utility::present(gin),
+    nccl::utility::Absent(),
+    nccl::utility::present(coop, gin, ncclTeamRail(gin.comm), gin.comm.railGinBarrier, index)
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>& ncclBarrierSession<Coop>::lsaBarrier() {
+  return this->innerLsaBar.thing;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>& ncclBarrierSession<Coop>::ginBarrier() {
+  return this->outerGinBar.thing;
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclBarrierSession<Coop>::sync(Coop, cuda::memory_order ord, ncclGinFenceLevel fence) {
+  if (this->innerLsaBar.present) {
+    this->innerLsaBar.thing.sync(this->coop, this->outerGinBar.present ? nccl::utility::releaseOrderOf(ord) : ord);
+  }
+  if (this->outerGinBar.present) {
+    this->outerGinBar.thing.sync(this->coop, this->innerLsaBar.present ? nccl::utility::acquireOrderOf(ord) : ord, fence);
+  }
+}
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER__FUNCS_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/barrier__types.h b/projects/rccl/src/include/nccl_device/impl/barrier__types.h
new file mode 100644
index 00000000000..2a262a856ae
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/barrier__types.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_BARRIER__TYPES_H_
+#include "../barrier.h"
+#include "../utility.h"
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclBarrierSession_internal {
+  Coop coop;
+  nccl::utility::Optional<ncclGin> gin;
+  nccl::utility::Optional<ncclLsaBarrierSession<Coop>> innerLsaBar;
+  nccl::utility::Optional<ncclGinBarrierSession<Coop>> outerGinBar;
+
+  template<typename GinInit, typename InnerInit, typename OuterInit>
+  NCCL_DEVICE_INLINE ncclBarrierSession_internal(
+      Coop coop, GinInit ginInit, InnerInit innerInit, OuterInit outerInit
+    ):
+    coop(coop), gin{ginInit}, innerLsaBar{innerInit}, outerGinBar{outerInit} {
+  }
+};
+#endif
+
+#endif // _NCCL_DEVICE_BARRIER__TYPES_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/comm__types.h b/projects/rccl/src/include/nccl_device/impl/comm__types.h
index 058d1013591..5183e8e6386 100644
--- a/projects/rccl/src/include/nccl_device/impl/comm__types.h
+++ b/projects/rccl/src/include/nccl_device/impl/comm__types.h
@@ -8,8 +8,9 @@
 #define _NCCL_DEVICE_COMM__TYPES_H_
 #include "../comm_tmp.h"
 #include "core__types.h"
-#include "mem_barrier__types.h"
 #include "ll_a2a__types.h"
+#include "lsa_barrier__types.h"
+#include "gin_barrier__types.h"
 
 struct ncclDevCommWindowTable;
 #if __cplusplus
@@ -35,6 +36,16 @@ struct ncclDevComm {
 
   ncclMultimemHandle_t lsaMultimem;
   ncclLsaBarrierHandle_t lsaBarrier;
+  ncclGinBarrierHandle_t railGinBarrier;
+
+  uint8_t ginContextCount;
+  uint8_t ginTypes[4];
+  void* ginHandles[4];
+  uint32_t ginSignalBase;
+  int ginSignalCount;
+  uint32_t ginCounterBase;
+  int ginCounterCount;
+  uint64_t* ginSignalShadows;
 };
 
 #endif // _NCCL_DEVICE_COMM__TYPES_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/core__funcs.h b/projects/rccl/src/include/nccl_device/impl/core__funcs.h
index 1087cd28924..b7a7de342ef 100644
--- a/projects/rccl/src/include/nccl_device/impl/core__funcs.h
+++ b/projects/rccl/src/include/nccl_device/impl/core__funcs.h
@@ -153,6 +153,38 @@ NCCL_DEVICE_INLINE void* ncclGetLsaMultimemPointer(ncclWindow_t w, size_t offset
 }
 #endif
 
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclWindow_t ncclFindWindow(Coop coop, ncclDevComm const& comm, void const *ptr) {
+  using nccl::utility::loadConst;
+  auto coalesced = ncclCoopCoalesced(coop);
+  ncclDevCommWindowTable* t = comm.windowTable;
+  while (true) {
+    bool found = false;
+    int index = coalesced.thread_rank();
+    #pragma unroll 1
+    while (index < 32) {
+      uintptr_t uptr = reinterpret_cast<uintptr_t>(ptr);
+      ncclDevCommWindowTable::Entry e = loadConst(&t->entries[index]);
+      if ((e.base != 0) && (e.size != 0) && (e.window != 0)) {
+        if (uptr - uintptr_t(e.base) < uintptr_t(e.size)) {
+          found = true;
+          break;
+        }
+      }
+      index += coalesced.size();
+    }
+    uint32_t mask = __ballot_sync(ncclCoopGetLaneMask(coalesced), found);
+    if (mask != 0) {
+      int source = __popc(mask-1);
+      index = __shfl_sync(ncclCoopGetLaneMask(coalesced), index, source);
+      return loadConst(&t->entries[index].window);
+    }
+    t = loadConst(&t->next);
+  }
+}
+#endif
+
 NCCL_HOST_DEVICE_INLINE size_t ncclGetResourceBufferOffset(ncclDevResourceHandle_t h) {
   return ((size_t)h)*128;
 }
diff --git a/projects/rccl/src/include/nccl_device/impl/core__types.h b/projects/rccl/src/include/nccl_device/impl/core__types.h
index 227f15ba8af..a97a0c7f009 100644
--- a/projects/rccl/src/include/nccl_device/impl/core__types.h
+++ b/projects/rccl/src/include/nccl_device/impl/core__types.h
@@ -7,16 +7,18 @@
 #ifndef _NCCL_DEVICE_CORE__TYPES_H_
 #define _NCCL_DEVICE_CORE__TYPES_H_
 #include "../core_tmp.h"
+#include "nccl_device/gin/gin_device_host_common.h"
 
 // nccl.h has: typedef ncclWindow_vidmem* ncclWindow_t;
 struct ncclWindow_vidmem {
   void* winHost;
-  //ncclGinWindow_t ginWin;
   char* lsaFlatBase; // pointer to first byte for rank 0 of lsa team
   int lsaRank;
   int worldRank;
   uint32_t stride4G;
   uint32_t mcOffset4K;
+  uint32_t ginOffset4K;
+  ncclGinWindow_t ginWins[NCCL_GIN_MAX_CONTEXTS];
 };
 
 struct ncclMultimemHandle {
diff --git a/projects/rccl/src/include/nccl_device/impl/gin__funcs.h b/projects/rccl/src/include/nccl_device/impl/gin__funcs.h
new file mode 100644
index 00000000000..668674282c6
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/gin__funcs.h
@@ -0,0 +1,412 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
+#define _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
+#include "gin__types.h"
+#include "ptr__types.h"
+
+#if __CUDACC__
+#include "nccl_device/gin/gin_device_api.h"
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE ncclGin_BackendMask<beMask>::ncclGin_BackendMask(ncclDevComm const& comm, int contextIndex):
+  comm(comm) {
+  this->nContexts = comm.ginContextCount;
+
+  static_assert(NCCL_GIN_MAX_CONTEXTS == 4, "Required for following modulo hack to work.");
+  // this->contextId = contextIndex % comm.ginContextCount;
+  this->contextId = comm.ginContextCount == 3
+    ? uint32_t(contextIndex)%3 // 3 is only non power of 2
+    : contextIndex & (comm.ginContextCount-1); // powers of 2
+
+  this->_ginBackend = comm.ginTypes[this->contextId];
+  this->_ginHandle = comm.ginHandles[this->contextId];
+  this->_signalShadows = comm.ginSignalShadows + this->contextId*comm.ginSignalCount;
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE ncclGinCtx_M<beMask> ncclGin_BackendMask<beMask>::_makeCtx() const {
+  ncclGinCtx_M<beMask> ans;
+  ans.backend = (ncclNetDeviceType)_ginBackend;
+  ans.rank = comm.rank;
+  ans.nRanks = comm.nRanks;
+  ans.handle = _ginHandle;
+  return ans;
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin descriptor helpers:
+
+#if __CUDACC__
+template<typename Descriptor>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isDescriptor(Descriptor) { return false; }
+template<typename Descriptor>
+NCCL_DEVICE_INLINE constexpr ncclGinDescriptorSmem* ncclGin_getDescriptor(Descriptor) { return nullptr; }
+
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isDescriptor(ncclGin_DescriptorSmem) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinDescriptorSmem* ncclGin_getDescriptor(ncclGin_DescriptorSmem arg) { return arg.descriptor; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin signal helpers:
+
+#if __CUDACC__
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(RemoteAction) { return false; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(ncclGin const&, RemoteAction) { return -1u; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(RemoteAction) { return (ncclGinSignalOp_t)0; }
+template<typename RemoteAction>
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(RemoteAction) { return 0; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(ncclGin_SignalInc) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(
+    ncclGin const& net, ncclGin_SignalInc arg
+  ) {
+  return net.comm.ginSignalBase + arg.signal;
+}
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(ncclGin_SignalInc arg) {
+  return ncclGinSignalInc;
+}
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(ncclGin_SignalInc) { return 1; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isSignal(ncclGin_SignalAdd) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getSignalId(
+    ncclGin const& net, ncclGin_SignalAdd arg
+  ) {
+  return net.comm.ginSignalBase + arg.signal;
+}
+NCCL_DEVICE_INLINE constexpr ncclGinSignalOp_t ncclGin_getSignalOp(ncclGin_SignalAdd arg) {
+  return ncclGinSignalAdd;
+}
+NCCL_DEVICE_INLINE constexpr uint64_t ncclGin_getSignalOpArg(ncclGin_SignalAdd arg) { return arg.value; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// ncclGin counter helpers:
+
+#if __CUDACC__
+template<typename LocalAction>
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isCounter(LocalAction) { return false; }
+template<typename LocalAction>
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getCounterId(ncclGin const&, LocalAction) { return -1u; }
+#endif
+
+#if __CUDACC__
+NCCL_DEVICE_INLINE constexpr bool ncclGin_isCounter(ncclGin_CounterInc) { return true; }
+NCCL_DEVICE_INLINE constexpr ncclGinSignal_t ncclGin_getCounterId(ncclGin const& net, ncclGin_CounterInc arg) { return net.comm.ginCounterBase + arg.counter; }
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename LocalAction, // one of ncclGin_{None|CounterInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::put(
+    ncclTeam team, int peer,
+    ncclWindow_t dstWin, size_t dstOffset,
+    ncclWindow_t srcWin, size_t srcOffset, size_t bytes,
+    RemoteAction remoteAction, LocalAction localAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,  cuda::thread_scope givenRelease
+  ) const {
+  using nccl::utility::loadConst;
+  ncclGinCtx_M<beMask> ctx = this->_makeCtx();
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_Put>(ctx,
+      ncclCoopThread(), ncclTeamRankToWorld(this->comm, team, peer), /*hasWins=*/true,
+      loadConst(&dstWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&dstWin->ginOffset4K)) + dstOffset,
+      loadConst(&srcWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&srcWin->ginOffset4K)) + srcOffset, bytes,
+      ncclGin_isSignal(remoteAction),
+      ncclGin_getSignalId(*this, remoteAction),
+      ncclGin_getSignalOp(remoteAction),
+      ncclGin_getSignalOpArg(remoteAction),
+      ncclGin_isCounter(localAction),
+      ncclGin_getCounterId(*this, localAction),
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease,
+      givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename LocalAction, // one of ncclGin_{None|CounterInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::put(
+    ncclTeam team, int peer,
+    ncclSymPtr<T> dstElts, ncclSymPtr<T> srcElts, size_t nElts,
+    RemoteAction remoteAction, LocalAction localAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  this->put(
+    team, peer, dstElts.window, dstElts.offset, srcElts.window, srcElts.offset, nElts*sizeof(T),
+    remoteAction, localAction, coop, descriptor, requiredRelease, givenRelease
+  );
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::putValue(
+    ncclTeam team, int peer,
+    ncclWindow_t dstWin, size_t dstOffset, T value,
+    RemoteAction remoteAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  static_assert(sizeof(T) <= 8, "Required: sizeof(T) <= 8");
+  using nccl::utility::loadConst;
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_PutValue>(this->_makeCtx(),
+      ncclCoopThread(), ncclTeamRankToWorld(this->comm, team, peer),
+      loadConst(&dstWin->ginWins[this->contextId]),
+      4096*size_t(loadConst(&dstWin->ginOffset4K)) + dstOffset,
+      value,
+      ncclGin_isSignal(remoteAction),
+      ncclGin_getSignalId(*this, remoteAction),
+      ncclGin_getSignalOp(remoteAction),
+      ncclGin_getSignalOpArg(remoteAction),
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease, givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<
+  typename T,
+  typename RemoteAction, // one of ncclGin_{None|SignalInc}
+  typename Coop,
+  typename DescriptorSmem
+>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::putValue(
+    ncclTeam team, int peer,
+    ncclSymPtr<T> dst, T value,
+    RemoteAction remoteAction,
+    Coop coop,
+    DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  this->putValue(
+    team, peer, dst.window, dst.offset, value, remoteAction, coop, descriptor, requiredRelease, givenRelease
+  );
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename RemoteAction, typename Coop, typename DescriptorSmem>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::signal(
+    ncclTeam team, int peer, RemoteAction action, Coop coop, DescriptorSmem descriptor,
+    cuda::thread_scope requiredRelease,
+    cuda::thread_scope givenRelease
+  ) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    ncclGinCall<ncclGinApi_Put>(this->_makeCtx(),
+      ncclCoopThread(), ncclTeamRankToWorld(this->comm, team, peer),
+      /*hasWins=*/false, nullptr, 0, nullptr, 0, 0,
+      ncclGin_isSignal(action),
+      ncclGin_getSignalId(*this, action),
+      ncclGin_getSignalOp(action),
+      ncclGin_getSignalOpArg(action),
+      /*hasCounter=*/false, 0,
+      ncclGin_isDescriptor(descriptor),
+      ncclGin_getDescriptor(descriptor),
+      requiredRelease, givenRelease
+    );
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::flush(Coop coop, cuda::memory_order ord) const {
+  coop.sync();
+  ncclGinCall<ncclGinApi_Flush>(this->_makeCtx(), coop, ord);
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitCounter(
+    Coop coop, ncclGinCounter_t counter, uint64_t least, int bits, cuda::memory_order ord
+  ) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetCounterPtr>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t ncclGin_BackendMask<beMask>::readCounter(ncclGinCounter_t counter, int bits, cuda::memory_order ord) const {
+  uint64_t* ptr = ncclGinCall<ncclGinApi_GetCounterPtr>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+  uint64_t mask = uint64_t(-1)>>(64-bits);
+  return mask & cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t* ncclGin_BackendMask<beMask>::getSignalShadowPtr(ncclGinSignal_t signal) const {
+  return &this->_signalShadows[signal];
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::increaseSignalShadow(ncclGinSignal_t signal, uint64_t delta) const {
+#if defined(__HIP_PLATFORM_AMD__)
+  __hip_atomic_fetch_add(this->_signalShadows + signal, delta, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+#else
+  asm volatile("red.relaxed.cta.add.u64 [%0],%1;" :: "l"(this->_signalShadows + signal), "l"(delta) : "memory");
+#endif
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE uint64_t ncclGin_BackendMask<beMask>::readSignal(ncclGinSignal_t signal, int bits, cuda::memory_order ord) const {
+  uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+  uint64_t mask = uint64_t(-1)>>(64-bits);
+  return mask & cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignal(Coop coop, ncclGinSignal_t signal, uint64_t least, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got, bits));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignalMeetShadow(Coop coop, ncclGinSignal_t signal, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    uint64_t least = this->_signalShadows[signal];
+    uint64_t got;
+    #pragma unroll 1
+    do got = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(least, got, bits));
+  }
+  coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+template<typename Coop, typename Uint>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::waitSignalFollowShadow(Coop coop, ncclGinSignal_t signal, Uint leastDelta, Uint* before, Uint* delta, int bits, cuda::memory_order ord) const {
+  coop.sync();
+  uint64_t before64 = this->_signalShadows[signal];
+  uint64_t after64;
+  if (coop.thread_rank() == 0) {
+    uint64_t* ptr = ncclGinCall<ncclGinApi_GetSignalPtr>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+    #pragma unroll 1
+    do after64 = cuda::atomic_ref<uint64_t>{*ptr}.load(ord);
+    while (!nccl::utility::rollingLessEq(before64 + leastDelta, after64, bits));
+  }
+  if (ncclCoopWithinWarp(coop) && bits <= 32) { // do a single __shfl_sync instead of 2
+    uint32_t mask = uint32_t(-1)>>(32-bits);
+    after64 = ncclCoopBcast(coop, (uint32_t)after64, 0, /*entrySync=*/false);
+    *before = (Uint)(mask & before64);
+    *delta = (Uint)(mask & (after64 - before64));
+  } else {
+    uint64_t mask = uint64_t(-1)>>(64-bits);
+    after64 = ncclCoopBcast(coop, after64, 0, /*entrySync=*/false);
+    *before = (Uint)(mask & before64);
+    *delta = (Uint)(mask & (after64 - before64));
+  }
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::resetCounter(ncclGinCounter_t counter) const {
+  ncclGinCall<ncclGinApi_ResetCounter>(this->_makeCtx(), this->comm.ginCounterBase + counter);
+}
+#endif
+
+#if __CUDACC__
+template<unsigned beMask>
+NCCL_DEVICE_INLINE void ncclGin_BackendMask<beMask>::resetSignal(ncclGinSignal_t signal) const {
+  ncclGinCall<ncclGinApi_ResetSignal>(this->_makeCtx(), this->comm.ginSignalBase + signal);
+  this->_signalShadows[signal] = 0;
+}
+#endif
+
+#endif // _NCCL_DEVICE_GIN_SESSION__FUNCS_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/gin__types.h b/projects/rccl/src/include/nccl_device/impl/gin__types.h
new file mode 100644
index 00000000000..e096d52e3bf
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/gin__types.h
@@ -0,0 +1,10 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_SESSION__TYPES_H_
+#define _NCCL_DEVICE_GIN_SESSION__TYPES_H_
+#include "../gin.h"
+#endif
diff --git a/projects/rccl/src/include/nccl_device/impl/gin_barrier__funcs.h b/projects/rccl/src/include/nccl_device/impl/gin_barrier__funcs.h
new file mode 100644
index 00000000000..3579329c996
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/gin_barrier__funcs.h
@@ -0,0 +1,63 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
+#include "gin_barrier__types.h"
+#include "comm__types.h"
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::ncclGinBarrierSession(
+    Coop coop, ncclGin net, ncclTeam team, ncclGinBarrierHandle handle, uint32_t barrierIndex
+  ):
+  ncclGinBarrierSession_internal<Coop>{coop, net, team, handle, (int)barrierIndex} {
+  uint32_t* epochs = (uint32_t*)ncclGetResourceBufferLocalPointer(net.comm, handle.bufHandle);
+  this->epoch = epochs[barrierIndex*NCCL_GIN_MAX_CONTEXTS + net.contextId];
+  this->signal = handle.signal0 + barrierIndex;
+}
+
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::ncclGinBarrierSession(
+    Coop coop, ncclGin net, ncclTeamTagRail, uint32_t barrierIndex
+  ):
+  ncclGinBarrierSession(coop, net, ncclTeamRail(net.comm), net.comm.railGinBarrier, barrierIndex) {
+}
+
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclGinBarrierSession<Coop>::~ncclGinBarrierSession() {
+  if (this->coop.thread_rank() == 0) {
+    uint32_t* epochs = (uint32_t*)ncclGetResourceBufferLocalPointer(this->net.comm, this->handle.bufHandle);
+    epochs[this->index*NCCL_GIN_MAX_CONTEXTS + this->net.contextId] = this->epoch;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclGinBarrierSession<Coop>::sync(Coop, cuda::memory_order ord, ncclGinFenceLevel fence) {
+  this->coop.sync();
+  #pragma unroll 1
+  for (int i=this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+    int peer = 1 + this->team.rank + i;
+    if (this->team.nRanks <= peer) peer -= this->team.nRanks;
+    this->net.signal(
+      this->team, peer, ncclGin_SignalInc{this->signal}, ncclCoopThread(), ncclGin_None(),
+      nccl::utility::releaseOrderOf(ord) != cuda::memory_order_relaxed
+        ? cuda::thread_scope_thread
+        : cuda::thread_scope_system
+    );
+  }
+  this->epoch += this->team.nRanks-1;
+  if (this->coop.thread_rank() == 0) {
+    this->net.waitSignal(ncclCoopThread(), this->signal, this->epoch, 32, nccl::utility::acquireOrderOf(ord));
+  }
+  this->coop.sync();
+}
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER__FUNCS_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/gin_barrier__types.h b/projects/rccl/src/include/nccl_device/impl/gin_barrier__types.h
new file mode 100644
index 00000000000..83e2f636cf6
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/gin_barrier__types.h
@@ -0,0 +1,31 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
+#include "../gin_barrier.h"
+#include "core__types.h"
+#include "gin__types.h"
+
+struct ncclGinBarrierHandle {
+  ncclGinSignal_t signal0;
+  ncclDevResourceHandle_t bufHandle;
+};
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclGinBarrierSession_internal {
+  Coop coop;
+  ncclGin net;
+  ncclTeam team;
+  ncclGinBarrierHandle handle;
+  int index;
+  uint32_t epoch;
+  ncclGinSignal_t signal;
+};
+#endif
+
+#endif // _NCCL_DEVICE_GIN_BARRIER__TYPES_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/lsa_barrier__funcs.h b/projects/rccl/src/include/nccl_device/impl/lsa_barrier__funcs.h
new file mode 100644
index 00000000000..3bc755b1b54
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/lsa_barrier__funcs.h
@@ -0,0 +1,128 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
+#include "lsa_barrier__types.h"
+#include "comm__types.h"
+
+#define __CUDACC__ 0
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeam team,
+    ncclLsaBarrierHandle handle, uint32_t index,
+    bool multimem, ncclMultimemHandle mmHandle
+  ):
+  ncclLsaBarrierSession_internal<Coop>{
+    coop, comm, team, handle, (int)index,
+#if CUDART_VERSION >= 12060
+    multimem,
+#else // WAR for an issue with ptxas in CTK < 12.6
+    /*multimem=*/false,
+#endif
+    mmHandle, /*epoch=*/0
+  } {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+  this->epoch = state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index];
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::ncclLsaBarrierSession(
+    Coop coop, ncclDevComm const& comm, ncclTeamTagLsa, uint32_t index, bool multimem
+  ): ncclLsaBarrierSession(
+    coop, comm, ncclTeamLsa(comm), comm.lsaBarrier, index, multimem, comm.lsaMultimem
+  ) {
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclLsaBarrierSession<Coop>::~ncclLsaBarrierSession() {
+  uint32_t* state = (uint32_t*)ncclGetResourceBufferLocalPointer(this->comm, this->handle.bufHandle);
+  if (this->coop.thread_rank() == 0) {
+#if __CUDA_ARCH__ == 1200 && CUDART_VERSION < 13000
+    // WAR for a compiler issue with CTK < 13.0
+    if (this->index == 0)
+      state[(this->multimem ? 0 : 1)*this->handle.nBarriers] = this->epoch;
+    else
+#endif
+    state[(this->multimem ? 0 : 1)*this->handle.nBarriers + this->index] = this->epoch;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::arrive(Coop, cuda::memory_order order) {
+  this->coop.sync();
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      uint32_t* inbox = this->mcInbox(/*multimem=*/true);
+      if (nccl::utility::releaseOrderOf(order) != cuda::memory_order_relaxed) {
+        asm volatile("multimem.red.release.sys.add.u32 [%0],1;" :: "l"(inbox));
+      } else {
+        asm volatile("multimem.red.relaxed.sys.add.u32 [%0],1;" :: "l"(inbox));
+      }
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(peer, this->team.rank));
+      inbox.store(this->epoch+1, nccl::utility::releaseOrderOf(order));
+    }
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::wait(Coop, cuda::memory_order order) {
+  if (this->multimem) {
+  #if __CUDA_ARCH__ >= 900
+    if (this->coop.thread_rank() == 0) {
+      cuda::atomic_ref<uint32_t> inbox(*this->mcInbox(/*multimem=*/false));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + this->team.nRanks) <= uint32_t(-1)>>1) break;
+      }
+      this->epoch += this->team.nRanks;
+    }
+  #endif
+  } else {
+    #pragma unroll 1
+    for (int i = this->coop.thread_rank(); i < this->team.nRanks-1; i += this->coop.size()) {
+      int peer = i + (this->team.rank <= i ? 1 : 0);
+      cuda::atomic_ref<uint32_t> inbox(*this->ucInbox(this->team.rank, peer));
+      #pragma unroll 1
+      while (true) {
+        uint32_t got = inbox.load(nccl::utility::acquireOrderOf(order));
+        if (got - (this->epoch + 1) <= uint32_t(-1)>>1) break;
+      }
+    }
+    this->epoch += 1;
+  }
+  this->coop.sync();
+}
+#endif
+
+#if __CUDACC__
+template<typename Coop>
+NCCL_DEVICE_INLINE void ncclLsaBarrierSession<Coop>::sync(Coop coop, cuda::memory_order order) {
+  this->arrive(coop, order);
+  this->wait(coop, order);
+}
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/lsa_barrier__types.h b/projects/rccl/src/include/nccl_device/impl/lsa_barrier__types.h
new file mode 100644
index 00000000000..b06e704b560
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/impl/lsa_barrier__types.h
@@ -0,0 +1,48 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
+#include "../lsa_barrier.h"
+#include "core__types.h"
+
+#define __CUDACC__ 0
+
+struct ncclLsaBarrierHandle {
+  ncclDevResourceHandle_t bufHandle;
+  int nBarriers;
+};
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal {
+  Coop coop;
+  ncclDevComm const& comm;
+  ncclTeam team;
+  ncclLsaBarrierHandle handle;
+  int index;
+  bool multimem;
+  ncclMultimemHandle mmHandle;
+  uint32_t epoch;
+
+  NCCL_DEVICE_INLINE uint32_t* mcInbox(bool multimem) {
+    uint32_t* state;
+    if (multimem) { // multicast
+      state = (uint32_t*)ncclGetResourceBufferMultimemPointer(comm, handle.bufHandle, mmHandle);
+    } else { // unicast
+      state = (uint32_t*)ncclGetResourceBufferLocalPointer(comm, handle.bufHandle);
+    }
+    return state + 2*handle.nBarriers + index;
+  }
+
+  NCCL_DEVICE_INLINE uint32_t* ucInbox(int owner, int peer) {
+    uint32_t* state = (uint32_t*)ncclGetResourceBufferPeerPointer(comm, handle.bufHandle, team, owner);
+    return state + 3*handle.nBarriers + index*team.nRanks + peer;
+  }
+};
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
diff --git a/projects/rccl/src/include/nccl_device/impl/mem_barrier__funcs.h b/projects/rccl/src/include/nccl_device/impl/mem_barrier__funcs.h
index 115d0b640ac..44f050520c9 100644
--- a/projects/rccl/src/include/nccl_device/impl/mem_barrier__funcs.h
+++ b/projects/rccl/src/include/nccl_device/impl/mem_barrier__funcs.h
@@ -6,7 +6,7 @@
 
 #ifndef _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
 #define _NCCL_DEVICE_MEM_BARRIER__FUNCS_H_
-#include "mem_barrier__types.h"
+#include "lsa_barrier__types.h"
 #include "comm__types.h"
 #include <atomic>
 
diff --git a/projects/rccl/src/include/nccl_device/impl/mem_barrier__types.h b/projects/rccl/src/include/nccl_device/impl/mem_barrier__types.h
index 8498cd6ba70..ee58ab19a52 100644
--- a/projects/rccl/src/include/nccl_device/impl/mem_barrier__types.h
+++ b/projects/rccl/src/include/nccl_device/impl/mem_barrier__types.h
@@ -6,7 +6,7 @@
 
 #ifndef _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
 #define _NCCL_DEVICE_MEM_BARRIER__TYPES_H_
-#include "../mem_barrier.h"
+#include "../lsa_barrier.h"
 #include "core__types.h"
 
 struct ncclLsaBarrierHandle {
diff --git a/projects/rccl/src/include/nccl_device/ll_a2a.h b/projects/rccl/src/include/nccl_device/ll_a2a.h
index db3a517b752..6d516a47c3d 100644
--- a/projects/rccl/src/include/nccl_device/ll_a2a.h
+++ b/projects/rccl/src/include/nccl_device/ll_a2a.h
@@ -25,7 +25,7 @@ struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
   NCCL_DEVICE_INLINE ~ncclLLA2ASession();
 
   ncclLLA2ASession(ncclLLA2ASession const&) = delete; // Sessions are not copyable
-  
+
   template<typename T>
   NCCL_DEVICE_INLINE void send(int peer, int slot, T data);
 
@@ -41,7 +41,7 @@ struct ncclLLA2ASession: ncclLLA2ASession_internal<Coop> {
   template<int Unroll, typename Elt, typename EltToAcc, typename Reduce>
   NCCL_DEVICE_INLINE auto recvReduce(int eltStart, int eltCount, int eltStride, EltToAcc eltToAcc, Reduce red)
     -> decltype(eltToAcc(nccl::utility::declval<Elt>())) ;
-  
+
   // End an alltoall region. For every peer in team you must have done both of the
   // following each of which can be accomplished using any thread in coop:
   //  1. Targeted that peer with at least one send().
diff --git a/projects/rccl/src/include/nccl_device/lsa_barrier.h b/projects/rccl/src/include/nccl_device/lsa_barrier.h
new file mode 100644
index 00000000000..a21e08435a1
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/lsa_barrier.h
@@ -0,0 +1,39 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_MEM_BARRIER_H_
+#define _NCCL_DEVICE_MEM_BARRIER_H_
+#include "impl/core__types.h"
+#include "core_tmp.h"
+
+#undef __CUDACC__
+#define __CUDACC__ 0
+
+struct ncclLsaBarrierHandle;
+
+NCCL_EXTERN_C __host__ ncclResult_t ncclLsaBarrierCreateRequirement(ncclTeam_t, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+
+#if __CUDACC__
+template<typename Coop>
+struct ncclLsaBarrierSession_internal;
+
+template<typename Coop>
+struct ncclLsaBarrierSession: ncclLsaBarrierSession_internal<Coop> {
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeam, ncclLsaBarrierHandle, uint32_t index, bool multimem=false, ncclMultimemHandle mmHandle={});
+
+  NCCL_DEVICE_INLINE ncclLsaBarrierSession(Coop, ncclDevComm const&, ncclTeamTagLsa, uint32_t index, bool multimem=false);
+
+  NCCL_DEVICE_INLINE ~ncclLsaBarrierSession();
+
+  ncclLsaBarrierSession(ncclLsaBarrierSession const&) = delete; // Sessions are not copyable
+
+  NCCL_DEVICE_INLINE void arrive(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void wait(Coop, cuda::memory_order);
+  NCCL_DEVICE_INLINE void sync(Coop, cuda::memory_order);
+};
+#endif
+
+#endif // _NCCL_DEVICE_MEM_BARRIER_H_
diff --git a/projects/rccl/src/include/nccl_device/net_device.h b/projects/rccl/src/include/nccl_device/net_device.h
new file mode 100644
index 00000000000..423f1027abd
--- /dev/null
+++ b/projects/rccl/src/include/nccl_device/net_device.h
@@ -0,0 +1,38 @@
+/*************************************************************************
+ * Copyright (c) 2023-2023, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_DEVICE_H_
+#define NCCL_NET_DEVICE_H_
+
+#define NCCL_NET_DEVICE_INVALID_VERSION      0x0
+#define NCCL_NET_MTU_SIZE                    4096
+
+// Arbitrary version number - A given NCCL build will only be compatible with a single device networking plugin
+// version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
+#define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
+
+typedef enum {
+  NCCL_NET_DEVICE_HOST=0,
+  NCCL_NET_DEVICE_UNPACK=1,
+  NCCL_NET_DEVICE_GIN_PROXY=2,
+  NCCL_NET_DEVICE_GIN_GDAKI=3,
+} ncclNetDeviceType;
+
+typedef struct {
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+  void* handle;
+  size_t size;
+  int needsProxyProgress;
+} ncclNetDeviceHandle_v7_t;
+
+typedef ncclNetDeviceHandle_v7_t ncclNetDeviceHandle_v8_t;
+typedef ncclNetDeviceHandle_v8_t ncclNetDeviceHandle_v9_t;
+typedef ncclNetDeviceHandle_v9_t ncclNetDeviceHandle_v10_t;
+typedef ncclNetDeviceHandle_v10_t ncclNetDeviceHandle_v11_t;
+typedef ncclNetDeviceHandle_v11_t ncclNetDeviceHandle_t;
+
+#endif
diff --git a/projects/rccl/src/include/nccl_device/utility.h b/projects/rccl/src/include/nccl_device/utility.h
index eac75a218aa..08690659906 100644
--- a/projects/rccl/src/include/nccl_device/utility.h
+++ b/projects/rccl/src/include/nccl_device/utility.h
@@ -1,5 +1,6 @@
 /*************************************************************************
  * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -7,16 +8,7 @@
 #ifndef _NCCL_DEVICE_UTILITY_H_
 #define _NCCL_DEVICE_UTILITY_H_
 
-#if __CUDACC__
-  #define NCCL_DEVICE_INLINE __device__ __forceinline__
-  #define NCCL_HOST_DEVICE_INLINE __host__ __device__ __forceinline__
-#else
-  #ifndef __host__
-    #define __host__
-  #endif
-  #define NCCL_DEVICE_INLINE
-  #define NCCL_HOST_DEVICE_INLINE inline __attribute__((always_inline))
-#endif
+#include "hip_compat.h"
 
 #if __cplusplus
 #define NCCL_EXTERN_C extern "C"
@@ -44,6 +36,19 @@ NCCL_HOST_DEVICE_INLINE T&& declval() noexcept {
   static_assert(sizeof(T)!=sizeof(T), "You can't evaluate declval.");
 }
 
+template<typename T, T value_>
+struct ValueAsType { static constexpr T value = value_; };
+
+// Returns the value zero but the compiler cannot prove that it is zero so it
+// is useful to inhibit compiler optimizations.
+#if __CUDACC__
+template<typename=void>
+NCCL_DEVICE_INLINE int opaqueZero() {
+  __device__ static int zero = 0;
+  return __ldg(&zero);
+}
+#endif
+
 template<typename X, typename Y, typename Z = decltype(X()+Y())>
 NCCL_HOST_DEVICE_INLINE constexpr Z divUp(X x, Y y) {
   return (x+y-1)/y;
@@ -102,6 +107,17 @@ NCCL_HOST_DEVICE_INLINE constexpr bool isPow2(Int x) {
   return (x & (x-1)) == 0;
 }
 
+template<typename Uint>
+NCCL_HOST_DEVICE_INLINE bool rollingLessEq(Uint a, Uint b, int nBits = 8*sizeof(Uint)) {
+  static_assert(Uint(0) < Uint(-1), "Uint must be unsigned.");
+  Uint m = Uint(-1) >> (8*sizeof(Uint) - nBits);
+  return ((b-a) & m) <= m>>1;
+}
+template<typename Uint>
+NCCL_HOST_DEVICE_INLINE bool rollingLessThan(Uint a, Uint b, int nBits = 8*sizeof(Uint)) {
+  return !rollingLessEq(b, a, nBits);
+}
+
 // Produce the reciprocal of x for use in idivByRcp
 NCCL_HOST_DEVICE_INLINE constexpr uint32_t idivRcp32(uint32_t x) {
   return uint32_t(-1)/x + isPow2(x);
@@ -111,15 +127,15 @@ NCCL_HOST_DEVICE_INLINE constexpr uint64_t idivRcp64(uint64_t x) {
 }
 
 NCCL_HOST_DEVICE_INLINE uint32_t mul32hi(uint32_t a, uint32_t b) {
-#if __CUDA_ARCH__
-  return __umulhi(a, b);
+#if NCCL_DEVICE_ARCH
+  return nccl_umulhi(a, b);
 #else
   return uint64_t(a)*b >> 32;
 #endif
 }
 NCCL_HOST_DEVICE_INLINE uint64_t mul64hi(uint64_t a, uint64_t b) {
-#if __CUDA_ARCH__
-  return __umul64hi(a, b);
+#if NCCL_DEVICE_ARCH
+  return nccl_umul64hi(a, b);
 #else
   return (uint64_t)(((unsigned __int128)a)*b >> 64);
 #endif
@@ -183,7 +199,7 @@ NCCL_HOST_DEVICE_INLINE uint32_t imodFast64(uint64_t x, uint64_t y, uint64_t yrc
   return r;
 }
 
-#if __CUDACC__
+#if NCCL_DEVICE_COMPILE
 // Precomputed integer reciprocoals for denominator values 1..64 inclusive.
 // Pass these to idivFast64() for fast division on the GPU.
 NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) {
@@ -210,33 +226,12 @@ NCCL_DEVICE_INLINE uint64_t idivRcp64_upto64(int x) {
 }
 #endif
 
-#if __CUDACC__
+#if NCCL_DEVICE_COMPILE
 NCCL_DEVICE_INLINE uint32_t idivRcp32_upto64(int x) {
   return idivRcp64_upto64(x)>>32;
 }
 #endif
 
-#if __CUDACC__
-NCCL_DEVICE_INLINE void fenceAcquireGpu() {
-  static __device__ int dummy;
-  int tmp;
-#if __HIP_PLATFORM_AMD__
-  tmp = __atomic_load_n(&dummy, __ATOMIC_ACQUIRE);
-  __threadfence();
-#else
-  asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
-#endif
-  dummy = tmp;
-}
-NCCL_DEVICE_INLINE void fenceReleaseGpu() {
-#if __HIP_PLATFORM_AMD__
-  __threadfence();
-#else
-  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
-#endif
-}
-#endif
-
 #if __CUDACC__
 #if __HIP_PLATFORM_AMD__
 NCCL_HOST_DEVICE_INLINE constexpr std::memory_order acquireOrderOf(std::memory_order ord) {
@@ -260,6 +255,28 @@ NCCL_HOST_DEVICE_INLINE constexpr int toAtomicBuiltinOrder(std::memory_order ord
     default: return __ATOMIC_SEQ_CST;
   }
 }
+
+NCCL_HOST_DEVICE_INLINE constexpr cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
+  return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
+         ord == cuda::memory_order_acq_rel ? cuda::memory_order_acquire :
+         ord;
+}
+NCCL_HOST_DEVICE_INLINE constexpr cuda::memory_order releaseOrderOf(cuda::memory_order ord) {
+  return ord == cuda::memory_order_acquire ? cuda::memory_order_relaxed :
+         ord == cuda::memory_order_acq_rel ? cuda::memory_order_release :
+         ord;
+}
+
+NCCL_HOST_DEVICE_INLINE constexpr cuda::memory_order toCudaOrder(std::memory_order ord) {
+  switch (ord) {
+    case std::memory_order_relaxed: return cuda::memory_order_relaxed;
+    case std::memory_order_acquire: return cuda::memory_order_acquire;
+    case std::memory_order_release: return cuda::memory_order_release;
+    case std::memory_order_acq_rel: return cuda::memory_order_acq_rel;
+    case std::memory_order_seq_cst: return cuda::memory_order_seq_cst;
+    default: return cuda::memory_order_seq_cst;
+  }
+}
 #else
 NCCL_HOST_DEVICE_INLINE constexpr cuda::memory_order acquireOrderOf(cuda::memory_order ord) {
   return ord == cuda::memory_order_release ? cuda::memory_order_relaxed :
@@ -275,41 +292,96 @@ NCCL_HOST_DEVICE_INLINE constexpr cuda::memory_order releaseOrderOf(cuda::memory
 #endif
 
 #if __CUDACC__
+NCCL_DEVICE_INLINE void fenceAcquireGpu() {
+  static __device__ int dummy;
+  int tmp;
+#if __HIP_PLATFORM_AMD__
+  tmp = __atomic_load_n(&dummy, __ATOMIC_ACQUIRE);
+  __threadfence();
+#else
+  asm volatile("ld.acquire.gpu.s32 %0,[%1];" : "=r"(tmp) : "l"(&dummy) : "memory");
+#endif
+  dummy = tmp;
+}
+NCCL_DEVICE_INLINE void fenceReleaseGpu() {
+#if __HIP_PLATFORM_AMD__
+  __threadfence();
+#else
+  cuda::atomic_thread_fence(cuda::memory_order_release, cuda::thread_scope_device);
+#endif
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order ord, cuda::thread_scope scope) {
+  switch (scope) {
+  case cuda::thread_scope_thread:
+    return cuda::atomic_ref<T, cuda::thread_scope_thread>{*ptr}.load(ord);
+  case cuda::thread_scope_block:
+    return cuda::atomic_ref<T, cuda::thread_scope_block>{*ptr}.load(ord);
+  case cuda::thread_scope_device:
+    return cuda::atomic_ref<T, cuda::thread_scope_device>{*ptr}.load(ord);
+  case cuda::thread_scope_system:
+    return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(ord);
+  default: __builtin_unreachable();
+  }
+}
+#endif
+
+#if __CUDACC__
+template<typename T>
+NCCL_DEVICE_INLINE void atomicStore(T* ptr, T val, cuda::memory_order ord, cuda::thread_scope scope) {
+  switch (scope) {
+  case cuda::thread_scope_thread:
+    cuda::atomic_ref<T, cuda::thread_scope_thread>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_block:
+    cuda::atomic_ref<T, cuda::thread_scope_block>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_device:
+    cuda::atomic_ref<T, cuda::thread_scope_device>{*ptr}.store(val, ord);
+    break;
+  case cuda::thread_scope_system:
+    cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, ord);
+    break;
+  default: __builtin_unreachable();
+  }
+}
+#endif
+
+#if NCCL_DEVICE_COMPILE
 NCCL_DEVICE_INLINE int lane() {
-  int ret;
-  asm("mov.u32 %0, %%laneid;" : "=r"(ret));
-  return ret;
+  return nccl_lane_id();
 }
 NCCL_DEVICE_INLINE unsigned int lanemask_lt() {
-  unsigned int ret;
-  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(ret));
-  return ret;
+  return nccl_lanemask_lt();
 }
 #endif
 
-#if __CUDACC__
+#if NCCL_DEVICE_COMPILE
 // Load anything, but cache like its constant memory.
 template<typename T>
 NCCL_DEVICE_INLINE T loadConst(T const *p) {
   if (alignof(T) == 1) {
     union { uint8_t part[sizeof(T)]; T ret; };
-    for (int i=0; i < (int)sizeof(T); i++) part[i] = __ldg((uint8_t const*)p + i);
+    for (int i=0; i < (int)sizeof(T); i++) part[i] = nccl_ldg((uint8_t const*)p + i);
     return ret;
   } else if (alignof(T) == 2) {
     union { uint16_t part[sizeof(T)/2]; T ret; };
-    for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = __ldg((uint16_t const*)p + i);
+    for (int i=0; i < (int)sizeof(T)/2; i++) part[i] = nccl_ldg((uint16_t const*)p + i);
     return ret;
   } else if (alignof(T) == 4) {
     union { uint32_t part[sizeof(T)/4]; T ret; };
-    for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = __ldg((uint32_t const*)p + i);
+    for (int i=0; i < (int)sizeof(T)/4; i++) part[i] = nccl_ldg((uint32_t const*)p + i);
     return ret;
   } else if (alignof(T) == 8) {
     union { uint64_t part[sizeof(T)/8]; T ret; };
-    for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = __ldg((uint64_t const*)p + i);
+    for (int i=0; i < (int)sizeof(T)/8; i++) part[i] = nccl_ldg((uint64_t const*)p + i);
     return ret;
   } else { // alignof(T) >= 16
     union { ulonglong2 part[sizeof(T)/16]; T ret; };
-    for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = __ldg((ulonglong2 const*)p + i);
+    for (int i=0; i < (int)sizeof(T)/16; i++) part[i] = nccl_ldg((ulonglong2 const*)p + i);
     return ret;
   }
 }
@@ -376,7 +448,7 @@ struct Optional {
   // Construct with present thing:
   template<typename ...Arg>
   NCCL_HOST_DEVICE_INLINE Optional(Present<Arg...> args):
-    Optional(args, IntSeqUpTo<sizeof...(Arg), 0>::Type()) {
+    Optional(args, typename IntSeqUpTo<sizeof...(Arg), 0>::Type()) {
   }
 
   NCCL_HOST_DEVICE_INLINE ~Optional() {
diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h
index 550c6ed3236..da92381f127 100644
--- a/projects/rccl/src/include/net.h
+++ b/projects/rccl/src/include/net.h
@@ -17,6 +17,7 @@
 typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
 
 ncclResult_t ncclNetInit(struct ncclComm* comm);
+ncclResult_t ncclNetInitFromParent(struct ncclComm* comm, struct ncclComm* parent);
 ncclResult_t ncclNetFinalize(struct ncclComm* comm);
 ncclResult_t ncclNetGetDevCount(int netPluginIndex, int* nPhysDev, int* nVirtDev);
 ncclResult_t ncclNetSetVirtDevCount(int netPluginIndex, int nVirtDev);
@@ -28,6 +29,8 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport);
 
 extern ncclNet_t ncclNetIb;
 extern ncclNet_t ncclNetSocket;
+extern ncclGin_t ncclGinIbGdaki;
+extern ncclGin_t ncclGinIbProxy;
 
 extern ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p);
 extern int64_t ncclParamDmaBufEnable();
diff --git a/projects/rccl/src/include/net_device.h b/projects/rccl/src/include/net_device.h
index 99ae9c38bde..423f1027abd 100644
--- a/projects/rccl/src/include/net_device.h
+++ b/projects/rccl/src/include/net_device.h
@@ -14,7 +14,12 @@
 // version. NCCL will check the supplied version number from net->getProperties() and compare to its internal version.
 #define NCCL_NET_DEVICE_UNPACK_VERSION 0x7
 
-typedef enum {NCCL_NET_DEVICE_HOST=0, NCCL_NET_DEVICE_UNPACK=1} ncclNetDeviceType;
+typedef enum {
+  NCCL_NET_DEVICE_HOST=0,
+  NCCL_NET_DEVICE_UNPACK=1,
+  NCCL_NET_DEVICE_GIN_PROXY=2,
+  NCCL_NET_DEVICE_GIN_GDAKI=3,
+} ncclNetDeviceType;
 
 typedef struct {
   ncclNetDeviceType netDeviceType; // Network offload type
diff --git a/projects/rccl/src/include/nvtx.h b/projects/rccl/src/include/nvtx.h
index 1146a0f8565..207b045813f 100644
--- a/projects/rccl/src/include/nvtx.h
+++ b/projects/rccl/src/include/nvtx.h
@@ -40,10 +40,11 @@
 #define NVTX_SID_CommSplit            18
 #define NVTX_SID_CommFinalize         19
 #define NVTX_SID_CommShrink           20
+#define NVTX_SID_CommRevoke           21
 // When adding new schema IDs, DO NOT re-use/overlap with the enum schema ID below!
 
 // Define static schema ID for the reduction operation.
-#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 21 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
+#define NVTX_PAYLOAD_ENTRY_NCCL_REDOP 22 + NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START
 
 extern const nvtxDomainHandle_t ncclNvtxDomainHandle;
 
diff --git a/projects/rccl/src/include/nvtx_payload_schemas.h b/projects/rccl/src/include/nvtx_payload_schemas.h
index 1e6db095484..87005ce3ab2 100644
--- a/projects/rccl/src/include/nvtx_payload_schemas.h
+++ b/projects/rccl/src/include/nvtx_payload_schemas.h
@@ -52,11 +52,12 @@ NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommInitRank, static c
   )
 )
 // The typedef and payload schema for ncclCommInitRank is also used for,
-// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, and ncclCommAbort.
+// ncclCommInitRankConfig, ncclCommInitRankScalable, ncclCommDestroy, ncclCommAbort, and ncclCommRevoke.
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankConfig;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommInitRankScalable;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommAbort;
 typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommDestroy;
+typedef NcclNvtxParamsCommInitRank NcclNvtxParamsCommRevoke;
 
 NCCL_NVTX_DEFINE_STRUCT_WITH_SCHEMA_ENTRIES(NcclNvtxParamsCommSplit, static constexpr,
   NCCL_NVTX_PAYLOAD_ENTRIES(
diff --git a/projects/rccl/src/include/plugin/env/env_v1.h b/projects/rccl/src/include/plugin/env/env_v1.h
new file mode 100644
index 00000000000..eb6aa09ae9f
--- /dev/null
+++ b/projects/rccl/src/include/plugin/env/env_v1.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ */
+
+#ifndef ENV_V1_H_
+#define ENV_V1_H_
+
+#include "nccl.h"
+
+typedef struct {
+  const char* name;
+  // Initialize the environment plugin
+  // Input
+  //  - ncclMajor: NCCL major version number
+  //  - ncclMinor: NCCL minor version number
+  //  - ncclPatch: NCCL patch version number
+  //  - suffix: NCCL version suffix string
+  ncclResult_t (*init)(uint8_t ncclMajor, uint8_t ncclMinor, uint8_t ncclPatch, const char* suffix);
+  // Finalize the environment plugin
+  ncclResult_t (*finalize)(void);
+  // Get environment variable value
+  // Input
+  //  - name: environment variable name
+  // Output
+  //  - returns: pointer to environment variable value string, or NULL if not found. The plugin is responsible for keeping the
+  //             returned value (address) valid until it is no longer needed by NCCL. This happens when NCCL calls ``finalize``
+  //             or ``getEnv`` again on the same variable name. In any other case, modifying the variable (e.g., through
+  //             ``setenv``) is considered undefined behavior since NCCL might access the returned address after the plugin has
+  //             reset the variable.
+  const char* (*getEnv)(const char* name);
+} ncclEnv_v1_t;
+
+#endif
diff --git a/projects/rccl/src/include/plugin/nccl_env.h b/projects/rccl/src/include/plugin/nccl_env.h
new file mode 100644
index 00000000000..3cb85a6ee13
--- /dev/null
+++ b/projects/rccl/src/include/plugin/nccl_env.h
@@ -0,0 +1,16 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_ENV_H_
+#define NCCL_ENV_H_
+
+#include "env/env_v1.h"
+
+typedef ncclEnv_v1_t ncclEnv_t;
+
+#define NCCL_ENV_PLUGIN_SYMBOL ncclEnvPlugin_v1
+
+#endif // end include guard
diff --git a/projects/rccl/src/include/plugin/nccl_net.h b/projects/rccl/src/include/plugin/nccl_net.h
index a234e53ae72..f310c778b50 100644
--- a/projects/rccl/src/include/plugin/nccl_net.h
+++ b/projects/rccl/src/include/plugin/nccl_net.h
@@ -9,7 +9,7 @@
 
 #include "nccl.h"
 #include "nccl_common.h"
-#include "net_device.h"
+#include "nccl_device/net_device.h"
 #include <stdint.h>
 #include <dlfcn.h>
 
@@ -26,6 +26,10 @@
 #define NCCL_PTR_CUDA 0x2
 #define NCCL_PTR_DMABUF 0x4
 
+#define NCCL_NET_MR_FLAG_FORCE_SO (1 << 0)
+#define NCCL_NET_SIGNAL_OP_INC 0x1
+#define NCCL_NET_SIGNAL_OP_ADD 0x2
+
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 32
 
@@ -45,6 +49,7 @@
 
 typedef ncclNet_v11_t ncclNet_t;
 typedef ncclCollNet_v11_t ncclCollNet_t;
+typedef ncclGin_v11_t ncclGin_t;
 typedef ncclNetSGE_v11_t ncclNetSGE_t;
 typedef ncclNetProperties_v11_t ncclNetProperties_t;
 typedef ncclNetAttr_v11_t ncclNetAttr_t;
diff --git a/projects/rccl/src/include/plugin/net/net_v11.h b/projects/rccl/src/include/plugin/net/net_v11.h
index 68e100637e6..b7ea1c7e7d2 100644
--- a/projects/rccl/src/include/plugin/net/net_v11.h
+++ b/projects/rccl/src/include/plugin/net/net_v11.h
@@ -17,7 +17,6 @@ typedef struct {
   int trafficClass;
 } ncclNetCommConfig_v11_t;
 
-
 typedef struct {
   char* name;                      // Used mostly for logging.
   char* pciPath;                   // Path to the PCI device in /sys.
@@ -185,4 +184,53 @@ typedef struct {
   ncclResult_t (*finalize)(void* ctx);
 } ncclCollNet_v11_t;
 
+typedef struct {
+  // Name of the GIN support (mainly for logs)
+  const char* name;
+  // Initialize the GIN support.
+  ncclResult_t (*init)(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing GIN operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v11_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(void* ctx, int dev, void* handle, void** listenComm);
+  // Create a group for GIN operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* ctx, void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Create device-side GIN context. devHandle will be passed to device code.
+  // This function is not used in GIN_PROXY mode.
+  ncclResult_t (*createContext)(void* collComm, int nSignals, int nCounters, void** ginCtx, ncclNetDeviceHandle_v11_t** devHandle);
+  // Collective memory registration
+  ncclResult_t (*regMrSym)(void* collComm, void* data, size_t size, int type, uint64_t mrFlags, void** mhandle, void **ginHandle);
+  ncclResult_t (*regMrSymDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mrFlags, void** mhandle, void **ginHandle);
+  ncclResult_t (*deregMrSym)(void* collComm, void* mhandle);
+  // Close and free collective comm objects
+  ncclResult_t (*destroyContext)(void* ginCtx);
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Put operations
+  ncclResult_t (*iput)(void* collComm, uint64_t srcOff, void* srcMhandle, size_t size,
+      uint64_t dstOff, void* dstMhandle, uint32_t rank, void** request);
+  ncclResult_t (*iputSignal)(void* collComm, uint64_t srcOff, void* srcMhandle,
+      size_t size, uint64_t dstOff, void* dstMhandle,
+      uint32_t rank, uint64_t signalOff, void *signalMhandle,
+      uint64_t signalValue, uint32_t signalOp, void** request);
+
+  // Test whether a request is complete.
+  ncclResult_t (*test)(void* collComm, void* request, int* done);
+
+  // Progress function. Will be called if non-NULL in GIN_PROXY mode, or if devHandle.needsProxyProgress=1.
+  ncclResult_t (*ginProgress)(void* collComm);
+
+  // Query the last error for the GIN support. Particularly important when ginProgress is not used, to report errors.
+  ncclResult_t (*queryLastError)(void* ginCtx, bool *hasError);
+
+  // Finalize the GIN support
+  ncclResult_t (*finalize)(void* ctx);
+} ncclGin_v11_t;
 #endif // end include guard
diff --git a/projects/rccl/src/include/plugin/plugin.h b/projects/rccl/src/include/plugin/plugin.h
index 83b58e985ab..ff1ca27c848 100644
--- a/projects/rccl/src/include/plugin/plugin.h
+++ b/projects/rccl/src/include/plugin/plugin.h
@@ -13,11 +13,13 @@ enum ncclPluginType {
   ncclPluginTypeNet,
   ncclPluginTypeTuner,
   ncclPluginTypeProfiler,
+  ncclPluginTypeEnv,
 };
 
 void* ncclOpenNetPluginLib(const char* name);
 void* ncclOpenTunerPluginLib(const char* name);
 void* ncclOpenProfilerPluginLib(const char* name);
+void* ncclOpenEnvPluginLib(const char* name);
 void* ncclGetNetPluginLib(enum ncclPluginType type);
 ncclResult_t ncclClosePluginLib(void* handle, enum ncclPluginType type);
 
diff --git a/projects/rccl/src/include/proxy.h b/projects/rccl/src/include/proxy.h
index ecc116a3cda..f4687b1c595 100644
--- a/projects/rccl/src/include/proxy.h
+++ b/projects/rccl/src/include/proxy.h
@@ -19,6 +19,7 @@
 #include "p2p.h"
 #include "collectives.h"
 #include "proxy_trace/proxy_trace.h"
+#include "gin/gin_host.h"
 
 typedef enum : uint8_t {
   ncclPatternRing,
@@ -351,6 +352,8 @@ struct ncclProxyState {
   bool dmaBufSupport;
   ncclNet_t* ncclNet;
   ncclCollNet_t* ncclCollNet;
+  struct ncclGinState* ginState;
+
   uint32_t* abortFlag;
   bool directMode;
   // Service threads
diff --git a/projects/rccl/src/include/register.h b/projects/rccl/src/include/register.h
index edfc722deee..938432fe1db 100644
--- a/projects/rccl/src/include/register.h
+++ b/projects/rccl/src/include/register.h
@@ -51,6 +51,9 @@ struct ncclReg {
   uintptr_t caddrs[NCCL_MAX_LOCAL_RANKS]; /* use to check if NVLS buffers match among intra-node ranks */
   // collnet reg
   void* collnetHandle;
+  // gin reg
+  void** ginMhandles;
+  void** ginHandles;
   struct ncclProxyConnector* collnetProxyconn;
   // general ipc reg
   struct ncclPeerRegIpcAddr regIpcAddrs;
diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h
index adeae9b2a7b..bc7df9a3da8 100644
--- a/projects/rccl/src/include/socket.h
+++ b/projects/rccl/src/include/socket.h
@@ -66,7 +66,13 @@ struct ncclSocket {
   int finalizeCounter; // Used to keep track of initial handshake for async sockets.
   char finalizeBuffer[sizeof(uint64_t)]; // Used to keep track of initial handshake for async sockets.
 };
-
+struct ncclSocketOp {
+  int op;                    // NCCL_SOCKET_SEND or NCCL_SOCKET_RECV
+  struct ncclSocket* sock;   // Socket to operate on
+  void* ptr;                 // Data pointer
+  int size;                  // Size of data
+  int offset;                // Current progress offset
+};
 const char *ncclSocketToString(const union ncclSocketAddress *addr, char *buf, const int numericHostForm = 1);
 ncclResult_t ncclSocketGetAddrFromString(union ncclSocketAddress* ua, const char* ip_port_pair);
 ncclResult_t ncclFindInterfaceMatchSubnet(char* ifName, union ncclSocketAddress* localAddr,
@@ -96,6 +102,7 @@ ncclResult_t ncclSocketWait(int op, struct ncclSocket* sock, void* ptr, int size
 ncclResult_t ncclSocketSend(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketRecv(struct ncclSocket* sock, void* ptr, int size);
 ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int sendSize, struct ncclSocket* recvSock, void* recvPtr, int recvSize);
+ncclResult_t ncclSocketMultiOp(struct ncclSocketOp* ops, int numOps);
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking);
 ncclResult_t ncclSocketShutdown(struct ncclSocket* sock, int how);
 ncclResult_t ncclSocketClose(struct ncclSocket* sock, bool wait = false);
diff --git a/projects/rccl/src/include/sym_kernels.h b/projects/rccl/src/include/sym_kernels.h
index 406a29d3187..7d94c94260a 100644
--- a/projects/rccl/src/include/sym_kernels.h
+++ b/projects/rccl/src/include/sym_kernels.h
@@ -112,7 +112,8 @@ ncclResult_t ncclSymkMakeDevWork(struct ncclComm* comm, struct ncclTaskColl* tas
 
 // Generated by src/device/symmetric/generate.py
 extern int const ncclSymkKernelCount;
-extern void* const ncclSymkKernelList[];
+extern void* ncclSymkKernelList[];
+extern int ncclSymkKernelRequirements[/*ncclSymkKernelCount*/];
 void* ncclSymkGetKernelPtr(ncclSymkKernelId kernelId, int/*ncclDevRedOp_t*/ red, ncclDataType_t ty);
 const char* ncclSymkKernelIdToString(int kernelId);
 
diff --git a/projects/rccl/src/include/transport.h b/projects/rccl/src/include/transport.h
index 71fbf282c7e..5a78b14822b 100644
--- a/projects/rccl/src/include/transport.h
+++ b/projects/rccl/src/include/transport.h
@@ -110,8 +110,9 @@ struct ncclTransport {
 
 ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, int channelId, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex);
 ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex, bool* needsProxy=NULL);
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode);
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode, bool* isAllCudaP2p);
 ncclResult_t ncclTransportIsAllDirectP2p(struct ncclComm* comm, int* isAllDirectP2p);
+bool ncclP2pUsesMemcpy();
 
 ncclResult_t ncclNvlsInit(struct ncclComm* comm);
 ncclResult_t ncclNvlsSetup(struct ncclComm* comm, struct ncclComm* parent);
@@ -153,4 +154,13 @@ ncclResult_t ncclNvlsGroupCreate(struct ncclComm *comm, CUmulticastObjectProp *p
 ncclResult_t ncclNvlsGroupConnect(struct ncclComm *comm, char *shareableHandle, int rank, CUmemGenericAllocationHandle *mcHandle);
 #endif
 
+ncclResult_t ncclIpcSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclIpcMapSymmetric(struct ncclComm* comm, size_t offset, size_t size, CUmemGenericAllocationHandle memHandle, void** symPtr);
+ncclResult_t ncclIpcFreeSymmetric(struct ncclComm* comm, size_t size, void* symPtr);
+ncclResult_t ncclIpcSymmetricFinalize(struct ncclComm* comm);
+ncclResult_t ncclNvlsSymmetricInit(struct ncclComm* comm);
+ncclResult_t ncclNvlsMapSymmetric(struct ncclComm* comm, size_t offset, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsFreeSymmetric(struct ncclComm* comm, size_t ucsize, void* ucaddr);
+ncclResult_t ncclNvlsSymmetricFinalize(struct ncclComm* comm);
+
 #endif
diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h
index 85c0cc054bd..b7e4eaa5540 100644
--- a/projects/rccl/src/include/utils.h
+++ b/projects/rccl/src/include/utils.h
@@ -68,6 +68,16 @@ inline ncclResult_t getRandomData(void* buffer, size_t bytes) {
   return ret;
 }
 
+static inline int gcd(int a, int b) {
+  // use the euclidian algorithm
+  while (b != 0) {
+    int temp = b;
+    b = a % b;
+    a = temp;
+  }
+  return a;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 template<typename Int>
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
index 29470d0480b..b2b4f38f785 100644
--- a/projects/rccl/src/init.cc
+++ b/projects/rccl/src/init.cc
@@ -50,6 +50,7 @@
 #include <unordered_map>
 #include "ce_coll.h"
 #include "nvtx.h"
+#include "env.h"
 
 // [RCCL]
 #include "git_version.h"
@@ -278,6 +279,19 @@ static ncclResult_t ncclInit() {
   return initResult;
 }
 
+static ncclResult_t envInitResult = ncclSuccess;
+static std::once_flag envInitOnceFlag;
+
+static void envInitOnceFunc() {
+  NCCLCHECKGOTO(ncclEnvPluginInit(), envInitResult, exit);
+exit:;
+}
+
+ncclResult_t ncclInitEnv() {
+  std::call_once(envInitOnceFlag, envInitOnceFunc);
+  return envInitResult;
+}
+
 NCCL_API(ncclResult_t, ncclGetVersion, int* version);
 ncclResult_t ncclGetVersion_impl(int* version) {
   Recorder::instance().record("GetVersion");
@@ -288,6 +302,7 @@ ncclResult_t ncclGetVersion_impl(int* version) {
 
 NCCL_API(ncclResult_t, ncclGetUniqueId, ncclUniqueId* out);
 ncclResult_t ncclGetUniqueId_impl(ncclUniqueId* out) {
+  NCCLCHECK(ncclInitEnv());
   NCCLCHECK(ncclInit());
   NCCLCHECK(PtrCheck(out, "GetUniqueId", "out"));
   struct ncclBootstrapHandle handle;
@@ -586,8 +601,13 @@ static ncclResult_t commFree(ncclComm_t comm) {
   if (comm->doneEvent != NULL)
     CUDACHECK(hipEventDestroy(comm->doneEvent));
 
+  // GIN may use proxy. We need to finalize it before destroying the proxy.
+  NCCLCHECK(ncclGinFinalize(comm));
+
+  int sharedResRefCount = 0;
   if (comm->sharedRes) {
-    if (ncclAtomicRefCountDecrement(&comm->sharedRes->refCount) == 0) {
+    sharedResRefCount = ncclAtomicRefCountDecrement(&comm->sharedRes->refCount);
+    if (sharedResRefCount == 0) {
       for (int c=0; c<MAXCHANNELS; c++) {
         if (comm->sharedRes->peers[c]) free(comm->sharedRes->peers[c]);
         if (comm->sharedRes->devPeers[c]) ncclCudaFree(comm->sharedRes->devPeers[c]);
@@ -636,7 +656,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
 
   commPoison(comm); // poison comm before free to avoid comm reuse.
   NCCLCHECK(ncclProfilerPluginFinalize(comm));
-  NCCLCHECK(ncclNetFinalize(comm));
+  if (sharedResRefCount == 0) NCCLCHECK(ncclNetFinalize(comm));
   if (ncclParamLaunchOrderImplicit()) {
     ncclCudaContextDrop(comm->context);
     INFO(NCCL_INIT, "cudaDev %d context tracking destroyed", comm->cudaDev);
@@ -722,10 +742,27 @@ static ncclResult_t commAlloc(struct ncclComm* comm, struct ncclComm* parent, in
   comm->hierarchicalInterComm = nullptr;
   comm->hierarchicalCommsInitialized = false;
   comm->hierarchicalAGTempBuffer = nullptr;
-  // Enable PAT for interComm hierarchical AG
   comm->forcePatEnable = (parent != nullptr) ? parent->forcePatEnable : false;
 
-  NCCLCHECK(ncclNetInit(comm));
+  if (parent == NULL || !parent->shareResources) {
+    struct ncclSharedResources* sharedRes = NULL;
+    NCCLCHECK(ncclCalloc(&sharedRes, 1));
+    sharedRes->owner = comm;
+    sharedRes->tpNRanks = comm->nRanks;
+    NCCLCHECK(ncclCalloc(&sharedRes->tpRankToLocalRank, comm->nRanks));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->deviceStream));
+    NCCLCHECK(ncclStrongStreamConstruct(&sharedRes->hostStream));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->launchEvent, cudaEventDisableTiming));
+    CUDACHECK(cudaEventCreateWithFlags(&sharedRes->scratchEvent, cudaEventDisableTiming));
+    comm->sharedRes = sharedRes;
+    sharedRes->refCount = 1;
+    NCCLCHECK(ncclNetInit(comm));
+  } else {
+    comm->sharedRes = parent->sharedRes;
+    ncclAtomicRefCountIncrement(&parent->sharedRes->refCount);
+    NCCLCHECK(ncclNetInitFromParent(comm, parent));
+  }
+
   INFO(NCCL_INIT, "Using network %s", comm->ncclNet->name);
 
   if (parent && parent->shareResources) {
@@ -1236,6 +1273,8 @@ NCCL_PARAM(MNNVLEnable, "MNNVL_ENABLE", 2);
 #define TIMER_INIT_ALLOC 7
 #define TIMERS_INIT_COUNT 8
 
+extern int64_t ncclParamWinStride();
+
 static ncclResult_t initNvlDomainInfo(struct ncclComm* comm) {
   // Initialize NVLink domain info
   comm->nvlDomainInfo.nNvlDomains = comm->nNodes;
@@ -1270,6 +1309,76 @@ static bool uniformRanksPerHost(const struct ncclComm* comm, int nranks) {
   return total == nranks && ranksPerHost > 0;
 }
 
+NCCL_PARAM(GroupSize, "P2P_SCHEDULE_GROUP_SIZE", NCCL_MAX_DEV_WORK_P2P_PER_BATCH);
+
+static ncclResult_t ncclP2pSchedule(struct ncclComm* comm) {
+  struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
+  int groupSize = (comm->nNodes > 1) ? ncclParamGroupSize() : comm->maxLocalRanks;
+  for (int node = 0; node < comm->nNodes; node++) {
+    int localRanks = nodeRanks[node].localRanks;
+    if (localRanks % groupSize != 0 || localRanks < groupSize) groupSize = gcd(groupSize, nodeRanks[node].localRanks);
+  }
+  // p2pChannelShiftSize is set by rcclCommSetP2pShiftSize() before this function;
+  // do not overwrite the RCCL-specific channel mapping (defaults to bit-reversal = -1).
+
+  int local = comm->localRank % groupSize;
+  int group = comm->localRank / groupSize;
+  int nGroups = comm->nRanks / groupSize;
+  int nGroupsPow2 = pow2Up(nGroups);
+
+  int *groupToNode, *groupToLocal;
+  NCCLCHECK(ncclCalloc(&groupToNode, nGroups));
+  NCCLCHECK(ncclCalloc(&groupToLocal, nGroups));
+  int groupCount = 0;
+  for (int n = 0; n < comm->nNodes; ++n) {
+    if (0 != comm->nodeRanks[n].localRanks % groupSize) {
+      WARN("nLocals = %d should be a diviser of the number of ranks in node %d = %d", groupSize, n, comm->nodeRanks[n].localRanks);
+      return ncclInternalError;
+    }
+    int nGroupsInNode = comm->nodeRanks[n].localRanks / groupSize;
+    for (int g = 0; g < nGroupsInNode; ++g) {
+      groupToLocal[groupCount] = g * groupSize;
+      groupToNode[groupCount] = n;
+      groupCount++;
+    }
+    if (n < comm->node) group += nGroupsInNode;
+  }
+  if (groupCount != nGroups) {
+    WARN("Group creation failed: %d vs %d", groupCount, nGroups);
+    return ncclInternalError;
+  }
+  INFO(NCCL_GRAPH,"%s: group size used is %d",__func__,groupSize);
+
+  uint32_t groupRound = 0, groupDelta = 0;
+  int round = 0;
+  do {
+    if (groupDelta < nGroups) {
+      int sendGroup = (group + groupDelta) % nGroups;
+      int recvGroup = (group - groupDelta + nGroups) % nGroups;
+      int sendNode = groupToNode[sendGroup];
+      int recvNode = groupToNode[recvGroup];
+      for (int delta = 0; delta < groupSize; delta++) {
+        int sendLocal = groupToLocal[sendGroup] + (local + delta) % groupSize;
+        int recvLocal = groupToLocal[recvGroup] + (local - delta + groupSize) % groupSize;
+        comm->p2pSchedule[round].sendRank = nodeRanks[sendNode].localRankToRank[sendLocal];
+        comm->p2pSchedule[round].recvRank = nodeRanks[recvNode].localRankToRank[recvLocal];
+        round += 1;
+      }
+    }
+    groupRound += 1;
+    groupDelta = (groupDelta + groupRound) & (nGroupsPow2 - 1);
+  } while (groupRound != nGroupsPow2);
+
+  free(groupToNode);
+  free(groupToLocal);
+
+  if (round != comm->nRanks) {
+    WARN("P2p schedule creation has bugs.");
+    return ncclInternalError;
+  }
+  return ncclSuccess;
+}
+
 static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
   // We use 2 AllGathers
   // 1. { peerInfo, comm, compCap}
@@ -1357,13 +1466,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   do {
     // Compute intra-process ranks
     int intraProcRank0 = -1, intraProcRank = -1, intraProcRanks = 0;
-    for (int i = 0; i < nranks; i++) comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
-    for (int i = 0; i < nranks; i++) comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
 
     comm->nvlsRegSupport = 1;
     for (int i = 0; i < nranks; i++) {
-      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash)
-          && (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
+      comm->minCompCap = std::min(comm->minCompCap, comm->peerInfo[i].cudaCompCap);
+      comm->maxCompCap = std::max(comm->maxCompCap, comm->peerInfo[i].cudaCompCap);
+      if ((comm->peerInfo[i].hostHash == comm->peerInfo[rank].hostHash) &&
+          (comm->peerInfo[i].pidHash == comm->peerInfo[rank].pidHash)) {
         // Rank is in same process
         if (intraProcRanks == 0) intraProcRank0 = i;
         if (i == rank) intraProcRank = intraProcRanks;
@@ -1877,7 +1986,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   // Profiler plugin context has to be initialized before proxy thread
   NCCLCHECK(ncclProfilerPluginInit(comm));
 
-  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode), ret, fail);
+  NCCLCHECKGOTO(ncclTransportCheckP2pType(comm, &comm->isAllDirectP2p, &comm->directMode, &comm->isAllCudaP2p), ret, fail);
   // Launch proxy service thread, after this, the proxy calls can be used.
   if (parent && parent->shareResources) {
     comm->proxyState = parent->sharedRes->proxyState;
@@ -1888,61 +1997,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
   NCCLCHECKGOTO(ncclCalloc(&comm->gproxyConn, comm->nRanks), ret, fail);
 
   timers[TIMER_INIT_CONNECT] = clockNano();
-  do { // Build p2p schedule
-    int node = comm->node;
-    int nNodes = comm->nNodes;
-    int nRanks = comm->nRanks;
-    int local = comm->localRank;
-    int nLocals = comm->maxLocalRanks;
-    struct ncclNodeRanks* nodeRanks = comm->nodeRanks;
-    bool flat = false;
-    for (int node = 0; node < nNodes; node++) {
-      if (nodeRanks[node].localRanks != nLocals) {
-        flat = true;
-        nNodes = 1; node = 0;
-        nLocals = nRanks; local = rank;
-        break;
-      }
-    }
-    int nNodesPow2 = pow2Up(nNodes);
-    int nLocalsPow2 = pow2Up(nLocals);
-    comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, nRanks);
-    comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, nRanks);
-    uint32_t nodeRound = 0;
-    uint32_t nodeDelta = 0;
-    int round = 0;
-    // When enumerating peer deltas we use the quadratic formula (x*x+x)/2 mod N.
-    // Since that formula only produces valid permutations when N is a pow of 2,
-    // we let N = pow2Up(n) and filter out results greater-eq to n.
-    // Example sequence for 16 ranks: 0, 1, 3, 6, 10, 15, 5, 12, 4, 13, 7, 2, 14, 11, 9, 8
-    do {
-      if (nodeDelta < nNodes) { // Filter nonsensical node deltas
-        int sendNode = (node + nodeDelta) % nNodes;
-        int recvNode = (node - nodeDelta + nNodes) % nNodes;
-        uint32_t localRound = 0;
-        uint32_t localDelta = 0;
-        do {
-          if (localDelta < nLocals) { // Filter nonsensical node-local deltas
-            int sendLocal = (local + localDelta) % nLocals;
-            int recvLocal = (local - localDelta + nLocals) % nLocals;
-            comm->p2pSchedule[round].sendRank = flat ? sendLocal : nodeRanks[sendNode].localRankToRank[sendLocal];
-            comm->p2pSchedule[round].recvRank = flat ? recvLocal : nodeRanks[recvNode].localRankToRank[recvLocal];
-            round += 1;
-          }
-          localRound += 1;
-          localDelta = (localDelta + localRound) & (nLocalsPow2 - 1); // Quadratic update
-        } while (localRound != nLocalsPow2);
-      }
-      nodeRound += 1;
-      nodeDelta = (nodeDelta + nodeRound) & (nNodesPow2 - 1); // Quadratic update
-    } while (nodeRound != nNodesPow2);
-
-    if (round != nRanks) {
-      WARN("P2p schedule creation has bugs.");
-      ret = ncclInternalError;
-      goto fail;
-    }
-  } while (0);
+  // Build p2p schedule
+  comm->p2pSchedule = ncclMemoryStackAlloc<ncclComm::P2pSchedulePair>(&comm->memPermanent, comm->nRanks);
+  comm->planner.peers = ncclMemoryStackAlloc<ncclKernelPlanner::Peer>(&comm->memPermanent, comm->nRanks);
+  NCCLCHECK(ncclP2pSchedule(comm));
 
   comm->runtimeConn = comm->cuMemSupport && ncclParamRuntimeConnect();
   if (comm->runtimeConn) {
@@ -2062,7 +2120,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
     }
   }
 
-  comm->symmetricSupport = comm->isAllDirectP2p && comm->nNodes == 1 && ncclParamWinEnable() && ncclCuMemEnable();
+  comm->symmetricSupport = comm->isAllCudaP2p && ncclParamWinEnable() && ncclCuMemEnable();
   comm->devrState.bigSize = 0;
 
   comm->ceColl.baseUCSymReadyPtr = NULL;
@@ -2113,7 +2171,7 @@ NCCL_PARAM(MinCTAs, "MIN_CTAS", NCCL_CONFIG_UNDEF_INT);
 #define NCCL_MAX_CGA_CLUSTER_SIZE 8
 
 NCCL_PARAM(NChannelsPerNetPeer, "NCHANNELS_PER_NET_PEER", NCCL_CONFIG_UNDEF_INT);
-NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", 0);
+NCCL_PARAM(NvlinkUtilCentricSchedEnable, "NVLINK_UTIL_CENTRIC_SCHED_ENABLE", NCCL_CONFIG_UNDEF_INT);
 
 
 #define NCCL_COMMINIT_FUNCNAME_LEN 128
@@ -2491,6 +2549,8 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   cgaClusterSizeEnv = ncclParamCGAClusterSize();
   if (0 <= cgaClusterSizeEnv && cgaClusterSizeEnv <= NCCL_MAX_CGA_CLUSTER_SIZE) {
+    if (comm->config.cgaClusterSize != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config cgaClusterSize reset to NCCL_MAX_CGA_CLUSTER_SIZE=%d", cgaClusterSizeEnv);
     comm->config.cgaClusterSize = cgaClusterSizeEnv;
   } else if (cgaClusterSizeEnv > NCCL_MAX_CGA_CLUSTER_SIZE) {
     INFO(NCCL_ENV, "NCCL_CGA_CLUSTER_SIZE value %d is too big. Limiting value to %d.", cgaClusterSizeEnv, NCCL_MAX_CGA_CLUSTER_SIZE);
@@ -2501,16 +2561,22 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (minCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
     if (minCTAsEnv <= 0)
       INFO(NCCL_ENV, "NCCL_MIN_CTAS %d is too low, leaving it set at %d", minCTAsEnv, comm->config.minCTAs);
-    else
+    else {
+      if (comm->config.minCTAs != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config minCTAs reset to NCCL_MIN_CTAS=%d", minCTAsEnv);
       comm->config.minCTAs = minCTAsEnv;
+    }
   }
 
   maxCTAsEnv = ncclParamMaxCTAs();
   if (maxCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
     if (maxCTAsEnv <= 0)
       INFO(NCCL_ENV, "NCCL_MAX_CTAS %d is too low, leaving it set at %d", maxCTAsEnv, comm->config.maxCTAs);
-    else
+    else {
+      if (comm->config.maxCTAs != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config maxCTAs reset to NCCL_MAX_CTAS=%d", maxCTAsEnv);
       comm->config.maxCTAs = maxCTAsEnv;
+    }
   }
 
   /* override configuration with env variable. */
@@ -2518,22 +2584,30 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (nChannelsPerNetPeerEnv != NCCL_CONFIG_UNDEF_INT) {
     if (nChannelsPerNetPeerEnv <= 0)
       INFO(NCCL_ENV, "NCCL_NCHANNELS_PER_NET_PEER %d is too low, leaving it set at %d", nChannelsPerNetPeerEnv, comm->config.nChannelsPerNetPeer);
-    else
+    else {
+      if (comm->config.nChannelsPerNetPeer != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config nChannelsPerNetPeer reset to NCCL_NCHANNELS_PER_NET_PEER=%d", nChannelsPerNetPeerEnv);
       comm->config.nChannelsPerNetPeer = nChannelsPerNetPeerEnv;
+    }
   }
 
   nvlinkUtilCentricSchedEnableEnv = ncclParamNvlinkUtilCentricSchedEnable();
   if (nvlinkUtilCentricSchedEnableEnv != NCCL_CONFIG_UNDEF_INT) {
     if (nvlinkUtilCentricSchedEnableEnv != 0 && nvlinkUtilCentricSchedEnableEnv != 1)
       INFO(NCCL_ENV, "NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE %d is not valid, leaving it set at %d", nvlinkUtilCentricSchedEnableEnv, comm->config.nvlinkCentricSched);
-    else
+    else {
+      if (comm->config.nvlinkCentricSched != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config nvlinkCentricSched reset to NCCL_NVLINK_UTIL_CENTRIC_SCHED_ENABLE=%d", nvlinkUtilCentricSchedEnableEnv);
       comm->config.nvlinkCentricSched = nvlinkUtilCentricSchedEnableEnv;
+    }
   }
 
   envNetName = ncclGetEnv("NCCL_NET");
   if (envNetName)
     tmpNetName = envNetName;
   if (tmpNetName != NULL) {
+    if (comm->config.netName != NCCL_CONFIG_UNDEF_PTR)
+      INFO(NCCL_ENV, "Comm config netName reset to NCCL_NET=%s", tmpNetName);
     int netNameLen = strlen(tmpNetName) + 1;
     comm->config.netName = (char*)malloc(netNameLen);
     if (comm->config.netName == nullptr) {
@@ -2547,10 +2621,14 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   splitShareEnv = ncclParamCommSplitShareResources();
   if (splitShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.splitShare != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config splitShare reset to NCCL_COMM_SPLIT_SHARE_RESOURCES=%d", splitShareEnv);
     comm->config.splitShare = splitShareEnv;
   }
   shrinkShareEnv = ncclParamCommShrinkShareResources();
   if (shrinkShareEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.shrinkShare != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config shrinkShare reset to NCCL_COMM_SHRINK_SHARE_RESOURCES=%d", shrinkShareEnv);
     comm->config.shrinkShare = shrinkShareEnv;
   }
 
@@ -2560,6 +2638,8 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
   if (collnetEnableEnv != NULL) {
     int collnetEnableInt = (int)strtol(collnetEnableEnv, NULL, 0);
     if (collnetEnableInt != NCCL_CONFIG_UNDEF_INT) {
+      if (comm->config.collnetEnable != NCCL_CONFIG_UNDEF_INT)
+        INFO(NCCL_ENV, "Comm config collnetEnable reset to NCCL_COLLNET_ENABLE=%d", collnetEnableInt);
       comm->config.collnetEnable = collnetEnableInt;
       INFO(NCCL_ENV, "NCCL_COLLNET_ENABLE set by environment to %d.", collnetEnableInt);
     }
@@ -2567,11 +2647,15 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
 
   ctaPolicyEnv = ncclParamCtaPolicy();
   if (ctaPolicyEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.CTAPolicy != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config CTAPolicy reset to NCCL_CTA_POLICY=%d", ctaPolicyEnv);
     comm->config.CTAPolicy = ctaPolicyEnv;
   }
 
   nvlsCTAsEnv = ncclParamNvlsChannels();
   if (nvlsCTAsEnv != NCCL_CONFIG_UNDEF_INT) {
+    if (comm->config.nvlsCTAs != NCCL_CONFIG_UNDEF_INT)
+      INFO(NCCL_ENV, "Comm config nvlsCTAs reset to NCCL_NVLS_NCHANNELS=%d", nvlsCTAsEnv);
     comm->config.nvlsCTAs = nvlsCTAsEnv;
   }
 
@@ -2866,6 +2950,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, int nId
 NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
 ncclResult_t ncclCommInitRank_impl(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
   NCCLCHECK(Recorder::instance().record(rrCommInitRank, nranks, myrank, &commId));
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRank)
   // Load the CUDA driver and dlsym hooks (can fail on old drivers)
   rocmLibraryInit();
@@ -2967,6 +3052,7 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
   ncclConfig_t internalConfig = NCCL_CONFIG_INITIALIZER;
   ncclConfig_t *internalConfigPtr = NULL;
 
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRankConfig);
 
   NCCLCHECK(ncclGroupStartInternal());
@@ -2998,6 +3084,7 @@ ncclResult_t ncclCommInitRankConfig_impl(ncclComm_t *newcomm, int nranks, ncclUn
 
 NCCL_API(ncclResult_t, ncclCommInitRankScalable, ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config);
 ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commId, ncclConfig_t* config) {
+  NCCLCHECK(ncclInitEnv());
   NVTX3_RANGE(NcclNvtxParamsCommInitRankScalable);
 
   int cudaDev;
@@ -3263,6 +3350,103 @@ static ncclResult_t setCommAbortFlags(ncclComm_t comm, int value) {
   return ncclSuccess;
 }
 
+NCCL_API(ncclResult_t, ncclCommRevoke, ncclComm_t comm, int revokeFlags);
+struct ncclCommRevokeAsyncJob {
+  struct ncclAsyncJob base;
+  ncclComm_t comm;
+};
+
+static ncclResult_t commRevokeAsync(struct ncclAsyncJob* job_) {
+  struct ncclCommRevokeAsyncJob* job = (struct ncclCommRevokeAsyncJob*)job_;
+  ncclComm_t comm = job->comm;
+  ncclResult_t res = ncclSuccess;
+  NCCLCHECKGOTO(PtrCheck(comm, "CommRevokeAsync", "comm"), res, exit);
+  INFO(NCCL_INIT, "CommRevokeAsync START comm %p rank %d nRanks %d nNodes %d localRank %d cudaDev %d",
+      comm, comm->rank, comm->nRanks, comm->nNodes, comm->localRank, comm->cudaDev);
+  CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), res, exit);
+  NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->hostStream), res, exit);
+  NCCLCHECKGOTO(ncclStrongStreamSynchronize(&comm->sharedRes->deviceStream), res, exit);
+  NCCLCHECKGOTO(ncclCommPollEventCallbacks(comm, /*waitSome=*/true), res, exit);
+  NCCLCHECKGOTO(ncclCommPollCallbacks(comm, /*waitSome=*/false), res, exit);
+  {
+    ncclResult_t _tmpret = ncclSuccess;
+    if ((_tmpret = ncclProxyStop(comm)) != ncclSuccess) {
+      WARN("ncclProxyStop: comm %p (rank = %d) destroys proxy resource error %d", comm, comm->rank, _tmpret);
+    }
+    if (comm->proxyState && comm->proxyRefCountOld == 0 && comm->proxyState->thread) {
+      PTHREADCHECK(pthread_join(comm->proxyState->thread, nullptr), "pthread_join");
+      if (comm->proxyState->threadUDS) {
+        // UDS support
+        PTHREADCHECK(pthread_join(comm->proxyState->threadUDS, nullptr), "pthread_join");
+      }
+      // Mark threads as joined so later cleanup (e.g., commFree) won't join again
+      comm->proxyState->thread = 0;
+      comm->proxyState->threadUDS = 0;
+    }
+  }
+  NCCLCHECKGOTO(setCommAbortFlags(comm, 0), res, exit);
+exit:
+  (void)ncclCommSetAsyncError(comm, res);
+  INFO(NCCL_INIT, "CommRevokeAsync END comm %p result %d", comm, res);
+  return res;
+}
+
+ncclResult_t ncclCommRevoke(ncclComm_t comm, int revokeFlags) {
+  NVTX3_RANGE(NcclNvtxParamsCommRevoke);
+
+  if (comm == NULL) {
+    return ncclSuccess;
+  }
+  // For now only NCCL_REVOKE_DEFAULT (0) is supported
+  if (revokeFlags != 0) { // NCCL_REVOKE_DEFAULT = 0
+    return ncclInvalidArgument;
+  }
+  // Disallow revoke if destroy/finalize in progress
+  if (comm->destroyFlag || comm->finalizeCalled) {
+    return ncclInvalidArgument;
+  }
+  // Disallow revoke if revoke in progress
+  if (comm->revokedFlag) {
+    return ncclInvalidArgument;
+  }
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Revoke START",
+      comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
+
+  NCCLCHECK(ncclGroupStartInternal());
+  (void)setCommAbortFlags(comm,1);
+  comm->revokedFlag = 1;
+  (void)ncclCommEnsureReady(comm);
+  comm->finalizeCalled = true;
+
+  int rank = comm->rank, nranks = comm->nRanks, cudaDev = comm->cudaDev;
+  struct ncclCommRevokeAsyncJob *job = NULL;
+  ncclResult_t res = ncclSuccess;
+
+  NVTX3_RANGE_ADD_PAYLOAD(CommRevoke, NcclNvtxParamsCommInitRankSchema,
+    NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
+  TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, rank, nranks, cudaDev, comm->busId);
+
+  NCCLCHECKGOTO(ncclCalloc(&job, 1), res, fail);
+  job->comm = comm;
+  NCCLCHECKGOTO(ncclAsyncLaunch((struct ncclAsyncJob*)job, commRevokeAsync, NULL, free, comm), res, fail);
+
+exit:
+  ncclGroupErrCheck(res);
+  NCCLCHECK(ncclGroupEndInternal());
+  if (comm) {
+    if (!comm->config.blocking) {
+      NCCLCHECK(ncclCommGetAsyncError(comm, &res));
+    }
+    NVTX3_RANGE_ADD_PAYLOAD(CommRevoke, NcclNvtxParamsCommInitRankSchema,
+      NVTX3_PAYLOAD(comm->commHash, nranks, rank, cudaDev));
+  }
+  INFO(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx - Revoke COMPLETE, result %d", comm, rank, nranks, cudaDev, comm->busId, res);
+  return res;
+fail:
+  if (comm && !comm->config.blocking) (void) ncclCommSetAsyncError(comm, res);
+  goto exit;
+}
+
 NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm);
 ncclResult_t ncclCommAbort_impl(ncclComm_t comm) {
   NCCLCHECK(Recorder::instance().record(rrCommAbort, comm));
@@ -3342,8 +3526,9 @@ static ncclResult_t ncclCommInitChildComm(ncclComm_t comm, ncclComm_t* newcomm,
     childComm->startMagic = childComm->endMagic = NCCL_MAGIC;
 
     // Set the shareResource field, this is used throughout the init and must be reset every time.
-    // If we shrink, we only reuse resources if we shrink in the default mode
-    comm->shareResources = isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare;
+    // Never share resources if the parent communicator has been revoked.
+    // If we shrink, we only reuse resources in default mode.
+    comm->shareResources = !comm->revokedFlag && (isShrink ? (!(flags & NCCL_SHRINK_ABORT) && comm->config.shrinkShare) : comm->config.splitShare);
     if (comm->shareResources) {
       childComm->abortFlag = comm->abortFlag;
       childComm->abortFlagDev = comm->abortFlagDev;
@@ -3480,6 +3665,26 @@ ncclResult_t ncclCommGetAsyncError_impl(ncclComm_t comm, ncclResult_t *asyncErro
 
   *asyncError = __atomic_load_n(&comm->asyncResult, __ATOMIC_ACQUIRE);
   if (*asyncError == ncclSuccess && comm->proxyState) *asyncError = __atomic_load_n(&comm->proxyState->asyncResult, __ATOMIC_ACQUIRE);
+
+  /* Check gin status */
+  if (*asyncError == ncclSuccess && comm->sharedRes && comm->sharedRes->ginState.ncclGin) {
+    struct ncclGinState* ginState = &comm->sharedRes->ginState;
+    // Gin progress thread status
+    if (ginState->needsProxyProgress) *asyncError = __atomic_load_n(&comm->sharedRes->ginState.asyncResult, __ATOMIC_ACQUIRE);
+    // Gin side errors, also works when we have no GIN progress thread.
+    if (*asyncError == ncclSuccess) {
+      bool ginError;
+      for (int c=0; c<comm->sharedRes->ginState.ginCommCount; c++) {
+        NCCLCHECK(ncclGinQueryLastError(&comm->sharedRes->ginState, &ginError));
+        if (ginError) {
+          WARN("GIN Error on gin context %d\n", c);
+          *asyncError = ncclRemoteError;
+          break;
+        }
+      }
+    }
+  }
+
   /* if there is linked group job, we should complete it. */
   if (*asyncError == ncclSuccess && comm->groupJob) {
     NCCLCHECK(ncclGroupJobComplete(comm->groupJob));
diff --git a/projects/rccl/src/libnccl.map b/projects/rccl/src/libnccl.map
new file mode 100644
index 00000000000..1dbfcd50802
--- /dev/null
+++ b/projects/rccl/src/libnccl.map
@@ -0,0 +1,8 @@
+{
+    global:
+        nccl*;
+        pnccl*;
+
+    local:
+        *;
+};
diff --git a/projects/rccl/src/misc/ibvsymbols.cc b/projects/rccl/src/misc/ibvsymbols.cc
index bd5f33390ff..c70e73b545f 100644
--- a/projects/rccl/src/misc/ibvsymbols.cc
+++ b/projects/rccl/src/misc/ibvsymbols.cc
@@ -50,7 +50,7 @@ ncclResult_t buildIbvSymbols(struct ncclIbvSymbols* ibvSymbols) {
   ASSIGN_SYM(ibvSymbols, ibv_destroy_qp, ibv_internal_destroy_qp);
   ASSIGN_SYM(ibvSymbols, ibv_fork_init, ibv_internal_fork_init);
   ASSIGN_SYM(ibvSymbols, ibv_event_type_str, ibv_internal_event_type_str);
-  
+
   ASSIGN_SYM(ibvSymbols, ibv_query_ece, ibv_internal_query_ece);
   ASSIGN_SYM(ibvSymbols, ibv_set_ece, ibv_internal_set_ece);
 
diff --git a/projects/rccl/src/misc/ibvwrap.cc b/projects/rccl/src/misc/ibvwrap.cc
index 058eb8b0ad7..afa17b5ac9a 100644
--- a/projects/rccl/src/misc/ibvwrap.cc
+++ b/projects/rccl/src/misc/ibvwrap.cc
@@ -169,12 +169,17 @@ ncclResult_t wrap_ibv_query_device(struct ibv_context *context, struct ibv_devic
 }
 
 ncclResult_t wrap_ibv_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) {
+#ifndef NCCL_BUILD_RDMA_CORE
   // First try and query the extended port attributes (e.g. active_speed_ex)
   if (ibv_query_port_ex(context, port_num, port_attr) != 0) {
     // Fall back to the original attribute API call, but zero all members first
     memset(port_attr, 0, sizeof(*port_attr));
     IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
   }
+#else
+  // When using system rdma-core, use the regular ibv_query_port
+  IBV_INT_CHECK_RET_ERRNO(ibvSymbols, ibv_internal_query_port, ibv_internal_query_port(context, port_num, port_attr), 0, "ibv_query_port");
+#endif
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/misc/param.cc b/projects/rccl/src/misc/param.cc
index e01932a26d7..642c8def677 100644
--- a/projects/rccl/src/misc/param.cc
+++ b/projects/rccl/src/misc/param.cc
@@ -6,6 +6,7 @@
 
 #include "param.h"
 #include "debug.h"
+#include "env.h"
 
 #include <algorithm>
 #include <errno.h>
@@ -93,6 +94,6 @@ void ncclLoadParam(char const* env, int64_t deftVal, int64_t uninitialized, int6
 }
 
 const char* ncclGetEnv(const char* name) {
-  initEnv();
-  return getenv(name);
+  ncclInitEnv();
+  return ncclEnvPluginGetEnv(name);
 }
diff --git a/projects/rccl/src/misc/rocm_smi_wrap.cc b/projects/rccl/src/misc/rocm_smi_wrap.cc
index 62e978e6d6a..3fc9e8a8912 100644
--- a/projects/rccl/src/misc/rocm_smi_wrap.cc
+++ b/projects/rccl/src/misc/rocm_smi_wrap.cc
@@ -189,7 +189,7 @@ ncclResult_t rocm_smi_getLinkInfo(int srcIndex, int dstIndex, RSMI_IO_LINK_TYPE*
       ARSMI_linkInfo tinfo;
       ARSMICHECK(ARSMI_topo_get_link_info(srcIndex, dstIndex, &tinfo));
 
-      *rsmi_type  = (ARSMI_IO_LINK_TYPE) tinfo.type;
+      *rsmi_type  = (RSMI_IO_LINK_TYPE) tinfo.type;
       if (*rsmi_type == RSMI_IOLINK_TYPE_XGMI && (tinfo.weight == 15 ||
         tinfo.weight == 41 || tinfo.weight == 13)) {
 	*hops = 1;
diff --git a/projects/rccl/src/misc/socket.cc b/projects/rccl/src/misc/socket.cc
index ce133f4fce3..56c000fac89 100644
--- a/projects/rccl/src/misc/socket.cc
+++ b/projects/rccl/src/misc/socket.cc
@@ -917,6 +917,29 @@ ncclResult_t ncclSocketSendRecv(struct ncclSocket* sendSock, void* sendPtr, int
 }
 
 
+ncclResult_t ncclSocketMultiOp(struct ncclSocketOp* ops, int numOps) {
+  if (ops == NULL || numOps <= 0) {
+    WARN("ncclSocketMultiOp: invalid arguments ops=%p numOps=%d", ops, numOps);
+    return ncclInvalidArgument;
+  }
+
+  for (int i = 0; i < numOps; i++) {
+    if (ops[i].sock == NULL) {
+      WARN("ncclSocketMultiOp: invalid socket at index %d", i);
+      return ncclInvalidArgument;
+    }
+    ops[i].offset = 0;
+  }
+  int completedOps=0, i=0;
+  while(completedOps < numOps){
+    if (ops[i].offset < ops[i].size){
+      NCCLCHECK(socketProgress(ops[i].op, ops[i].sock, ops[i].ptr, ops[i].size, &ops[i].offset));
+      if(ops[i].offset >= ops[i].size) completedOps++;
+    }
+    i=(i+1)%numOps;
+  }
+  return ncclSuccess;
+}
 // Receive or detect connection closed
 ncclResult_t ncclSocketTryRecv(struct ncclSocket* sock, void* ptr, int size, int* closed, bool blocking) {
   int offset = 0;
diff --git a/projects/rccl/src/nccl_device/CMakeLists.txt b/projects/rccl/src/nccl_device/CMakeLists.txt
index 9d0c3d10067..4b73ef1cc2d 100644
--- a/projects/rccl/src/nccl_device/CMakeLists.txt
+++ b/projects/rccl/src/nccl_device/CMakeLists.txt
@@ -2,7 +2,8 @@
 set(SYM_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/core.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/ll_a2a.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/mem_barrier.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/lsa_barrier.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/gin_barrier.cc
 )
 
 # Add register sources to parent scope
diff --git a/projects/rccl/src/nccl_device/gin_barrier.cc b/projects/rccl/src/nccl_device/gin_barrier.cc
new file mode 100644
index 00000000000..fef97b664be
--- /dev/null
+++ b/projects/rccl/src/nccl_device/gin_barrier.cc
@@ -0,0 +1,22 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_device/impl/gin_barrier__funcs.h"
+
+NCCL_API(ncclResult_t, ncclGinBarrierCreateRequirement, ncclComm_t comm, ncclTeam_t team, int nBarriers, ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+ncclResult_t ncclGinBarrierCreateRequirement(
+    ncclComm_t comm, ncclTeam_t team, int nBarriers,
+    ncclGinBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq
+  ) {
+  memset(outReq, 0, sizeof(*outReq));
+  outReq->bufferSize = nBarriers*NCCL_GIN_MAX_CONTEXTS*sizeof(uint32_t);
+  outReq->bufferAlign = alignof(uint32_t);
+  outReq->outBufferHandle = &outHandle->bufHandle;
+  outReq->ginSignalCount = nBarriers;
+  outReq->outGinSignalStart = &outHandle->signal0;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/nccl_device/lsa_barrier.cc b/projects/rccl/src/nccl_device/lsa_barrier.cc
new file mode 100644
index 00000000000..a2153caa991
--- /dev/null
+++ b/projects/rccl/src/nccl_device/lsa_barrier.cc
@@ -0,0 +1,21 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
+
+NCCL_API(ncclResult_t, ncclLsaBarrierCreateRequirement, ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
+ncclResult_t ncclLsaBarrierCreateRequirement(
+    ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle,
+    ncclDevResourceRequirements_t* outReq
+  ) {
+  memset(outReq, 0, sizeof(*outReq));
+  outHandle->nBarriers = nBarriers;
+  outReq->bufferSize = (3*nBarriers + nBarriers*team.nRanks)*sizeof(uint32_t);
+  outReq->bufferAlign = alignof(uint32_t);
+  outReq->outBufferHandle = &outHandle->bufHandle;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/nccl_device/mem_barrier.cc b/projects/rccl/src/nccl_device/mem_barrier.cc
index b6c400fa4e0..a2153caa991 100644
--- a/projects/rccl/src/nccl_device/mem_barrier.cc
+++ b/projects/rccl/src/nccl_device/mem_barrier.cc
@@ -5,7 +5,7 @@
  ************************************************************************/
 
 #include "core.h"
-#include "nccl_device/impl/mem_barrier__funcs.h"
+#include "nccl_device/impl/lsa_barrier__funcs.h"
 
 NCCL_API(ncclResult_t, ncclLsaBarrierCreateRequirement, ncclTeam_t team, int nBarriers, ncclLsaBarrierHandle_t* outHandle, ncclDevResourceRequirements_t* outReq);
 ncclResult_t ncclLsaBarrierCreateRequirement(
diff --git a/projects/rccl/src/plugin/CMakeLists.txt b/projects/rccl/src/plugin/CMakeLists.txt
index 2ef9282f6dd..bbbf7c0b4f3 100644
--- a/projects/rccl/src/plugin/CMakeLists.txt
+++ b/projects/rccl/src/plugin/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_subdirectory(profiler)
 add_subdirectory(net)
 add_subdirectory(tuner)
+add_subdirectory(env)
 
 # Plugin sources
 set(PLUGIN_SOURCES
@@ -9,9 +10,11 @@ set(PLUGIN_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/profiler.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/plugin_open.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/tuner.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/env.cc
     ${PLUGIN_NET_SOURCES}
     ${PLUGIN_PROFILER_SOURCES}
     ${PLUGIN_TUNER_SOURCES}
+    ${PLUGIN_ENV_SOURCES}
 )
 
 # Add plugin sources to parent scope
diff --git a/projects/rccl/src/plugin/env.cc b/projects/rccl/src/plugin/env.cc
new file mode 100644
index 00000000000..2249bba470c
--- /dev/null
+++ b/projects/rccl/src/plugin/env.cc
@@ -0,0 +1,111 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <errno.h>
+#include <stdlib.h>
+#include <mutex>
+#include <atomic>
+
+#include "checks.h"
+#include "debug.h"
+#include "env.h"
+#include "param.h"
+#include "plugin.h"
+
+extern ncclEnv_t* getNcclEnv_v1(void* lib);
+
+static void* envPluginLib = nullptr;
+static ncclEnv_t* ncclEnvPlugin = nullptr;
+extern ncclEnv_v1_t ncclIntEnv_v1;
+
+#define EXT_ENV_PLUGIN 0
+#define INT_ENV_PLUGIN 1
+#define NUM_ENV_PLUGIN 2
+static ncclEnv_t *ncclEnvPlugins[NUM_ENV_PLUGIN] = { nullptr, &ncclIntEnv_v1 };
+
+enum {
+  envPluginLoadFailed  = -1,
+  envPluginLoadReady   =  0,
+  envPluginLoadSuccess =  1,
+};
+static int envPluginStatus = envPluginLoadReady;
+
+static ncclResult_t ncclEnvPluginLoad(void) {
+  const char* envName;
+  if (envPluginStatus != envPluginLoadReady) goto exit;
+
+  if ((envName = getenv("NCCL_ENV_PLUGIN")) != nullptr) {
+    INFO(NCCL_ENV, "NCCL_ENV_PLUGIN set by environment to %s", envName);
+    if (strcasecmp(envName, "none") == 0) {
+      goto fail;
+    }
+  }
+  envPluginLib = ncclOpenEnvPluginLib(envName);
+  if (nullptr == envPluginLib) {
+    goto fail;
+  } else if (ncclPluginLibPaths[ncclPluginTypeEnv]) {
+    envName = ncclPluginLibPaths[ncclPluginTypeEnv];
+  }
+
+  ncclEnvPlugins[EXT_ENV_PLUGIN] = getNcclEnv_v1(envPluginLib);
+  if (nullptr == ncclEnvPlugins[EXT_ENV_PLUGIN]) {
+    INFO(NCCL_INIT, "External env plugin %s is unsupported", envName);
+    goto fail;
+  }
+  INFO(NCCL_INIT, "Successfully loaded external env plugin %s", envName);
+
+  envPluginStatus = envPluginLoadSuccess;
+
+exit:
+  return ncclSuccess;
+fail:
+  // Fallback to internal/default plugin
+  if (envPluginLib) NCCLCHECK(ncclClosePluginLib(envPluginLib, ncclPluginTypeEnv));
+  envPluginLib = nullptr;
+  envPluginStatus = envPluginLoadFailed;
+  goto exit;
+}
+
+static ncclResult_t ncclEnvPluginUnload(void) {
+  if (ncclEnvPlugin) {
+    INFO(NCCL_INIT, "ENV/Plugin: Closing env plugin %s", ncclEnvPlugin->name);
+  }
+  if (ncclEnvPlugins[EXT_ENV_PLUGIN]) {
+    ncclEnvPlugin = ncclEnvPlugins[INT_ENV_PLUGIN];
+    ncclEnvPlugins[EXT_ENV_PLUGIN] = nullptr;
+  }
+  NCCLCHECK(ncclClosePluginLib(envPluginLib, ncclPluginTypeEnv));
+  return ncclSuccess;
+}
+
+void ncclEnvPluginFinalize(void);
+
+static bool initialized;
+
+ncclResult_t ncclEnvPluginInit(void) {
+  initEnv();
+  NCCLCHECK(ncclEnvPluginLoad());
+  ncclEnvPlugin = (envPluginLoadSuccess == envPluginStatus) ? ncclEnvPlugins[EXT_ENV_PLUGIN] : ncclEnvPlugins[INT_ENV_PLUGIN];
+  NCCLCHECK(ncclEnvPlugin->init(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH, NCCL_SUFFIX));
+  atexit(ncclEnvPluginFinalize);
+  __atomic_store_n(&initialized, true, __ATOMIC_RELEASE);
+  return ncclSuccess;
+}
+
+void ncclEnvPluginFinalize(void) {
+  if (ncclEnvPlugin->finalize) {
+    ncclEnvPlugin->finalize();
+    ncclEnvPluginUnload();
+  }
+}
+
+const char* ncclEnvPluginGetEnv(const char* name) {
+  return ncclEnvPlugin->getEnv(name);
+}
+
+bool ncclEnvPluginInitialized(void) {
+  return __atomic_load_n(&initialized, __ATOMIC_ACQUIRE);
+}
diff --git a/projects/rccl/src/plugin/env/CMakeLists.txt b/projects/rccl/src/plugin/env/CMakeLists.txt
new file mode 100644
index 00000000000..07ca7e13d3d
--- /dev/null
+++ b/projects/rccl/src/plugin/env/CMakeLists.txt
@@ -0,0 +1,7 @@
+# Env plugin sources
+set(PLUGIN_ENV_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/env_v1.cc
+)
+
+# Env plugin sources to parent scope
+set(PLUGIN_ENV_SOURCES ${PLUGIN_ENV_SOURCES} PARENT_SCOPE)
diff --git a/projects/rccl/src/plugin/env/env_v1.cc b/projects/rccl/src/plugin/env/env_v1.cc
new file mode 100644
index 00000000000..fa2b6b1b294
--- /dev/null
+++ b/projects/rccl/src/plugin/env/env_v1.cc
@@ -0,0 +1,40 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <stdlib.h>
+#include <dlfcn.h>
+#include "debug.h"
+#include "nccl_env.h"
+
+static ncclEnv_v1_t* ncclEnv_v1;
+
+ncclEnv_t* getNcclEnv_v1(void* lib) {
+  ncclEnv_v1 = (ncclEnv_v1_t*)dlsym(lib, "ncclEnvPlugin_v1");
+  if (ncclEnv_v1) {
+    INFO(NCCL_INIT|NCCL_ENV, "ENV/Plugin: Using %s (v1)", ncclEnv_v1->name);
+    return ncclEnv_v1;
+  }
+  return nullptr;
+}
+
+static ncclResult_t ncclEnvInit(uint8_t ncclMajor, uint8_t ncclMinor, uint8_t ncclPatch, const char* suffix) {
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclEnvFinalize(void) {
+  return ncclSuccess;
+}
+
+static const char* ncclEnvGetEnv(const char* name) {
+  return getenv(name);
+}
+
+ncclEnv_v1_t ncclIntEnv_v1 = {
+  .name = "ncclEnvDefault",
+  .init = ncclEnvInit,
+  .finalize = ncclEnvFinalize,
+  .getEnv = ncclEnvGetEnv,
+};
diff --git a/projects/rccl/src/plugin/net.cc b/projects/rccl/src/plugin/net.cc
index 81258ecada2..c131d94b5c9 100644
--- a/projects/rccl/src/plugin/net.cc
+++ b/projects/rccl/src/plugin/net.cc
@@ -19,6 +19,7 @@
 
 typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
 typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
+typedef ncclGin_t* getNcclGin_t(void* netPluginLib);
 
 extern getNcclNet_t getNcclNet_v6;
 extern getNcclNet_t getNcclNet_v7;
@@ -32,14 +33,15 @@ extern getNcclCollNet_t getNcclCollNet_v8;
 extern getNcclCollNet_t getNcclCollNet_v9;
 extern getNcclCollNet_t getNcclCollNet_v10;
 extern getNcclCollNet_t getNcclCollNet_v11;
-
 extern int64_t rcclParamAinicRoce();
-
+extern getNcclGin_t getNcclGin_v11;
 NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 0);
 #define NCCL_NET_VERSION_COUNT 6
 int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {11, 10, 9, 8, 7, 6};
 getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v11, getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
 getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v11, getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
+#define NCCL_GIN_VERSION_COUNT 1
+getNcclGin_t* getNcclGin[NCCL_GIN_VERSION_COUNT] = {getNcclGin_v11};
 
 #define NCCL_NET_NUM_INTERNAL_PLUGINS 2
 
@@ -60,6 +62,8 @@ typedef struct netPluginLib {
   ncclCollNet_t* ncclCollNet;                   // Pointer to the ncclCollNet_t structure
   ncclNetPluginState_t ncclNetPluginState;      // State of the nccl net plugin
   ncclNetPluginState_t ncclCollNetPluginState;  // State of the nccl coll net plugin
+  ncclGin_t* ncclGin;                           // Pointer to the ncclGin_t structure
+  ncclNetPluginState_t ncclGinPluginState;      // State of the nccl gin plugin
   int ncclNetPluginRefCount;                    // Reference count for the nccl net plugin
   int netPhysDevs;                              // ncclNet - number of physical devices
   int netVirtDevs;                              // ncclNet - number of virtual devices
@@ -117,6 +121,17 @@ static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
   else
     pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
 
+  // load gin
+  for (int i = 0; i < NCCL_GIN_VERSION_COUNT; i++) {
+    pluginLib->ncclGin = getNcclGin[i](pluginLib->dlHandle);
+    if (pluginLib->ncclGin) break;
+  }
+
+  if (pluginLib->ncclGin == nullptr)
+    pluginLib->ncclGinPluginState = ncclNetPluginStateLoadFailed;
+  else
+    pluginLib->ncclGinPluginState = ncclNetPluginStateInitReady;
+
   INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external network plugin %s",
        (ncclPluginLibPaths[ncclPluginTypeNet] ? ncclPluginLibPaths[ncclPluginTypeNet] : pluginLib->name));
 exit:
@@ -157,10 +172,14 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
 
 static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* pluginLib) {
   int ndev;
+  // Init must be called for each new comm to set the right context
   if (pluginLib->ncclNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclNet) {
     ncclNetCommConfig_t commConfig = {};
     commConfig.trafficClass = comm->config.trafficClass == NCCL_CONFIG_UNDEF_INT ? NCCL_NET_TRAFFIC_CLASS_UNDEF : comm->config.trafficClass;
     if (pluginLib->ncclNet->init(&comm->netContext, comm->commHash, &commConfig, ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
+  }
+  // Detection of the devices is only done when the plugin is being initialized the first time
+  if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
     if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
     pluginLib->netPhysDevs = ndev;
     pluginLib->netVirtDevs = NCCL_UNDEF_DEV_COUNT;
@@ -168,15 +187,39 @@ static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* plu
   pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
   INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
 
+  // Init must be called for each new comm to set the right context
   if (pluginLib->ncclCollNetPluginState >= ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
     if (pluginLib->ncclCollNet->init(&comm->collNetContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
-    else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  }
+  // Detection of the devices is only done when the plugin is being initialized the first time
+  if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
+    if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
     else {
       pluginLib->collNetPhysDevs = ndev;
       pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
       pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
     }
   }
+
+  if (pluginLib->ncclGinPluginState == ncclNetPluginStateInitReady && pluginLib->ncclGin) {
+    if ((ncclParamGinType() == -1) && (pluginLib->ncclGin == (ncclGin_t *)-1)) {
+      void* throwAwayContext = nullptr;
+      if (ncclGinIbGdaki.init(&throwAwayContext, comm->commHash, ncclDebugLog) == ncclSuccess) {
+        if (ncclGinIbGdaki.devices(&ndev) == ncclSuccess && ndev > 0) {
+          pluginLib->ncclGin = &ncclGinIbGdaki;
+        }
+        ncclGinIbGdaki.finalize(throwAwayContext);
+      }
+      else {
+        pluginLib->ncclGin = &ncclGinIbProxy;
+      }
+    }
+    if (pluginLib->ncclGin->init(&comm->ginContext, comm->commHash, ncclDebugLog) != ncclSuccess) pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
+    else if (pluginLib->ncclGin->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
+    else {
+      pluginLib->ncclGinPluginState = ncclNetPluginStateEnabled;
+    }
+  }
 exit:
   return ncclSuccess;
 fail:
@@ -186,12 +229,11 @@ static ncclResult_t ncclNetPluginInit(struct ncclComm* comm, netPluginLib_t* plu
   pluginLib->collNetPhysDevs = pluginLib->collNetVirtDevs = NCCL_UNDEF_DEV_COUNT;
   pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
   pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
+  pluginLib->ncclGinPluginState = ncclNetPluginStateDisabled;
   goto exit;
 }
 
 static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
-  const char* netName = comm->config.netName;
-  if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
   if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
 
   if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
@@ -204,6 +246,10 @@ static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginI
     if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
       comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
     }
+    if (netPluginLibs[pluginIndex].ncclGinPluginState >= ncclNetPluginStateEnabled) {
+      INFO(NCCL_INIT|NCCL_NET, "Assigned GIN plugin %s to comm", netPluginLibs[pluginIndex].ncclGin->name);
+      comm->sharedRes->ginState.ncclGin = netPluginLibs[pluginIndex].ncclGin;
+    }
   }
 exit:
   return ncclSuccess;
@@ -211,6 +257,7 @@ static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginI
   *isAssigned = false;
   netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
   netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
+  netPluginLibs[pluginIndex].ncclGinPluginState = ncclNetPluginStateEnabled;
   goto exit;
 }
 
@@ -297,6 +344,17 @@ static void initPluginLibsOnceFunc() {
   pluginCount = pluginCounter;
 }
 
+static ncclResult_t ncclNetPluginFinalize(struct ncclComm* comm, int pluginIndex) {
+  NCCLCHECK(netPluginLibs[pluginIndex].ncclNet->finalize(comm->netContext));
+  if (netPluginLibs[pluginIndex].ncclCollNet && netPluginLibs[pluginIndex].ncclCollNetPluginState == ncclNetPluginStateEnabled) NCCLCHECK(netPluginLibs[pluginIndex].ncclCollNet->finalize(comm->collNetContext));
+  if (netPluginLibs[pluginIndex].ncclGin && netPluginLibs[pluginIndex].ncclGinPluginState == ncclNetPluginStateEnabled) NCCLCHECK(netPluginLibs[pluginIndex].ncclGin->finalize(comm->ginContext));
+  netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
+  if (pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) {
+    NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[pluginIndex]));
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclNetInit(struct ncclComm* comm) {
   bool ncclNetPluginInitialized = false;
   std::call_once(initPluginLibsOnceFlag, initPluginLibsOnceFunc);
@@ -305,17 +363,22 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
     if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
       NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
     }
-    if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady) {
+    if ((netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateInitReady)
+        && (!comm->config.netName || (strcasecmp(comm->config.netName, netPluginLibs[pluginIndex].ncclNet->name) == 0))) {
+      // plugin init must be done by all comms to setup the context, therefore we use ">="
       NCCLCHECK(ncclNetPluginInit(comm, &netPluginLibs[pluginIndex]));
-    }
-    if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
-      bool isAssigned = false;
-      NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
-      if (isAssigned) {
-        // If one external plugin is assigned to a comm, then disable all other external plugins
-        ncclNetPluginDisableOtherExternal(pluginIndex);
-        ncclNetPluginInitialized = true;
-        break;
+      if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
+        bool isAssigned = false;
+        NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
+        if (isAssigned) {
+          // If one external plugin is assigned to a comm, then disable all other external plugins
+          ncclNetPluginDisableOtherExternal(pluginIndex);
+          ncclNetPluginInitialized = true;
+          break;
+        }
+        else {
+          ncclNetPluginFinalize(comm, pluginIndex);
+        }
       }
     }
   }
@@ -324,15 +387,28 @@ ncclResult_t ncclNetInit(struct ncclComm* comm) {
   return ncclInvalidUsage;
 }
 
+ncclResult_t ncclNetInitFromParent(struct ncclComm* comm, struct ncclComm* parent) {
+  ncclResult_t ret = ncclSuccess;
+  comm->netContext = parent->netContext;
+  comm->collNetContext = parent->collNetContext;
+  comm->ginContext = parent->ginContext;
+  comm->ncclNet = parent->ncclNet;
+  comm->ncclCollNet = parent->ncclCollNet;
+  comm->netPluginIndex = parent->netPluginIndex;
+  if (comm->config.netName != NCCL_CONFIG_UNDEF_PTR && strcasecmp(comm->config.netName, parent->config.netName)) {
+    WARN("Comm config netName (%s) does not match the parent (%s)", comm->config.netName, parent->config.netName);
+    ret = ncclInvalidUsage;
+  }
+  if (comm->config.trafficClass != NCCL_CONFIG_UNDEF_INT && comm->config.trafficClass != parent->config.trafficClass) {
+    INFO(NCCL_INIT, "Comm config trafficClass (%d) does not match the parent (%d)", comm->config.trafficClass, parent->config.trafficClass);
+  }
+  return ret;
+}
+
 ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
   int pluginIndex = comm->netPluginIndex;
   std::lock_guard<std::mutex> lock(netPluginMutex);
-  NCCLCHECK(comm->ncclNet->finalize(comm->netContext));
-  if (comm->collNetContext) NCCLCHECK(comm->ncclCollNet->finalize(comm->collNetContext));
-  netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
-  for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
-    NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
-  }
+  NCCLCHECK(ncclNetPluginFinalize(comm, pluginIndex));
   return ncclSuccess;
 }
 
@@ -417,8 +493,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
     char* gpuPtr = NULL;
     void* mHandle = NULL;
     ncclResult_t ret;
-    ncclDebugNoWarn = NCCL_NET;
-    NCCLCHECKGOTO(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1);
+    NCCLCHECKGOTONOWARN(comm->ncclNet->listen(comm->netContext, dev, &handle, &lComm), ret, cleanup1, NCCL_NET);
 
     bool connected;
     connected = false;
@@ -430,22 +505,22 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
       }
 
       if (sComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2);
+        NCCLCHECKGOTONOWARN(comm->ncclNet->connect(comm->netContext, dev, &handle, &sComm, NULL), ret, cleanup2, NCCL_NET);
 
       if (rComm == NULL)
-        NCCLCHECKGOTO(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2);
+        NCCLCHECKGOTONOWARN(comm->ncclNet->accept(lComm, &rComm, NULL), ret, cleanup2, NCCL_NET);
 
       connected = (rComm != NULL) && (sComm != NULL);
     }
 
-    NCCLCHECKGOTO(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2);
-    if (comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
-      NCCLCHECK(comm->ncclNet->deregMr(sComm, mHandle));
-      NCCLCHECK(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
-      NCCLCHECK(comm->ncclNet->deregMr(rComm, mHandle));
+    NCCLCHECKGOTONOWARN(ncclCudaMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup2, NCCL_NET);
+    NOWARN(ret = comm->ncclNet->regMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), NCCL_NET);
+    if (ret == ncclSuccess) {
+      NCCLCHECKNOWARN(comm->ncclNet->deregMr(sComm, mHandle), NCCL_NET);
+      NCCLCHECKNOWARN(comm->ncclNet->regMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle), NCCL_NET);
+      NCCLCHECKNOWARN(comm->ncclNet->deregMr(rComm, mHandle), NCCL_NET);
       gdrSupportMatrix[comm->cudaDev] = 1;
     }
-    ncclDebugNoWarn = 0;
     NCCLCHECK(ncclCudaFree(gpuPtr));
 cleanup2:
     if (rComm != NULL)
diff --git a/projects/rccl/src/plugin/net/net_v10.cc b/projects/rccl/src/plugin/net/net_v10.cc
index 591a57ac081..217d6407d8a 100644
--- a/projects/rccl/src/plugin/net/net_v10.cc
+++ b/projects/rccl/src/plugin/net/net_v10.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
@@ -192,7 +191,7 @@ static ncclResult_t ncclCollNet_init(void** ctx __attribute__((unused)),
   ncclCollNet.test = ncclCollNet_v10->test;
   ncclCollNet.closeColl = ncclCollNet_v10->closeColl;
   ncclCollNet.closeListen = ncclCollNet_v10->closeListen;
-  ncclCollNet.makeVDevice = ncclCollNet_makeVDevice;
+  ncclCollNet.makeVDevice = (ncclCollNet_v10->makeVDevice) ? ncclCollNet_makeVDevice : nullptr;
   ncclCollNet.finalize = ncclCollNet_finalize;
   return ncclSuccess;
 }
diff --git a/projects/rccl/src/plugin/net/net_v11.cc b/projects/rccl/src/plugin/net/net_v11.cc
index b13a0efb9d4..a88db8bdeb4 100644
--- a/projects/rccl/src/plugin/net/net_v11.cc
+++ b/projects/rccl/src/plugin/net/net_v11.cc
@@ -5,12 +5,12 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include <dlfcn.h>
 
 static ncclNet_v11_t* ncclNet_v11;
 static ncclCollNet_v11_t* ncclCollNet_v11;
+static ncclGin_v11_t* ncclGin_v11;
 
 ncclNet_t* getNcclNet_v11(void* lib) {
   ncclNet_v11 = (ncclNet_v11_t*)dlsym(lib, "ncclNetPlugin_v11");
@@ -29,3 +29,12 @@ ncclCollNet_t* getNcclCollNet_v11(void* lib) {
   }
   return nullptr;
 }
+
+ncclGin_t* getNcclGin_v11(void* lib) {
+  ncclGin_v11 = (ncclGin_v11_t*)dlsym(lib, "ncclGinPlugin_v11");
+  if (ncclGin_v11) {
+    INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Loaded gin plugin %s (v11)", ncclGin_v11->name);
+    return ncclGin_v11;
+  }
+  return nullptr;
+}
diff --git a/projects/rccl/src/plugin/net/net_v6.cc b/projects/rccl/src/plugin/net/net_v6.cc
index 73eb8614d33..6cf40d4e510 100644
--- a/projects/rccl/src/plugin/net/net_v6.cc
+++ b/projects/rccl/src/plugin/net/net_v6.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/projects/rccl/src/plugin/net/net_v7.cc b/projects/rccl/src/plugin/net/net_v7.cc
index a1371729435..8121273dcb8 100644
--- a/projects/rccl/src/plugin/net/net_v7.cc
+++ b/projects/rccl/src/plugin/net/net_v7.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/projects/rccl/src/plugin/net/net_v8.cc b/projects/rccl/src/plugin/net/net_v8.cc
index d241d5dc5d7..3b1aaa58ea2 100644
--- a/projects/rccl/src/plugin/net/net_v8.cc
+++ b/projects/rccl/src/plugin/net/net_v8.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/projects/rccl/src/plugin/net/net_v9.cc b/projects/rccl/src/plugin/net/net_v9.cc
index 12011aa8c2e..b32d6f4fdf5 100644
--- a/projects/rccl/src/plugin/net/net_v9.cc
+++ b/projects/rccl/src/plugin/net/net_v9.cc
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "nccl_net.h"
-#include "net_device.h"
 #include "proxy.h"
 #include "checks.h"
 #include <dlfcn.h>
diff --git a/projects/rccl/src/plugin/plugin_open.cc b/projects/rccl/src/plugin/plugin_open.cc
index 96960bb54a6..312be2fabcc 100644
--- a/projects/rccl/src/plugin/plugin_open.cc
+++ b/projects/rccl/src/plugin/plugin_open.cc
@@ -15,14 +15,14 @@
 
 #define MAX_STR_LEN 255
 
-#define NUM_LIBS 3
+#define NUM_LIBS 4
 static char* libNames[NUM_LIBS];
 char* ncclPluginLibPaths[NUM_LIBS];
 static void *libHandles[NUM_LIBS];
-static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
-static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "libnccl-tuner", "librccl-profiler" };
-static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
-static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
+static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER", "ENV" };
+static const char *pluginPrefix[NUM_LIBS] = { "libnccl-net", "libnccl-tuner", "libnccl-profiler", "libnccl-env" };
+static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "", "" };
+static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT, NCCL_INIT|NCCL_ENV };
 
 static void* tryOpenLib(char* name, int* err, char* errStr) {
   *err = 0;
@@ -124,6 +124,10 @@ void* ncclOpenProfilerPluginLib(const char* name) {
   return openPluginLib(ncclPluginTypeProfiler, name);
 }
 
+void* ncclOpenEnvPluginLib(const char* name) {
+  return openPluginLib(ncclPluginTypeEnv, name);
+}
+
 void* ncclGetNetPluginLib(enum ncclPluginType type) {
   if (libNames[ncclPluginTypeNet]) {
     // increment the reference counter of the net library
diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc
index 5d9fe0b3d45..755751102b9 100644
--- a/projects/rccl/src/proxy.cc
+++ b/projects/rccl/src/proxy.cc
@@ -521,10 +521,12 @@ static ncclResult_t ncclLocalOpAppend(struct ncclComm* comm, struct ncclProxyCon
     op = pool->ops+opIndex;
     proxyOps->freeOp = op->next;
   } else {
-    int freeOp;
-    while ((freeOp = pool->freeOps[tpLocalRank]) == -1) sched_yield();
-    int freeOpNew;
-    while ((freeOpNew = __sync_val_compare_and_swap(pool->freeOps+tpLocalRank, freeOp, -1)) != freeOp) freeOp = freeOpNew;
+    // Read the freeOps value and wait for a value different than -1. Once not -1, read the value with acquire and reset -1
+    int freeOp = -1;
+    while (freeOp == -1) {
+      freeOp = __atomic_exchange_n(&pool->freeOps[tpLocalRank], -1, __ATOMIC_ACQUIRE);
+      if (freeOp == -1) sched_yield();
+    }
     opIndex = freeOp;
     op = pool->ops+opIndex;
     proxyOps->freeOp = op->next;
@@ -887,26 +889,17 @@ static ncclResult_t ncclProxyGetPostedOps(struct ncclProxyState* proxyState, int
 
   for (int i = 0; i < proxyState->tpLocalnRanks; i++) {
     if (freeOp[i] == -1) continue;
-    int newFree = freeOp[i];
-    int oldFree = pool->freeOps[i];
-    // Coverity gets confused by the complex code structure here.  The previous "for" loop ensures that freeOpEnd[i]
-    // is initialized so long as freeOp[i] is initialized (is not -1).  In the current loop we filter out uninitialized
-    // freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
-    // coverity[uninit_use:FALSE]
-    pool->ops[freeOpEnd[i]].next = oldFree;
-    if (oldFree == -1) {
-      // Nothing for the main thread to consume, we can set it.
-      pool->freeOps[i] = newFree;
-    } else {
-      // The main thread may recycle free ops at any time, replace the freeOps value atomically and check it worked.
-      int swap = __sync_val_compare_and_swap(pool->freeOps+i, oldFree, newFree);
-      if (swap != oldFree) {
-        if (swap != -1) return ncclInternalError;
-        // Ops were recycled while we were trying to swap, just set the value directly now.
-        pool->ops[freeOpEnd[i]].next = -1;
-        pool->freeOps[i] = newFree;
-      }
-    }
+    int oldFree = -1, swap = -1, newFree = freeOp[i];
+    // prepend the ops freeOp[i]-freeOpEnd[i] in front of the pool->freeOps[i] op
+    oldFree = __atomic_load_n(&pool->freeOps[i], __ATOMIC_ACQUIRE);
+    do {
+      // Coverity gets confused by the complex code structure here.  The previous "for" loop ensures that freeOpEnd[i]
+      // is initialized so long as freeOp[i] is initialized (is not -1).  In the current loop we filter out uninitialized
+      // freeOp[i], hence ensuring that freeOpEnd[i] is also initialized.
+      // coverity[uninit_use:FALSE]
+      pool->ops[freeOpEnd[i]].next = swap = oldFree;
+      __atomic_compare_exchange_n(&pool->freeOps[i], &oldFree, newFree, true, /*success=*/__ATOMIC_RELEASE, /*failure=*/__ATOMIC_ACQUIRE);
+    } while (swap != oldFree);
   }
   ncclProfilerRecordProxyCtrlEventState(eHandle, *added, ncclProfilerProxyCtrlAppendEnd);
   ncclProfilerStopProxyCtrlEvent(eHandle);
@@ -1967,6 +1960,7 @@ ncclResult_t ncclProxyCreate(struct ncclComm* comm) {
     proxyState->dmaBufSupport = comm->dmaBufSupport;
     proxyState->ncclNet = comm->ncclNet;
     proxyState->ncclCollNet = comm->ncclCollNet;
+    proxyState->ginState = &comm->sharedRes->ginState;
     proxyState->netContext = comm->netContext;
     proxyState->collNetContext = comm->collNetContext;
     proxyState->profilerContext = comm->profilerContext;
diff --git a/projects/rccl/src/ras/client.cc b/projects/rccl/src/ras/client.cc
index 56937b139b3..795ff041ed8 100644
--- a/projects/rccl/src/ras/client.cc
+++ b/projects/rccl/src/ras/client.cc
@@ -28,6 +28,7 @@ static const char* hostName = "localhost";
 static const char* port = STR(NCCL_RAS_CLIENT_PORT);
 static int timeout = -1;
 static bool verbose = false;
+static const char* format = "text";
 static int sock = -1;
 
 static void printUsage(const char* argv0) {
@@ -35,6 +36,7 @@ static void printUsage(const char* argv0) {
           "Usage: %s [OPTION]...\n"
           "Query the state of a running NCCL job.\n"
           "\nOptions:\n"
+          "  -f, --format=FMT    Output format: text or json (text by default)\n"
           "  -h, --host=HOST     Host name or IP address of the RAS client socket of the\n"
           "                      NCCL job to connect to (localhost by default)\n"
           "  -p, --port=PORT     TCP port of the RAS client socket of the NCCL job\n"
@@ -51,17 +53,25 @@ static void parseArgs(int argc, char** argv) {
   int c;
   int optIdx = 0;
   struct option longOpts[] = {
+    {"format",  required_argument, NULL, 'f'},
+    {"help",    no_argument,       NULL, 'e'},
     {"host",    required_argument, NULL, 'h'},
     {"port",    required_argument, NULL, 'p'},
     {"timeout", required_argument, NULL, 't'},
     {"verbose", no_argument,       NULL, 'v'},
-    {"help",    no_argument,       NULL, 'e'},
     {"version", no_argument,       NULL, 'r'},
     {0}
   };
 
-  while ((c = getopt_long(argc, argv, "h:p:t:v", longOpts, &optIdx)) != -1) {
+  while ((c = getopt_long(argc, argv, "f:h:p:t:v", longOpts, &optIdx)) != -1) {
     switch (c) {
+      case 'f':
+        format = optarg;
+        if (strcasecmp(format, "text") != 0 && strcasecmp(format, "json") != 0) {
+          fprintf(stderr, "Invalid format: %s (must be text or json)\n", format);
+          exit(1);
+        }
+        break;
       case 'h':
         hostName = optarg;
         break;
@@ -265,9 +275,51 @@ static int connectToNCCL() {
   goto retry;
 }
 
+static int setOutputFormat() {
+  char msgBuf[4096];
+  int bytes;
+
+  // Only set format if it's not the default.
+  if (strcasecmp(format, "text") != 0) {
+    snprintf(msgBuf, sizeof(msgBuf), "SET FORMAT %s\n", format);
+    if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("write to socket");
+      return 1;
+    }
+    // Read response.
+    bytes = rasRead(sock, msgBuf, sizeof(msgBuf));
+    if (bytes < 0) {
+      if (errno == EAGAIN || errno == EWOULDBLOCK)
+        fprintf(stderr, "Connection timed out\n");
+      else
+        perror("read socket");
+      return 1;
+    }
+    if (bytes == 0) {
+      fprintf(stderr, "NCCL unexpectedly closed the connection\n");
+      return 1;
+    }
+    if (strcasecmp(msgBuf, "OK\n")) {
+      fprintf(stderr, "Unexpected response from NCCL: %s\n", msgBuf);
+      return 1;
+    }
+  }
+  return 0;
+}
+
 int getNCCLStatus() {
   char msgBuf[4096];
   int bytes;
+
+  // Set the output format.
+  if (setOutputFormat() != 0) {
+    return 1;
+  }
+
+  // Send the status command.
   snprintf(msgBuf, sizeof(msgBuf), "%sSTATUS\n", (verbose ? "VERBOSE " : ""));
   if (socketWrite(sock, msgBuf, strlen(msgBuf)) != strlen(msgBuf)) {
     if (errno == EAGAIN || errno == EWOULDBLOCK)
diff --git a/projects/rccl/src/ras/client_support.cc b/projects/rccl/src/ras/client_support.cc
index 85d2230eb65..746a8f2a88c 100644
--- a/projects/rccl/src/ras/client_support.cc
+++ b/projects/rccl/src/ras/client_support.cc
@@ -106,6 +106,9 @@ static char lineBuf[1024]; // Temporary buffer used for printing at most 10 (RAS
                            // Still, 1024 should normally be plenty (verbose output may make things more difficult,
                            // but we do check for overflows, so it will just be trimmed).
 
+// CUDA version information - shared across functions.
+static int cudaDriverVersion = -1, cudaRuntimeVersion = -1;
+
 
 static ncclResult_t getNewClientEntry(struct rasClient** pClient);
 static void rasClientEnqueueMsg(struct rasClient* client, char* msg, size_t msgLen);
@@ -135,6 +138,22 @@ static const char* ncclErrorToString(ncclResult_t err);
 static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* buf, size_t size);
 static bool rasCountIsOutlier(int count, bool verbose, int totalCount = -1);
 
+// CUDA version information - shared across functions.
+static void rasDumpCommsToJSON(struct rasClient* client, struct rasCollComms* commsData,
+                               struct rasCollective* coll, const int* peerIdxConv);
+static void jsonWriteHeader(const char* ncclVersion, int cudaRuntime, int cudaDriver,
+                            const char* timestamp, int commsCount);
+static void jsonStartCommunicator(unsigned long commHash, unsigned long hostHash, unsigned long pidHash,
+                                  int commSize, int ranksCount, int missingCount, bool firstComm);
+static void jsonWriteRankData(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                              int initState, int asyncError, bool finalizeCalled, bool destroyFlag,
+                              bool abortFlag, const unsigned long* collCounts, bool firstRank);
+static void jsonStartMissingRanks();
+static void jsonWriteMissingRank(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                                 bool unresponsive, bool dead, bool firstMissing);
+static void jsonEndCommunicator();
+static void jsonWriteFooter(double collectionTime, int timeoutsCount);
+
 
 ///////////////////////////////////
 // General rasClients functions. //
@@ -207,6 +226,7 @@ static ncclResult_t getNewClientEntry(struct rasClient** pClient) {
   client->sock = client->pfd = -1;
   ncclIntruQueueConstruct(&client->sendQ);
   client->timeout =  RAS_COLLECTIVE_LEG_TIMEOUT;
+  client->outputFormat = RAS_OUTPUT_TEXT;  // Initialize to default TEXT format.
 
   if (rasClientsHead) {
     rasClientsTail->next = client;
@@ -359,6 +379,24 @@ void rasClientEventLoop(struct rasClient* client, int pollIdx) {
         // We don't copy the terminating '\0', hence memcpy rather than strcpy.
         memcpy(msg, rasLine, msgLen);
         rasClientEnqueueMsg(client, msg, msgLen);
+      } else if (strncasecmp(cmd, "set format ", strlen("set format ")) == 0) {
+        char* format = cmd + strlen("set format ");
+        if (strcasecmp(format, "text") == 0) {
+          client->outputFormat = RAS_OUTPUT_TEXT;
+          strcpy(rasLine, "OK\n");
+        } else if (strcasecmp(format, "json") == 0) {
+          client->outputFormat = RAS_OUTPUT_JSON;
+          strcpy(rasLine, "OK\n");
+        } else {
+          snprintf(rasLine, sizeof(rasLine), "ERROR: Invalid format %s\n", format);
+        }
+        msgLen = strlen(rasLine);
+        if (rasClientAllocMsg(&msg, msgLen) != ncclSuccess) {
+          rasClientTerminate(client);
+          return;
+        }
+        memcpy(msg, rasLine, msgLen);
+        rasClientEnqueueMsg(client, msg, msgLen);
       } else if (strcasecmp(cmd, "status") == 0) {
         client->status = RAS_CLIENT_INIT;
         (void)rasClientRun(client);
@@ -552,15 +590,17 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
         consistentNPeersGlobal, consistentNGpusGlobal, consistentNGpusNode);
   TRACE(NCCL_RAS, "RAS: firstNPeersGlobal %d, firstNGpusGlobal %d", firstNPeersGlobal, firstNGpusGlobal);
 
-  rasOutAppend("Job summary\n"
-               "===========\n\n");
+  // Only output job summary for text format.
+  if (client->outputFormat == RAS_OUTPUT_TEXT) {
+    rasOutAppend("Job summary\n"
+                 "===========\n\n");
 
-  if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
-    rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
-                 "(total)   per node  per process    (total)  (total)\n"
-                 "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
-                 totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
-  } else {
+    if (consistentNGpusNode && consistentNGpusGlobal && consistentNPeersGlobal) {
+      rasOutAppend("  Nodes  Processes         GPUs  Processes     GPUs\n"
+                   "(total)   per node  per process    (total)  (total)\n"
+                   "%7d"  "  %9d"    "  %11d"     "  %9d"    "  %7d\n",
+                   totalNodes, firstNPeersGlobal, firstNGpusGlobal, nRasPeers, totalGpus);
+    } else {
     // Gather the stats on the number of processes per node.  However, that number is not a property of a peer,
     // but of a group of peers, so calculating it is more involved.  We store the value in a temporary auxRasPeers
     // array.
@@ -706,8 +746,9 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
           } // for (peerIdx)
         } // if (rasCountIsOutlier(vc->count))
       } // for (i)
-    } // !consistentNPeersGlobal
-  } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+      } // !consistentNPeersGlobal
+    } // !consistentNGpusNode || !consistentNGpusGlobal || !consistentNPeersGlobal
+  } // TEXT format only
 
 #if 0 // Commented out for now to focus the summary status report on the information most relevant to the users.
       // To be revisited with future extensions to RAS.
@@ -728,12 +769,15 @@ static ncclResult_t rasClientRunInit(struct rasClient* client) {
       ret = ncclInProgress; // We need to wait for async. responses.
   }
 #endif
-  rasOutAppend("\nCommunicators...");
-  msgLen = rasOutLength();
-  NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
-  rasOutExtract(msg);
-  rasClientEnqueueMsg(client, msg, msgLen);
-  msg = nullptr;
+  // Only send "Communicators..." message for text format.
+  if (client->outputFormat == RAS_OUTPUT_TEXT) {
+    rasOutAppend("\nCommunicators...");
+    msgLen = rasOutLength();
+    NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+    rasOutExtract(msg);
+    rasClientEnqueueMsg(client, msg, msgLen);
+    msg = nullptr;
+  }
   {
     struct rasCollRequest collReq = {};
     bool allDone = false;
@@ -772,7 +816,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
   client->coll = nullptr;
 
   rasOutReset();
-  rasOutAppend(" obtained a result in %.2fs\n", (clockNano()-coll->startTime)/1e9);
+  rasOutAppend(" obtained a result in %.3fs\n", (clockNano()-coll->startTime)/1e9);
   if (coll->nLegTimeouts > 0) {
     rasOutAppend(" Warning: encountered %d communication timeout%s while gathering data\n", coll->nLegTimeouts,
                  (coll->nLegTimeouts > 1 ? "s" : ""));
@@ -829,7 +873,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
     rasOutAppend(" Collected data about %d unidirectional connection%s\n",
                  connsData->nConns, (connsData->nConns > 1 ? "s" : ""));
     rasOutAppend(" Travel times (valid only if system clocks are synchronized between nodes):\n"
-                 "  Minimum %fs, maximum %fs, average %fs\n",
+                 "  Minimum %.3fs, maximum %.3fs, average %.3fs\n",
                  connsData->travelTimeMin/1e9, connsData->travelTimeMax/1e9,
                  connsData->travelTimeSum/(1e9*connsData->travelTimeCount));
   } else {
@@ -847,7 +891,7 @@ static ncclResult_t rasClientRunConns(struct rasClient* client) {
         int sourcePeerIdx = rasPeerFind(&negativeMin->source);
         int destPeerIdx = rasPeerFind(&negativeMin->dest);
         if (sourcePeerIdx != -1 && destPeerIdx != -1)
-          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %fs\n",
+          rasOutAppend("  From node %s process %d to node %s process %d: observed travel time of %.3fs\n",
                        ncclSocketToHost(&negativeMin->source, rasLine, sizeof(rasLine)), rasPeers[sourcePeerIdx].pid,
                        ncclSocketToHost(&negativeMin->dest, lineBuf, sizeof(lineBuf)), rasPeers[destPeerIdx].pid,
                        negativeMin->travelTimeMin/1e9);
@@ -925,7 +969,6 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   client->coll = nullptr;
 
   rasOutReset();
-  rasOutAppend(" (%.2fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
 
   // Calculate the number of missing peers early as we rely on it for other things.
   nPeersMissing = nRasPeers - nRasDeadPeers - coll->nPeers;
@@ -957,6 +1000,21 @@ static ncclResult_t rasClientRunComms(struct rasClient* client) {
   // Sort coll->peers to match the ordering of rasPeers -- we may need it later...
   qsort(coll->peers, coll->nPeers, sizeof(*coll->peers), &ncclSocketsCompare);
 
+  // Check output format and call appropriate dump function for JSON.
+  if (client->outputFormat == RAS_OUTPUT_JSON) {
+    rasDumpCommsToJSON(client, commsData, coll, peerIdxConv);
+    msgLen = rasOutLength();
+    NCCLCHECKGOTO(rasClientAllocMsg(&msg, msgLen), ret, fail);
+    rasOutExtract(msg);
+    rasClientEnqueueMsg(client, msg, msgLen);
+    msg = nullptr;
+    client->status = RAS_CLIENT_FINISHED;
+    goto exit;
+  }
+
+  // Default TEXT format continues below.
+  rasOutAppend(" (%.3fs)\n=============\n\n", (clockNano()-coll->startTime)/1e9);
+
   // Fill in the remaining fields of auxComm's.
   for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
     struct rasAuxComm* auxComm = auxComms+commIdx;
@@ -1820,6 +1878,111 @@ static int rasAuxCommRanksValueCompare(const void* p1, const void* p2) {
 }
 
 
+//////////////////////////////////////////////////////////
+// JSON utility functions for structured output format. //
+//////////////////////////////////////////////////////////
+
+// Writes the JSON document header with metadata and opens the communicators array.
+static void jsonWriteHeader(const char* ncclVersion, int cudaRuntime, int cudaDriver,
+                            const char* timestamp, int commsCount) {
+  rasOutAppend("{\n");
+  rasOutAppend("  \"nccl_version\": \"%s\",\n", ncclVersion);
+  rasOutAppend("  \"cuda_runtime_version\": %d,\n", cudaRuntime);
+  rasOutAppend("  \"cuda_driver_version\": %d,\n", cudaDriver);
+  rasOutAppend("  \"timestamp\": \"%s\",\n", timestamp);
+  rasOutAppend("  \"communicators_count\": %d,\n", commsCount);
+  rasOutAppend("  \"communicators\": [\n");
+}
+
+// Starts a new communicator entry in the JSON output.
+static void jsonStartCommunicator(unsigned long commHash, unsigned long hostHash, unsigned long pidHash,
+                                  int commSize, int ranksCount, int missingCount, bool firstComm) {
+  if (!firstComm) rasOutAppend(",\n");
+  rasOutAppend("    {\n");
+  rasOutAppend("      \"hash\": \"0x%lx\",\n", commHash);
+  rasOutAppend("      \"secondary_hash\": \"0x%lx:0x%lx\",\n", hostHash, pidHash);
+  rasOutAppend("      \"size\": %d,\n", commSize);
+  rasOutAppend("      \"ranks_count\": %d,\n", ranksCount);
+  rasOutAppend("      \"missing_ranks_count\": %d,\n", missingCount);
+  rasOutAppend("      \"ranks\": [\n");
+}
+
+// Writes detailed rank information including status and collective operation counts.
+static void jsonWriteRankData(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                              int initState, int asyncError, bool finalizeCalled, bool destroyFlag,
+                              bool abortFlag, const unsigned long* collCounts, bool firstRank) {
+  if (!firstRank) rasOutAppend(",\n");
+  rasOutAppend("        {\n");
+  rasOutAppend("          \"rank\": %d,\n", rank);
+  rasOutAppend("          \"host\": \"%s\",\n", host);
+  rasOutAppend("          \"pid\": %d,\n", pid);
+  rasOutAppend("          \"cuda_dev\": %d,\n", cudaDev);
+  rasOutAppend("          \"nvml_dev\": %d,\n", nvmlDev);
+
+  // Status object.
+  rasOutAppend("          \"status\": {\n");
+  rasOutAppend("            \"init_state\": %d,\n", initState);
+  rasOutAppend("            \"async_error\": %d,\n", asyncError);
+  rasOutAppend("            \"finalize_called\": %s,\n", finalizeCalled ? "true" : "false");
+  rasOutAppend("            \"destroy_flag\": %s,\n", destroyFlag ? "true" : "false");
+  rasOutAppend("            \"abort_flag\": %s\n", abortFlag ? "true" : "false");
+  rasOutAppend("          },\n");
+
+  // Collective counts object.
+  rasOutAppend("          \"collective_counts\": {\n");
+  for (int op = 0; op < NCCL_NUM_FUNCTIONS; op++) {
+    rasOutAppend("            \"%s\": %lu", ncclFuncToString((ncclFunc_t)op), collCounts[op]);
+    if (op < NCCL_NUM_FUNCTIONS - 1) rasOutAppend(",");
+    rasOutAppend("\n");
+  }
+  rasOutAppend("          }\n");
+  rasOutAppend("        }");
+}
+
+// Closes the ranks array and starts the missing ranks section.
+static void jsonStartMissingRanks() {
+  rasOutAppend("\n"
+               "      ],\n");
+
+  rasOutAppend("      \"missing_ranks\": [\n");
+}
+
+// Writes basic information for a missing rank.
+static void jsonWriteMissingRank(int rank, const char* host, int pid, int cudaDev, int nvmlDev,
+                                 bool unresponsive, bool dead, bool firstMissing) {
+  if (!firstMissing) rasOutAppend(",\n");
+  rasOutAppend("        {\n");
+  rasOutAppend("          \"rank\": %d,\n", rank);
+  rasOutAppend("          \"host\": \"%s\",\n", host);
+  rasOutAppend("          \"pid\": %d,\n", pid);
+  rasOutAppend("          \"cuda_dev\": %d,\n", cudaDev);
+  rasOutAppend("          \"nvml_dev\": %d\n", nvmlDev);
+
+  // Status object.
+  rasOutAppend("          \"status\": {\n");
+  rasOutAppend("            \"unresponsive\": %s,\n", unresponsive ? "true" : "false");
+  rasOutAppend("            \"considered_dead\": %s\n", dead ? "true" : "false");
+  rasOutAppend("          }\n");
+  rasOutAppend("        }");
+}
+
+// Closes the current communicator entry.
+static void jsonEndCommunicator() {
+  rasOutAppend("\n      ]\n");
+  rasOutAppend("    }");
+}
+
+// Writes the JSON document footer with RAS metadata and closes the document.
+static void jsonWriteFooter(double collectionTime, int timeoutsCount) {
+  rasOutAppend("\n  ],\n");
+  rasOutAppend("  \"ras\": {\n");
+  rasOutAppend("    \"collection_time_sec\": %.3f,\n", collectionTime);
+  rasOutAppend("    \"timeouts_count\": %d\n", timeoutsCount);
+  rasOutAppend("  }\n");
+  rasOutAppend("}\n");
+}
+
+
 ////////////////////////////////////////////////////////////
 // String formatting functions for various types of data. //
 ////////////////////////////////////////////////////////////
@@ -1889,6 +2052,81 @@ static const char* ncclSocketToHost(const union ncclSocketAddress* addr, char* b
   }
 }
 
+// Dump communicator data to JSON format - build JSON in output buffer.
+static void rasDumpCommsToJSON(struct rasClient* client, struct rasCollComms* commsData,
+                               struct rasCollective* coll, const int* peerIdxConv) {
+  char hostBuf[256], timeBuf[64];
+
+  time_t timestampSec = time(NULL);
+  struct tm tmBuffer;
+  struct tm* tmInfo = localtime_r(&timestampSec, &tmBuffer);
+  strftime(timeBuf, sizeof(timeBuf), "%F %T", tmInfo);
+
+  // Write JSON header with metadata.
+  jsonWriteHeader(STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX,
+                  cudaRuntimeVersion, cudaDriverVersion, timeBuf, commsData->nComms);
+
+  struct rasCollComms::comm* comm = commsData->comms;
+
+  // Iterate through communicators.
+  for (int commIdx = 0; commIdx < commsData->nComms; commIdx++) {
+    jsonStartCommunicator(comm->commId.commHash, comm->commId.hostHash, comm->commId.pidHash,
+                          comm->commNRanks, comm->nRanks, comm->nMissingRanks, (commIdx == 0));
+
+    // Add each rank.
+    for (int rankIdx = 0; rankIdx < comm->nRanks; rankIdx++) {
+      struct rasCollComms::comm::rank* rank = comm->ranks + rankIdx;
+
+      // Get host and pid information.
+      const char* host = "unknown";
+      int pid = -1;
+      if (rank->peerIdx >= 0 && peerIdxConv && peerIdxConv[rank->peerIdx] >= 0) {
+        int rasPeerIdx = peerIdxConv[rank->peerIdx];
+        host = ncclSocketToHost(&rasPeers[rasPeerIdx].addr, hostBuf, sizeof(hostBuf));
+        pid = rasPeers[rasPeerIdx].pid;
+      }
+
+      jsonWriteRankData(rank->commRank, host, pid, rank->cudaDev, rank->nvmlDev,
+                        rank->status.initState, rank->status.asyncError,
+                        rank->status.finalizeCalled, rank->status.destroyFlag, rank->status.abortFlag,
+                        rank->collOpCounts, (rankIdx == 0));
+    }
+
+    // Start missing ranks section.
+    jsonStartMissingRanks();
+
+    // Add missing ranks.
+    struct rasCollCommsMissingRank* missingRanks = (struct rasCollCommsMissingRank*)(comm->ranks + comm->nRanks);
+    for (int missingIdx = 0; missingIdx < comm->nMissingRanks; missingIdx++) {
+      struct rasCollCommsMissingRank* missingRank = missingRanks + missingIdx;
+
+      // Get host and pid for missing rank.
+      int rasPeerIdx = rasPeerFind(&missingRank->addr);
+      const char* host = "unknown";
+      int pid = -1;
+      bool unresponsive = (bsearch(&missingRank->addr, coll->peers, coll->nPeers, sizeof(*coll->peers),
+                                   ncclSocketsCompare) == nullptr);
+      bool dead = rasPeerIsDead(&missingRank->addr);
+      if (rasPeerIdx >= 0) {
+        host = ncclSocketToHost(&rasPeers[rasPeerIdx].addr, hostBuf, sizeof(hostBuf));
+        pid = rasPeers[rasPeerIdx].pid;
+      }
+
+      jsonWriteMissingRank(missingRank->commRank, host, pid, missingRank->cudaDev, missingRank->nvmlDev,
+                           unresponsive, dead, (missingIdx == 0));
+    }
+
+    jsonEndCommunicator();
+
+    // Move to the next communicator.
+    comm = (struct rasCollComms::comm*)(((char*)(comm+1)) + comm->nRanks * sizeof(*comm->ranks) +
+                                        comm->nMissingRanks * sizeof(struct rasCollCommsMissingRank));
+  }
+
+  // Write JSON footer with RAS metadata.
+  jsonWriteFooter((clockNano()-coll->startTime)/1e9, coll->nLegTimeouts);
+}
+
 // Determines if the given count constitutes an outlier.
 static bool rasCountIsOutlier(int count, bool verbose, int totalCount) {
   if (count == 1)
diff --git a/projects/rccl/src/ras/ras_internal.h b/projects/rccl/src/ras/ras_internal.h
index b35305207af..d7fef1c17ed 100644
--- a/projects/rccl/src/ras/ras_internal.h
+++ b/projects/rccl/src/ras/ras_internal.h
@@ -437,6 +437,13 @@ typedef enum {
   RAS_CLIENT_FINISHED = 99
 } rasClientStatus;
 
+// Output format enum for different data export types.
+// This is shared between client and server.
+typedef enum {
+  RAS_OUTPUT_TEXT = 0,    // Default human-readable format.
+  RAS_OUTPUT_JSON = 1     // JSON format (always verbose).
+} rasOutputFormat;
+
 // Describes a RAS client.
 struct rasClient {
   struct rasClient* next;
@@ -457,6 +464,8 @@ struct rasClient {
   int verbose;
   int64_t timeout;
 
+  rasOutputFormat outputFormat;  // TEXT or JSON output format.
+
   // State stored during asynchronous operations such as collectives.
   struct rasCollective* coll;
 };
diff --git a/projects/rccl/src/register/register.cc b/projects/rccl/src/register/register.cc
index 5a7a6ac9089..8ed34db209f 100644
--- a/projects/rccl/src/register/register.cc
+++ b/projects/rccl/src/register/register.cc
@@ -121,6 +121,7 @@ ncclResult_t ncclRegCleanup(struct ncclComm* comm) {
 }
 
 NCCL_API(ncclResult_t, ncclCommRegister, const ncclComm_t comm, void* buff, size_t size, void** handle);
+
 ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t size, void** handle) {
   ncclResult_t ret = ncclSuccess;
 
@@ -137,7 +138,13 @@ ncclResult_t ncclCommRegister_impl(const ncclComm_t comm, void* buff, size_t siz
 }
 
 ncclResult_t ncclCommGraphRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) {
-  NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  if (ncclP2pUsesMemcpy()) {
+    *handle = NULL;
+    INFO(NCCL_REG, "Skipping graph registration for buffer %p size %zi (P2pUsesMemcpy=%d)",
+         buff, size, ncclP2pUsesMemcpy());
+  } else {
+    NCCLCHECK(ncclRegister(comm, buff, size, true, handle));
+  }
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/scheduler/symmetric_sched.cc b/projects/rccl/src/scheduler/symmetric_sched.cc
index 0642958ba93..f38fafd4615 100644
--- a/projects/rccl/src/scheduler/symmetric_sched.cc
+++ b/projects/rccl/src/scheduler/symmetric_sched.cc
@@ -16,6 +16,7 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
   int fnOpTySymIndices[ncclNumFuncs * ncclNumDevRedOps * ncclNumTypes];
   struct ncclKernelPlanner* planner = &comm->planner;
   struct ncclTaskColl* remainTasksTail = nullptr;
+  bool foundSymm = false;
 
   memset(tasksSymByFnOpTy, 0, sizeof(tasksSymByFnOpTy));
   *remainTasksHead = nullptr;
@@ -31,6 +32,7 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
       task->next = tasksSymByFnOpTy[index];
       tasksSymByFnOpTy[index] = task;
       planner->nTasksColl--;
+      foundSymm = true;
     } else {
       if (*remainTasksHead) {
         remainTasksTail->next = task;
@@ -43,6 +45,8 @@ ncclResult_t ncclMakeSymmetricTaskList(struct ncclComm* comm, struct ncclTaskCol
   }
   if (remainTasksTail) remainTasksTail->next = nullptr;
 
+  if (!foundSymm) goto exit;
+
   // make sure kernel args space can hold at least a single work
   assert(comm->workArgsBytes >= ncclSymkDevWorkArgs::calcArgsSize(MAXCHANNELS, 1));
 
diff --git a/projects/rccl/src/sym_kernels.cc b/projects/rccl/src/sym_kernels.cc
index 5b2502916a5..93766334acf 100644
--- a/projects/rccl/src/sym_kernels.cc
+++ b/projects/rccl/src/sym_kernels.cc
@@ -56,6 +56,18 @@ constexpr uint32_t kernelMask_RS = 1<<ncclSymkKernelId_ReduceScatter_LD |
                                    1<<ncclSymkKernelId_ReduceScatter_LDMC |
                                    1<<ncclSymkKernelId_ReduceScatter_LL;
 
+constexpr uint32_t kernelMask_LSA = 1<<ncclSymkKernelId_AllReduce_AGxLL_R |
+                                    1<<ncclSymkKernelId_AllReduce_AGxLLMC_R |
+                                    1<<ncclSymkKernelId_AllReduce_RSxLD_AGxST |
+                                    1<<ncclSymkKernelId_AllReduce_RSxLDMC_AGxSTMC |
+                                    1<<ncclSymkKernelId_AllGather_LL |
+                                    1<<ncclSymkKernelId_AllGather_LLMC |
+                                    1<<ncclSymkKernelId_AllGather_ST |
+                                    1<<ncclSymkKernelId_AllGather_STMC |
+                                    1<<ncclSymkKernelId_ReduceScatter_LL |
+                                    1<<ncclSymkKernelId_ReduceScatter_LD |
+                                    1<<ncclSymkKernelId_ReduceScatter_LDMC;
+
 static uint32_t kernelMask_coll(ncclFunc_t coll) {
   switch (coll) {
   case ncclFuncAllGather: return kernelMask_AG;
@@ -220,7 +232,9 @@ ncclResult_t ncclSymkInitOnce(struct ncclComm* comm) {
     lla2aReq.next = reqs.resourceRequirementsList;
     reqs.resourceRequirementsList = &lla2aReq;
 
-    NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm));
+    if (comm->nNodes == 1) {
+      NCCLCHECK(ncclDevrCommCreateInternal(comm, &reqs, &symk->kcomm.devComm));
+    }
   }
   return ncclSuccess;
 }
@@ -301,6 +315,8 @@ static uint32_t ncclSymkMask(struct ncclComm* comm, ncclFunc_t coll, int/*ncclDe
   // to be at least 32 bytes per chunk)
   if (nBusBytes >= 32*(size_t(2)<<30)) kmask = 0;
 
+  if (comm->nNodes > 1) kmask &= ~kernelMask_LSA;
+
   return kmask;
 }
 
diff --git a/projects/rccl/src/transport.cc b/projects/rccl/src/transport.cc
index 68019ebb868..20c043e0a61 100644
--- a/projects/rccl/src/transport.cc
+++ b/projects/rccl/src/transport.cc
@@ -88,32 +88,46 @@ NCCL_PARAM(ConnectRoundMaxPeers, "CONNECT_ROUND_MAX_PEERS", 128);
 NCCL_PARAM(ReportConnectProgress, "REPORT_CONNECT_PROGRESS", 0);
 #include <sys/time.h>
 
-ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode) {
-  bool supportFlag = true;
+// Tests communicator for CUDA P2P connectivity (local ranks only).
+// *isAllDirectP2p returns 1 if all local ranks have CUDA P2P connectivity with each other
+// and are no further than NCCL_P2P_LEVEL apart.
+// *directMode returns 1 if *any* two local ranks are managed by the same process.
+// *isAllCudaP2p returns 1 if all local ranks have CUDA P2P connectivity with each other, irrespective of the distance.
+ncclResult_t ncclTransportCheckP2pType(struct ncclComm* comm, bool* isAllDirectP2p, bool* directMode,
+                                       bool* isAllCudaP2p) {
+  bool ncclP2pFlag = true;
   bool directFlag = false;
-  if (comm->localRanks == 1) {
-    supportFlag = false;
-  } else {
-    for (int i = 0; i < comm->localRanks; ++i) {
-      for (int j = i + 1; j < comm->localRanks; ++j) {
-        int ipeer = comm->localRankToRank[i];
-        int jpeer = comm->localRankToRank[j];
-        struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
-        struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
-        int canConnect = 0;
-        int intermediateRank = -1;
-        NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank, &canConnect, NULL, &intermediateRank));
-        if (!canConnect || intermediateRank != -1) {
-          supportFlag = false;
-        }
-        if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) directFlag = true;
-        if (!supportFlag && directFlag) break;
+  bool cudaP2pFlag = true;
+  for (int i = 0; i < comm->localRanks; ++i) {
+    for (int j = i + 1; j < comm->localRanks; ++j) {
+      int ipeer = comm->localRankToRank[i];
+      int jpeer = comm->localRankToRank[j];
+      struct ncclPeerInfo* ipeerInfo = &comm->peerInfo[ipeer];
+      struct ncclPeerInfo* jpeerInfo = &comm->peerInfo[jpeer];
+      int canConnect = 0;
+      int intermediateRank = -1;
+      int cudaP2p = 0;
+      NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, ipeerInfo->rank, jpeerInfo->rank,
+                                 &canConnect, NULL, &intermediateRank, &cudaP2p));
+      if (!canConnect || intermediateRank != -1) {
+        ncclP2pFlag = false;
+      }
+      if (!cudaP2p) {
+        cudaP2pFlag = false;
+      }
+      if (ipeerInfo->hostHash == jpeerInfo->hostHash && ipeerInfo->pidHash == jpeerInfo->pidHash) {
+        directFlag = true;
+      }
+      if (!ncclP2pFlag && directFlag && !cudaP2pFlag) {
+        break;
       }
     }
   }
-  *isAllDirectP2p = supportFlag;
+  *isAllDirectP2p = ncclP2pFlag;
   *directMode = directFlag;
-  if (comm->rank == 0) INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d", supportFlag, directFlag);
+  *isAllCudaP2p = cudaP2pFlag;
+  INFO(NCCL_INIT, "Check P2P Type isAllDirectP2p %d directMode %d isAllCudaP2p %d",
+       *isAllDirectP2p, *directMode, *isAllCudaP2p);
   return ncclSuccess;
 }
 
diff --git a/projects/rccl/src/transport/CMakeLists.txt b/projects/rccl/src/transport/CMakeLists.txt
index 61da2032cb7..621f2b4fdd7 100644
--- a/projects/rccl/src/transport/CMakeLists.txt
+++ b/projects/rccl/src/transport/CMakeLists.txt
@@ -12,5 +12,16 @@ set(TRANSPORT_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/generic.cc
 )
 
+add_subdirectory(gdaki)
+
 # Add transport sources to parent scope
 set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} PARENT_SCOPE)
+
+# Add DOCA device headers to parent scope
+set(DEVICE_DOCA_HEADERS ${DEVICE_DOCA_HEADERS} PARENT_SCOPE)
+
+# Add DOCA sources to parent scope
+set(DOCA_SOURCES ${DOCA_SOURCES} PARENT_SCOPE)
+
+# Add DOCA_HOME to parent scope
+set(DOCA_HOME ${DOCA_HOME} PARENT_SCOPE)
diff --git a/projects/rccl/src/transport/gdaki/CMakeLists.txt b/projects/rccl/src/transport/gdaki/CMakeLists.txt
new file mode 100644
index 00000000000..5f422dc6dc2
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/CMakeLists.txt
@@ -0,0 +1,65 @@
+# DOCA
+# Allow users to specify DOCA_HOME via cmake -DDOCA_HOME=<path>
+# Default to transport/gdaki/doca-gpunetio if not specified
+if(NOT DEFINED DOCA_HOME)
+    set(DOCA_HOME ${CMAKE_CURRENT_SOURCE_DIR}/doca-gpunetio)
+endif()
+
+# Copy DOCA GPUNetIO headers to build directory
+set(DOCA_INCLUDE_SOURCE_DIR ${DOCA_HOME}/include)
+set(DOCA_INCLUDE_DEST_DIR ${CMAKE_BINARY_DIR}/include/nccl_device/gin/gdaki/doca_gpunetio)
+
+# Get all header files from the source directory, including subfolders
+file(GLOB DOCA_HEADER_TOP ${DOCA_INCLUDE_SOURCE_DIR}/doca_gpunetio_device.h)
+file(GLOB DOCA_HEADER_COMMON ${DOCA_INCLUDE_SOURCE_DIR}/common/*.h)
+file(GLOB DOCA_HEADER_DEVICE ${DOCA_INCLUDE_SOURCE_DIR}/device/*.cuh)
+
+# Copy top-level header
+foreach(HEADER_FILE ${DOCA_HEADER_TOP})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/${HEADER_NAME})
+endforeach()
+
+# Copy common/ headers
+foreach(HEADER_FILE ${DOCA_HEADER_COMMON})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/common/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/common/${HEADER_NAME})
+endforeach()
+
+# Copy device/ headers
+foreach(HEADER_FILE ${DOCA_HEADER_DEVICE})
+    get_filename_component(HEADER_NAME ${HEADER_FILE} NAME)
+    configure_file(${HEADER_FILE} ${DOCA_INCLUDE_DEST_DIR}/device/${HEADER_NAME} COPYONLY)
+    list(APPEND DEVICE_DOCA_HEADERS ${DOCA_INCLUDE_DEST_DIR}/device/${HEADER_NAME})
+endforeach()
+
+# Add DOCA device headers to parent scope
+set(DEVICE_DOCA_HEADERS ${DEVICE_DOCA_HEADERS} PARENT_SCOPE)
+
+# DOCA sources
+set(DOCA_SOURCES
+    ${DOCA_HOME}/src/doca_verbs_qp.cpp
+    ${DOCA_HOME}/src/doca_verbs_cq.cpp
+    ${DOCA_HOME}/src/doca_verbs_device_attr.cpp
+    ${DOCA_HOME}/src/doca_verbs_umem.cpp
+    ${DOCA_HOME}/src/doca_verbs_srq.cpp
+    ${DOCA_HOME}/src/doca_verbs_uar.cpp
+    ${DOCA_HOME}/src/doca_gpunetio.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_log.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_high_level.cpp
+    ${DOCA_HOME}/src/doca_verbs_cuda_wrapper.cpp
+    ${DOCA_HOME}/src/doca_verbs_mlx5dv_wrapper.cpp
+    ${DOCA_HOME}/src/doca_verbs_ibv_wrapper.cpp
+    ${DOCA_HOME}/src/doca_gpunetio_gdrcopy.cpp
+)
+
+# Add DOCA sources to parent scope
+set(DOCA_SOURCES ${DOCA_SOURCES} PARENT_SCOPE)
+
+# Add DOCA_HOME to parent scope
+set(DOCA_HOME ${DOCA_HOME} PARENT_SCOPE)
+
+# Add gin_host_gdaki.cc to TRANSPORT_SOURCES in parent scope
+set(TRANSPORT_SOURCES ${TRANSPORT_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/gin_host_gdaki.cc PARENT_SCOPE)
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h
new file mode 100644
index 00000000000..1de849c94d8
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_def.h
@@ -0,0 +1,398 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_def.h
+ * @brief GDAKI common definitions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_VERBS_DEF_H
+#define DOCA_GPUNETIO_VERBS_DEF_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <limits.h>
+#include <linux/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Macro to temporarily cast a variable to volatile.
+ */
+#define DOCA_GPUNETIO_VOLATILE(x) (*(volatile typeof(x) *)&(x))
+
+/**
+ * Default warp size value of 32 threads
+ */
+#define DOCA_GPUNETIO_VERBS_WARP_SIZE 32
+
+/**
+ * Default warp full mask value
+ */
+#define DOCA_GPUNETIO_VERBS_WARP_FULL_MASK 0xffffffff
+
+/**
+ * Default page size alignment on GPU
+ */
+#define DOCA_GPUNETIO_VERBS_PAGE_SIZE 65536
+
+/**
+ * CQE Consumer Index Mask - 24bits counter
+ */
+#define DOCA_GPUNETIO_VERBS_CQE_CI_MASK 0xFFFFFF
+
+/**
+ * WQE Producer Index Mask - 16bits counter
+ */
+#define DOCA_GPUNETIO_VERBS_WQE_PI_MASK 0xFFFF
+
+#define DOCA_GPUNETIO_IB_MLX5_WQE_SQ_SHIFT 6
+
+/**
+ * Set to 1 if mkeys passed to the wqe functions
+ * are already swapped by application.
+ * Otherwise set it to 0.
+ */
+#define DOCA_GPUNETIO_VERBS_MKEY_SWAPPED 1
+
+/**
+ * Enable debug prints in this headerfile.
+ * Bad for performance, should be used only for debugging
+ */
+#ifndef DOCA_GPUNETIO_VERBS_ENABLE_DEBUG
+#define DOCA_GPUNETIO_VERBS_ENABLE_DEBUG 0
+#endif
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+#include <assert.h>
+#define DOCA_GPUNETIO_VERBS_ASSERT(x) assert(x)
+#else
+#define DOCA_GPUNETIO_VERBS_ASSERT(x) \
+    do {                              \
+    } while (0)
+#endif
+
+/**
+ * WQE data segment inline data with byte count
+ */
+#define DOCA_GPUNETIO_VERBS_MAX_INLINE_SIZE 28
+
+/**
+ * CQE Opcode Shift Bytes
+ */
+#define DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT 4
+
+#define DOCA_GPUNETIO_VERBS_CQE_SIZE 64
+
+#define DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT 8
+/**
+ * Max RDMA transfer size
+ */
+#define DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT 30
+#define DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE \
+    (1ULL << DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT)  // 1GiB
+
+#ifndef ACCESS_ONCE
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+#endif
+
+#ifndef READ_ONCE
+#define READ_ONCE(x) ACCESS_ONCE(x)
+#endif
+
+#ifndef WRITE_ONCE
+#define WRITE_ONCE(x, v) (ACCESS_ONCE(x) = (v))
+#endif
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_NOP = 0x00,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND_INVAL = 0x01,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE = 0x08,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE_IMM = 0x09,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND = 0x0a,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SEND_IMM = 0x0b,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_TSO = 0x0e,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ = 0x10,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_CS = 0x11,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA = 0x12,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_MASKED_CS = 0x14,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_MASKED_FA = 0x15,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_FMR = 0x19,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_LOCAL_INVAL = 0x1b,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_WAIT = 0x0f,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_CONFIG_CMD = 0x1f,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_SET_PSV = 0x20,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_DUMP = 0x23,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_UMR = 0x25,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_TAG_MATCHING = 0x28,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_FLOW_TBL_ACCESS = 0x2c,
+    DOCA_GPUNETIO_IB_MLX5_OPCODE_MMO = 0x2F,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_CQE_ERROR = 0x0,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_FIRST_CQE_ERROR = 0x1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ALWAYS = 0x2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_AND_EQE = 0x3,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_NO_FENCE = 0x0,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_INITIATOR_SMALL_FENCE = 0x1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE = 0x2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_STRONG_ORDERING = 0x3,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE_AND_INITIATOR_SMALL_FENCE = 0x4,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_CUSTOM = 0x100, /* None of the previous, use custom value */
+};
+
+/**
+ * GPUNetIO Verbs flags for WQE control segment
+ */
+enum doca_gpu_dev_verbs_wqe_ctrl_flags {
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE = DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ALWAYS << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_ERROR_UPDATE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_CQE_ERROR << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_FIRST_CQE_ERROR =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CE_CQE_ON_FIRST_CQE_ERROR << 2,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_SOLICITED = 1 << 1,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FENCE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_FENCE_AND_INITIATOR_SMALL_FENCE << 5,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_INITIATOR_SMALL_FENCE << 5,
+    DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_STRONG_ORDERING =
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_FM_STRONG_ORDERING << 5
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_RCV_DBR = 0,
+    DOCA_GPUNETIO_IB_MLX5_SND_DBR = 1,
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_mem_type
+ * @brief Memory type of the buffer.
+ */
+enum doca_gpu_dev_verbs_mem_type {
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_AUTO =
+        0,  ///< Automatically select the most performant memory type
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_HOST = 1,      ///< Allocate resource on host memory
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU = 2,       ///< Allocate resource on GPU memory
+    DOCA_GPUNETIO_VERBS_MEM_TYPE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_mem_type
+ * @brief Memory type of the buffer.
+ */
+enum doca_gpu_dev_verbs_qp_type {
+    DOCA_GPUNETIO_VERBS_QP_SQ = 0,  ///< Use QP SQ
+};
+
+enum doca_gpu_dev_verbs_exec_scope {
+    DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD = 0,
+    DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_sync_scope
+ * @brief Synchronization scope.
+ */
+enum doca_gpu_dev_verbs_sync_scope {
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS = 0,       ///< System synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU = 1,       ///< GPU synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA = 2,       ///< CTA synchronization scope
+    DOCA_GPUNETIO_VERBS_SYNC_SCOPE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_resource_sharing_mode
+ * @brief Resource sharing mode.
+ */
+enum doca_gpu_dev_verbs_resource_sharing_mode {
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE =
+        0,  ///< The resource is exclusive to one CUDA thread
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA = 1,       ///< The resource is shared among CUDA
+                                                             ///< threads in the same CTA
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU = 2,       ///< The resource is shared among CUDA
+                                                             ///< threads in the same GPU
+    DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_MAX = INT_MAX  ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_nic_handler
+ * @brief The processor that handles the NIC.
+ */
+enum doca_gpu_dev_verbs_nic_handler {
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO = 0,  ///< Automatically select the most performant handler
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY = 1,  ///< CPU Proxy
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB = 2,  ///< GPU SM, regular DB
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF = 3,  ///< GPU SM, BlueFlame DB
+    DOCA_GPUNETIO_VERBS_NIC_HANDLER_TYPE_MAX,       ///< Sentinel value
+};
+
+/**
+ * @enum doca_gpu_dev_verbs_gpu_code_opt
+ * @brief GPU code optimization for GDA-KI. They can be combined using bitwise or.
+ */
+enum doca_gpu_dev_verbs_gpu_code_opt {
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT = 0,  ///< Use default code optimization
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE = (1 << 0),  ///< Use store.async.release
+                                                                      ///< code optimization
+    DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_MAX = INT_MAX                    ///< Sentinel value
+};
+
+enum doca_gpu_dev_verbs_signal_op {
+    DOCA_GPUNETIO_VERBS_SIGNAL_OP_ADD = 0,  ///< Signal operation - Add
+};
+
+enum {
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MIN = 3,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MAX = 4,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_ATOMIC_FA_CAS = 4,
+    DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_WAIT = 2
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_INLINE_SEG = 0x80000000,
+};
+
+enum {
+    DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK = 1,
+    DOCA_GPUNETIO_IB_MLX5_CQE_REQ = 0,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_WR_IMM = 1,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND = 2,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND_IMM = 3,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_SEND_INV = 4,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESIZE_CQ = 5,
+    DOCA_GPUNETIO_IB_MLX5_CQE_NO_PACKET = 6,
+    DOCA_GPUNETIO_IB_MLX5_CQE_SIG_ERR = 12,
+    DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR = 13,
+    DOCA_GPUNETIO_IB_MLX5_CQE_RESP_ERR = 14,
+    DOCA_GPUNETIO_IB_MLX5_CQE_INVALID = 15,
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_data_seg {
+    __be32 byte_count;
+    __be32 lkey;
+    __be64 addr;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg {
+    __be32 opmod_idx_opcode;
+    __be32 qpn_ds;
+    uint8_t signature;
+    __be16 dci_stream_channel_id;
+    uint8_t fm_ce_se;
+    __be32 imm;
+} __attribute__((__packed__)) __attribute__((__aligned__(4)));
+
+struct doca_gpunetio_ib_mlx5_wqe_raddr_seg {
+    __be64 raddr;
+    __be32 rkey;
+    __be32 reserved;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_atomic_seg {
+    __be64 swap_add;
+    __be64 compare;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg {
+    uint32_t byte_count;
+};
+
+struct doca_gpunetio_ib_mlx5_tm_cqe {
+    __be32 success;
+    __be16 hw_phase_cnt;
+    uint8_t rsvd0[12];
+};
+
+struct doca_gpunetio_ib_ibv_tmh {
+    uint8_t opcode;      /* from enum ibv_tmh_op */
+    uint8_t reserved[3]; /* must be zero */
+    __be32 app_ctx;      /* opaque user data */
+    __be64 tag;
+};
+
+struct doca_gpunetio_ib_mlx5_cqe64 {
+    union {
+        struct {
+            uint8_t rsvd0[2];
+            __be16 wqe_id;
+            uint8_t rsvd4[13];
+            uint8_t ml_path;
+            uint8_t rsvd20[4];
+            __be16 slid;
+            __be32 flags_rqpn;
+            uint8_t hds_ip_ext;
+            uint8_t l4_hdr_type_etc;
+            __be16 vlan_info;
+        };
+        struct doca_gpunetio_ib_mlx5_tm_cqe tm_cqe;
+        /* TMH is scattered to CQE upon match */
+        struct doca_gpunetio_ib_ibv_tmh tmh;
+    };
+    __be32 srqn_uidx;
+    __be32 imm_inval_pkey;
+    uint8_t app;
+    uint8_t app_op;
+    __be16 app_info;
+    __be32 byte_cnt;
+    __be64 timestamp;
+    __be32 sop_drop_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+struct doca_gpunetio_ib_mlx5_err_cqe_ex {
+    uint8_t rsvd0[32];
+    __be32 srqn;
+    uint8_t rsvd1[16];
+    uint8_t hw_err_synd;
+    uint8_t hw_synd_type;
+    uint8_t vendor_err_synd;
+    uint8_t syndrome;
+    __be32 s_wqe_opcode_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_VERBS_DEF_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h
new file mode 100644
index 00000000000..1067833df65
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/common/doca_gpunetio_verbs_dev.h
@@ -0,0 +1,203 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_dev.h
+ * @brief GDAKI common definitions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_VERBS_DEV_H
+#define DOCA_GPUNETIO_VERBS_DEV_H
+
+#include "doca_gpunetio_verbs_def.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @typedef doca_gpu_dev_verbs_ticket_t
+ * @brief Ticket type used in one-sided APIs.
+ */
+typedef uint64_t doca_gpu_dev_verbs_ticket_t;
+
+/**
+ * Describes GPUNetIO dev WQE crtl segment.
+ */
+struct doca_gpu_dev_verbs_wqe_ctrl_seg {
+    __be32 opmod_idx_opcode; /**< opcode + wqe idx */
+    __be32 qpn_ds;           /**< qp number */
+    union {
+        struct {
+            uint8_t signature; /**< signature */
+            uint8_t rsvd[2];   /**< reserved */
+            uint8_t fm_ce_se;  /**< fm_ce_se */
+        };
+        struct {
+            __be32 signature_fm_ce_se; /**< all flags in or */
+        };
+    };
+
+    __be32 imm; /**< immediate */
+} __attribute__((__aligned__(8)));
+
+/**
+ * Describes GPUNetIO dev WQE crtl segment.
+ */
+struct doca_gpu_dev_verbs_wqe_wait_seg {
+    uint32_t resv[2];
+    __be32 max_index;
+    __be32 qpn_cqn;
+} __attribute__((__packed__)) __attribute__((__aligned__(8)));
+
+/**
+ * @struct doca_gpu_dev_verbs_addr
+ * @brief This structure holds the address and key of a memory region.
+ */
+struct doca_gpu_dev_verbs_addr {
+    uint64_t addr;
+    __be32 key;
+};
+
+/**
+ * Describes GPUNetIO dev general WQE.
+ */
+struct doca_gpu_dev_verbs_wqe {
+    union {
+        /* Generic inline Data */
+        struct {
+            uint8_t inl_data[64];
+        };
+
+        /* Generic Data */
+        struct {
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg2;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg3;
+        };
+
+        /* Read/Write */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg rw_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rw_rseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg rw_dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg rw_dseg1;
+        };
+
+        /* Atomic */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg at_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_raddr_seg at_rseg;
+            struct doca_gpunetio_ib_mlx5_wqe_atomic_seg at_seg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg at_dseg;
+        };
+
+        /* Send */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg snd_cseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg1;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg snd_dseg2;
+        };
+
+        /* Wait */
+        struct {
+            struct doca_gpu_dev_verbs_wqe_ctrl_seg wait_cseg;
+            struct doca_gpu_dev_verbs_wqe_wait_seg wait_dseg;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg padding0;
+            struct doca_gpunetio_ib_mlx5_wqe_data_seg padding1;
+        };
+    };
+} __attribute__((__aligned__(8)));
+
+/**
+ * Describes GPUNetIO dev CQ
+ */
+struct doca_gpu_dev_verbs_cq {
+    uint8_t *cqe_daddr;                         /**< CQE address */
+    uint32_t cq_num;                            /**< CQ number */
+    uint32_t cqe_num;                           /**< Total number of CQEs in CQ */
+    __be32 *dbrec;                              /**< CQE Doorbell Record */
+    uint64_t cqe_ci;                            /**< CQE Consumer Index */
+    uint32_t cqe_mask;                          /**< Mask of total number of CQEs in CQ */
+    uint8_t cqe_size;                           /**< Single CQE size (64B default) */
+    uint64_t cqe_rsvd;                          /**< All previous CQEs are polled */
+    enum doca_gpu_dev_verbs_mem_type mem_type;  ///< Memory type of the completion queue
+};
+
+/**
+ * Describes GPUNetIO dev QP
+ */
+struct doca_gpu_dev_verbs_qp {
+    uint64_t sq_rsvd_index;        ///< All WQE slots prior to this index are reserved
+    uint64_t sq_ready_index;       ///< All WQE slots prior to this index are ready
+    uint64_t sq_wqe_pi;            /**< tbd */
+    uint32_t sq_num;               /**< SQ num */
+    uint32_t sq_num_shift8;        /**< SQ num << 8 */
+    uint32_t sq_num_shift8_be;     /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_1ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_2ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_3ds; /**< SQ num << 8 big endian */
+    uint32_t sq_num_shift8_be_4ds; /**< SQ num << 8 big endian */
+    int sq_lock;                   /**< SQ lock */
+    uint16_t sq_wqe_num;           /**< tbd */
+    uint16_t sq_wqe_mask;          /**< tbd */
+    uint8_t *sq_wqe_daddr;         /**< tbd */
+    __be32 *sq_dbrec;              /**< tbd */
+    uint64_t *sq_db;               /**< tbd */
+
+    /* Compatibility with DOCA GPUNetIO full, not really used */
+    uint32_t rq_num;         /**< tbd */
+    uint64_t rq_wqe_pi;      /**< tbd */
+    uint32_t rq_wqe_num;     /**< tbd */
+    uint32_t rq_wqe_mask;    /**< tbd */
+    uint8_t *rq_wqe_daddr;   /**< tbd */
+    __be32 *rq_dbrec;        /**< tbd */
+    uint32_t rcv_wqe_size;   /**< tbd */
+    uint64_t rq_rsvd_index;  /**< All previous WQEs are reserved */
+    uint64_t rq_ready_index; /**< All previous WQEs are ready */
+    int rq_lock;             /**< RQ lock */
+
+    struct doca_gpu_dev_verbs_cq cq_sq; /**< SQ CQ connected to QP */
+    struct doca_gpu_dev_verbs_cq cq_rq; /**< RQ CQ connected to QP */
+
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;  ///< NIC handler
+    enum doca_gpu_dev_verbs_mem_type mem_type;        ///< Memory type of the completion
+} __attribute__((__aligned__(8)));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_VERBS_DEV_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh
new file mode 100644
index 00000000000..293754d59c3
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_common.cuh
@@ -0,0 +1,422 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_common.cuh
+ * @brief GDAKI common device structs and functions
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_COMMON_H
+#define DOCA_GPUNETIO_DEV_VERBS_COMMON_H
+
+#include <stdio.h>
+#include <stdint.h>
+#include <cuda/atomic>
+#include <math.h>
+
+#include "../common/doca_gpunetio_verbs_dev.h"
+
+#if __CUDA_ARCH__ >= 1000
+#define DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE 1
+#endif
+
+#if __CUDA_ARCH__ >= 900
+#define DOCA_GPUNETIO_VERBS_HAS_TMA_COPY 1
+#endif
+
+#if CUDA_VERSION >= 12020
+#define DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO 1
+#else
+#warning "warning: doca_gpunetio should be used with a CUDA version >= 12020."
+#endif
+
+#if CUDA_VERSION >= 12080 && __CUDA_ARCH__ >= 900
+#define DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX 1
+#endif
+
+/**
+ * @brief Queries the global timer
+ *
+ * @return The value of the global timer
+ */
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_query_globaltimer() {
+    uint64_t ret;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(ret)::"memory");
+    return ret;
+}
+
+__device__ static __forceinline__ unsigned int doca_gpu_dev_verbs_get_lane_id() {
+    unsigned int ret;
+    asm volatile("mov.u32 %0, %%laneid;" : "=r"(ret));
+    return ret;
+}
+
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_bswap64(uint64_t x) {
+    uint64_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask;\n\t"
+        ".reg .b32 ign;\n\t"
+        ".reg .b32 lo;\n\t"
+        ".reg .b32 hi;\n\t"
+        ".reg .b32 new_lo;\n\t"
+        ".reg .b32 new_hi;\n\t"
+        "mov.b32 mask, 0x0123;\n\t"
+        "mov.b64 {lo,hi}, %1;\n\t"
+        "prmt.b32 new_hi, lo, ign, mask;\n\t"
+        "prmt.b32 new_lo, hi, ign, mask;\n\t"
+        "mov.b64 %0, {new_lo,new_hi};\n\t"
+        "}"
+        : "=l"(ret)
+        : "l"(x));
+    return ret;
+}
+
+__device__ static __forceinline__ uint32_t doca_gpu_dev_verbs_bswap32(uint32_t x) {
+    uint32_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask;\n\t"
+        ".reg .b32 ign;\n\t"
+        "mov.b32 mask, 0x0123;\n\t"
+        "prmt.b32 %0, %1, ign, mask;\n\t"
+        "}"
+        : "=r"(ret)
+        : "r"(x));
+    return ret;
+}
+
+__device__ static __forceinline__ uint16_t doca_gpu_dev_verbs_bswap16(uint16_t x) {
+    uint16_t ret;
+    asm volatile(
+        "{\n\t"
+        ".reg .b8 hi;\n\t"
+        ".reg .b8 lo;\n\t"
+        "mov.b16 {hi, lo}, %1;\n\t"
+        "mov.b16 %0, {lo, hi};\n\t"
+        "}"
+        : "=h"(ret)
+        : "h"(x));
+    return ret;
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+__device__ static __forceinline__ void doca_gpu_dev_verbs_store_relaxed_mmio(uint64_t *ptr,
+                                                                             uint64_t val) {
+    asm volatile("st.mmio.relaxed.sys.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+#endif
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence_acquire() {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("fence.acquire.cta;");
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("fence.acquire.gpu;");
+    else
+        asm volatile("fence.acquire.sys;");
+#else
+    // fence.acquire is not available in PTX. Emulate that with st.release.
+    uint32_t dummy;
+    const uint32_t val = 0;
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("ld.acquire.cta.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("ld.acquire.gpu.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+    else
+        asm volatile("ld.acquire.sys.b32 %0, [%1];" : : "r"(val), "l"(&dummy));
+#endif
+}
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence_release() {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_FENCE_ACQUIRE_RELEASE_PTX
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("fence.release.cta;");
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("fence.release.gpu;");
+    else
+        asm volatile("fence.release.sys;");
+#else
+    // fence.release is not available in PTX. Emulate that with st.release.
+    uint32_t dummy;
+    const uint32_t val = 0;
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA)
+        asm volatile("st.release.cta.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+    else if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.release.gpu.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+    else
+        asm volatile("st.release.sys.u32 [%0], %1;" : : "l"(&dummy), "r"(val));
+#endif
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_async_store_release(uint32_t *ptr,
+                                                                              uint32_t val) {
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.async.mmio.release.gpu.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+    else
+        asm volatile("st.async.mmio.release.sys.global.b32 [%0], %1;" : : "l"(ptr), "r"(val));
+}
+
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_async_store_release(uint64_t *ptr,
+                                                                              uint64_t val) {
+    if (sync_scope == DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU)
+        asm volatile("st.async.mmio.release.gpu.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+    else
+        asm volatile("st.async.mmio.release.sys.global.b64 [%0], %1;" : : "l"(ptr), "l"(val));
+}
+#endif
+
+__device__ static __forceinline__ bool doca_gpu_dev_verbs_isaligned(void *ptr, size_t alignment) {
+    bool status;
+    status = (((uintptr_t)ptr & (alignment - 1)) == 0);
+    return status;
+}
+
+/**
+ * @brief Copy data from src to dst. The data must have natural alignment with it's size.
+ *
+ * @param dst - Destination pointer
+ * @param src - Source pointer
+ * @param bytes - Number of bytes to copy
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_aligned_data(void *dst, void *src,
+                                                                              size_t bytes) {
+    size_t remaining_bytes = bytes;
+    size_t copied_size;
+    while (remaining_bytes > 0) {
+        if (remaining_bytes >= sizeof(uint32_t)) {
+            *(uint32_t *)dst = *(uint32_t *)src;
+            copied_size = sizeof(uint32_t);
+        } else if (remaining_bytes >= sizeof(uint16_t)) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            copied_size = sizeof(uint16_t);
+        } else {
+            *(uint8_t *)dst = *(uint8_t *)src;
+            copied_size = sizeof(uint8_t);
+        }
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+/**
+ * @brief Copy data from src to dst. The data may or may not have natural alignment with it's size.
+ *
+ * @param dst - Destination pointer
+ * @param src - Source pointer
+ * @param bytes - Number of bytes to copy
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_data(void *dst, void *src,
+                                                                      size_t bytes) {
+    size_t remaining_bytes = bytes;
+    size_t copied_size;
+    while (remaining_bytes > 0) {
+        if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint64_t)) &&
+            doca_gpu_dev_verbs_isaligned(src, sizeof(uint64_t)) &&
+            remaining_bytes >= sizeof(uint64_t)) {
+            *(uint64_t *)dst = *(uint64_t *)src;
+            copied_size = sizeof(uint64_t);
+        } else if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint32_t)) &&
+                   doca_gpu_dev_verbs_isaligned(src, sizeof(uint32_t)) &&
+                   remaining_bytes >= sizeof(uint32_t)) {
+            *(uint32_t *)dst = *(uint32_t *)src;
+            copied_size = sizeof(uint32_t);
+        } else if (doca_gpu_dev_verbs_isaligned(dst, sizeof(uint16_t)) &&
+                   doca_gpu_dev_verbs_isaligned(src, sizeof(uint16_t)) &&
+                   remaining_bytes >= sizeof(uint16_t)) {
+            *(uint16_t *)dst = *(uint16_t *)src;
+            copied_size = sizeof(uint16_t);
+        } else {
+            *(uint8_t *)dst = *(uint8_t *)src;
+            copied_size = sizeof(uint8_t);
+        }
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+template <typename T>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_memcpy_inl_aligned_data(T *dst, T *src,
+                                                                                  size_t bytes) {
+    size_t remaining_bytes = bytes;
+    const size_t copied_size = sizeof(T);
+    while (remaining_bytes > 0) {
+        remaining_bytes -= copied_size;
+        dst = (void *)((uintptr_t)dst + copied_size);
+        src = (void *)((uintptr_t)src + copied_size);
+    }
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode,
+          bool need_fence_acquire = false>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_max(T *ptr, T val) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        T old_val = *ptr;
+        *ptr = max(old_val, val);
+        return old_val;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<T, cuda::thread_scope_block> ptr_aref(*ptr);
+        return ptr_aref.fetch_max(
+            val, need_fence_acquire ? cuda::memory_order_acquire : cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<T, cuda::thread_scope_device> ptr_aref(*ptr);
+        return ptr_aref.fetch_max(
+            val, need_fence_acquire ? cuda::memory_order_acquire : cuda::memory_order_relaxed);
+    }
+    return 0;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_add(T *ptr, T val) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        T old_val = *ptr;
+        *ptr = old_val + val;
+        return old_val;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<T, cuda::thread_scope_block> ptr_aref(*ptr);
+        return ptr_aref.fetch_add(val, cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<T, cuda::thread_scope_device> ptr_aref(*ptr);
+        return ptr_aref.fetch_add(val, cuda::memory_order_relaxed);
+    }
+    return 0;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ T doca_gpu_dev_verbs_atomic_read(T *ptr) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        return *ptr;
+    else
+        return READ_ONCE(*ptr);
+}
+
+/**
+ * @brief Lock a resource
+ *
+ * @param lock - Pointer to the lock
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_lock(int *lock) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        *lock = 1;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        while (atomicCAS_block(lock, 0, 1) != 0) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        while (atomicCAS(lock, 0, 1) != 0) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+    }
+}
+
+/**
+ * @brief Unlock a resource
+ *
+ * @param lock - Pointer to the lock
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_unlock(int *lock) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        *lock = 0;
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        cuda::atomic_ref<int, cuda::thread_scope_block> lock_aref(*lock);
+        lock_aref.store(0, cuda::memory_order_release);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        cuda::atomic_ref<int, cuda::thread_scope_device> lock_aref(*lock);
+        lock_aref.store(0, cuda::memory_order_release);
+    }
+}
+
+__device__ static __forceinline__ uint8_t doca_gpu_dev_verbs_load_relaxed_sys_global(uint8_t *ptr) {
+    uint16_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b8 %0, [%1];" : "=h"(ret) : "l"(ptr));
+    return (uint8_t)ret;
+}
+
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_load_relaxed_sys_global(uint32_t *ptr) {
+    uint32_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+    return ret;
+}
+
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_load_relaxed_sys_global(uint64_t *ptr) {
+    uint64_t ret;
+    asm volatile("ld.relaxed.sys.global.L1::no_allocate.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ uint64_t doca_gpu_dev_verbs_load_relaxed(uint64_t *ptr) {
+    uint64_t ret = 0;
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        ret = *ptr;
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA)
+        asm volatile("ld.relaxed.cta.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU)
+        asm volatile("ld.relaxed.gpu.b64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+    return ret;
+}
+
+/**
+ * @brief Calculate the ceiling of x / y, where y is (2^denominator_shift)
+ *
+ * @param x - Numerator
+ * @param denominator_shift - Denominator shift (y = 2^denominator_shift)
+ * @return The ceiling of x / y
+ */
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_div_ceil_aligned_pow2(uint64_t x, unsigned int denominator_shift) {
+    uint64_t y = 1ULL << denominator_shift;
+    return ((x & ~(y - 1)) >> denominator_shift) + (!!(x & (y - 1)));
+}
+
+/**
+ * @brief Calculate the ceiling of x / y, where y is (2^denominator_shift).
+ * The result must fit in 32 bits. This is a faster implementation than gdaki_div_ceil_aligned_pow2.
+ *
+ * @param x - Numerator
+ * @param denominator_shift - Denominator shift (y = 2^denominator_shift)
+ * @return The ceiling of x / y
+ */
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(uint64_t x, int denominator_shift) {
+    return uint32_t(x >> denominator_shift) + !!__funnelshift_r(0, uint32_t(x), denominator_shift);
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_COMMON_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh
new file mode 100644
index 00000000000..f178866435e
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_counter.cuh
@@ -0,0 +1,421 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_counter.cuh
+ * @brief GDAKI CUDA device functions for One-sided Shared QP ops
+ *
+ * @{
+ */
+
+#ifndef DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH
+#define DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH
+
+#include "doca_gpunetio_dev_verbs_qp.cuh"
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/**
+ * @brief Submit work requests to the NIC using the DB protocol.
+ *
+ * @param qps - Array of Queue Pair (QP)
+ * @param prod_indices - Array of producer indices
+ * @param num_qps - Number of Queue Pair (QP)
+ */
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_db_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    uint64_t old_prod_indices[num_qps];
+    __be64 db_vals[num_qps];
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qps[i]->sq_lock);
+        old_prod_indices[i] = doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode, true>(
+            &qps[i]->sq_wqe_pi, prod_indices[i]);
+        if (old_prod_indices[i] < prod_indices[i]) {
+            // Early rining of the DB to push WQEs to the NIC ASAP.
+            __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qps[i]->sq_db);
+            db_vals[i] = doca_gpu_dev_verbs_prepare_db(qps[i], prod_indices[i]);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+            if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+                doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr,
+                                                                   (uint64_t)db_vals[i]);
+            } else
+#endif
+            {
+                doca_gpu_dev_verbs_fence_release<sync_scope>();
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+                { doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_vals[i]); }
+#else
+                {
+                    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(
+                        *((uint64_t *)db_ptr));
+                    db_ptr_aref.store(db_vals[i], cuda::memory_order_relaxed);
+                }
+#endif
+            }
+        }
+    }
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        if (old_prod_indices[i] < prod_indices[i]) {
+            // In case the recovery path is triggered, the later DB ringing will cover for
+            // correctness.
+            doca_priv_gpu_dev_verbs_update_dbr(qps[i], prod_indices[i]);
+            __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qps[i]->sq_db);
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+            if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+                doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr,
+                                                                   (uint64_t)db_vals[i]);
+            } else
+#endif
+            {
+                doca_gpu_dev_verbs_fence_release<sync_scope>();
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+                { doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_vals[i]); }
+#else
+                {
+                    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(
+                        *((uint64_t *)db_ptr));
+                    db_ptr_aref.store(db_vals[i], cuda::memory_order_relaxed);
+                }
+#endif
+            }
+        }
+        doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qps[i]->sq_lock);
+    }
+}
+
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_proxy_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+
+#pragma unroll 2
+    for (unsigned int i = 0; i < num_qps; i++) {
+        doca_gpu_dev_verbs_ring_proxy<resource_sharing_mode>(qps[i], prod_indices[i]);
+    }
+}
+
+template <unsigned int num_qps,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_multi_qps(
+    struct doca_gpu_dev_verbs_qp **qps, uint64_t *prod_indices) {
+    DOCA_GPUNETIO_VERBS_ASSERT(num_qps >= 2);
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        const enum doca_gpu_dev_verbs_nic_handler qp_nic_handler =
+            (enum doca_gpu_dev_verbs_nic_handler)__ldg((int *)&qps[0]->nic_handler);
+        if (qp_nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB)
+            doca_gpu_dev_verbs_submit_db_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+                qps, prod_indices);
+        else
+            doca_gpu_dev_verbs_submit_proxy_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+                qps, prod_indices);
+    } else if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB) {
+        doca_gpu_dev_verbs_submit_db_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+            qps, prod_indices);
+    } else {
+        doca_gpu_dev_verbs_submit_proxy_multi_qps<num_qps, resource_sharing_mode, sync_scope>(
+            qps, prod_indices);
+    }
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_qp *companion_qp,
+    struct doca_gpu_dev_verbs_addr counter_raddr, struct doca_gpu_dev_verbs_addr counter_laddr,
+    uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint64_t num_chunks =
+        doca_gpu_dev_verbs_div_ceil_aligned_pow2(size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint64_t num_chunks =
+        doca_gpu_dev_verbs_div_ceil_aligned_pow2(size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    // Put
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks + 1);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    // Signal
+    ++wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    // Counter
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_counter(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    struct doca_gpu_dev_verbs_qp *companion_qp, struct doca_gpu_dev_verbs_addr counter_raddr,
+    struct doca_gpu_dev_verbs_addr counter_laddr, uint64_t counter_val) {
+    constexpr unsigned int num_qps = 2;
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    // DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    // Signal
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+
+    // Counter
+    uint64_t companion_base_wqe_idx =
+        doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(companion_qp, 2);
+    uint64_t companion_wqe_idx = companion_base_wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_wait(companion_qp, wqe_ptr, companion_wqe_idx,
+                                        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, wqe_idx,
+                                        qp->cq_sq.cq_num);
+
+    ++companion_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(companion_qp, companion_wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        companion_qp, wqe_ptr, companion_wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, counter_raddr.addr, counter_raddr.key,
+        counter_laddr.addr, counter_laddr.key, sizeof(uint64_t), counter_val, 0);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(companion_qp, companion_base_wqe_idx,
+                                                              companion_wqe_idx);
+
+    doca_gpu_dev_verbs_qp *qps[num_qps] = {qp, companion_qp};
+    uint64_t prod_indices[num_qps] = {wqe_idx + 1, companion_wqe_idx + 1};
+    doca_gpu_dev_verbs_submit_multi_qps<num_qps, resource_sharing_mode,
+                                        DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU, nic_handler>(
+        qps, prod_indices);
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_COUNTER_CUH */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh
new file mode 100644
index 00000000000..ae834a7ba4f
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_cq.cuh
@@ -0,0 +1,295 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_cq.cuh
+ * @brief GDAKI CUDA device functions for CQ management
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_CQ_H
+#define DOCA_GPUNETIO_DEV_VERBS_CQ_H
+
+#include <errno.h>
+
+#include "doca_gpunetio_dev_verbs_common.cuh"
+
+/**
+ * @brief Return device CQ SQ pointer from a device QP
+ *
+ * @param[in] qp - Dev QP pointer
+ *
+ * @return Dev CQ pointer
+ */
+__device__ static __forceinline__ struct doca_gpu_dev_verbs_cq *doca_gpu_dev_verbs_qp_get_cq_sq(
+    struct doca_gpu_dev_verbs_qp *qp) {
+    return &(qp->cq_sq);
+}
+
+/**
+ * @brief Increament and round up CQE id
+ *
+ * @param[in] cqe_idx - cqe idx
+ * @param[in] increment - cqe idx increment
+ *
+ * @return cqe incremented idx
+ */
+__device__ static __forceinline__ uint32_t doca_gpu_dev_verbs_cqe_idx_inc_mask(uint32_t cqe_idx,
+                                                                               uint32_t increment) {
+    return (cqe_idx + increment) & DOCA_GPUNETIO_VERBS_CQE_CI_MASK;
+}
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+/**
+ * @brief Print error CQE values
+ *
+ * @param[in] cqe64 - erroneous cqe
+ *
+ * @return
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_cq_print_cqe_err(
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64) {
+    struct doca_gpunetio_ib_mlx5_err_cqe_ex *err_cqe =
+        (struct doca_gpunetio_ib_mlx5_err_cqe_ex *)cqe64;
+
+    printf(
+        "got completion with err: "
+        "syndrome=%#x, vendor_err_synd=%#x, "
+        "hw_err_synd=%#x, hw_synd_type=%#x, wqe_counter=%u\n",
+        err_cqe->syndrome, err_cqe->vendor_err_synd, err_cqe->hw_err_synd, err_cqe->hw_synd_type,
+        doca_gpu_dev_verbs_bswap16(err_cqe->wqe_counter));
+}
+#endif
+
+/**
+ * @brief [Internal] Poll the Completion Queue (CQ) at a specific index.
+ * This function does not update the SW consumer index nor guarantees the ordering.
+ * It also does not wait for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_priv_gpu_dev_verbs_poll_one_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    uint8_t *cqe = (uint8_t *)__ldg((uintptr_t *)&cq->cqe_daddr);
+    const uint32_t cqe_num = __ldg(&cq->cqe_num);
+    uint32_t idx = cons_index & (cqe_num - 1);
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64 =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)(cqe + (idx * DOCA_GPUNETIO_VERBS_CQE_SIZE));
+
+    uint64_t cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+
+    if (cons_index < cqe_ci) return 0;
+    if (cons_index >= cqe_ci + cqe_num) return EBUSY;
+
+    uint8_t opown;
+    uint8_t opcode;
+    bool observed_completion;
+
+#if __CUDA_ARCH__ >= 900
+    opown = doca_gpu_dev_verbs_load_relaxed_sys_global((uint8_t *)&cqe64->op_own);
+
+    observed_completion =
+        !((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num));
+#else
+    uint32_t cqe_chunk;
+    uint16_t wqe_counter;
+
+    cqe_chunk = doca_gpu_dev_verbs_load_relaxed_sys_global((uint32_t *)&cqe64->wqe_counter);
+    cqe_chunk = doca_gpu_dev_verbs_bswap32(cqe_chunk);
+    wqe_counter = cqe_chunk >> 16;
+    opown = cqe_chunk & 0xff;
+
+    observed_completion =
+        !((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num)) &&
+        (wqe_counter == ((uint32_t)cons_index & 0xffff));
+#endif
+
+    if (!observed_completion) return EBUSY;
+
+    opcode = opown >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+    if (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) doca_gpu_dev_verbs_cq_print_cqe_err(cqe64);
+#endif
+    return (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) * -EIO;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ) at a specific index. This function does
+ * not wait for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ * @return On success, doca_gpu_dev_verbs_poll_one_cq_at() returns 0. If the completion is
+ * not available, returns EBUSY. If it is a completion with error, returns a
+ * negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_one_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    int status =
+        doca_priv_gpu_dev_verbs_poll_one_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+    if (status == 0) {
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+        doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode>(&cq->cqe_ci, cons_index + 1);
+    }
+    return status;
+}
+
+/**
+ * @brief [Internal] Poll the Completion Queue (CQ) at a specific index.
+ * This function does not update the SW consumer index nor guarantees the ordering.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_priv_gpu_dev_verbs_poll_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)__ldg((uintptr_t *)&cq->cqe_daddr);
+    const uint32_t cqe_num = __ldg(&cq->cqe_num);
+    uint32_t idx = cons_index & (cqe_num - 1);
+    struct doca_gpunetio_ib_mlx5_cqe64 *cqe64 = &cqe[idx];
+    uint8_t opown;
+    uint8_t opcode;
+    uint64_t cqe_ci;
+#if __CUDA_ARCH__ >= 900
+    do {
+        cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+        [[unlikely]] if (cons_index < cqe_ci)
+            return 0;
+        opown = doca_gpu_dev_verbs_load_relaxed_sys_global((uint8_t *)&cqe64->op_own);
+    } while ((cons_index >= cqe_ci + cqe_num) ||
+             ((cqe_ci <= cons_index) &&
+              ((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num))));
+#else
+    uint32_t cqe_chunk;
+    uint16_t wqe_counter;
+
+    do {
+        cqe_ci = doca_gpu_dev_verbs_load_relaxed<resource_sharing_mode>(&cq->cqe_ci);
+        [[unlikely]] if (cons_index < cqe_ci)
+            return 0;
+        cqe_chunk = doca_gpu_dev_verbs_load_relaxed_sys_global((uint32_t *)&cqe64->wqe_counter);
+        cqe_chunk = doca_gpu_dev_verbs_bswap32(cqe_chunk);
+        wqe_counter = cqe_chunk >> 16;
+        opown = cqe_chunk & 0xff;
+    } while ((cons_index >= cqe_ci + cqe_num) ||
+             ((cqe_ci <= cons_index) &&
+              (((opown & DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK) ^ !!(cons_index & cqe_num)) ||
+               (wqe_counter != ((uint32_t)cons_index & 0xffff)))));
+#endif
+
+    opcode = opown >> DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT;
+
+#if DOCA_GPUNETIO_VERBS_ENABLE_DEBUG == 1
+    if (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) doca_gpu_dev_verbs_cq_print_cqe_err(cqe64);
+#endif
+    return (opcode == DOCA_GPUNETIO_IB_MLX5_CQE_REQ_ERR) * -EIO;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ) at a specific index. This function waits for the completion
+ * to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param cons_index - Index of the Completion Queue (CQ) to be polled
+ * @return On success, doca_gpu_dev_verbs_poll_cq_at() returns 0. If it is a completion with
+ * error, returns a negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_cq_at(
+    struct doca_gpu_dev_verbs_cq *cq, uint64_t cons_index) {
+    int status = doca_priv_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+    if (status == 0) {
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_SYS>();
+        doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode>(&cq->cqe_ci, cons_index + 1);
+    }
+    return status;
+}
+
+/**
+ * @brief Poll the Completion Queue (CQ). This function waits for the completion to arrive.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param count - Number of completions to poll
+ * @return On success, doca_gpu_dev_verbs_poll_cq() returns 0. If it is a completion with
+ * error, returns a negative value.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ int doca_gpu_dev_verbs_poll_cq(struct doca_gpu_dev_verbs_cq *cq,
+                                                                 uint32_t count) {
+    uint64_t cons_index =
+        doca_gpu_dev_verbs_atomic_add<uint64_t, resource_sharing_mode>(&cq->cqe_rsvd, count) +
+        count - 1;
+    return doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(cq, cons_index);
+}
+
+/**
+ * @brief Increment CQ DBREC
+ *
+ * @param[in] cq - GPU Completion Queue
+ * @param[in] cqe_num - CQE num to increment
+ *
+ * @return new CQE consumer index
+ */
+template <bool is_overrun>
+__device__ static __forceinline__ uint32_t
+doca_gpu_dev_verbs_cq_update_dbrec(struct doca_gpu_dev_verbs_cq *cq, uint32_t cqe_num) {
+    uint32_t cqe_ci = DOCA_GPUNETIO_VOLATILE(cq->cqe_ci);
+
+    cqe_ci = (cqe_ci + cqe_num) & DOCA_GPUNETIO_VERBS_CQE_CI_MASK;
+    if (is_overrun == false) {
+        asm volatile("st.release.gpu.global.L1::no_allocate.b32 [%0], %1;"
+                     :
+                     : "l"(cq->dbrec), "r"(doca_gpu_dev_verbs_bswap32(cqe_ci)));
+    }
+
+    DOCA_GPUNETIO_VOLATILE(cq->cqe_ci) = cqe_ci;
+
+    return cqe_ci;
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_CQ_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh
new file mode 100644
index 00000000000..57c65bec1c8
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_onesided.cuh
@@ -0,0 +1,508 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_onesided.cuh
+ * @brief GDAKI CUDA device functions for One-sided Shared QP ops
+ *
+ * @{
+ */
+
+#ifndef DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH
+#define DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH
+
+#include "doca_gpunetio_dev_verbs_qp.cuh"
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/* **************************************** PUT **************************************** */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+    uint32_t num_chunks = doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(
+        size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks);
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+#if __CUDA_ARCH__ >= 800
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx = 0, wqe_idx;
+    uint32_t base_wqe_idx_0 = 0, base_wqe_idx_1 = 0;
+    uint32_t lane_idx = doca_gpu_dev_verbs_get_lane_id();
+
+    DOCA_GPUNETIO_VERBS_ASSERT(size <= DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE);
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+
+    if (lane_idx == 0) {
+        base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(
+            qp, DOCA_GPUNETIO_VERBS_WARP_SIZE);
+        base_wqe_idx_0 = (uint32_t)base_wqe_idx;
+        base_wqe_idx_1 = (uint32_t)(base_wqe_idx >> 32);
+    }
+    __syncwarp();
+
+    base_wqe_idx_0 = __reduce_max_sync(DOCA_GPUNETIO_VERBS_WARP_FULL_MASK, base_wqe_idx_0);
+    base_wqe_idx_1 = __reduce_max_sync(DOCA_GPUNETIO_VERBS_WARP_FULL_MASK, base_wqe_idx_1);
+    base_wqe_idx = ((uint64_t)base_wqe_idx_1) << 32 | base_wqe_idx_0;
+
+    wqe_idx = base_wqe_idx + lane_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_write(qp, wqe_ptr, wqe_idx,
+                                         DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0, raddr.addr,
+                                         raddr.key, laddr.addr, laddr.key, size);
+
+    __syncwarp();
+    if (lane_idx == 0) {
+        doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(
+            qp, base_wqe_idx, base_wqe_idx + DOCA_GPUNETIO_VERBS_WARP_SIZE - 1);
+        doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                                  nic_handler>(qp, base_wqe_idx + DOCA_GPUNETIO_VERBS_WARP_SIZE);
+    }
+    __syncwarp();
+
+    *out_ticket = wqe_idx;
+#else
+    printf("__CUDA_ARCH__ < 800, WARP mode not enabled\n");
+    *out_ticket = 0;
+#endif
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_put_thread<resource_sharing_mode, nic_handler>(qp, raddr, laddr, size,
+                                                                          out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_put_warp<resource_sharing_mode, nic_handler>(qp, raddr, laddr, size,
+                                                                        out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put(struct doca_gpu_dev_verbs_qp *qp,
+                                                              struct doca_gpu_dev_verbs_addr raddr,
+                                                              struct doca_gpu_dev_verbs_addr laddr,
+                                                              size_t size) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_put<resource_sharing_mode, nic_handler, exec_scope>(qp, raddr, laddr, size,
+                                                                           &ticket);
+}
+
+/* **************************************** PUT INLINE **************************************** */
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_p_thread<T, resource_sharing_mode, nic_handler>(qp, raddr, value,
+                                                                           out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_p_warp<T, resource_sharing_mode, nic_handler>(qp, raddr, value,
+                                                                         out_ticket);
+}
+
+template <typename T,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p(struct doca_gpu_dev_verbs_qp *qp,
+                                                            struct doca_gpu_dev_verbs_addr raddr,
+                                                            T value) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_p<T, resource_sharing_mode, nic_handler, exec_scope>(qp, raddr, value,
+                                                                            &ticket);
+}
+
+/* **************************************** PUT SIGNAL **************************************** */
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+    size_t remaining_size = size;
+    size_t size_;
+
+    uint32_t num_chunks = doca_gpu_dev_verbs_div_ceil_aligned_pow2_32bits(
+        size, DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE_SHIFT);
+    num_chunks = num_chunks > 1 ? num_chunks : 1;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, num_chunks + 1);
+
+#pragma unroll 1
+    for (uint64_t i = 0; i < num_chunks; i++) {
+        wqe_idx = base_wqe_idx + i;
+        size_ = remaining_size > DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    ? DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE
+                    : remaining_size;
+
+        wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+        [[likely]] if (size_ > 0) {
+            doca_gpu_dev_verbs_wqe_prepare_write(
+                qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE,
+                DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, 0,
+                raddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), raddr.key,
+                laddr.addr + (i * DOCA_GPUNETIO_VERBS_MAX_TRANSFER_SIZE), laddr.key, size_);
+        } else {
+            doca_gpu_dev_verbs_wqe_prepare_nop(qp, wqe_ptr, wqe_idx,
+                                               DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE);
+        }
+        remaining_size -= size_;
+    }
+
+    ++wqe_idx;
+
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_put_signal_thread<sig_op, resource_sharing_mode, nic_handler>(
+            qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_put_signal_warp<sig_op, resource_sharing_mode, nic_handler>(
+            qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_put_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr,
+    struct doca_gpu_dev_verbs_addr laddr, size_t size, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_put_signal<sig_op, resource_sharing_mode, nic_handler, exec_scope>(
+        qp, raddr, laddr, size, sig_raddr, sig_laddr, sig_val, &ticket);
+}
+
+template <typename T, enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_addr sig_raddr, struct doca_gpu_dev_verbs_addr sig_laddr,
+    uint64_t sig_val, doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+    uint64_t base_wqe_idx;
+    uint64_t wqe_idx;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    base_wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 2);
+    wqe_idx = base_wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx,
+                                                         DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE,
+                                                         raddr.addr, raddr.key, sizeof(T));
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data<T>(qp, wqe_ptr, value);
+
+    ++wqe_idx;
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, base_wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <typename T, enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_p_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr raddr, T value,
+    struct doca_gpu_dev_verbs_addr sig_raddr, struct doca_gpu_dev_verbs_addr sig_laddr,
+    uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_p_signal<T, sig_op, resource_sharing_mode, nic_handler>(
+        qp, raddr, value, sig_raddr, sig_laddr, sig_val, &ticket);
+}
+
+/* **************************************** SIGNAL **************************************** */
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_thread(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    uint64_t wqe_idx;
+    struct doca_gpu_dev_verbs_wqe *wqe_ptr;
+
+    DOCA_GPUNETIO_VERBS_ASSERT(out_ticket != NULL);
+    DOCA_GPUNETIO_VERBS_ASSERT(qp != NULL);
+    // DOCA_GPUNETIO_VERBS_ASSERT(qp->mem_type == DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU);
+
+    wqe_idx = doca_gpu_dev_verbs_reserve_wq_slots<resource_sharing_mode>(qp, 1);
+    wqe_ptr = doca_gpu_dev_verbs_get_wqe_ptr(qp, wqe_idx);
+
+    doca_gpu_dev_verbs_wqe_prepare_atomic(
+        qp, wqe_ptr, wqe_idx, DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA,
+        DOCA_GPUNETIO_IB_MLX5_WQE_CTRL_CQ_UPDATE, sig_raddr.addr, sig_raddr.key, sig_laddr.addr,
+        sig_laddr.key, sizeof(uint64_t), sig_val, 0);
+
+    doca_gpu_dev_verbs_mark_wqes_ready<resource_sharing_mode>(qp, wqe_idx, wqe_idx);
+    doca_gpu_dev_verbs_submit<resource_sharing_mode, DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+                              nic_handler>(qp, wqe_idx + 1);
+
+    *out_ticket = wqe_idx;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    *out_ticket = 0;
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_exec_scope exec_scope = DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val,
+    doca_gpu_dev_verbs_ticket_t *out_ticket) {
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_THREAD)
+        doca_gpu_dev_verbs_signal_thread<sig_op, resource_sharing_mode, nic_handler>(
+            qp, sig_raddr, sig_laddr, sig_val, out_ticket);
+    if (exec_scope == DOCA_GPUNETIO_VERBS_EXEC_SCOPE_WARP)
+        doca_gpu_dev_verbs_signal_warp<sig_op, resource_sharing_mode, nic_handler>(
+            qp, sig_raddr, sig_laddr, sig_val, out_ticket);
+}
+
+template <enum doca_gpu_dev_verbs_signal_op sig_op,
+          enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_signal(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_addr sig_raddr,
+    struct doca_gpu_dev_verbs_addr sig_laddr, uint64_t sig_val) {
+    uint64_t ticket;
+    doca_gpu_dev_verbs_signal<sig_op, resource_sharing_mode, nic_handler>(qp, sig_raddr, sig_laddr,
+                                                                          sig_val, &ticket);
+}
+
+/* **************************************** OTHERS **************************************** */
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait(struct doca_gpu_dev_verbs_qp *qp,
+                                                               doca_gpu_dev_verbs_ticket_t ticket) {
+    doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode>(doca_gpu_dev_verbs_qp_get_cq_sq(qp),
+                                                         ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait(struct doca_gpu_dev_verbs_qp *qp) {
+    uint64_t ticket =
+        doca_gpu_dev_verbs_atomic_read<uint64_t, resource_sharing_mode>(&qp->sq_rsvd_index);
+    [[unlikely]] if (ticket == 0)
+        return;
+    --ticket;
+    doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode>(doca_gpu_dev_verbs_qp_get_cq_sq(qp),
+                                                         ticket);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_fence(struct doca_gpu_dev_verbs_qp *qp) {
+    // This is no-op in the current implementation
+    return;
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_ONESIDED_CUH */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh
new file mode 100644
index 00000000000..64019b00cbf
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/device/doca_gpunetio_dev_verbs_qp.cuh
@@ -0,0 +1,824 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_dev_verbs_qp.cuh
+ * @brief GDAKI CUDA device functions for QP management
+ *
+ * @{
+ */
+#ifndef DOCA_GPUNETIO_DEV_VERBS_QP_H
+#define DOCA_GPUNETIO_DEV_VERBS_QP_H
+
+#include <cuda/atomic>
+#include "doca_gpunetio_dev_verbs_cq.cuh"
+
+/* *********** WQE UTILS *********** */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_store_wqe_seg(uint64_t *ptr,
+                                                                        uint64_t *val) {
+    asm volatile("st.weak.cs.v2.b64 [%0], {%1, %2};" : : "l"(ptr), "l"(val[0]), "l"(val[1]));
+}
+
+/**
+ * @brief Get a pointer to the WQE buffer at a specific index
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe_idx - Index of the WQE to get
+ * @return Pointer to the WQE buffer at the specified index
+ */
+__device__ static __forceinline__ struct doca_gpu_dev_verbs_wqe *doca_gpu_dev_verbs_get_wqe_ptr(
+    struct doca_gpu_dev_verbs_qp *qp, uint16_t wqe_idx) {
+    const uint16_t nwqes_mask = __ldg(&qp->sq_wqe_mask);
+    const uintptr_t wqe_addr = __ldg((uintptr_t *)&qp->sq_wqe_daddr);
+    const uint16_t idx = wqe_idx & nwqes_mask;
+    return (struct doca_gpu_dev_verbs_wqe *)(wqe_addr +
+                                             (idx << DOCA_GPUNETIO_IB_MLX5_WQE_SQ_SHIFT));
+}
+
+/* *********** WQE SHARING *********** */
+
+/**
+ * @brief Wait until the given WQE slot is available.
+ * All prior WQE slots are also guaranteed to be available.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe_idx - WQE slot index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wait_until_slot_available(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t wqe_idx) {
+    const uint16_t nwqes = __ldg(&qp->sq_wqe_num);
+    [[likely]] if (wqe_idx >= nwqes)
+        doca_gpu_dev_verbs_poll_cq_at<resource_sharing_mode, qp_type>(&(qp->cq_sq),
+                                                                      wqe_idx - nwqes);
+}
+
+/**
+ * @brief Reserve a number of WQE slots.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param count - Number of WQE slots to reserve
+ * @return The index of the first reserved WQE slot
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ,
+          bool wait_for_availability = true>
+__device__ static __forceinline__ uint64_t
+doca_gpu_dev_verbs_reserve_wq_slots(struct doca_gpu_dev_verbs_qp *qp, uint32_t count) {
+    uint64_t wqe_idx =
+        doca_gpu_dev_verbs_atomic_add<uint64_t, resource_sharing_mode>(&qp->sq_rsvd_index, count);
+    if (wait_for_availability)
+        doca_gpu_dev_verbs_wait_until_slot_available<resource_sharing_mode>(qp,
+                                                                            wqe_idx + count - 1);
+    return wqe_idx;
+}
+
+/**
+ * @brief Mark the WQEs in the range [from_wqe_idx, to_wqe_idx] as ready.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param from_wqe_idx - Starting WQE index
+ * @param to_wqe_idx - Ending WQE index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_mark_wqes_ready(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t from_wqe_idx, uint64_t to_wqe_idx) {
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE)
+        qp->sq_ready_index = to_wqe_idx + 1;
+    else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_CTA) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_block> ready_index_aref(qp->sq_ready_index);
+        while (ready_index_aref.load(cuda::memory_order_relaxed) != from_wqe_idx) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_CTA>();
+        ready_index_aref.store(to_wqe_idx + 1, cuda::memory_order_relaxed);
+    } else if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU) {
+        doca_gpu_dev_verbs_fence_release<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_device> ready_index_aref(qp->sq_ready_index);
+        while (ready_index_aref.load(cuda::memory_order_relaxed) != from_wqe_idx) continue;
+        doca_gpu_dev_verbs_fence_acquire<DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>();
+        ready_index_aref.store(to_wqe_idx + 1, cuda::memory_order_relaxed);
+    }
+}
+
+/* *********** QP DBR/DB *********** */
+
+/**
+ * @brief Prepare the DBR (Doorbell Record)
+ *
+ * @param prod_index - Producer index
+ * @return DBR value
+ */
+__device__ static __forceinline__ __be32 doca_gpu_dev_verbs_prepare_dbr(uint32_t prod_index) {
+    __be32 dbrec_val;
+
+    // This is equivalent to
+    // HTOBE32(dbrec_head & 0xffff);
+    asm volatile(
+        "{\n\t"
+        ".reg .b32 mask1;\n\t"
+        ".reg .b32 dbrec_head_16b;\n\t"
+        ".reg .b32 ign;\n\t"
+        ".reg .b32 mask2;\n\t"
+        "mov.b32 mask1, 0xffff;\n\t"
+        "mov.b32 mask2, 0x123;\n\t"
+        "and.b32 dbrec_head_16b, %1, mask1;\n\t"
+        "prmt.b32 %0, dbrec_head_16b, ign, mask2;\n\t"
+        "}"
+        : "=r"(dbrec_val)
+        : "r"(prod_index));
+
+    return dbrec_val;
+}
+
+/**
+ * @brief [Internal] Update the NIC DBR (Doorbell Record).
+ * This function does not guarantee the ordering.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_priv_gpu_dev_verbs_update_dbr(
+    struct doca_gpu_dev_verbs_qp *qp, uint32_t prod_index) {
+    __be32 dbrec_val = doca_gpu_dev_verbs_prepare_dbr(prod_index);
+    __be32 *dbrec_ptr = (__be32 *)__ldg((uintptr_t *)&qp->sq_dbrec);
+
+    cuda::atomic_ref<__be32, cuda::thread_scope_system> dbrec_ptr_aref(*dbrec_ptr);
+    dbrec_ptr_aref.store(dbrec_val, cuda::memory_order_relaxed);
+}
+
+/**
+ * @brief Update the NIC DBR (Doorbell Record)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_update_dbr(
+    struct doca_gpu_dev_verbs_qp *qp, uint32_t prod_index) {
+    __be32 dbrec_val = doca_gpu_dev_verbs_prepare_dbr(prod_index);
+    __be32 *dbrec_ptr = (__be32 *)__ldg((uintptr_t *)&qp->sq_dbrec);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+    if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+        doca_gpu_dev_verbs_async_store_release<sync_scope>(dbrec_ptr, dbrec_val);
+    } else
+#endif
+    {
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        doca_priv_gpu_dev_verbs_update_dbr<qp_type>(qp, prod_index);
+    }
+}
+
+/**
+ * @brief Prepare the DB (Doorbell)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @return DB value
+ */
+__device__ static __forceinline__ __be64
+doca_gpu_dev_verbs_prepare_db(struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg ctrl_seg = {0};
+
+    // The only ctrl segment fields that are inspected while ringing
+    // the DB are QP number and WQE index
+    ctrl_seg.qpn_ds = __ldg(&qp->sq_num_shift8_be);
+    ctrl_seg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32((prod_index << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT));
+
+    return *(uint64_t *)&ctrl_seg;
+}
+
+/* *************************** Ring Doorbell *************************** */
+
+/**
+ * @brief Ring the DB (Doorbell)
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_db(struct doca_gpu_dev_verbs_qp *qp,
+                                                                  uint64_t prod_index) {
+    __be64 *db_ptr = (__be64 *)__ldg((uintptr_t *)&qp->sq_db);
+    __be64 db_val = doca_gpu_dev_verbs_prepare_db(qp, prod_index);
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_ASYNC_STORE_RELEASE
+    if (code_opt & DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_ASYNC_STORE_RELEASE) {
+        doca_gpu_dev_verbs_async_store_release<sync_scope>((uint64_t *)db_ptr, (uint64_t)db_val);
+    } else
+#endif
+#ifdef DOCA_GPUNETIO_VERBS_HAS_STORE_RELAXED_MMIO
+    {
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        doca_gpu_dev_verbs_store_relaxed_mmio((uint64_t *)db_ptr, (uint64_t)db_val);
+    }
+#else
+    {
+        cuda::atomic_ref<uint64_t, cuda::thread_scope_system> db_ptr_aref(*((uint64_t *)db_ptr));
+        doca_gpu_dev_verbs_fence_release<sync_scope>();
+        db_ptr_aref.store(db_val, cuda::memory_order_relaxed);
+    }
+#endif
+}
+
+#ifdef DOCA_GPUNETIO_VERBS_HAS_TMA_COPY
+/**
+ * @brief Ring the BF (BlueFlame). Requires shared memory.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe - WQE to be ringed. This buffer must be in shared memory.
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_bf(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr) {
+    void *bf_ptr = (void *)__ldg((uintptr_t *)&qp->sq_db);
+    uint64_t *wqe = (uint64_t *)wqe_ptr;
+
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+    asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], 64;"
+                 :
+                 : "l"(bf_ptr), "l"(*wqe));
+}
+#endif
+
+/**
+ * @brief Ring the BF (BlueFlame). Requires at least 8 threads in the warp.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param wqe - WQE to be ringed
+ */
+template <enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_bf_warp(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr) {
+    unsigned int lane_id = doca_gpu_dev_verbs_get_lane_id();
+    uint64_t *bf_ptr = (uint64_t *)qp->sq_db;
+    uint64_t *wqe = (uint64_t *)wqe_ptr;
+
+    if (lane_id == 0) doca_gpu_dev_verbs_fence_release<sync_scope>();
+    __syncwarp();
+
+    if (lane_id < 8) {
+        bf_ptr[lane_id] = wqe[lane_id];
+    }
+}
+
+/**
+ * @brief Ring the proxy.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_idx - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_ring_proxy(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_idx) {
+    uint64_t *proxy_ptr = (uint64_t *)__ldg((uintptr_t *)&qp->sq_db);
+    cuda::atomic_ref<uint64_t, cuda::thread_scope_system> proxy_ptr_aref(*proxy_ptr);
+
+    if (resource_sharing_mode == DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_EXCLUSIVE) {
+        proxy_ptr_aref.store(prod_idx, cuda::memory_order_relaxed);
+        WRITE_ONCE(*proxy_ptr, prod_idx);
+    } else {
+        proxy_ptr_aref.fetch_max(prod_idx, cuda::memory_order_relaxed);
+    }
+}
+
+/**
+ * @brief Submit a work request to the NIC using the DB protocol.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_db(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+
+    uint64_t old_prod_index = doca_gpu_dev_verbs_atomic_max<uint64_t, resource_sharing_mode, true>(
+        &qp->sq_wqe_pi, prod_index);
+    if (old_prod_index < prod_index) {
+        // Early rining of the DB to push WQEs to the NIC ASAP.
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+
+        // In case the recovery path is triggered, the later DB ringing will cover for correctness.
+        doca_priv_gpu_dev_verbs_update_dbr<qp_type>(qp, prod_index);
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+    }
+
+    doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+}
+
+/**
+ * @brief Submit a work request to the NIC using the BlueFlame protocol.
+ * This function requires a single thread. Users must pass a pointer to a WQE stored in shared
+ * memory. Hopper or a newer generation is required to leaverage the BlueFlame protocol.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @param smem_wqe - WQE to be submitted directly to the NIC. The buffer must be in shared memory.
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_bf(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index,
+    struct doca_gpu_dev_verbs_wqe *smem_wqe) {
+#ifdef DOCA_GPUNETIO_VERBS_HAS_TMA_COPY
+    doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+    unsigned long long int old_prod_index =
+        doca_gpu_dev_verbs_atomic_max<unsigned long long int, resource_sharing_mode, true>(
+            (unsigned long long int *)&qp->sq_wqe_pi, (unsigned long long int)prod_index);
+    if (old_prod_index < prod_index) {
+        doca_gpu_dev_verbs_ring_bf<sync_scope>(qp, smem_wqe);
+        doca_priv_gpu_dev_verbs_update_dbr<DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+        doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+    }
+    doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+#else
+    doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope, code_opt,
+                                 DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+#endif
+}
+
+/**
+ * @brief Submit all the WQEs up to the given producer index to the NIC using the BlueFlame
+ * protocol. This function must be called by all threads in the warp. At least 8 threads are
+ * required.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ * @param wqe - WQE to be submitted directly to the NIC
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_gpu_code_opt code_opt = DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_bf_warp(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index, struct doca_gpu_dev_verbs_wqe *wqe) {
+    unsigned int lane_id = doca_gpu_dev_verbs_get_lane_id();
+    unsigned long long int old_prod_index;
+    if (lane_id == 0) {
+        doca_gpu_dev_verbs_lock<resource_sharing_mode>(&qp->sq_lock);
+        old_prod_index =
+            doca_gpu_dev_verbs_atomic_max<unsigned long long int, resource_sharing_mode, true>(
+                (unsigned long long int *)&qp->sq_wqe_pi, (unsigned long long int)prod_index);
+    }
+    __syncwarp();
+    old_prod_index = __shfl_sync(0xFFFFFFFF, old_prod_index, 0);
+    if (old_prod_index < prod_index) {
+        doca_gpu_dev_verbs_ring_bf_warp(qp, wqe);
+        __syncwarp();
+        if (lane_id == 0) {
+            doca_priv_gpu_dev_verbs_update_dbr<DOCA_GPUNETIO_VERBS_QP_SQ>(qp, prod_index);
+            doca_gpu_dev_verbs_ring_db<sync_scope, code_opt>(qp, prod_index);
+        }
+    }
+    if (lane_id == 0) doca_gpu_dev_verbs_unlock<resource_sharing_mode>(&qp->sq_lock);
+    __syncwarp();
+}
+
+/**
+ * @brief Submit all the WQEs up to the given producer index to the NIC via the CPU proxy.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param prod_index - Producer index
+ */
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit_proxy(
+    struct doca_gpu_dev_verbs_qp *qp, uint64_t prod_index) {
+    doca_gpu_dev_verbs_fence_release<sync_scope>();
+    doca_gpu_dev_verbs_ring_proxy<resource_sharing_mode>(qp, prod_index);
+}
+
+template <enum doca_gpu_dev_verbs_resource_sharing_mode resource_sharing_mode =
+              DOCA_GPUNETIO_VERBS_RESOURCE_SHARING_MODE_GPU,
+          enum doca_gpu_dev_verbs_sync_scope sync_scope = DOCA_GPUNETIO_VERBS_SYNC_SCOPE_GPU,
+          enum doca_gpu_dev_verbs_nic_handler nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO,
+          enum doca_gpu_dev_verbs_qp_type qp_type = DOCA_GPUNETIO_VERBS_QP_SQ>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_submit(struct doca_gpu_dev_verbs_qp *qp,
+                                                                 uint64_t prod_index) {
+    const enum doca_gpu_dev_verbs_nic_handler qp_nic_handler =
+        (enum doca_gpu_dev_verbs_nic_handler)__ldg((int *)&qp->nic_handler);
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        if (qp_nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB)
+            doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope,
+                                         DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT, qp_type>(
+                qp, prod_index);
+        else
+            doca_gpu_dev_verbs_submit_proxy<resource_sharing_mode, sync_scope>(qp, prod_index);
+    } else if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB) {
+        doca_gpu_dev_verbs_submit_db<resource_sharing_mode, sync_scope,
+                                     DOCA_GPUNETIO_VERBS_GPU_CODE_OPT_DEFAULT, qp_type>(qp,
+                                                                                        prod_index);
+    } else {
+        doca_gpu_dev_verbs_submit_proxy<resource_sharing_mode, sync_scope>(qp, prod_index);
+    }
+}
+
+/* *********** WQE PREPARATION *********** */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_nop(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_NOP);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_1ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t immediate,
+    const uint64_t raddr, const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0,
+    const uint32_t bytes0) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_3ds);
+    cseg.fm_ce_se = ctrl_flags;
+    cseg.imm = immediate;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t immediate,
+    const uint64_t raddr, const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0,
+    const uint32_t bytes0, const uint64_t laddr1, const uint32_t lkey1, const uint32_t bytes1) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_4ds);
+    cseg.fm_ce_se = ctrl_flags;
+    cseg.imm = immediate;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    dseg1.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes1 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg1.lkey = lkey1;
+#else
+    dseg1.lkey = doca_gpu_dev_verbs_bswap32(lkey1);
+#endif
+    dseg1.addr = doca_gpu_dev_verbs_bswap64(laddr1);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg1));
+}
+
+/**
+ * @brief Prepare the header segment of an inline RDMA Write WQE.
+ * The data segment is prepared separately.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint32_t bytes) {
+    int ds;
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+
+    if (bytes > sizeof(struct doca_gpunetio_ib_mlx5_wqe_data_seg) -
+                    sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg))
+        ds = DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MAX;
+    else
+        ds = DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_RDMA_WRITE_INL_MIN;
+
+    assert(bytes <= DOCA_GPUNETIO_VERBS_MAX_INLINE_SIZE);
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_WRITE);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) | ds);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+}
+
+/**
+ * @brief Prepare the data segment of an inline RDMA Write WQE.
+ *
+ * @param qp - Queue Pair (QP)
+ * @param data - Data to be written
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+template <typename T>
+__device__ static __forceinline__ void doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_data(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr, T data) {
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg *data_seg_ptr =
+        (struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg
+             *)((uintptr_t)wqe_ptr + sizeof(struct doca_gpu_dev_verbs_wqe_ctrl_seg) +
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg));
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg data_seg;
+    uint32_t bytes = sizeof(T);
+
+    data_seg.byte_count = doca_gpu_dev_verbs_bswap32(bytes | DOCA_GPUNETIO_IB_MLX5_INLINE_SEG);
+    *(uint32_t *)data_seg_ptr = data_seg.byte_count;
+    if (bytes <= sizeof(uint32_t)) {
+        T *dst = (T *)((uintptr_t)data_seg_ptr + sizeof(data_seg));
+        *dst = data;
+    } else {
+        uint32_t *dst32 = (uint32_t *)((uintptr_t)data_seg_ptr + sizeof(data_seg));
+        dst32[0] = ((uint32_t *)&data)[0];
+        dst32[1] = ((uint32_t *)&data)[1];
+    }
+}
+
+/**
+ * @brief Prepare a RDMA Write WQE with inline data
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_write_inl(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr, const uint32_t bytes) {
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg data_seg;
+    struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg *data_seg_ptr =
+        (struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg
+             *)((uintptr_t)wqe_ptr + sizeof(struct doca_gpu_dev_verbs_wqe_ctrl_seg) +
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg));
+
+    doca_gpu_dev_verbs_prepare_inl_rdma_write_wqe_header(qp, wqe_ptr, wqe_idx, ctrl_flags, raddr,
+                                                         rkey, bytes);
+
+    data_seg.byte_count = doca_gpu_dev_verbs_bswap32(bytes | DOCA_GPUNETIO_IB_MLX5_INLINE_SEG);
+    *(uint32_t *)data_seg_ptr = data_seg.byte_count;
+
+    doca_gpu_dev_verbs_memcpy_data((void *)((uintptr_t)data_seg_ptr + sizeof(data_seg)),
+                                   (void *)(uintptr_t)laddr, bytes);
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_read(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0, const uint32_t bytes0) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_3ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count = doca_gpu_dev_verbs_bswap32(bytes0);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+}
+
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_read(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr,
+    const uint32_t rkey, const uint64_t laddr0, const uint32_t lkey0, const uint32_t bytes0,
+    const uint64_t laddr1, const uint32_t lkey1, const uint32_t bytes1) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg0;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg1;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_RDMA_READ);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) | 4);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    dseg0.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes0 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg0.lkey = lkey0;
+#else
+    dseg0.lkey = doca_gpu_dev_verbs_bswap32(lkey0);
+#endif
+    dseg0.addr = doca_gpu_dev_verbs_bswap64(laddr0);
+
+    dseg1.byte_count =
+        doca_gpu_dev_verbs_bswap32(bytes1 & uint32_t(DOCA_GPUNETIO_IB_MLX5_INLINE_SEG - 1));
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg1.lkey = lkey1;
+#else
+    dseg1.lkey = doca_gpu_dev_verbs_bswap32(lkey1);
+#endif
+    dseg1.addr = doca_gpu_dev_verbs_bswap64(laddr1);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(dseg0));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg1));
+}
+
+/**
+ * @brief Prepare an Atomic WQE
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_atomic(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr,
+    const uint16_t wqe_idx, const uint32_t opcode,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint64_t raddr, const uint32_t rkey,
+    const uint64_t laddr, const uint32_t lkey, const uint32_t bytes, const uint64_t compare_add,
+    const uint64_t swap_add) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpunetio_ib_mlx5_wqe_raddr_seg rseg;
+    struct doca_gpunetio_ib_mlx5_wqe_atomic_seg atseg;
+    struct doca_gpunetio_ib_mlx5_wqe_data_seg dseg;
+
+    cseg.opmod_idx_opcode = doca_gpu_dev_verbs_bswap32(
+        ((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) | opcode);
+    cseg.qpn_ds = __ldg(&qp->sq_num_shift8_be_4ds);
+    cseg.fm_ce_se = ctrl_flags;
+
+    rseg.raddr = doca_gpu_dev_verbs_bswap64(raddr);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    rseg.rkey = rkey;
+#else
+    rseg.rkey = doca_gpu_dev_verbs_bswap32(rkey);
+#endif
+
+    atseg.swap_add = doca_gpu_dev_verbs_bswap64(
+        opcode == DOCA_GPUNETIO_IB_MLX5_OPCODE_ATOMIC_FA ? compare_add : swap_add);
+    atseg.compare = doca_gpu_dev_verbs_bswap64(compare_add);
+
+    dseg.byte_count = doca_gpu_dev_verbs_bswap32(bytes);
+#if DOCA_GPUNETIO_VERBS_MKEY_SWAPPED == 1
+    dseg.lkey = lkey;
+#else
+    dseg.lkey = doca_gpu_dev_verbs_bswap32(lkey);
+#endif
+    dseg.addr = doca_gpu_dev_verbs_bswap64(laddr);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(rseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg2), (uint64_t *)&(atseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg3), (uint64_t *)&(dseg));
+}
+
+/**
+ * @brief Prepare a Wait WQE
+ *
+ * @param qp - Queue Pair (QP)
+ * @param send_wr - Send Work Request to be prepared
+ * @param wqe_idx - Index of the WQE to be prepared
+ * @param out_wqes - Pointer to the WQE buffer to write the prepared WQE to
+ */
+__device__ static __forceinline__ void doca_gpu_dev_verbs_wqe_prepare_wait(
+    struct doca_gpu_dev_verbs_qp *qp, struct doca_gpu_dev_verbs_wqe *wqe_ptr, uint16_t wqe_idx,
+    enum doca_gpu_dev_verbs_wqe_ctrl_flags ctrl_flags, const uint32_t max_index,
+    const uint32_t qpn_cqn) {
+    struct doca_gpu_dev_verbs_wqe_ctrl_seg cseg;
+    struct doca_gpu_dev_verbs_wqe_wait_seg wseg;
+
+    cseg.opmod_idx_opcode =
+        doca_gpu_dev_verbs_bswap32(((uint32_t)wqe_idx << DOCA_GPUNETIO_VERBS_WQE_IDX_SHIFT) |
+                                   DOCA_GPUNETIO_IB_MLX5_OPCODE_WAIT);
+    cseg.qpn_ds = doca_gpu_dev_verbs_bswap32(__ldg(&qp->sq_num_shift8) |
+                                             DOCA_GPUNETIO_VERBS_WQE_SEG_CNT_WAIT);
+    cseg.fm_ce_se = ctrl_flags;
+    // cseg.imm = 0;
+
+    wseg.max_index = doca_gpu_dev_verbs_bswap32(max_index);
+    wseg.qpn_cqn = doca_gpu_dev_verbs_bswap32(qpn_cqn);
+
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg0), (uint64_t *)&(cseg));
+    doca_gpu_dev_verbs_store_wqe_seg((uint64_t *)&(wqe_ptr->dseg1), (uint64_t *)&(wseg));
+}
+
+#endif /* DOCA_GPUNETIO_DEV_VERBS_QP_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h
new file mode 100644
index 00000000000..f0bd0d46b13
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_config.h
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_config.h
+ * @brief A header file for the DOCA GPUNetIO build-time configuration. This header
+ * file may be generated by calling scripts/configure.
+ */
+
+#ifndef DOCA_GPUNETIO_CONFIG_H
+#define DOCA_GPUNETIO_CONFIG_H
+/* DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF support */
+#define DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF 1
+/* DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR support */
+#define DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR 1
+/* DOCA_GPUNETIO_HAVE_CUDA_DMABUF support */
+#define DOCA_GPUNETIO_HAVE_CUDA_DMABUF 1
+#endif  // DOCA_GPUNETIO_CONFIG_H
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h
new file mode 100644
index 00000000000..39483f5addf
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_device.h
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_device.h
+ * @brief A header file that includes all necessary device APIs
+ */
+
+#ifndef DOCA_GPUNETIO_DEVICE_H
+#define DOCA_GPUNETIO_DEVICE_H
+
+#include "common/doca_gpunetio_verbs_def.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "device/doca_gpunetio_dev_verbs_common.cuh"
+#include "device/doca_gpunetio_dev_verbs_cq.cuh"
+#include "device/doca_gpunetio_dev_verbs_qp.cuh"
+#include "device/doca_gpunetio_dev_verbs_onesided.cuh"
+#include "device/doca_gpunetio_dev_verbs_counter.cuh"
+
+#endif /* DOCA_GPUNETIO_DEVICE_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h
new file mode 100644
index 00000000000..283dd14822c
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/doca_gpunetio_host.h
@@ -0,0 +1,49 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_host.h
+ * @brief A header file that includes all necessary host APIs
+ */
+
+#ifndef DOCA_GPUNETIO_HOST_H
+#define DOCA_GPUNETIO_HOST_H
+
+#include "doca_gpunetio_config.h"
+#include "common/doca_gpunetio_verbs_def.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "host/mlx5_ifc.h"
+#include "host/mlx5_prm.h"
+#include "host/doca_error.h"
+#include "host/doca_verbs.h"
+#include "host/doca_gpunetio.h"
+#include "host/doca_gpunetio_high_level.h"
+
+#endif /* DOCA_GPUNETIO_HOST_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h
new file mode 100644
index 00000000000..7cb101b3ff2
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_error.h
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_errors.h
+ * @brief A header file for the doca_error APIs
+ */
+
+#ifndef DOCA_ERROR_H
+#define DOCA_ERROR_H
+
+#include <errno.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief DOCA API return codes
+ */
+typedef enum doca_error {
+    DOCA_SUCCESS = 0,                      /**< Success */
+    DOCA_ERROR_UNKNOWN = 1,                /**< Unknown error */
+    DOCA_ERROR_NOT_PERMITTED = 2,          /**< Operation not permitted */
+    DOCA_ERROR_IN_USE = 3,                 /**< Resource already in use */
+    DOCA_ERROR_NOT_SUPPORTED = 4,          /**< Operation not supported */
+    DOCA_ERROR_AGAIN = 5,                  /**< Resource temporarily unavailable, try again */
+    DOCA_ERROR_INVALID_VALUE = 6,          /**< Invalid input */
+    DOCA_ERROR_NO_MEMORY = 7,              /**< Memory allocation failure */
+    DOCA_ERROR_INITIALIZATION = 8,         /**< Resource initialization failure */
+    DOCA_ERROR_TIME_OUT = 9,               /**< Timer expired waiting for resource */
+    DOCA_ERROR_SHUTDOWN = 10,              /**< Shut down in process or completed */
+    DOCA_ERROR_CONNECTION_RESET = 11,      /**< Connection reset by peer */
+    DOCA_ERROR_CONNECTION_ABORTED = 12,    /**< Connection aborted */
+    DOCA_ERROR_CONNECTION_INPROGRESS = 13, /**< Connection in progress */
+    DOCA_ERROR_NOT_CONNECTED = 14,         /**< Not Connected */
+    DOCA_ERROR_NO_LOCK = 15,               /**< Unable to acquire required lock */
+    DOCA_ERROR_NOT_FOUND = 16,             /**< Resource Not Found */
+    DOCA_ERROR_IO_FAILED = 17,             /**< Input/Output Operation Failed */
+    DOCA_ERROR_BAD_STATE = 18,             /**< Bad State */
+    DOCA_ERROR_UNSUPPORTED_VERSION = 19,   /**< Unsupported version */
+    DOCA_ERROR_OPERATING_SYSTEM = 20,      /**< Operating system call failure */
+    DOCA_ERROR_DRIVER = 21,                /**< DOCA Driver call failure */
+    DOCA_ERROR_UNEXPECTED = 22,            /**< An unexpected scenario was detected */
+    DOCA_ERROR_ALREADY_EXIST = 23,         /**< Resource already exist */
+    DOCA_ERROR_FULL = 24,                  /**< No more space in resource */
+    DOCA_ERROR_EMPTY = 25,                 /**< No entry is available in resource */
+    DOCA_ERROR_IN_PROGRESS = 26,           /**< Operation is in progress */
+    DOCA_ERROR_TOO_BIG = 27,               /**< Requested operation too big to be contained */
+    DOCA_ERROR_AUTHENTICATION = 28,        /**< Authentication failure */
+    DOCA_ERROR_BAD_CONFIG = 29,            /**< Configuration is not valid */
+    DOCA_ERROR_SKIPPED = 30, /**< Result is valid, but some previous output data was dropped */
+    DOCA_ERROR_DEVICE_FATAL_ERROR = 31 /**< Device experienced a fatal error */
+} doca_error_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+#endif /* DOCA_ERROR_H_ */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h
new file mode 100644
index 00000000000..7d0f46ac643
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio.h
@@ -0,0 +1,387 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio.h
+ * @brief A header file for the doca_gpunetio APIs
+ */
+
+#ifndef DOCA_GPUNETIO_H
+#define DOCA_GPUNETIO_H
+
+#include "host/doca_error.h"
+#include "doca_gpunetio_config.h"
+#include "common/doca_gpunetio_verbs_def.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************************************************************************************************************
+ * DOCA GPU Lightweight opaque types
+ *********************************************************************************************************************/
+/**
+ * Opaque structure representing a DOCA GPU device handler.
+ */
+struct doca_gpu;
+
+/**
+ * @brief Type of memory the GPUNetIO library can allocate
+ *
+ */
+enum doca_gpu_mem_type {
+    /* GPU memory not accessible from CPU. */
+    DOCA_GPU_MEM_TYPE_GPU = 0,
+    /* GPU memory with direct access from CPU. */
+    DOCA_GPU_MEM_TYPE_GPU_CPU = 1,
+    /* CPU memory with direct access from GPU. */
+    DOCA_GPU_MEM_TYPE_CPU_GPU = 2,
+};
+
+/**
+ * @brief Forward declaration
+ *
+ */
+struct doca_gpu_dev_verbs_qp;
+struct doca_gpu_dev_verbs_cq;
+
+/**
+ * @brief GPUNetIO QP handler accessible from CPU
+ *
+ */
+struct doca_gpu_verbs_qp {
+    struct doca_gpu *gpu_dev;
+    struct doca_verbs_qp *qp;
+    uint64_t *cpu_db;
+    uint64_t sq_wqe_pi_last;
+    uint64_t *sq_db;
+    __be32 *sq_dbrec;
+    bool cpu_proxy;
+    uint32_t sq_num_shift8_be;
+    /* CPU handler */
+    struct doca_gpu_dev_verbs_qp *qp_cpu;
+    /* GPU handler */
+    struct doca_gpu_dev_verbs_qp *qp_gpu;
+};
+
+/**
+ * @brief GPUNetIO QP Error info.
+ */
+struct doca_gpu_verbs_qp_error_info {
+    bool has_error;
+    int syndrome;
+    int vendor_err_synd;
+    int hw_err_synd;
+    int hw_synd_type;
+    int wqe_counter;
+};
+
+typedef void *doca_gpu_verbs_service_t;
+
+/**
+ * @brief Create a DOCA GPUNETIO handler.
+ *
+ * @param [in] gpu_bus_id
+ * GPU PCIe address.
+ * @param [out] gpu_dev
+ * Pointer to the newly created gpu device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - gpu_dev argument is a NULL pointer.
+ * - DOCA_ERROR_NOT_FOUND - GPU not found at the input PCIe address
+ * - DOCA_ERROR_NO_MEMORY - failed to alloc doca_gpu.
+ *
+ */
+doca_error_t doca_gpu_create(const char *gpu_bus_id, struct doca_gpu **gpu_dev);
+
+/**
+ * @brief Destroy a DOCA GPUNETIO handler.
+ *
+ * @param [in] gpu_dev
+ * Pointer to handler to be destroyed.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ */
+doca_error_t doca_gpu_destroy(struct doca_gpu *gpu_dev);
+
+/**
+ * Allocate a GPU accessible memory buffer. Assumes DPDK has been already attached with
+ * doca_gpu_to_dpdk(). According to the memory type specified, the buffer can be allocated in:
+ * - DOCA_GPU_MEM_TYPE_GPU memptr_gpu is not NULL while memptr_cpu is NULL.
+ * - DOCA_GPU_MEM_TYPE_GPU_CPU both memptr_gpu and memptr_cpu are not NULL.
+ * - DOCA_GPU_MEM_TYPE_CPU_GPU both memptr_gpu and memptr_cpu are not NULL.
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] size
+ * Buffer size in bytes.
+ * @param [in] alignment
+ * Buffer memory alignment.
+ * If 0, the return is a pointer that is suitably aligned
+ * for any kind of variable (in the same manner as malloc()).
+ * Otherwise, the return is a pointer that is a multiple of *align*.
+ * Alignment value must be a power of two.
+ * @param [in] mtype
+ * Type of memory buffer. See enum doca_gpu_memtype for reference.
+ * @param [out] memptr_gpu
+ * GPU memory pointer. Must be used with CUDA API and within CUDA kernels.
+ * @param [out] memptr_cpu
+ * CPU memory pointer. Must be used for CPU direct access to the memory.
+ *
+ * @return
+ * Non NULL memptr_gpu pointer on success, NULL otherwise.
+ * Non NULL memptr_cpu pointer on success in case of DOCA_GPU_MEM_TYPE_CPU_GPU and
+ * DOCA_GPU_MEM_TYPE_GPU_CPU, NULL otherwise. DOCA_SUCCESS - in case of success. doca_error code -
+ * in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_NO_MEMORY - if an error occurred dealing with GPU memory.
+ */
+doca_error_t doca_gpu_mem_alloc(struct doca_gpu *gpu_dev, size_t size, size_t alignment,
+                                enum doca_gpu_mem_type mtype, void **memptr_gpu, void **memptr_cpu);
+
+/**
+ * Free a GPU memory buffer.
+ * Only memory allocated with doca_gpu_mem_alloc() can be freed with this function.
+ *
+ * @param [in] gpu
+ * DOCA GPUNetIO handler.
+ * @param [in] memptr_gpu
+ * GPU memory pointer to be freed.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_mem_free(struct doca_gpu *gpu, void *memptr_gpu);
+
+/**
+ * Create a GPU handler for a Verbs QP object
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] qp
+ * DOCA network device handler.
+ * @param [in] nic_handler
+ * Type of NIC handler for this QP.
+ * @param [in] gpu_qp_umem_dev_ptr
+ * GPU external UMEM.
+ * @param [in] cq_sq
+ * DOCA Verbs CQ SQ CPU object connected to the QP.
+ * @param [out] qp_out
+ * DOCA GPUNetIO Verbs QP object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_export_qp(struct doca_gpu *gpu_dev, struct doca_verbs_qp *qp,
+                                      enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                                      void *gpu_qp_umem_dev_ptr, struct doca_verbs_cq *cq_sq,
+                                      struct doca_gpu_verbs_qp **qp_out);
+
+/**
+ * Destroy a GPU handler for a Verbs QP object
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] qp_cpu
+ * DOCA Verbs QP CPU object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_unexport_qp(struct doca_gpu *gpu_dev, struct doca_gpu_verbs_qp *qp);
+
+/**
+ * Get a GPUNetIO GPU device handler handler from a GPUNetIO Verbs QP object.
+ *
+ * @param [in] qp
+ * DOCA GPUNetIO Verbs QP object.
+ * @param [out] qp_gpu
+ * DOCA GPUNetIO Verbs QP GPU object.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_get_qp_dev(struct doca_gpu_verbs_qp *qp,
+                                       struct doca_gpu_dev_verbs_qp **qp_gpu);
+
+/**
+ * Return a DMABuf file descriptor from a GPU memory address if the GPU device and CUDA installation
+ * supports DMABuf.
+ *
+ * @param [in] gpu_dev
+ * DOCA GPUNetIO handler.
+ * @param [in] memptr_gpu
+ * GPU memory pointer to be freed.
+ * @param [in] size
+ * Size in bytes to map.
+ * @param [out] dmabuf_fd
+ * DMABuf file descriptor
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_NOT_SUPPORTED - DMABuf not supported
+ */
+doca_error_t doca_gpu_dmabuf_fd(struct doca_gpu *gpu_dev, void *memptr_gpu, size_t size,
+                                int *dmabuf_fd);
+
+/**
+ * Check if UAR can be registered on GPU
+ *
+ * @param [in] db
+ * UAR address
+ * @param [out] out_can_register
+ * Can register on GPU
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_can_gpu_register_uar(void *db, bool *out_can_register);
+
+/**
+ * Export UAR to GPU
+ *
+ * @param [in] sq_db
+ * SQ UAR address
+ * @param [out] uar_addr_gpu
+ * SQ UAR GPU address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - if UAR mapping failed
+ */
+doca_error_t doca_gpu_verbs_export_uar(uint64_t *sq_db, uint64_t **uar_addr_gpu);
+
+/**
+ * Unexport UAR from GPU
+ *
+ * @param [in] uar_addr_gpu
+ * SQ UAR GPU address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_unexport_uar(uint64_t *uar_addr_gpu);
+
+/**
+ * Progress QP (ring db) in case of CPU proxy mode
+ *
+ * @param [in] qp_cpu
+ * QP to progress
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_cpu_proxy_progress(struct doca_gpu_verbs_qp *qp_cpu);
+
+/**
+ * Create a service object.
+ *
+ * @param [out] out_service
+ * Service handle
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_service(doca_gpu_verbs_service_t *out_service);
+
+/**
+ * Monitor a QP and make forward progress.
+ *
+ * @param [in] service
+ * Service object
+ * @param [in] qp
+ * QP to monitor
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_service_monitor_qp(doca_gpu_verbs_service_t service,
+                                               struct doca_gpu_verbs_qp *qp);
+
+/**
+ * Destroy a service object.
+ *
+ * @param [in] service
+ * Service object to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_service(doca_gpu_verbs_service_t service);
+
+/**
+ * Query the last error of a GPUNetIO QP
+ *
+ * @param [in] qp
+ * QP to query
+ * @param [out] error_info
+ * Error info
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_query_last_error(struct doca_gpu_verbs_qp *qp,
+                                             struct doca_gpu_verbs_qp_error_info *error_info);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h
new file mode 100644
index 00000000000..8d71f8da2c0
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_gpunetio_high_level.h
@@ -0,0 +1,191 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpunetio_high_level.h
+ * @brief A header file for the doca_gpunetio High-level APIs
+ */
+
+#ifndef DOCA_GPUNETIO_HIGH_LEVEL_H
+#define DOCA_GPUNETIO_HIGH_LEVEL_H
+
+#include "doca_gpunetio.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum doca_gpu_verbs_mem_reg_type {
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT =
+        0,  ///< Automatically select the most appropriate method
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_DMABUF = 1,   ///< Use CUDA DMABUF to register memory
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_PEERMEM = 2,  ///< Use CUDA PeerMem to register memory
+    DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_MAX,               ///< Sentinel value
+};
+
+struct doca_gpu_verbs_qp_init_attr_hl {
+    struct doca_gpu *gpu_dev;
+    struct ibv_pd *ibpd;
+    uint16_t sq_nwqe;
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;
+    enum doca_gpu_verbs_mem_reg_type mreg_type;
+};
+
+struct doca_gpu_verbs_qp_hl {
+    struct doca_gpu *gpu_dev; /* DOCA GPU device to use */
+
+    // CQ
+    struct doca_verbs_cq *cq_sq;
+    void *cq_sq_umem_gpu_ptr;
+    struct doca_verbs_umem *cq_sq_umem;
+    void *cq_sq_umem_dbr_gpu_ptr;
+    struct doca_verbs_umem *cq_sq_umem_dbr;
+
+    // QP
+    struct doca_verbs_qp *qp;
+    void *qp_umem_gpu_ptr;
+    struct doca_verbs_umem *qp_umem;
+    void *qp_umem_dbr_gpu_ptr;
+    struct doca_verbs_umem *qp_umem_dbr;
+    struct doca_verbs_uar *external_uar;
+
+    enum doca_gpu_dev_verbs_nic_handler nic_handler;
+
+    // QP GPUNetIO Object
+    struct doca_gpu_verbs_qp *qp_gverbs;
+};
+
+struct doca_gpu_verbs_qp_group_hl {
+    struct doca_gpu_verbs_qp_hl qp_main;
+    struct doca_gpu_verbs_qp_hl qp_companion;
+};
+
+/**
+ * Create an high-level GPUNetIO QP.
+ * This function encapsulate all required steps using doca verbs and doca gpunetio to
+ * create a GDAKI QP.
+ *
+ * @param [in] qp_init_attr
+ * High-level QP init attributes.
+ * @param [out] qp
+ * GPUNetIO QP device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_qp_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                         struct doca_gpu_verbs_qp_hl **qp);
+
+/**
+ * Destroy an high-level GPUNetIO QP.
+ *
+ * @param [in] qp
+ * GPUNetIO high-level QP to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_qp_hl(struct doca_gpu_verbs_qp_hl *qp);
+
+/**
+ * Create an high-level GPUNetIO QP group (main and companion).
+ * This function encapsulate all required steps using doca verbs and doca gpunetio to
+ * create two GDAKI QPs, main one and the one used for core direct operations.
+ * The two QPs share the same UAR.
+ *
+ * @param [in] qp_init_attr
+ * High-level QP init attributes.
+ * @param [out] qpg
+ * GPUNetIO QP Group device handler.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_create_qp_group_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                               struct doca_gpu_verbs_qp_group_hl **qpg);
+
+/**
+ * Destroy an high-level GPUNetIO QP group.
+ *
+ * @param [in] qp
+ * GPUNetIO high-level QP group to destroy
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_destroy_qp_group_hl(struct doca_gpu_verbs_qp_group_hl *qpg);
+
+/**
+ * Creates a flat list of GPU QP.
+ * Copies each struct doca_gpu_dev_verbs_qp inside the struct doca_gpu_verbs_qp_hl into
+ * a GPU array to avoid pointers dereferencing.
+ *
+ * @param [in] qp_list
+ * GPUNetIO high-level QP array
+ * @param [in] num_elems
+ * Number of QP in the qp_list
+ * @param [out] qp_gpu
+ * Array of GPU QP structures.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_qp_flat_list_create_hl(struct doca_gpu_verbs_qp_hl **qp_list,
+                                                   uint32_t num_elems,
+                                                   struct doca_gpu_dev_verbs_qp **qp_gpu);
+
+/**
+ * Destry a flat list of GPU QP.
+ *
+ * @param [in] qp_gpu
+ * Array of GPU QP structures.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_gpu_verbs_qp_flat_list_destroy_hl(struct doca_gpu_dev_verbs_qp *qp_gpu);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_GPUNETIO_HIGH_LEVEL_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h
new file mode 100644
index 00000000000..a760cb3af26
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/doca_verbs.h
@@ -0,0 +1,2467 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs.h
+ * @brief A header file for the doca_verbs APIs
+ */
+
+#ifndef DOCA_VERBS_H
+#define DOCA_VERBS_H
+
+#include <errno.h>
+
+#include "doca_error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**********************************************************************************************************************
+ * DOCA Verbs opaque types
+ *********************************************************************************************************************/
+/**
+ * Opaque structure representing a DOCA Verbs QP Init Attributes instance.
+ */
+struct doca_verbs_qp_init_attr;
+/**
+ * Opaque structure representing a DOCA Verbs QP Attributes instance.
+ */
+struct doca_verbs_qp_attr;
+/**
+ * Opaque structure representing a DOCA Verbs Queue Pair instance.
+ */
+struct doca_verbs_qp;
+/**
+ * Opaque structure representing a DOCA Verbs CQ Attributes instance.
+ */
+struct doca_verbs_cq_attr;
+/**
+ * Opaque structure representing a DOCA Verbs Completion Queue instance.
+ */
+struct doca_verbs_cq;
+/**
+ * Opaque structure representing a DOCA Verbs Shared Receive Queue instance.
+ */
+struct doca_verbs_srq;
+/**
+ * Opaque structure representing a DOCA Verbs SRQ Init Attributes
+ */
+struct doca_verbs_srq_init_attr;
+/**
+ * Opaque structure representing a DOCA Verbs AH instance.
+ */
+struct doca_verbs_ah_attr;
+/**
+ * Opaque structure representing a DOCA UMEM instance.
+ */
+struct doca_verbs_umem;
+/**
+ * Opaque structure representing a DOCA UAR instance.
+ */
+struct doca_verbs_uar;
+/**
+ * Opaque structure representing a DOCA Device Attributes instance.
+ */
+struct doca_verbs_device_attr;
+
+/**
+ * @brief Verbs RC QP type define.
+ */
+#define DOCA_VERBS_QP_TYPE_RC 0x0
+
+/**
+ * @brief Verbs QP state.
+ */
+enum doca_verbs_qp_state {
+    DOCA_VERBS_QP_STATE_RST = 0x0,
+    DOCA_VERBS_QP_STATE_INIT = 0x1,
+    DOCA_VERBS_QP_STATE_RTR = 0x2,
+    DOCA_VERBS_QP_STATE_RTS = 0x3,
+    DOCA_VERBS_QP_STATE_ERR = 0x4,
+};
+
+/**
+ * @brief Verbs address type.
+ */
+enum doca_verbs_addr_type {
+    DOCA_VERBS_ADDR_TYPE_IPv4,      /**< IPv4 type */
+    DOCA_VERBS_ADDR_TYPE_IPv6,      /**< IPv6 type */
+    DOCA_VERBS_ADDR_TYPE_IB_GRH,    /**< IB with GRH type */
+    DOCA_VERBS_ADDR_TYPE_IB_NO_GRH, /**< IB without GRH type */
+};
+
+/**
+ * @brief MTU size in bytes.
+ */
+enum doca_verbs_mtu_size {
+    DOCA_VERBS_MTU_SIZE_256_BYTES = 0x0,
+    DOCA_VERBS_MTU_SIZE_512_BYTES = 0x1,
+    DOCA_VERBS_MTU_SIZE_1K_BYTES = 0x2,
+    DOCA_VERBS_MTU_SIZE_2K_BYTES = 0x3,
+    DOCA_VERBS_MTU_SIZE_4K_BYTES = 0x4,
+    DOCA_VERBS_MTU_SIZE_RAW_ETHERNET = 0x5, /* Reserved */
+};
+
+/**
+ * @brief DOCA Verbs UAR allocation type.
+ */
+enum doca_verbs_uar_allocation_type {
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME = 0,
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE = 1,
+    DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED = 2,
+};
+
+/**
+ * @brief CQ overrun
+ */
+enum doca_verbs_cq_overrun {
+    DOCA_VERBS_CQ_DISABLE_OVERRUN = 0, /**< Disable overrun by default. */
+    DOCA_VERBS_CQ_ENABLE_OVERRUN = 1,  /**< Enable overrun. */
+};
+
+/**
+ * @brief DOCA Verbs SRQ type.
+ */
+enum doca_verbs_srq_type {
+    DOCA_VERBS_SRQ_TYPE_LINKED_LIST,
+    DOCA_VERBS_SRQ_TYPE_CONTIGUOUS,
+};
+
+/**
+ * @brief DOCA Verbs Atomic Type.
+ */
+enum doca_verbs_qp_atomic_type {
+    DOCA_VERBS_QP_ATOMIC_MODE_NONE = 0x0,
+    DOCA_VERBS_QP_ATOMIC_MODE_IB_SPEC = 0x1,
+    DOCA_VERBS_QP_ATOMIC_MODE_UP_TO_8BYTES = 0x3
+};
+
+/**
+ * @brief Verbs QP attributes
+ *
+ * @details These defines can be used with doca_verbs_qp_modify() to set QP attributes.
+ * These attributes are used in several QP state transition commands.
+ *
+ * For each command bellow there are optional and required attributes depending on QP type:
+ * - *->rst:
+ *		QP type RC:
+ *			required: next_state
+ *			optional: NONE
+ *		QP type UC:
+ *			required: next_state
+ *			optional: NONE
+ * - *->err:
+ *		QP type RC:
+ *			required: next_state
+ *			optional: NONE
+ *		QP type UC:
+ *			required: next_state
+ *			optional: NONE
+ * - rst->init:
+ * 		QP type RC:
+ *			required: next_state, allow_remote_write, allow_remote_read, allow_atomic,
+ *pkey_index, port_num optional: NONE QP type UC: required: next_state, allow_remote_write,
+ *pkey_index, port_num optional: NONE
+ * - init->init:
+ *		QP type RC:
+ *			required: NONE
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic, pkey_index,
+ *port_num QP type UC: required: NONE optional: allow_remote_write, pkey_index, port_num
+ * - init->rtr:
+ *		QP type RC:
+ *			required: next_state, rq_psn, dest_qp_num, path_mtu, ah_attr, min_rnr_timer
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic, pkey_index
+ *		QP type UC:
+ *			required: next_state, rq_psn, dest_qp_num, path_mtu, ah_attr
+ *			optional: allow_remote_write, pkey_index
+ * - rtr->rts:
+ *		QP type RC:
+ *			required: next_state, sq_psn, ack_timeout, retry_cnt, rnr_retry
+ *			optional: allow_remote_write, min_rnr_timer
+ *		QP type UC:
+ *			required: next_state, sq_psn,
+ *			optional: allow_remote_write
+ * - rts->rts:
+ *		QP type RC:
+ *			required: NONE
+ *			optional: allow_remote_write, allow_remote_read, allow_atomic,
+ *min_rnr_timer, ah_attr QP type UC: required: NONE optional: allow_remote_write, ah_attr
+ *
+ */
+/**
+ * @brief Allow Remote Write attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE (1 << 0)
+/**
+ * @brief Allow Remote Read attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ (1 << 1)
+/**
+ * @brief PKEY Index attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PKEY_INDEX (1 << 2)
+/**
+ * @brief Minimum RNR Timer attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER (1 << 3)
+/**
+ * @brief Port Number attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PORT_NUM (1 << 4)
+/**
+ * @brief Next State attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_NEXT_STATE (1 << 5)
+/**
+ * @brief Current State attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_CURRENT_STATE (1 << 6)
+/**
+ * @brief Path MTU attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_PATH_MTU (1 << 7)
+/**
+ * @brief RQ PSN attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RQ_PSN (1 << 8)
+/**
+ * @brief SQ PSN attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_SQ_PSN (1 << 9)
+/**
+ * @brief Destination QP attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_DEST_QP_NUM (1 << 10)
+/**
+ * @brief ACK Timeout attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_ACK_TIMEOUT (1 << 11)
+/**
+ * @brief Retry Counter attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RETRY_CNT (1 << 12)
+/**
+ * @brief RNR Retry attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_RNR_RETRY (1 << 13)
+/**
+ * @brief AH attribute.
+ */
+#define DOCA_VERBS_QP_ATTR_AH_ATTR (1 << 14)
+
+/**
+ * @brief Specifies the length of a GID (Global ID) in bytes.
+ */
+#define DOCA_VERBS_GID_BYTE_LENGTH 16
+
+/**
+ * @brief Invalid dmabuf_fd value. Used to notify the umem must be registered without dmabuf.
+ */
+#define DOCA_VERBS_DMABUF_INVALID_FD 0xFFFFFFFF
+/**
+ * @brief GID struct.
+ */
+struct doca_verbs_gid {
+    uint8_t raw[DOCA_VERBS_GID_BYTE_LENGTH]; /**< The raw value of the GID */
+};
+
+/**********************************************************************************************************************
+ * DOCA Verbs functions
+ *********************************************************************************************************************/
+
+/**
+ * @brief Create a DOCA Verbs QP Init Attributes instance.
+ *
+ * @param [out] verbs_qp_init_attr
+ * Pointer to pointer to be set to point to the created verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_init_attr_create(struct doca_verbs_qp_init_attr **verbs_qp_init_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs QP Init Attributes instance.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_destroy(struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set pd attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] pd
+ * pd attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_pd(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                            struct ibv_pd *pd);
+
+/**
+ * @brief Get pd attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * pd attribute.
+ */
+struct ibv_pd *doca_verbs_qp_init_attr_get_pd(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set send_cq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] send_cq
+ * send_cq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_send_cq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 struct doca_verbs_cq *send_cq);
+
+/**
+ * @brief Get send_cq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * send_cq attribute.
+ */
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_send_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set receive_cq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] receive_cq
+ * receive_cq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_receive_cq(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_cq *receive_cq);
+
+/**
+ * @brief Get receive_cq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * receive_cq attribute.
+ */
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_receive_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set sq_sig_all attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] sq_sig_all
+ * sq_sig_all attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_sq_sig_all(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, int sq_sig_all);
+
+/**
+ * @brief Get sq_sig_all attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * sq_sig_all attribute.
+ */
+int doca_verbs_qp_init_attr_get_sq_sig_all(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set sq_wr attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] sq_wr
+ * sq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_sq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t sq_wr);
+
+/**
+ * @brief Get sq_wr attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * sq_wr attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_sq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set rq_wr attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] rq_wr
+ * rq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_rq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t rq_wr);
+
+/**
+ * @brief Get rq_wr attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * rq_wr attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_rq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set send_max_sges attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] send_max_sges
+ * send_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_send_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t send_max_sges);
+
+/**
+ * @brief Get send_max_sges attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * send_max_sges attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_send_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set receive_max_sges attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] receive_max_sges
+ * receive_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_receive_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t receive_max_sges);
+
+/**
+ * @brief Get receive_max_sges attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * receive_max_sges attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_receive_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set max_inline_data attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] max_inline_data
+ * max_inline_data attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_max_inline_data(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t max_inline_data);
+
+/**
+ * @brief Get max_inline_data attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * max_inline_data attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_max_inline_data(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set user_index attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] user_index
+ * user_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_user_index(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t user_index);
+
+/**
+ * @brief Get user_index attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * user_index attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_user_index(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set qp_type attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] qp_type
+ * qp_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_qp_type(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 uint32_t qp_type);
+
+/**
+ * @brief Get qp_type attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * qp_type attribute.
+ */
+uint32_t doca_verbs_qp_init_attr_get_qp_type(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set external umem attributes for verbs_qp_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Set external DBR umem attributes for verbs_qp_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the dbr umem by himself,
+ * in compare with the default mode where the dbr umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_dbr_umem
+ * External dbr umem instance.
+ * @param [in] external_dbr_umem_offset
+ * The offset in the external dbr umem buffer to set the DBR
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_dbr_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Get external umem attributes from verbs_qp_init_attr.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [out] external_umem
+ * External umem instance.
+ * @param [out] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_get_external_umem(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset);
+
+/**
+ * @brief Set external uar attribute for verbs_qp_init_attr.
+ *
+ * Setting these attribute means that the user wants to create and provide the uar by himself,
+ * in compare with the default mode where the uar is created internally.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] external_uar
+ * External uar instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_external_uar(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar *external_uar);
+
+/**
+ * @brief Get external uar attribute from verbs_qp_init_attr.
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [out] external_uar
+ * External uar instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_get_external_uar(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar **external_uar);
+
+/**
+ * @brief Set qp_context attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] qp_context
+ * qp_context attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_qp_context(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, void *qp_context);
+
+/**
+ * @brief Get qp_context attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * qp_context attribute.
+ */
+void *doca_verbs_qp_init_attr_get_qp_context(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set srq attribute for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] srq
+ * srq attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_srq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                             struct doca_verbs_srq *srq);
+
+/**
+ * @brief Get srq attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * srq attribute.
+ */
+struct doca_verbs_srq *doca_verbs_qp_init_attr_get_srq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Set CORE direct for verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ * @param [in] core_direct_master
+ * Set core direct attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_init_attr_set_core_direct_master(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint8_t core_direct_master);
+
+/**
+ * @brief Get CORE Direct attribute from verbs_qp_init_attr
+ *
+ * @param [in] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * min_rnr_timer attribute.
+ */
+uint8_t doca_verbs_qp_init_attr_get_core_direct_master(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Create a DOCA Verbs QP Attributes instance.
+ *
+ * @param [out] verbs_qp_attr
+ * Pointer to pointer to be set to point to the created verbs_qp_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_attr_create(struct doca_verbs_qp_attr **verbs_qp_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs QP Attributes instance.
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_destroy(struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set next_state attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] next_state
+ * next_state attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_next_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               enum doca_verbs_qp_state next_state);
+
+/**
+ * @brief Get next_state attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * next_state attribute.
+ */
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_next_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set current_state attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] current_state
+ * current_state attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_current_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  enum doca_verbs_qp_state current_state);
+
+/**
+ * @brief Get current_state attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * current_state attribute.
+ */
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_current_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set path_mtu attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] path_mtu
+ * path_mtu attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_path_mtu(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             enum doca_verbs_mtu_size path_mtu);
+
+/**
+ * @brief Get path_mtu attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * path_mtu attribute.
+ */
+enum doca_verbs_mtu_size doca_verbs_qp_attr_get_path_mtu(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set rq_psn attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] rq_psn
+ * rq_psn attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_rq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t rq_psn);
+
+/**
+ * @brief Get rq_psn attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * rq_psn attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_rq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set sq_psn attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] sq_psn
+ * sq_psn attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_sq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t sq_psn);
+
+/**
+ * @brief Get sq_psn attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * sq_psn attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_sq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set dest_qp_num attribute for verbs_qp_attr
+ * @note The destination QP number used to establish a connection with the destination QP during the
+ * QP state modification.
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] dest_qp_num
+ * dest_qp_num attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_dest_qp_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint32_t dest_qp_num);
+
+/**
+ * @brief Get dest_qp_num attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * dest_qp_num attribute.
+ */
+uint32_t doca_verbs_qp_attr_get_dest_qp_num(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_remote_write attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_remote_write
+ * allow_remote_write attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_write(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                       int allow_remote_write);
+
+/**
+ * @brief Get allow_remote_write attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_remote_write attribute.
+ */
+int doca_verbs_qp_attr_get_allow_remote_write(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_remote_read attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_remote_read
+ * allow_remote_read attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_read(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                      int allow_remote_read);
+
+/**
+ * @brief Get allow_remote_read attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_remote_read attribute.
+ */
+int doca_verbs_qp_attr_get_allow_remote_read(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set allow_atomic attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] allow_atomic
+ * allow_atomic attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_allow_remote_atomic(
+    struct doca_verbs_qp_attr *verbs_qp_attr, enum doca_verbs_qp_atomic_type allow_atomic_type);
+
+/**
+ * @brief Get allow_atomic attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * allow_atomic attribute.
+ */
+enum doca_verbs_qp_atomic_type doca_verbs_qp_attr_get_allow_remote_atomic(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set ah_attr attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] ah_attr
+ * ah_attr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_ah_attr(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                            struct doca_verbs_ah_attr *ah_attr);
+
+/**
+ * @brief Get ah_attr attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * ah_attr attribute.
+ */
+struct doca_verbs_ah_attr *doca_verbs_qp_attr_get_ah_attr(
+    const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set pkey_index attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] pkey_index
+ * pkey_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_pkey_index(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               uint16_t pkey_index);
+
+/**
+ * @brief Get pkey_index attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * pkey_index attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_pkey_index(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set port_num attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] port_num
+ * port_num attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_port_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             uint16_t port_num);
+
+/**
+ * @brief Get port_num attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * port_num attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_port_num(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set ack_timeout attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] ack_timeout
+ * ack_timeout attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_ack_timeout(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint16_t ack_timeout);
+
+/**
+ * @brief Get ack_timeout attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * ack_timeout attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_ack_timeout(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set retry_cnt attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] retry_cnt
+ * retry_cnt attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_retry_cnt(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t retry_cnt);
+
+/**
+ * @brief Get retry_cnt attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * retry_cnt attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_retry_cnt(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set rnr_retry attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] rnr_retry
+ * rnr_retry attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_rnr_retry(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t rnr_retry);
+
+/**
+ * @brief Get rnr_retry attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * rnr_retry attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_rnr_retry(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Set min_rnr_timer attribute for verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] min_rnr_timer
+ * min_rnr_timer attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_attr_set_min_rnr_timer(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  uint16_t min_rnr_timer);
+
+/**
+ * @brief Get min_rnr_timer attribute from verbs_qp_attr
+ *
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ *
+ * @return
+ * min_rnr_timer attribute.
+ */
+uint16_t doca_verbs_qp_attr_get_min_rnr_timer(const struct doca_verbs_qp_attr *verbs_qp_attr);
+
+/**
+ * @brief Create a DOCA Verbs AH instance.
+ *
+ * @param [in] context
+ * Pointer to context instance.
+ * @param [out] verbs_ah
+ * Pointer to pointer to be set to point to the created verbs_ah instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_ah_attr_create(struct ibv_context *context,
+                                       struct doca_verbs_ah_attr **verbs_ah);
+
+/**
+ * @brief Destroy a DOCA Verbs AH instance.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_destroy(struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set gid attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] gid
+ * gid attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_gid(struct doca_verbs_ah_attr *verbs_ah,
+                                        struct doca_verbs_gid gid);
+
+/**
+ * @brief Get gid attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * gid attribute.
+ */
+struct doca_verbs_gid doca_verbs_ah_get_gid(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set addr_type attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] addr_type
+ * addr_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_addr_type(struct doca_verbs_ah_attr *verbs_ah,
+                                              enum doca_verbs_addr_type addr_type);
+
+/**
+ * @brief Get addr_type attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * addr_type attribute.
+ */
+enum doca_verbs_addr_type doca_verbs_ah_get_addr_type(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set dlid attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] dlid
+ * dlid attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_dlid(struct doca_verbs_ah_attr *verbs_ah, uint32_t dlid);
+
+/**
+ * @brief Get dlid attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * dlid attribute.
+ */
+uint32_t doca_verbs_ah_get_dlid(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set sl attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] sl
+ * sl attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_sl(struct doca_verbs_ah_attr *verbs_ah, uint8_t sl);
+
+/**
+ * @brief Get sl attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * sl attribute.
+ */
+uint8_t doca_verbs_ah_get_sl(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set sgid_index attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] sgid_index
+ * sgid_index attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_sgid_index(struct doca_verbs_ah_attr *verbs_ah,
+                                               uint8_t sgid_index);
+
+/**
+ * @brief Get sgid_index attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * sgid_index attribute.
+ */
+uint8_t doca_verbs_ah_get_sgid_index(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set static_rate attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] static_rate
+ * static_rate attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_static_rate(struct doca_verbs_ah_attr *verbs_ah,
+                                                uint8_t static_rate);
+
+/**
+ * @brief Get static_rate attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * static_rate attribute.
+ */
+uint8_t doca_verbs_ah_get_static_rate(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set hop_limit attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] hop_limit
+ * hop_limit attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_hop_limit(struct doca_verbs_ah_attr *verbs_ah,
+                                              uint8_t hop_limit);
+
+/**
+ * @brief Get hop_limit attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * hop_limit attribute.
+ */
+uint8_t doca_verbs_ah_get_hop_limit(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Set traffic_class attribute for verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ * @param [in] traffic_class
+ * traffic_class attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_ah_attr_set_traffic_class(struct doca_verbs_ah_attr *verbs_ah,
+                                                  uint8_t traffic_class);
+
+/**
+ * @brief Get traffic_class attribute from verbs_ah.
+ *
+ * @param [in] verbs_ah
+ * Pointer to verbs_ah instance.
+ *
+ * @return
+ * traffic_class attribute.
+ */
+uint8_t doca_verbs_ah_get_traffic_class(const struct doca_verbs_ah_attr *verbs_ah);
+
+/**
+ * @brief Create a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] verbs_qp_init_attr
+ * Pointer to qp_init_attr instance.
+ * @param [out] verbs_qp
+ * Pointer to pointer to be set to point to the created verbs_qp instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_create(struct ibv_context *context,
+                                  struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                  struct doca_verbs_qp **verbs_qp);
+
+/**
+ * @brief Destroy a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_qp_destroy(struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Modify a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [in] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [in] attr_mask
+ * Mask for QP attributes. see define for DOCA_VERBS_QP_ATTR_*
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_modify(struct doca_verbs_qp *verbs_qp,
+                                  struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask);
+
+/**
+ * @brief Query the attributes of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [out] verbs_qp_attr
+ * Pointer to verbs_qp_attr instance.
+ * @param [out] verbs_qp_init_attr
+ * Pointer to verbs_qp_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_qp_query(struct doca_verbs_qp *verbs_qp,
+                                 struct doca_verbs_qp_attr *verbs_qp_attr,
+                                 struct doca_verbs_qp_init_attr *verbs_qp_init_attr);
+
+/**
+ * @brief Get the Work Queue attributes of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ * @param [out] sq_buf
+ * Pointer to Send Queue buffer.
+ * @param [out] sq_num_entries
+ * The number of entries in Send Queue buffer.
+ * @param [out] rq_buf
+ * Pointer to Receive Queue buffer.
+ * @param [out] rq_num_entries
+ * The number of entries in Receive Queue buffer.
+ * @param [out] rwqe_size_bytes
+ * Receive WQE size in bytes.
+ *
+ */
+void doca_verbs_qp_get_wq(const struct doca_verbs_qp *verbs_qp, void **sq_buf,
+                          uint32_t *sq_num_entries, void **rq_buf, uint32_t *rq_num_entries,
+                          uint32_t *rwqe_size_bytes);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The DBR address.
+ */
+void *doca_verbs_qp_get_dbr_addr(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Get the UAR address of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The UAR register address.
+ */
+void *doca_verbs_qp_get_uar_addr(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Get the QP number of a DOCA Verbs Queue Pair instance.
+ *
+ * @param [in] verbs_qp
+ * Pointer to verbs_qp instance.
+ *
+ * @return
+ * The QP number.
+ */
+uint32_t doca_verbs_qp_get_qpn(const struct doca_verbs_qp *verbs_qp);
+
+/**
+ * @brief Create a DOCA Verbs CQ Attributes instance.
+ *
+ * @param [out] verbs_cq_attr
+ * Pointer to pointer to be set to point to the created verbs_cq_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_cq_attr_create(struct doca_verbs_cq_attr **verbs_cq_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs CQ Attributes instance.
+ *
+ * @param [in] verbs_cq_attr
+ * Pointer to verbs_cq_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_destroy(struct doca_verbs_cq_attr *verbs_cq_attr);
+
+/**
+ * @brief Set cq_size attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] cq_size
+ * cq size (num entries).
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_size(struct doca_verbs_cq_attr *cq_attr, uint32_t cq_size);
+
+/**
+ * @brief Set cq_context attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] cq_context
+ * User data. cq_context may be null in case the application regrets setting a user data.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_context(struct doca_verbs_cq_attr *cq_attr,
+                                               void *cq_context);
+
+/**
+ * @brief Set external umem attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Completion Queue.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                  struct doca_verbs_umem *external_umem,
+                                                  uint64_t external_umem_offset);
+
+/**
+ * @brief Set external dbr umem attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to create and provide the dbr umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Completion Queue.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_dbr_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                      struct doca_verbs_umem *external_umem,
+                                                      uint64_t external_umem_offset);
+
+/**
+ * @brief Set external uar attribute for doca_verbs_cq_attr.
+ *
+ * Setting this attribute means that the user wants to provide an external uar by himself,
+ * in compare with the default mode where uar is created internally.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] external_uar
+ * External uar.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_external_uar(struct doca_verbs_cq_attr *cq_attr,
+                                                 struct doca_verbs_uar *external_uar);
+
+/**
+ * @brief Enable cq_overrun attribute for doca_verbs_cq_attr.
+ *
+ * @param [in] cq_attr
+ * Pointer to doca_verbs_cq_attr instance.
+ * @param [in] overrun
+ * enable or disable overrun (@see doca_verbs_cq_overrun).
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_attr_set_cq_overrun(struct doca_verbs_cq_attr *cq_attr,
+                                               enum doca_verbs_cq_overrun overrun);
+/**
+ * @brief Create a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] verbs_cq_attr
+ * Pointer to verbs_cq_attr instance.
+ * @param [out] verbs_cq
+ * Pointer to pointer to be set to point to the created doca_verbs_cq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_cq_create(struct ibv_context *context,
+                                  struct doca_verbs_cq_attr *verbs_cq_attr,
+                                  struct doca_verbs_cq **verbs_cq);
+
+/**
+ * @brief Destroy a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_cq_destroy(struct doca_verbs_cq *verbs_cq);
+
+/**
+ * @brief Get the Completion Queue attributes of a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ * @param [out] cq_buf
+ * Pointer to Completion Queue buffer.
+ * @param [out] cq_num_entries
+ * The number of entries in Completion Queue buffer.
+ * @param [out] cq_entry_size
+ * The size of each entry in Completion Queue buffer.
+ *
+ */
+void doca_verbs_cq_get_wq(struct doca_verbs_cq *verbs_cq, void **cq_buf, uint32_t *cq_num_entries,
+                          uint8_t *cq_entry_size);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Completion Queue instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ * @param [out] uar_db_reg
+ * Pointer to the UAR doorbell record
+ * @param [out] ci_dbr
+ * Pointer to the CI doorbell record
+ * @param [out] arm_dbr
+ * Pointer to the arm doorbell record
+ */
+void doca_verbs_cq_get_dbr_addr(struct doca_verbs_cq *verbs_cq, uint64_t **uar_db_reg,
+                                uint32_t **ci_dbr, uint32_t **arm_dbr);
+
+/**
+ * @brief Get the CQ number of a DOCA Verbs CQ instance.
+ *
+ * @param [in] verbs_cq
+ * Pointer to verbs_cq instance.
+ *
+ * @return
+ * The CQ number.
+ */
+uint32_t doca_verbs_cq_get_cqn(const struct doca_verbs_cq *verbs_cq);
+
+/**
+ * @brief Create a DOCA Verbs SRQ Init Attributes instance.
+ *
+ * @param [out] verbs_srq_init_attr
+ * Pointer to pointer to be set to point to the created verbs_srq_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_srq_init_attr_create(struct doca_verbs_srq_init_attr **verbs_srq_init_attr);
+
+/**
+ * @brief Destroy a DOCA Verbs SRQ Init Attributes instance.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_destroy(struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set srq_wr attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] srq_wr
+ * srq_wr attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_srq_wr(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t srq_wr);
+
+/**
+ * @brief Get srq_wr attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * srq_wr attribute.
+ */
+uint32_t doca_verbs_srq_init_attr_get_srq_wr(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+/**
+ * @brief Set receive_max_sges attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] receive_max_sges
+ * receive_max_sges attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_receive_max_sges(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t receive_max_sges);
+
+/**
+ * @brief Get receive_max_sges attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * receive_max_sges attribute.
+ */
+uint32_t doca_verbs_srq_init_attr_get_receive_max_sges(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set srq_type attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] srq_type
+ * srq_type attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_type(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                               enum doca_verbs_srq_type srq_type);
+
+/**
+ * @brief Get srq_type attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * srq_type attribute.
+ */
+enum doca_verbs_srq_type doca_verbs_srq_init_attr_get_type(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set pd attribute for verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] pd
+ * pd attribute.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_pd(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                             struct ibv_pd *pd);
+
+/**
+ * @brief Get pd attribute from verbs_srq_init_attr
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ *
+ * @return
+ * pd attribute.
+ */
+struct ibv_pd *doca_verbs_srq_init_attr_get_pd(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr);
+
+/**
+ * @brief Set external umem attributes for verbs_srq_init_attr.
+ *
+ * Setting these attributes means that the user wants to create and provide the umem by himself,
+ * in compare with the default mode where the umem is created internally.
+ * In that case it is the user responsibility to allocate enough memory for the umem and to free it.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [in] external_umem
+ * External umem instance.
+ * @param [in] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_set_external_umem(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset);
+
+/**
+ * @brief Get external umem attributes from verbs_srq_init_attr.
+ *
+ * @param [in] verbs_srq_init_attr
+ * Pointer to verbs_srq_init_attr instance.
+ * @param [out] external_umem
+ * External umem instance.
+ * @param [out] external_umem_offset
+ * The offset in the external umem buffer to set the Work Queue
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_init_attr_get_external_umem(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset);
+
+/**
+ * @brief Create a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_context
+ * Pointer to verbs_context instance.
+ * @param [in] verbs_srq_init_attr
+ * Pointer to srq_init_attr instance.
+ * @param [out] verbs_srq
+ * Pointer to pointer to be set to point to the created verbs_srq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ */
+doca_error_t doca_verbs_srq_create(struct ibv_context *verbs_context,
+                                   struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                   struct doca_verbs_srq **verbs_srq);
+
+/**
+ * @brief Destroy a DOCA IB Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_srq_destroy(struct doca_verbs_srq *verbs_srq);
+
+/**
+ * @brief Get the SRQ number of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * The SRQ number.
+ */
+uint32_t doca_verbs_srq_get_srqn(const struct doca_verbs_srq *verbs_srq);
+
+/**
+ * @brief Get the Work Queue attributes of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ * @param [out] srq_buf
+ * Pointer to Shared Receive Queue buffer.
+ * @param [out] srq_num_entries
+ * The number of entries in Shared Receive Queue buffer.
+ * @param [out] rwqe_size_bytes
+ * Receive WQE size in bytes.
+ *
+ */
+void doca_verbs_srq_get_wq(const struct doca_verbs_srq *verbs_srq, void **srq_buf,
+                           uint32_t *srq_num_entries, uint32_t *rwqe_size_bytes);
+
+/**
+ * @brief Get the DBR address of a DOCA Verbs Shared Receive Queue instance.
+ *
+ * @param [in] verbs_srq
+ * Pointer to verbs_srq instance.
+ *
+ * @return
+ * The DBR address.
+ */
+void *doca_verbs_srq_get_dbr_addr(const struct doca_verbs_srq *verbs_srq);
+
+/**********************************************************************************************************************
+ * Capabilities functions
+ *********************************************************************************************************************/
+
+/**
+ * @brief Query DOCA Verbs device attributes.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [out] verbs_device_attr
+ * Pointer to pointer to be set to point to the created verbs_device_attr instance.
+ * User is expected to free this object with "doca_verbs_device_attr_free()".
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_NO_MEMORY - failed to allocate resources.
+ * - DOCA_ERROR_NOT_DRIVER - low level layer failure.
+ */
+doca_error_t doca_verbs_query_device(struct ibv_context *context,
+                                     struct doca_verbs_device_attr **verbs_device_attr);
+
+/**
+ * @brief Free a DOCA Verbs Device Attributes instance.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_device_attr_free(struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of QPs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of QPs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_qp(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of work requests on send/receive queue supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of work requests on send/receive queue supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_qp_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of scatter/gather entries per send/receive work request in a QP
+ * other than RD supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of scatter/gather entries per send/receive work request in a QP other than RD
+ * supported by the device.
+ *
+ */
+uint32_t doca_verbs_device_attr_get_max_sge(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of CQs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of CQs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_cq(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of entries on CQ supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of entries on CQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_cqe(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of MRs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of MRs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_mr(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of PDs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of MRs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_pd(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of AHs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of AHs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_ah(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of SRQs supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of SRQs supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq(const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of work requests on SRQ supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of work requests on SRQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of scatter entries per receive work request in a SRQ supported by
+ * the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of scatter entries per receive work request in a SRQ supported by the device.
+ */
+uint32_t doca_verbs_device_attr_get_max_srq_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Get the maximum number of partitions supported by the device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ *
+ * @return
+ * The max number of partitions supported by the device.
+ */
+uint16_t doca_verbs_device_attr_get_max_pkeys(
+    const struct doca_verbs_device_attr *verbs_device_attr);
+
+/**
+ * @brief Check if a given QP type is supported on this device.
+ *
+ * @param [in] verbs_device_attr
+ * Pointer to doca_verbs_device_attr instance.
+ * @param [in] qp_type
+ * The QP type to check its support.
+ *
+ * @return
+ * DOCA_SUCCESS - in case QP type is supported.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid parameter was given.
+ * - DOCA_ERROR_NOT_SUPPORTED - if QP type is not supported.
+ */
+doca_error_t doca_verbs_device_attr_get_is_qp_type_supported(
+    const struct doca_verbs_device_attr *verbs_device_attr, uint32_t qp_type);
+
+/**
+ * @brief Create an instance of DOCA Verbs UMEM.
+ *
+ * @param [in] context
+ * Pointer to ibv_context instance.
+ * @param [in] address
+ * The umem address.
+ * @param [in] size
+ * The umem size.
+ * @param [in] access_flags
+ * The umem access flags.
+ * @param [in] dmabuf_fd
+ * The umem dmabuf file descriptor id.
+ * @param [in] dmabuf_offset
+ * The umem dmabuf offset.
+ * @param [out] umem_obj
+ * The umem object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_umem_create(struct ibv_context *context, void *address, size_t size,
+                                    uint32_t access_flags, int dmabuf_id, size_t dmabuf_offset,
+                                    struct doca_verbs_umem **umem_obj);
+
+/**
+ * @brief Destroy an instance of DOCA Verbs UMEM.
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_umem_destroy(struct doca_verbs_umem *umem_obj);
+
+/**
+ * @brief This method retrieves the umem id
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_id
+ * the umem id.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_id(const struct doca_verbs_umem *umem_obj, uint32_t *umem_id);
+
+/**
+ * @brief This method retrieves the umem size
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_size
+ * the umem size.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_size(const struct doca_verbs_umem *umem_obj, size_t *umem_size);
+
+/**
+ * @brief This method retrieves the umem address
+ *
+ * @param [in] umem_obj
+ * Pointer to the umem instance.
+ * @param [out] umem_address
+ * the umem address.
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - if an invalid input had been received.
+ */
+doca_error_t doca_verbs_umem_get_address(const struct doca_verbs_umem *umem_obj,
+                                         void **umem_address);
+
+/**
+ * @brief Create a UAR object
+ *
+ * @param [in] context
+ * Pointer to ibv_context
+ * @param [in] allocation_type
+ * doca_uar_allocation_type
+ * @param [out] uar
+ * UAR object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ * - DOCA_ERROR_DRIVER - in case of error in a DOCA driver call.
+ */
+doca_error_t doca_verbs_uar_create(struct ibv_context *context,
+                                   enum doca_verbs_uar_allocation_type allocation_type,
+                                   struct doca_verbs_uar **uar_obj);
+
+/**
+ * @brief Destroy a UAR object
+ *
+ * @param [in] uar
+ * UAR object
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_destroy(struct doca_verbs_uar *uar_obj);
+
+/**
+ * @brief This method retrieves the UAR ID
+ *
+ * @param [in] uar
+ * UAR object
+ * @param [out] id
+ * The UAR ID
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_id_get(const struct doca_verbs_uar *uar, uint32_t *id);
+
+/**
+ * @brief This method retrieves the uar register address
+ *
+ * @param [in] uar
+ * UAR object
+ * @param [out] reg_addr
+ * UAR register address
+ *
+ * @return
+ * DOCA_SUCCESS - in case of success.
+ * doca_error code - in case of failure:
+ * - DOCA_ERROR_INVALID_VALUE - received invalid input.
+ */
+doca_error_t doca_verbs_uar_reg_addr_get(const struct doca_verbs_uar *uar_obj, void **reg_addr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h
new file mode 100644
index 00000000000..112f7809aab
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_ifc.h
@@ -0,0 +1,5693 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef MLX5_IFC_H
+#define MLX5_IFC_H
+
+#define u8 uint8_t
+
+#define __mlx5_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)NULL)
+#define __mlx5_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+#define __mlx5_bit_sz(typ, fld) sizeof(__mlx5_nullp(typ)->fld)
+#define __mlx5_bit_off(typ, fld) offsetof(struct mlx5_ifc_##typ##_bits, fld)
+#define __mlx5_dw_off(bit_off) ((bit_off) / 32)
+#define __mlx5_64_off(bit_off) ((bit_off) / 64)
+#define __mlx5_dw_bit_off(bit_sz, bit_off) (32 - (bit_sz) - ((bit_off) & 0x1f))
+#define __mlx5_mask(bit_sz) ((uint32_t)((1ull << (bit_sz)) - 1))
+#define __mlx5_dw_mask(bit_sz, bit_off) (__mlx5_mask(bit_sz) << __mlx5_dw_bit_off(bit_sz, bit_off))
+
+#define MLX5_FLD_SZ_BITS(typ, fld) (__mlx5_bit_sz(typ, fld))
+#define MLX5_FLD_SZ_BYTES(typ, fld) (__mlx5_bit_sz(typ, fld) / 8)
+#define MLX5_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
+#define MLX5_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define MLX5_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
+#define MLX5_BYTE_OFF(typ, fld) (__mlx5_bit_off(typ, fld) / 8)
+#define MLX5_ADDR_OF(typ, p, fld) ((unsigned char *)(p) + MLX5_BYTE_OFF(typ, fld))
+
+enum mlx5_cap_mode {
+    HCA_CAP_OPMOD_GET_MAX = 0,
+    HCA_CAP_OPMOD_GET_CUR = 1,
+};
+
+enum {
+    MLX5_CMD_OP_QUERY_HCA_CAP = 0x100,
+    MLX5_CMD_OP_INIT_HCA = 0x102,
+    MLX5_CMD_OP_TEARDOWN_HCA = 0x103,
+    MLX5_CMD_OP_ENABLE_HCA = 0x104,
+    MLX5_CMD_OP_QUERY_PAGES = 0x107,
+    MLX5_CMD_OP_MANAGE_PAGES = 0x108,
+    MLX5_CMD_OP_SET_HCA_CAP = 0x109,
+    MLX5_CMD_OP_QUERY_ISSI = 0x10a,
+    MLX5_CMD_OP_SET_ISSI = 0x10b,
+    MLX5_CMD_OP_CREATE_MKEY = 0x200,
+    MLX5_CMD_OP_DESTROY_MKEY = 0x202,
+    MLX5_CMD_OP_CREATE_EQ = 0x301,
+    MLX5_CMD_OP_DESTROY_EQ = 0x302,
+    MLX5_CMD_OP_CREATE_CQ = 0x400,
+    MLX5_CMD_OP_DESTROY_CQ = 0x401,
+    MLX5_CMD_OP_CREATE_QP = 0x500,
+    MLX5_CMD_OP_DESTROY_QP = 0x501,
+    MLX5_CMD_OP_RST2INIT_QP = 0x502,
+    MLX5_CMD_OP_INIT2RTR_QP = 0x503,
+    MLX5_CMD_OP_RTR2RTS_QP = 0x504,
+    MLX5_CMD_OP_RTS2RTS_QP = 0x505,
+    MLX5_CMD_OP_QP_2ERR = 0x507,
+    MLX5_CMD_OP_QP_2RST = 0x50a,
+    MLX5_CMD_OP_QUERY_QP = 0x50b,
+    MLX5_CMD_OP_INIT2INIT_QP = 0x50e,
+    MLX5_CMD_OP_CREATE_PSV = 0x600,
+    MLX5_CMD_OP_DESTROY_PSV = 0x601,
+    MLX5_CMD_OP_CREATE_SRQ = 0x700,
+    MLX5_CMD_OP_DESTROY_SRQ = 0x701,
+    MLX5_CMD_OP_CREATE_XRC_SRQ = 0x705,
+    MLX5_CMD_OP_DESTROY_XRC_SRQ = 0x706,
+    MLX5_CMD_OP_CREATE_DCT = 0x710,
+    MLX5_CMD_OP_DESTROY_DCT = 0x711,
+    MLX5_CMD_OP_QUERY_DCT = 0x713,
+    MLX5_CMD_OP_CREATE_XRQ = 0x717,
+    MLX5_CMD_OP_DESTROY_XRQ = 0x718,
+    MLX5_CMD_OP_QUERY_ESW_FUNCTIONS = 0x740,
+    MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752,
+    MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
+    MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755,
+    MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760,
+    MLX5_CMD_OP_ALLOC_Q_COUNTER = 0x771,
+    MLX5_CMD_OP_DEALLOC_Q_COUNTER = 0x772,
+    MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT = 0x782,
+    MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT = 0x783,
+    MLX5_CMD_OP_ALLOC_PD = 0x800,
+    MLX5_CMD_OP_DEALLOC_PD = 0x801,
+    MLX5_CMD_OP_ALLOC_UAR = 0x802,
+    MLX5_CMD_OP_DEALLOC_UAR = 0x803,
+    MLX5_CMD_OP_ACCESS_REG = 0x805,
+    MLX5_CMD_OP_ATTACH_TO_MCG = 0x806,
+    MLX5_CMD_OP_DETACH_FROM_MCG = 0x807,
+    MLX5_CMD_OP_ALLOC_XRCD = 0x80e,
+    MLX5_CMD_OP_DEALLOC_XRCD = 0x80f,
+    MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN = 0x816,
+    MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN = 0x817,
+    MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT = 0x827,
+    MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT = 0x828,
+    MLX5_CMD_OP_SET_L2_TABLE_ENTRY = 0x829,
+    MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY = 0x82b,
+    MLX5_CMD_OP_QUERY_LAG = 0x842,
+    MLX5_CMD_OP_CREATE_TIR = 0x900,
+    MLX5_CMD_OP_DESTROY_TIR = 0x902,
+    MLX5_CMD_OP_CREATE_SQ = 0x904,
+    MLX5_CMD_OP_MODIFY_SQ = 0x905,
+    MLX5_CMD_OP_DESTROY_SQ = 0x906,
+    MLX5_CMD_OP_CREATE_RQ = 0x908,
+    MLX5_CMD_OP_DESTROY_RQ = 0x90a,
+    MLX5_CMD_OP_CREATE_RMP = 0x90c,
+    MLX5_CMD_OP_DESTROY_RMP = 0x90e,
+    MLX5_CMD_OP_CREATE_TIS = 0x912,
+    MLX5_CMD_OP_MODIFY_TIS = 0x913,
+    MLX5_CMD_OP_DESTROY_TIS = 0x914,
+    MLX5_CMD_OP_QUERY_TIS = 0x915,
+    MLX5_CMD_OP_CREATE_RQT = 0x916,
+    MLX5_CMD_OP_DESTROY_RQT = 0x918,
+    MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930,
+    MLX5_CMD_OP_DESTROY_FLOW_TABLE = 0x931,
+    MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932,
+    MLX5_CMD_OP_CREATE_FLOW_GROUP = 0x933,
+    MLX5_CMD_OP_DESTROY_FLOW_GROUP = 0x934,
+    MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936,
+    MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY = 0x938,
+    MLX5_CMD_OP_CREATE_FLOW_COUNTER = 0x939,
+    MLX5_CMD_OP_DEALLOC_FLOW_COUNTER = 0x93a,
+    MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d,
+    MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e,
+    MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940,
+    MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941,
+    MLX5_CMD_OP_CREATE_GENERAL_OBJECT = 0xa00,
+    MLX5_CMD_OP_MODIFY_GENERAL_OBJECT = 0xa01,
+    MLX5_CMD_OP_QUERY_GENERAL_OBJECT = 0xa02,
+    MLX5_CMD_OP_DESTROY_GENERAL_OBJECT = 0xa03,
+    MLX5_CMD_OP_CREATE_UMEM = 0xa08,
+    MLX5_CMD_OP_DESTROY_UMEM = 0xa0a,
+    MLX5_CMD_OP_SYNC_STEERING = 0xb00,
+};
+
+enum {
+    MLX5_CMD_STAT_OK = 0x0,
+    MLX5_CMD_STAT_INT_ERR = 0x1,
+    MLX5_CMD_STAT_BAD_OP_ERR = 0x2,
+    MLX5_CMD_STAT_BAD_PARAM_ERR = 0x3,
+    MLX5_CMD_STAT_BAD_SYS_STATE_ERR = 0x4,
+    MLX5_CMD_STAT_BAD_RES_ERR = 0x5,
+    MLX5_CMD_STAT_RES_BUSY = 0x6,
+    MLX5_CMD_STAT_LIM_ERR = 0x8,
+    MLX5_CMD_STAT_BAD_RES_STATE_ERR = 0x9,
+    MLX5_CMD_STAT_IX_ERR = 0xa,
+    MLX5_CMD_STAT_NO_RES_ERR = 0xf,
+    MLX5_CMD_STAT_BAD_INP_LEN_ERR = 0x50,
+    MLX5_CMD_STAT_BAD_OUTP_LEN_ERR = 0x51,
+    MLX5_CMD_STAT_BAD_QP_STATE_ERR = 0x10,
+    MLX5_CMD_STAT_BAD_PKT_ERR = 0x30,
+    MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR = 0x40,
+};
+
+enum {
+    MLX5_PAGES_CANT_GIVE = 0,
+    MLX5_PAGES_GIVE = 1,
+    MLX5_PAGES_TAKE = 2,
+};
+
+enum {
+    MLX5_REG_HOST_ENDIANNESS = 0x7004,
+};
+
+enum {
+    MLX5_CAP_PORT_TYPE_IB = 0x0,
+    MLX5_CAP_PORT_TYPE_ETH = 0x1,
+};
+
+enum mlx5_event {
+    MLX5_EVENT_TYPE_CMD = 0x0a,
+    MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb,
+};
+
+enum {
+    MLX5_EQ_DOORBEL_OFFSET = 0x40,
+};
+
+struct mlx5_ifc_atomic_caps_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 atomic_req_8B_endianness_mode[0x2];
+    u8 reserved_at_42[0x4];
+    u8 supported_atomic_req_8B_endianness_mode_1[0x1];
+
+    u8 reserved_at_47[0x19];
+
+    u8 reserved_at_60[0x20];
+
+    u8 reserved_at_80[0x10];
+    u8 atomic_operations[0x10];
+
+    u8 reserved_at_a0[0x10];
+    u8 atomic_size_qp[0x10];
+
+    u8 reserved_at_c0[0x10];
+    u8 atomic_size_dc[0x10];
+
+    u8 reserved_at_e0[0x1a0];
+
+    u8 fetch_add_pci_atomic[0x10];
+    u8 swap_pci_atomic[0x10];
+    u8 compare_swap_pci_atomic[0x10];
+
+    u8 reserved_at_2b0[0x550];
+};
+
+struct mlx5_ifc_roce_cap_bits {
+    u8 reserved_0[0x4];
+    u8 sw_r_roce_src_udp_port[0x1];
+    u8 fl_rc_qp_when_roce_disabled[0x1];
+    u8 fl_rc_qp_when_roce_enabled[0x1];
+    u8 reserved_at_7[0x17];
+    u8 qp_ts_format[0x2];
+
+    uint8_t reserved_at_20[0x60];
+
+    uint8_t reserved_at_80[0xc];
+    uint8_t l3_type[0x4];
+    uint8_t reserved_at_90[0x8];
+    uint8_t roce_version[0x8];
+
+    uint8_t reserved_at_a0[0x10];
+    uint8_t r_roce_dest_udp_port[0x10];
+
+    uint8_t r_roce_max_src_udp_port[0x10];
+    uint8_t r_roce_min_src_udp_port[0x10];
+
+    uint8_t reserved_at_e0[0x10];
+    uint8_t roce_address_table_size[0x10];
+
+    uint8_t reserved_at_100[0x700];
+};
+
+enum {
+    MLX5_MULTI_PATH_FT_MAX_LEVEL = 64,
+};
+
+struct mlx5_ifc_flow_table_context_bits {
+    u8 reformat_en[0x1];
+    u8 decap_en[0x1];
+    u8 sw_owner[0x1];
+    u8 termination_table[0x1];
+    u8 table_miss_action[0x4];
+    u8 level[0x8];
+    u8 reserved_at_10[0x8];
+    u8 log_size[0x8];
+
+    u8 reserved_at_20[0x8];
+    u8 table_miss_id[0x18];
+
+    u8 reserved_at_40[0x8];
+    u8 lag_master_next_table_id[0x18];
+
+    u8 reserved_at_60[0x60];
+
+    u8 sw_owner_icm_root_1[0x40];
+
+    u8 sw_owner_icm_root_0[0x40];
+};
+
+struct mlx5_ifc_create_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_create_flow_table_out_bits {
+    u8 status[0x8];
+    u8 icm_address_63_40[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 icm_address_39_32[0x8];
+    u8 table_id[0x18];
+
+    u8 icm_address_31_0[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_flow_table_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_query_flow_table_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x80];
+
+    struct mlx5_ifc_flow_table_context_bits flow_table_context;
+};
+
+struct mlx5_ifc_sync_steering_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0xc0];
+};
+
+struct mlx5_ifc_sync_steering_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_device_mem_cap_bits {
+    u8 memic[0x1];
+    u8 reserved_at_1[0x1f];
+
+    u8 reserved_at_20[0xb];
+    u8 log_min_memic_alloc_size[0x5];
+    u8 reserved_at_30[0x8];
+    u8 log_max_memic_addr_alignment[0x8];
+
+    u8 memic_bar_start_addr[0x40];
+
+    u8 memic_bar_size[0x20];
+
+    u8 max_memic_size[0x20];
+
+    u8 steering_sw_icm_start_address[0x40];
+
+    u8 reserved_at_100[0x8];
+    u8 log_header_modify_sw_icm_size[0x8];
+    u8 reserved_at_110[0x2];
+    u8 log_sw_icm_alloc_granularity[0x6];
+    u8 log_steering_sw_icm_size[0x8];
+
+    u8 reserved_at_120[0x20];
+
+    u8 header_modify_sw_icm_start_address[0x40];
+};
+
+struct mlx5_ifc_flow_table_fields_supported_bits {
+    u8 outer_dmac[0x1];
+    u8 outer_smac[0x1];
+    u8 outer_ether_type[0x1];
+    u8 outer_ip_version[0x1];
+    u8 outer_first_prio[0x1];
+    u8 outer_first_cfi[0x1];
+    u8 outer_first_vid[0x1];
+    u8 outer_ipv4_ttl[0x1];
+    u8 outer_second_prio[0x1];
+    u8 outer_second_cfi[0x1];
+    u8 outer_second_vid[0x1];
+    u8 outer_ipv6_flow_label[0x1];
+    u8 outer_sip[0x1];
+    u8 outer_dip[0x1];
+    u8 outer_frag[0x1];
+    u8 outer_ip_protocol[0x1];
+    u8 outer_ip_ecn[0x1];
+    u8 outer_ip_dscp[0x1];
+    u8 outer_udp_sport[0x1];
+    u8 outer_udp_dport[0x1];
+    u8 outer_tcp_sport[0x1];
+    u8 outer_tcp_dport[0x1];
+    u8 outer_tcp_flags[0x1];
+    u8 outer_gre_protocol[0x1];
+    u8 outer_gre_key[0x1];
+    u8 outer_vxlan_vni[0x1];
+    u8 outer_geneve_vni[0x1];
+    u8 outer_geneve_oam[0x1];
+    u8 outer_geneve_protocol_type[0x1];
+    u8 outer_geneve_opt_len[0x1];
+    u8 source_vhca_port[0x1];
+    u8 source_eswitch_port[0x1];
+
+    u8 inner_dmac[0x1];
+    u8 inner_smac[0x1];
+    u8 inner_ether_type[0x1];
+    u8 inner_ip_version[0x1];
+    u8 inner_first_prio[0x1];
+    u8 inner_first_cfi[0x1];
+    u8 inner_first_vid[0x1];
+    u8 inner_ipv4_ttl[0x1];
+    u8 inner_second_prio[0x1];
+    u8 inner_second_cfi[0x1];
+    u8 inner_second_vid[0x1];
+    u8 inner_ipv6_flow_label[0x1];
+    u8 inner_sip[0x1];
+    u8 inner_dip[0x1];
+    u8 inner_frag[0x1];
+    u8 inner_ip_protocol[0x1];
+    u8 inner_ip_ecn[0x1];
+    u8 inner_ip_dscp[0x1];
+    u8 inner_udp_sport[0x1];
+    u8 inner_udp_dport[0x1];
+    u8 inner_tcp_sport[0x1];
+    u8 inner_tcp_dport[0x1];
+    u8 inner_tcp_flags[0x1];
+    u8 reserved_at_37[0x7];
+    u8 metadata_reg_b[0x1];
+    u8 metadata_reg_a[0x1];
+
+    u8 reserved_at_40[0x5];
+    u8 outer_first_mpls_over_udp_ttl[0x1];
+    u8 outer_first_mpls_over_udp_s_bos[0x1];
+    u8 outer_first_mpls_over_udp_exp[0x1];
+    u8 outer_first_mpls_over_udp_label[0x1];
+    u8 outer_first_mpls_over_gre_ttl[0x1];
+    u8 outer_first_mpls_over_gre_s_bos[0x1];
+    u8 outer_first_mpls_over_gre_exp[0x1];
+    u8 outer_first_mpls_over_gre_label[0x1];
+    u8 inner_first_mpls_ttl[0x1];
+    u8 inner_first_mpls_s_bos[0x1];
+    u8 inner_first_mpls_exp[0x1];
+    u8 inner_first_mpls_label[0x1];
+    u8 outer_first_mpls_ttl[0x1];
+    u8 outer_first_mpls_s_bos[0x1];
+    u8 outer_first_mpls_exp[0x1];
+    u8 outer_first_mpls_label[0x1];
+    u8 outer_emd_tag[0x1];
+    u8 inner_esp_spi[0x1];
+    u8 outer_esp_spi[0x1];
+    u8 inner_ipv6_hop_limit[0x1];
+    u8 outer_ipv6_hop_limit[0x1];
+    u8 bth_dst_qp[0x1];
+    u8 inner_first_svlan[0x1];
+    u8 inner_second_svlan[0x1];
+    u8 outer_first_svlan[0x1];
+    u8 outer_second_svlan[0x1];
+    u8 source_sqn[0x1];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dr_match_spec_bits {
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 ethertype[0x10];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 first_prio[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vid[0xc];
+
+    u8 ip_protocol[0x8];
+    u8 ip_dscp[0x6];
+    u8 ip_ecn[0x2];
+    u8 cvlan_tag[0x1];
+    u8 svlan_tag[0x1];
+    u8 frag[0x1];
+    u8 ip_version[0x4];
+    u8 tcp_flags[0x9];
+
+    u8 tcp_sport[0x10];
+    u8 tcp_dport[0x10];
+
+    u8 reserved_at_c0[0x10];
+    u8 ipv4_ihl[0x4];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 ipv4_checksum_ok[0x1];
+    u8 l4_checksum_ok[0x1];
+    u8 ip_ttl_hoplimit[0x8];
+
+    u8 udp_sport[0x10];
+    u8 udp_dport[0x10];
+
+    u8 src_ip_127_96[0x20];
+
+    u8 src_ip_95_64[0x20];
+
+    u8 src_ip_63_32[0x20];
+
+    u8 src_ip_31_0[0x20];
+
+    u8 dst_ip_127_96[0x20];
+
+    u8 dst_ip_95_64[0x20];
+
+    u8 dst_ip_63_32[0x20];
+
+    u8 dst_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_auto1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 source_vhca_port[0x4];
+    u8 source_sqn[0x18];
+
+    u8 source_eswitch_owner_vhca_id[0x10];
+    u8 source_port[0x10];
+
+    u8 outer_second_prio[0x3];
+    u8 outer_second_cfi[0x1];
+    u8 outer_second_vid[0xc];
+    u8 inner_second_prio[0x3];
+    u8 inner_second_cfi[0x1];
+    u8 inner_second_vid[0xc];
+
+    u8 outer_second_cvlan_tag[0x1];
+    u8 inner_second_cvlan_tag[0x1];
+    u8 outer_second_svlan_tag[0x1];
+    u8 inner_second_svlan_tag[0x1];
+    u8 outer_emd_tag[0x1];
+    u8 reserved_at_65[0xb];
+    u8 gre_protocol[0x10];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 vxlan_vni[0x18];
+    u8 reserved_at_b8[0x8];
+
+    u8 geneve_vni[0x18];
+    u8 reserved_at_e4[0x7];
+    u8 geneve_oam[0x1];
+
+    u8 reserved_at_ec[0xc];
+    u8 outer_ipv6_flow_label[0x14];
+
+    u8 reserved_at_100[0xc];
+    u8 inner_ipv6_flow_label[0x14];
+
+    u8 reserved_at_120[0xa];
+    u8 geneve_opt_len[0x6];
+    u8 geneve_protocol_type[0x10];
+
+    u8 reserved_at_140[0x8];
+    u8 bth_dst_qp[0x18];
+
+    u8 inner_esp_spi[0x20];
+
+    u8 outer_esp_spi[0x20];
+
+    u8 reserved_at_1a0[0x60];
+};
+
+struct mlx5_ifc_dr_match_set_misc2_bits {
+    u8 outer_first_mpls_label[0x14];
+    u8 outer_first_mpls_exp[0x3];
+    u8 outer_first_mpls_s_bos[0x1];
+    u8 outer_first_mpls_ttl[0x8];
+
+    u8 inner_first_mpls_label[0x14];
+    u8 inner_first_mpls_exp[0x3];
+    u8 inner_first_mpls_s_bos[0x1];
+    u8 inner_first_mpls_ttl[0x8];
+
+    u8 outer_first_mpls_over_gre_label[0x14];
+    u8 outer_first_mpls_over_gre_exp[0x3];
+    u8 outer_first_mpls_over_gre_s_bos[0x1];
+    u8 outer_first_mpls_over_gre_ttl[0x8];
+
+    u8 outer_first_mpls_over_udp_label[0x14];
+    u8 outer_first_mpls_over_udp_exp[0x3];
+    u8 outer_first_mpls_over_udp_s_bos[0x1];
+    u8 outer_first_mpls_over_udp_ttl[0x8];
+
+    u8 metadata_reg_c_7[0x20];
+    u8 metadata_reg_c_6[0x20];
+    u8 metadata_reg_c_5[0x20];
+    u8 metadata_reg_c_4[0x20];
+    u8 metadata_reg_c_3[0x20];
+    u8 metadata_reg_c_2[0x20];
+    u8 metadata_reg_c_1[0x20];
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_a[0x20];
+    u8 metadata_reg_b[0x20];
+
+    u8 reserved_at_260[0x40];
+};
+
+struct mlx5_ifc_dr_match_set_misc3_bits {
+    u8 inner_tcp_seq_num[0x20];
+
+    u8 outer_tcp_seq_num[0x20];
+
+    u8 inner_tcp_ack_num[0x20];
+
+    u8 outer_tcp_ack_num[0x20];
+
+    u8 reserved_at_80[0x8];
+    u8 outer_vxlan_gpe_vni[0x18];
+
+    u8 outer_vxlan_gpe_next_protocol[0x8];
+    u8 outer_vxlan_gpe_flags[0x8];
+    u8 reserved_at_b0[0x10];
+
+    u8 icmp_header_data[0x20];
+
+    u8 icmpv6_header_data[0x20];
+
+    u8 icmp_type[0x8];
+    u8 icmp_code[0x8];
+    u8 icmpv6_type[0x8];
+    u8 icmpv6_code[0x8];
+
+    u8 geneve_tlv_option_0_data[0x20];
+
+    u8 gtpu_teid[0x20];
+
+    u8 gtpu_msg_type[0x8];
+    u8 gtpu_msg_flags[0x8];
+    u8 reserved_at_150[0x10];
+
+    u8 gtpu_dw_2[0x20];
+
+    u8 gtpu_first_ext_dw_0[0x20];
+
+    u8 gtpu_dw_0[0x20];
+
+    u8 reserved_at_1c0[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc4_bits {
+    u8 prog_sample_field_value_0[0x20];
+
+    u8 prog_sample_field_id_0[0x20];
+
+    u8 prog_sample_field_value_1[0x20];
+
+    u8 prog_sample_field_id_1[0x20];
+
+    u8 prog_sample_field_value_2[0x20];
+
+    u8 prog_sample_field_id_2[0x20];
+
+    u8 prog_sample_field_value_3[0x20];
+
+    u8 prog_sample_field_id_3[0x20];
+
+    u8 prog_sample_field_value_4[0x20];
+
+    u8 prog_sample_field_id_4[0x20];
+
+    u8 prog_sample_field_value_5[0x20];
+
+    u8 prog_sample_field_id_5[0x20];
+
+    u8 prog_sample_field_value_6[0x20];
+
+    u8 prog_sample_field_id_6[0x20];
+
+    u8 prog_sample_field_value_7[0x20];
+
+    u8 prog_sample_field_id_7[0x20];
+};
+
+struct mlx5_ifc_dr_match_set_misc5_bits {
+    u8 macsec_tag_0[0x20];
+
+    u8 macsec_tag_1[0x20];
+
+    u8 macsec_tag_2[0x20];
+
+    u8 macsec_tag_3[0x20];
+
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 tunnel_header_2[0x20];
+
+    u8 tunnel_header_3[0x20];
+
+    u8 reserved[0x100];
+};
+
+struct mlx5_ifc_dr_match_param_bits {
+    struct mlx5_ifc_dr_match_spec_bits outer;
+    struct mlx5_ifc_dr_match_set_misc_bits misc;
+    struct mlx5_ifc_dr_match_spec_bits inner;
+    struct mlx5_ifc_dr_match_set_misc2_bits misc2;
+    struct mlx5_ifc_dr_match_set_misc3_bits misc3;
+    struct mlx5_ifc_dr_match_set_misc4_bits misc4;
+    struct mlx5_ifc_dr_match_set_misc5_bits misc5;
+};
+
+struct mlx5_ifc_flow_table_prop_layout_bits {
+    u8 ft_support[0x1];
+    u8 flow_tag[0x1];
+    u8 flow_counter[0x1];
+    u8 flow_modify_en[0x1];
+    u8 modify_root[0x1];
+    u8 identified_miss_table[0x1];
+    u8 flow_table_modify[0x1];
+    u8 reformat[0x1];
+    u8 decap[0x1];
+    u8 reset_root_to_default[0x1];
+    u8 pop_vlan[0x1];
+    u8 push_vlan[0x1];
+    u8 fpga_vendor_acceleration[0x1];
+    u8 pop_vlan_2[0x1];
+    u8 push_vlan_2[0x1];
+    u8 reformat_and_vlan_action[0x1];
+    u8 modify_and_vlan_action[0x1];
+    u8 sw_owner[0x1];
+    u8 reformat_l3_tunnel_to_l2[0x1];
+    u8 reformat_l2_to_l3_tunnel[0x1];
+    u8 reformat_and_modify_action[0x1];
+    u8 reserved_at_15[0x9];
+    u8 sw_owner_v2[0x1];
+    u8 reserved_at_1f[0x1];
+
+    u8 reserved_at_20[0x2];
+    u8 log_max_ft_size[0x6];
+    u8 log_max_modify_header_context[0x8];
+    u8 max_modify_header_actions[0x8];
+    u8 max_ft_level[0x8];
+
+    u8 reserved_at_40[0x10];
+    u8 metadata_reg_b_width[0x8];
+    u8 metadata_reg_a_width[0x8];
+
+    u8 reserved_at_60[0x18];
+    u8 log_max_ft_num[0x8];
+
+    u8 reserved_at_80[0x10];
+    u8 log_max_flow_counter[0x8];
+    u8 log_max_destination[0x8];
+
+    u8 reserved_at_a0[0x18];
+    u8 log_max_flow[0x8];
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_flow_table_fields_supported_bits ft_field_support;
+
+    struct mlx5_ifc_flow_table_fields_supported_bits ft_field_bitmask_support;
+};
+
+enum {
+    MLX5_FLEX_PARSER_GENEVE_ENABLED = 1 << 3,
+    MLX5_FLEX_PARSER_MPLS_OVER_GRE_ENABLED = 1 << 4,
+    mlx5_FLEX_PARSER_MPLS_OVER_UDP_ENABLED = 1 << 5,
+    MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED = 1 << 7,
+    MLX5_FLEX_PARSER_ICMP_V4_ENABLED = 1 << 8,
+    MLX5_FLEX_PARSER_ICMP_V6_ENABLED = 1 << 9,
+    MLX5_FLEX_PARSER_GENEVE_OPT_0_ENABLED = 1 << 10,
+    MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11,
+    MLX5_FLEX_PARSER_GTPU_DW_2_ENABLED = 1 << 16,
+    MLX5_FLEX_PARSER_GTPU_FIRST_EXT_DW_0_ENABLED = 1 << 17,
+    MLX5_FLEX_PARSER_GTPU_DW_0_ENABLED = 1 << 18,
+    MLX5_FLEX_PARSER_GTPU_TEID_ENABLED = 1 << 19,
+};
+
+enum mlx5_ifc_steering_format_version {
+    MLX5_HW_CONNECTX_5 = 0x0,
+    MLX5_HW_CONNECTX_6DX = 0x1,
+};
+
+enum mlx5_ifc_ste_v1_modify_hdr_offset {
+    MLX5_MODIFY_HEADER_V1_QW_OFFSET = 0x20,
+};
+
+struct mlx5_ifc_cmd_hca_cap_bits {
+    u8 access_other_hca_roce[0x1];
+    u8 reserved_at_1[0x1e];
+    u8 vhca_resource_manager[0x1];
+
+    u8 hca_cap_2[0x1];
+    u8 reserved_at_21[0xf];
+    u8 vhca_id[0x10];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x2];
+    u8 qp_data_in_order[0x1];
+    u8 reserved_at_63[0x8];
+    u8 log_dma_mmo_max_size[0x5];
+    u8 reserved_at_70[0x10];
+
+    u8 log_max_srq_sz[0x8];
+    u8 log_max_qp_sz[0x8];
+    u8 reserved_at_90[0x3];
+    u8 isolate_vl_tc_new[0x1];
+    u8 reserved_at_94[0x4];
+    u8 prio_tag_required[0x1];
+    u8 reserved_at_99[0x2];
+    u8 log_max_qp[0x5];
+
+    u8 reserved_at_a0[0xb];
+    u8 log_max_srq[0x5];
+    u8 reserved_at_b0[0x10];
+
+    u8 reserved_at_c0[0x8];
+    u8 log_max_cq_sz[0x8];
+    u8 reserved_at_d0[0xb];
+    u8 log_max_cq[0x5];
+
+    u8 log_max_eq_sz[0x8];
+    u8 relaxed_ordering_write[0x1];
+    u8 reserved_at_e9[0x1];
+    u8 log_max_mkey[0x6];
+    u8 tunneled_atomic[0x1];
+    u8 as_notify[0x1];
+    u8 m_pci_port[0x1];
+    u8 m_vhca_mk[0x1];
+    u8 cmd_on_behalf[0x1];
+    u8 device_emulation_manager[0x1];
+    u8 terminate_scatter_list_mkey[0x1];
+    u8 repeated_mkey[0x1];
+    u8 dump_fill_mkey[0x1];
+    u8 reserved_at_f9[0x3];
+    u8 log_max_eq[0x4];
+
+    u8 max_indirection[0x8];
+    u8 fixed_buffer_size[0x1];
+    u8 log_max_mrw_sz[0x7];
+    u8 force_teardown[0x1];
+    u8 fast_teardown[0x1];
+    u8 log_max_bsf_list_size[0x6];
+    u8 umr_extended_translation_offset[0x1];
+    u8 null_mkey[0x1];
+    u8 log_max_klm_list_size[0x6];
+
+    u8 reserved_at_120[0x2];
+    u8 qpc_extension[0x1];
+    u8 reserved_at_123[0x7];
+    u8 log_max_ra_req_dc[0x6];
+    u8 reserved_at_130[0xa];
+    u8 log_max_ra_res_dc[0x6];
+
+    u8 reserved_at_140[0x7];
+    u8 sig_crc64_xp10[0x1];
+    u8 sig_crc32c[0x1];
+    u8 reserved_at_149[0x1];
+    u8 log_max_ra_req_qp[0x6];
+    u8 reserved_at_150[0x1];
+    u8 rts2rts_qp_udp_sport[0x1];
+    u8 rts2rts_lag_tx_port_affinity[0x1];
+    u8 dma_mmo_sq[0x1];
+    u8 reserved_at_154[0x6];
+    u8 log_max_ra_res_qp[0x6];
+
+    u8 end_pad[0x1];
+    u8 cc_query_allowed[0x1];
+    u8 cc_modify_allowed[0x1];
+    u8 start_pad[0x1];
+    u8 cache_line_128byte[0x1];
+    u8 gid_table_size_ro[0x1];
+    u8 pkey_table_size_ro[0x1];
+    u8 reserved_at_167[0x1];
+    u8 rnr_nak_q_counters[0x1];
+    u8 rts2rts_qp_counters_set_id[0x1];
+    u8 rts2rts_qp_dscp[0x1];
+    u8 reserved_at_16b[0x4];
+    u8 qcam_reg[0x1];
+    u8 gid_table_size[0x10];
+
+    u8 out_of_seq_cnt[0x1];
+    u8 vport_counters[0x1];
+    u8 retransmission_q_counters[0x1];
+    u8 debug[0x1];
+    u8 modify_rq_counters_set_id[0x1];
+    u8 rq_delay_drop[0x1];
+    u8 max_qp_cnt[0xa];
+    u8 pkey_table_size[0x10];
+
+    u8 vport_group_manager[0x1];
+    u8 vhca_group_manager[0x1];
+    u8 ib_virt[0x1];
+    u8 eth_virt[0x1];
+    u8 vnic_env_queue_counters[0x1];
+    u8 ets[0x1];
+    u8 nic_flow_table[0x1];
+    u8 eswitch_manager[0x1];
+    u8 device_memory[0x1];
+    u8 mcam_reg[0x1];
+    u8 pcam_reg[0x1];
+    u8 local_ca_ack_delay[0x5];
+    u8 port_module_event[0x1];
+    u8 enhanced_retransmission_q_counters[0x1];
+    u8 port_checks[0x1];
+    u8 pulse_gen_control[0x1];
+    u8 disable_link_up_by_init_hca[0x1];
+    u8 beacon_led[0x1];
+    u8 port_type[0x2];
+    u8 num_ports[0x8];
+
+    u8 reserved_at_1c0[0x1];
+    u8 pps[0x1];
+    u8 pps_modify[0x1];
+    u8 log_max_msg[0x5];
+    u8 multi_path_xrc_rdma[0x1];
+    u8 multi_path_dc_rdma[0x1];
+    u8 multi_path_rc_rdma[0x1];
+    u8 traffic_fast_control[0x1];
+    u8 max_tc[0x4];
+    u8 temp_warn_event[0x1];
+    u8 dcbx[0x1];
+    u8 general_notification_event[0x1];
+    u8 multi_prio_sq[0x1];
+    u8 afu_owner[0x1];
+    u8 fpga[0x1];
+    u8 rol_s[0x1];
+    u8 rol_g[0x1];
+    u8 ib_port_sniffer[0x1];
+    u8 wol_s[0x1];
+    u8 wol_g[0x1];
+    u8 wol_a[0x1];
+    u8 wol_b[0x1];
+    u8 wol_m[0x1];
+    u8 wol_u[0x1];
+    u8 wol_p[0x1];
+
+    u8 stat_rate_support[0x10];
+    u8 sig_block_4048[0x1];
+    u8 reserved_at_1f1[0xb];
+    u8 cqe_version[0x4];
+
+    u8 compact_address_vector[0x1];
+    u8 eth_striding_wq[0x1];
+    u8 reserved_at_202[0x1];
+    u8 ipoib_enhanced_offloads[0x1];
+    u8 ipoib_basic_offloads[0x1];
+    u8 ib_striding_wq[0x1];
+    u8 repeated_block_disabled[0x1];
+    u8 umr_modify_entity_size_disabled[0x1];
+    u8 umr_modify_atomic_disabled[0x1];
+    u8 umr_indirect_mkey_disabled[0x1];
+    u8 umr_fence[0x2];
+    u8 dc_req_sctr_data_cqe[0x1];
+    u8 dc_connect_qp[0x1];
+    u8 dc_cnak_trace[0x1];
+    u8 drain_sigerr[0x1];
+    u8 cmdif_checksum[0x2];
+    u8 sigerr_cqe[0x1];
+    u8 reserved_at_213[0x1];
+    u8 wq_signature[0x1];
+    u8 sctr_data_cqe[0x1];
+    u8 reserved_at_216[0x1];
+    u8 sho[0x1];
+    u8 tph[0x1];
+    u8 rf[0x1];
+    u8 dct[0x1];
+    u8 qos[0x1];
+    u8 eth_net_offloads[0x1];
+    u8 roce[0x1];
+    u8 atomic[0x1];
+    u8 extended_retry_count[0x1];
+
+    u8 cq_oi[0x1];
+    u8 cq_resize[0x1];
+    u8 cq_moderation[0x1];
+    u8 cq_period_mode_modify[0x1];
+    u8 cq_invalidate[0x1];
+    u8 reserved_at_225[0x1];
+    u8 cq_eq_remap[0x1];
+    u8 pg[0x1];
+    u8 block_lb_mc[0x1];
+    u8 exponential_backoff[0x1];
+    u8 scqe_break_moderation[0x1];
+    u8 cq_period_start_from_cqe[0x1];
+    u8 cd[0x1];
+    u8 atm[0x1];
+    u8 apm[0x1];
+    u8 vector_calc[0x1];
+    u8 umr_ptr_rlkey[0x1];
+    u8 imaicl[0x1];
+    u8 qp_packet_based[0x1];
+    u8 reserved_at_233[0x1];
+    u8 ipoib_enhanced_pkey_change[0x1];
+    u8 initiator_src_dct_in_cqe[0x1];
+    u8 qkv[0x1];
+    u8 pkv[0x1];
+    u8 set_deth_sqpn[0x1];
+    u8 rts2rts_primary_sl[0x1];
+    u8 initiator_src_dct[0x1];
+    u8 dc_v2[0x1];
+    u8 xrc[0x1];
+    u8 ud[0x1];
+    u8 uc[0x1];
+    u8 rc[0x1];
+
+    u8 uar_4k[0x1];
+    u8 reserved_at_241[0x9];
+    u8 uar_sz[0x6];
+    u8 reserved_at_250[0x2];
+    u8 umem_uid_0[0x1];
+    u8 log_max_dc_cnak_qps[0x5];
+    u8 log_pg_sz[0x8];
+
+    u8 bf[0x1];
+    u8 driver_version[0x1];
+    u8 pad_tx_eth_packet[0x1];
+    u8 query_driver_version[0x1];
+    u8 max_qp_retry_freq[0x1];
+    u8 qp_by_name[0x1];
+    u8 mkey_by_name[0x1];
+    u8 reserved_at_267[0x1];
+    u8 suspend_qp_uc[0x1];
+    u8 suspend_qp_ud[0x1];
+    u8 suspend_qp_rc[0x1];
+    u8 log_bf_reg_size[0x5];
+    u8 reserved_at_270[0x6];
+    u8 lag_dct[0x2];
+    u8 lag_tx_port_affinity[0x1];
+    u8 reserved_at_279[0x2];
+    u8 lag_master[0x1];
+    u8 num_lag_ports[0x4];
+
+    u8 num_of_diagnostic_counters[0x10];
+    u8 max_wqe_sz_sq[0x10];
+
+    u8 reserved_at_2a0[0x10];
+    u8 max_wqe_sz_rq[0x10];
+
+    u8 max_flow_counter_31_16[0x10];
+    u8 max_wqe_sz_sq_dc[0x10];
+
+    u8 reserved_at_2e0[0x7];
+    u8 max_qp_mcg[0x19];
+
+    u8 mlnx_tag_ethertype[0x10];
+    u8 reserved_at_310[0x8];
+    u8 log_max_mcg[0x8];
+
+    u8 reserved_at_320[0x3];
+    u8 log_max_transport_domain[0x5];
+    u8 reserved_at_328[0x3];
+    u8 log_max_pd[0x5];
+    u8 reserved_at_330[0xb];
+    u8 log_max_xrcd[0x5];
+
+    u8 nic_receive_steering_discard[0x1];
+    u8 receive_discard_vport_down[0x1];
+    u8 transmit_discard_vport_down[0x1];
+    u8 eq_overrun_count[0x1];
+    u8 nic_receive_steering_depth[0x1];
+    u8 invalid_command_count[0x1];
+    u8 quota_exceeded_count[0x1];
+    u8 reserved_at_347[0x1];
+    u8 log_max_flow_counter_bulk[0x8];
+    u8 max_flow_counter_15_0[0x10];
+
+    u8 modify_tis[0x1];
+    u8 reserved_at_361[0x2];
+    u8 log_max_rq[0x5];
+    u8 reserved_at_368[0x3];
+    u8 log_max_sq[0x5];
+    u8 reserved_at_370[0x3];
+    u8 log_max_tir[0x5];
+    u8 reserved_at_378[0x3];
+    u8 log_max_tis[0x5];
+
+    u8 basic_cyclic_rcv_wqe[0x1];
+    u8 reserved_at_381[0x2];
+    u8 log_max_rmp[0x5];
+    u8 reserved_at_388[0x3];
+    u8 log_max_rqt[0x5];
+    u8 reserved_at_390[0x3];
+    u8 log_max_rqt_size[0x5];
+    u8 reserved_at_398[0x3];
+    u8 log_max_tis_per_sq[0x5];
+
+    u8 ext_stride_num_range[0x1];
+    u8 reserved_at_3a1[0x2];
+    u8 log_max_stride_sz_rq[0x5];
+    u8 reserved_at_3a8[0x3];
+    u8 log_min_stride_sz_rq[0x5];
+    u8 reserved_at_3b0[0x3];
+    u8 log_max_stride_sz_sq[0x5];
+    u8 reserved_at_3b8[0x3];
+    u8 log_min_stride_sz_sq[0x5];
+
+    u8 hairpin[0x1];
+    u8 reserved_at_3c1[0x2];
+    u8 log_max_hairpin_queues[0x5];
+    u8 reserved_at_3c8[0x3];
+    u8 log_max_hairpin_wq_data_sz[0x5];
+    u8 reserved_at_3d0[0x3];
+    u8 log_max_hairpin_num_packets[0x5];
+    u8 reserved_at_3d8[0x3];
+    u8 log_max_wq_sz[0x5];
+
+    u8 nic_vport_change_event[0x1];
+    u8 disable_local_lb_uc[0x1];
+    u8 disable_local_lb_mc[0x1];
+    u8 log_min_hairpin_wq_data_sz[0x5];
+    u8 reserved_at_3e8[0x3];
+    u8 log_max_vlan_list[0x5];
+    u8 reserved_at_3f0[0x3];
+    u8 log_max_current_mc_list[0x5];
+    u8 reserved_at_3f8[0x3];
+    u8 log_max_current_uc_list[0x5];
+
+    u8 general_obj_types[0x40];
+
+    u8 reserved_at_440[0x4];
+    u8 steering_format_version[0x4];
+    u8 create_qp_start_hint[0x18];
+
+    u8 reserved_at_460[0x8];
+    u8 aes_xts[0x1];
+    u8 crypto[0x1];
+    u8 reserved_at_46a[0x6];
+    u8 max_num_eqs[0x10];
+
+    u8 sigerr_domain_and_sig_type[0x1];
+    u8 reserved_at_481[0x2];
+    u8 log_max_l2_table[0x5];
+    u8 reserved_at_488[0x8];
+    u8 log_uar_page_sz[0x10];
+
+    u8 reserved_at_4a0[0x20];
+
+    u8 device_frequency_mhz[0x20];
+
+    u8 device_frequency_khz[0x20];
+
+    u8 capi[0x1];
+    u8 create_pec[0x1];
+    u8 nvmf_target_offload[0x1];
+    u8 capi_invalidate[0x1];
+    u8 reserved_at_504[0x17];
+    u8 log_max_pasid[0x5];
+
+    u8 num_of_uars_per_page[0x20];
+
+    u8 flex_parser_protocols[0x20];
+
+    u8 reserved_at_560[0x10];
+    u8 flex_parser_header_modify[0x1];
+    u8 reserved_at_571[0x2];
+    u8 log_max_guaranteed_connections[0x5];
+    u8 reserved_at_578[0x3];
+    u8 log_max_dct_connections[0x5];
+
+    u8 log_max_atomic_size_qp[0x8];
+    u8 reserved_at_588[0x10];
+    u8 log_max_atomic_size_dc[0x8];
+
+    u8 reserved_at_5a0[0x1c];
+    u8 mini_cqe_resp_stride_index[0x1];
+    u8 cqe_128_always[0x1];
+    u8 cqe_compression_128b[0x1];
+    u8 cqe_compression[0x1];
+
+    u8 cqe_compression_timeout[0x10];
+    u8 cqe_compression_max_num[0x10];
+
+    u8 reserved_at_5e0[0x8];
+    u8 flex_parser_id_gtpu_dw_0[0x4];
+    u8 log_max_tm_offloaded_op_size[0x4];
+    u8 tag_matching[0x1];
+    u8 rndv_offload_rc[0x1];
+    u8 rndv_offload_dc[0x1];
+    u8 log_tag_matching_list_sz[0x5];
+    u8 reserved_at_5f8[0x3];
+    u8 log_max_xrq[0x5];
+
+    u8 affiliate_nic_vport_criteria[0x8];
+    u8 native_port_num[0x8];
+    u8 num_vhca_ports[0x8];
+    u8 flex_parser_id_gtpu_teid[0x4];
+    u8 reserved_at_61c[0x1];
+    u8 trusted_vnic_vhca[0x1];
+    u8 sw_owner_id[0x1];
+    u8 reserve_not_to_use[0x1];
+    u8 reserved_at_620[0x60];
+    u8 sf[0x1];
+    u8 reserved_at_682[0x43];
+    u8 flex_parser_id_geneve_opt_0[0x4];
+    u8 flex_parser_id_icmp_dw1[0x4];
+    u8 flex_parser_id_icmp_dw0[0x4];
+    u8 flex_parser_id_icmpv6_dw1[0x4];
+    u8 flex_parser_id_icmpv6_dw0[0x4];
+    u8 flex_parser_id_outer_first_mpls_over_gre[0x4];
+    u8 flex_parser_id_outer_first_mpls_over_udp_label[0x4];
+
+    u8 reserved_at_6e0[0x20];
+
+    u8 flex_parser_id_gtpu_dw_2[0x4];
+    u8 flex_parser_id_gtpu_first_ext_dw_0[0x4];
+    u8 reserved_at_708[0x18];
+
+    u8 reserved_at_720[0x20];
+
+    u8 reserved_at_740[0x8];
+    u8 dma_mmo_qp[0x1];
+    u8 reserved_at_749[0x17];
+
+    u8 reserved_at_760[0x60];
+
+    u8 match_definer_format_supported[0x40];
+};
+
+struct mlx5_ifc_header_modify_cap_properties_bits {
+    struct mlx5_ifc_flow_table_fields_supported_bits set_action_field_support;
+
+    u8 reserved_at_80[0x80];
+
+    struct mlx5_ifc_flow_table_fields_supported_bits add_action_field_support;
+
+    u8 reserved_at_180[0x80];
+
+    u8 copy_action_field_support[8][0x20];
+
+    u8 reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_flow_table_fields_supported_2_bits {
+    u8 reserved_at_0[0x17];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+    u8 psp_header[0x1];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+
+    u8 reserved_at_20[0x60];
+};
+
+struct mlx5_ifc_flow_table_nic_cap_bits {
+    u8 nic_rx_multi_path_tirs[0x1];
+    u8 nic_rx_multi_path_tirs_fts[0x1];
+    u8 allow_sniffer_and_nic_rx_shared_tir[0x1];
+    u8 reserved_at_3[0x1];
+    u8 nic_rx_flow_tag_multipath_en[0x1];
+    u8 reserved_at_5[0x13];
+    u8 nic_receive_max_steering_depth[0x8];
+
+    u8 encap_general_header[0x1];
+    u8 reserved_at_21[0xa];
+    u8 log_max_packet_reformat_context[0x5];
+    u8 reserved_at_30[0x6];
+    u8 max_encap_header_size[0xa];
+
+    u8 reserved_at_40[0x1c0];
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_transmit_sniffer;
+
+    u8 reserved_at_e00[0x200];
+
+    struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_receive_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits
+        ft_field_bitmask_support_2_nic_receive_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_bitmask_support_2_nic_transmit_rdma;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits ft_field_support_2_nic_transmit_sniffer;
+
+    struct mlx5_ifc_flow_table_fields_supported_2_bits
+        ft_field_bitmask_support_2_nic_transmit_sniffer;
+
+    u8 reserved_at_1400[0x200];
+
+    struct mlx5_ifc_header_modify_cap_properties_bits header_modify_nic_transmit;
+
+    u8 sw_steering_nic_rx_action_drop_icm_address[0x40];
+
+    u8 sw_steering_nic_tx_action_drop_icm_address[0x40];
+
+    u8 sw_steering_nic_tx_action_allow_icm_address[0x40];
+
+    u8 reserved_at_20c0[0x5f40];
+};
+
+struct mlx5_ifc_flow_table_eswitch_cap_bits {
+    u8 reserved_at_0[0x1c];
+    u8 fdb_multi_path_to_table[0x1];
+    u8 reserved_at_1d[0x1e3];
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_nic_esw_fdb;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_ingress;
+
+    struct mlx5_ifc_flow_table_prop_layout_bits flow_table_properties_esw_acl_egress;
+
+    u8 reserved_at_800[0x1000];
+
+    u8 sw_steering_fdb_action_drop_icm_address_rx[0x40];
+    u8 sw_steering_fdb_action_drop_icm_address_tx[0x40];
+    u8 sw_steering_uplink_icm_address_rx[0x40];
+    u8 sw_steering_uplink_icm_address_tx[0x40];
+
+    u8 reserved_at_1900[0x6700];
+};
+
+struct mlx5_ifc_odp_per_transport_service_cap_bits {
+    u8 send[0x1];
+    u8 receive[0x1];
+    u8 write[0x1];
+    u8 read[0x1];
+    u8 atomic[0x1];
+    u8 srq_receive[0x1];
+    u8 reserved_at_6[0x1a];
+};
+
+struct mlx5_ifc_odp_cap_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 sig[0x1];
+    u8 reserved_at_41[0x1f];
+
+    u8 reserved_at_60[0x20];
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits rc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits uc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits ud_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits xrc_odp_caps;
+
+    struct mlx5_ifc_odp_per_transport_service_cap_bits dc_odp_caps;
+
+    u8 reserved_at_120[0x6e0];
+};
+
+struct mlx5_ifc_e_switch_cap_bits {
+    u8 reserved_at_0[0x4b];
+    u8 log_max_esw_sf[0x5];
+    u8 esw_sf_base_id[0x10];
+    u8 reserved_at_60[0x7a0];
+};
+
+enum {
+    ELEMENT_TYPE_CAP_MASK_TASR = 1 << 0,
+    ELEMENT_TYPE_CAP_MASK_QUEUE_GROUP = 1 << 4,
+};
+
+enum {
+    TSAR_TYPE_CAP_MASK_DWRR = 1 << 0,
+};
+
+struct mlx5_ifc_qos_cap_bits {
+    u8 reserved_at_0[0x8];
+    u8 nic_sq_scheduling[0x1];
+    u8 nic_bw_share[0x1];
+    u8 nic_rate_limit[0x1];
+    u8 reserved_at_b[0x15];
+
+    u8 reserved_at_20[0x1];
+    u8 nic_qp_scheduling[0x1];
+    u8 reserved_at_22[0x1e];
+
+    u8 reserved_at_40[0xc0];
+
+    u8 nic_element_type[0x10];
+    u8 nic_tsar_type[0x10];
+
+    u8 reserved_at_120[0x6e0];
+};
+
+struct mlx5_ifc_cmd_hca_cap_2_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 reserved_at_80[0x13];
+    u8 log_reserved_qpn_granularity[0x5];
+    u8 reserved_at_98[0x8];
+
+    u8 reserved_at_a0[0x760];
+};
+
+enum {
+    MLX5_CRYPTO_CAPS_WRAPPED_IMPORT_METHOD_AES = 0x4,
+};
+
+struct mlx5_ifc_crypto_caps_bits {
+    u8 wrapped_crypto_operational[0x1];
+    u8 wrapped_crypto_going_to_commissioning[0x1];
+    u8 reserved_at_2[0x16];
+    u8 wrapped_import_method[0x8];
+
+    u8 reserved_at_20[0xb];
+    u8 log_max_num_deks[0x5];
+    u8 reserved_at_30[0x3];
+    u8 log_max_num_import_keks[0x5];
+    u8 reserved_at_38[0x3];
+    u8 log_max_num_creds[0x5];
+
+    u8 failed_selftests[0x10];
+    u8 num_nv_import_keks[0x8];
+    u8 num_nv_credentials[0x8];
+
+    u8 reserved_at_60[0x7a0];
+};
+
+union mlx5_ifc_hca_cap_union_bits {
+    struct mlx5_ifc_atomic_caps_bits atomic_caps;
+    struct mlx5_ifc_cmd_hca_cap_bits cmd_hca_cap;
+    struct mlx5_ifc_flow_table_nic_cap_bits flow_table_nic_cap;
+    struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap;
+    struct mlx5_ifc_e_switch_cap_bits e_switch_cap;
+    struct mlx5_ifc_device_mem_cap_bits device_mem_cap;
+    struct mlx5_ifc_odp_cap_bits odp_cap;
+    struct mlx5_ifc_roce_cap_bits roce_caps;
+    struct mlx5_ifc_qos_cap_bits qos_caps;
+    struct mlx5_ifc_cmd_hca_cap_2_bits cmd_hca_cap_2;
+    struct mlx5_ifc_crypto_caps_bits crypto_caps;
+    u8 reserved_at_0[0x8000];
+};
+
+struct mlx5_ifc_query_hca_cap_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_query_hca_cap_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum mlx5_cap_type {
+    MLX5_CAP_GENERAL = 0,
+    MLX5_CAP_ODP = 2,
+    MLX5_CAP_ATOMIC = 3,
+    MLX5_CAP_ROCE,
+    MLX5_CAP_NUM,
+};
+
+enum {
+    MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE = 0x7 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE = 0x8 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_QOS = 0xc << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_ESW = 0x9 << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY = 0xf << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_CRYPTO = 0x1a << 1,
+    MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE_CAP_2 = 0x20 << 1,
+};
+
+enum {
+    MLX5_MKC_ACCESS_MODE_MTT = 0x1,
+    MLX5_MKC_ACCESS_MODE_KLMS = 0x2,
+};
+
+struct mlx5_ifc_mkc_bits {
+    u8 reserved_at_0[0x1];
+    u8 free[0x1];
+    u8 reserved_at_2[0x1];
+    u8 access_mode_4_2[0x3];
+    u8 reserved_at_6[0x7];
+    u8 relaxed_ordering_write[0x1];
+    u8 reserved_at_e[0x1];
+    u8 small_fence_on_rdma_read_response[0x1];
+    u8 umr_en[0x1];
+    u8 a[0x1];
+    u8 rw[0x1];
+    u8 rr[0x1];
+    u8 lw[0x1];
+    u8 lr[0x1];
+    u8 access_mode_1_0[0x2];
+    u8 reserved_at_18[0x8];
+
+    u8 qpn[0x18];
+    u8 mkey_7_0[0x8];
+
+    u8 reserved_at_40[0x20];
+
+    u8 length64[0x1];
+    u8 bsf_en[0x1];
+    u8 sync_umr[0x1];
+    u8 reserved_at_63[0x2];
+    u8 expected_sigerr_count[0x1];
+    u8 reserved_at_66[0x1];
+    u8 en_rinval[0x1];
+    u8 pd[0x18];
+
+    u8 start_addr[0x40];
+
+    u8 len[0x40];
+
+    u8 bsf_octword_size[0x20];
+
+    u8 reserved_at_120[0x80];
+
+    u8 translations_octword_size[0x20];
+
+    u8 reserved_at_1c0[0x19];
+    u8 relaxed_ordering_read[0x1];
+    u8 reserved_at_1d9[0x1];
+    u8 log_page_size[0x5];
+
+    u8 reserved_at_1e0[0x3];
+    u8 crypto_en[0x2];
+    u8 reserved_at_1e5[0x1b];
+};
+
+struct mlx5_ifc_create_mkey_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 mkey_index[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_mkey_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x20];
+
+    u8 pg_access[0x1];
+    u8 mkey_umem_valid[0x1];
+    u8 reserved_at_62[0x1e];
+
+    struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
+
+    u8 reserved_at_280[0x80];
+
+    u8 translations_octword_actual_size[0x20];
+
+    u8 reserved_at_320[0x560];
+
+    u8 klm_pas_mtt[0][0x20];
+};
+
+struct mlx5_ifc_destroy_mkey_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_mkey_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 mkey_index[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_l2_hdr_bits {
+    u8 dmac_47_16[0x20];
+    u8 dmac_15_0[0x10];
+    u8 smac_47_32[0x10];
+    u8 smac_31_0[0x20];
+    u8 ethertype[0x10];
+    u8 vlan_type[0x10];
+    u8 vlan[0x10];
+};
+
+enum {
+    FS_FT_NIC_RX = 0x0,
+    FS_FT_NIC_TX = 0x1,
+    FS_FT_ESW_EGRESS_ACL = 0x2,
+    FS_FT_ESW_INGRESS_ACL = 0x3,
+    FS_FT_FDB = 0X4,
+    FS_FT_SNIFFER_RX = 0X5,
+    FS_FT_SNIFFER_TX = 0X6,
+};
+
+struct mlx5_ifc_ste_general_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+    u8 reserved_at_60[0xa0];
+    u8 tag_value[0x60];
+    u8 bit_mask[0x60];
+};
+
+struct mlx5_ifc_ste_sx_transmit_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 sx_wire[0x1];
+    u8 sx_func_lb[0x1];
+    u8 sx_sniffer[0x1];
+    u8 sx_wire_enable[0x1];
+    u8 sx_func_lb_enable[0x1];
+    u8 sx_sniffer_enable[0x1];
+    u8 action_type[0x3];
+    u8 reserved_at_69[0x1];
+    u8 action_description[0x6];
+    u8 gvmi[0x10];
+
+    u8 encap_pointer_vlan_data[0x20];
+
+    u8 loopback_syndome_en[0x8];
+    u8 loopback_syndome[0x8];
+    u8 counter_trigger[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 go_back[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_rx_steering_mult_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 member_count[0x10];
+    u8 gvmi[0x10];
+
+    u8 qp_list_pointer[0x20];
+
+    u8 reserved_at_a0[0x1];
+    u8 tunneling_action[0x3];
+    u8 action_description[0x4];
+    u8 reserved_at_a8[0x8];
+    u8 counter_trigger_15_0[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x08];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 fail_on_error[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_modify_packet_bits {
+    u8 entry_type[0x4];
+    u8 reserved_at_4[0x4];
+    u8 entry_sub_type[0x8];
+    u8 byte_mask[0x10];
+
+    u8 next_table_base_63_48[0x10];
+    u8 next_lu_type[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 linear_hash_enable[0x1];
+    u8 reserved_at_5c[0x2];
+    u8 next_table_rank[0x2];
+
+    u8 number_of_re_write_actions[0x10];
+    u8 gvmi[0x10];
+
+    u8 header_re_write_actions_pointer[0x20];
+
+    u8 reserved_at_a0[0x1];
+    u8 tunneling_action[0x3];
+    u8 action_description[0x4];
+    u8 reserved_at_a8[0x8];
+    u8 counter_trigger_15_0[0x10];
+
+    u8 miss_address_63_48[0x10];
+    u8 counter_trigger_23_16[0x08];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 learning_point[0x1];
+    u8 fail_on_error[0x1];
+    u8 match_polarity[0x1];
+    u8 mask_mode[0x1];
+    u8 miss_rank[0x2];
+};
+
+struct mlx5_ifc_ste_single_action_flow_tag_v1_bits {
+    u8 action_id[0x8];
+    u8 flow_tag[0x18];
+};
+
+struct mlx5_ifc_ste_single_action_modify_list_v1_bits {
+    u8 action_id[0x8];
+    u8 num_of_modify_actions[0x8];
+    u8 modify_actions_ptr[0x10];
+};
+
+struct mlx5_ifc_ste_single_action_remove_header_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 reserved_at_10[0x2];
+    u8 end_anchor[0x6];
+    u8 reserved_at_18[0x4];
+    u8 decap[0x1];
+    u8 vni_to_cqe[0x1];
+    u8 qos_profile[0x2];
+};
+
+struct mlx5_ifc_ste_single_action_remove_header_size_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 outer_l4_remove[0x1];
+    u8 reserved_at_11[0x1];
+    u8 start_offset[0x7];
+    u8 reserved_at_18[0x1];
+    u8 remove_size[0x6];
+};
+
+struct mlx5_ifc_ste_double_action_copy_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 reserved_at_20[0x8];
+    u8 source_dw_offset[0x8];
+    u8 reserved_at_30[0x2];
+    u8 source_right_shifter[0x6];
+    u8 reserved_at_38[0x8];
+};
+
+struct mlx5_ifc_ste_double_action_set_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_add_v1_bits {
+    u8 action_id[0x8];
+    u8 destination_dw_offset[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 add_value[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_insert_with_inline_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 start_offset[0x7];
+    u8 reserved_at_17[0x9];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_insert_with_ptr_v1_bits {
+    u8 action_id[0x8];
+    u8 reserved_at_8[0x2];
+    u8 start_anchor[0x6];
+    u8 start_offset[0x7];
+    u8 size[0x6];
+    u8 attributes[0x3];
+
+    u8 pointer[0x20];
+};
+
+struct mlx5_ifc_ste_double_action_modify_action_list_v1_bits {
+    u8 action_id[0x8];
+    u8 modify_actions_pattern_pointer[0x18];
+
+    u8 number_of_modify_actions[0x8];
+    u8 modify_actions_argument_pointer[0x18];
+};
+
+enum {
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_RED = 0x0,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_YELLOW = 0x1,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_GREEN = 0x2,
+    MLX5_IFC_ASO_FLOW_METER_INITIAL_COLOR_UNDEFINED = 0x3,
+};
+
+enum {
+    MLX5_IFC_ASO_CT_DIRECTION_INITIATOR = 0x0,
+    MLX5_IFC_ASO_CT_DIRECTION_RESPONDER = 0x1,
+};
+
+struct mlx5_ifc_ste_aso_first_hit_action_v1_bits {
+    u8 reserved_at_0[0x6];
+    u8 set[0x1];
+    u8 line_id[0x9];
+};
+
+struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits {
+    u8 reserved_at_0[0xc];
+    u8 action[0x1];
+    u8 initial_color[0x2];
+    u8 line_id[0x1];
+};
+
+struct mlx5_ifc_ste_aso_ct_action_v1_bits {
+    u8 reserved_at_0[0xf];
+    u8 direction[0x1];
+};
+
+struct mlx5_ifc_ste_double_action_aso_v1_bits {
+    u8 action_id[0x8];
+    u8 aso_context_number[0x18];
+
+    u8 dest_reg_id[0x2];
+    u8 change_ordering_tag[0x1];
+    u8 aso_check_ordering[0x1];
+    u8 aso_context_type[0x4];
+    u8 reserved_at_28[0x8];
+    union {
+        u8 aso_fields[0x10];
+        struct mlx5_ifc_ste_aso_first_hit_action_v1_bits first_hit;
+        struct mlx5_ifc_ste_aso_flow_meter_action_v1_bits flow_meter;
+        struct mlx5_ifc_ste_aso_ct_action_v1_bits ct;
+    };
+};
+
+struct mlx5_ifc_ste_match_bwc_v1_bits {
+    u8 entry_format[0x8];
+    u8 counter_id[0x18];
+
+    u8 miss_address_63_48[0x10];
+    u8 match_definer_ctx_idx[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 reserved_at_5a[0x1];
+    u8 match_polarity[0x1];
+    u8 reparse[0x1];
+    u8 reserved_at_5d[0x3];
+
+    u8 next_table_base_63_48[0x10];
+    u8 hash_definer_ctx_idx[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 hash_type[0x2];
+    u8 hash_after_actions[0x1];
+    u8 reserved_at_9e[0x2];
+
+    u8 byte_mask[0x10];
+    u8 next_entry_format[0x1];
+    u8 mask_mode[0x1];
+    u8 gvmi[0xe];
+
+    u8 action[0x40];
+};
+
+struct mlx5_ifc_ste_mask_and_match_v1_bits {
+    u8 entry_format[0x8];
+    u8 counter_id[0x18];
+
+    u8 miss_address_63_48[0x10];
+    u8 match_definer_ctx_idx[0x8];
+    u8 miss_address_39_32[0x8];
+
+    u8 miss_address_31_6[0x1a];
+    u8 reserved_at_5a[0x1];
+    u8 match_polarity[0x1];
+    u8 reparse[0x1];
+    u8 reserved_at_5d[0x3];
+
+    u8 next_table_base_63_48[0x10];
+    u8 hash_definer_ctx_idx[0x8];
+    u8 next_table_base_39_32_size[0x8];
+
+    u8 next_table_base_31_5_size[0x1b];
+    u8 hash_type[0x2];
+    u8 hash_after_actions[0x1];
+    u8 reserved_at_9e[0x2];
+
+    u8 action[0x60];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_bits {
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 qp_type[0x2];
+    u8 ethertype_filter[0x1];
+    u8 reserved_at_43[0x1];
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 reserved_at_48[0x4];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_52[0x2];
+    u8 first_vlan_id[0xc];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 reserved_at_68[0x4];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_v1_bits {
+    u8 reserved_at_0[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 ip_fragmented[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x6];
+    u8 tcp_syn[0x1];
+    u8 reserved_at_67[0x3];
+    u8 force_loopback[0x1];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_dst_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 qp_type[0x2];
+    u8 ethertype_filter[0x1];
+    u8 reserved_at_43[0x1];
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 reserved_at_48[0x4];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_52[0x2];
+    u8 first_vlan_id[0xc];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 reserved_at_68[0x4];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_dst_v1_bits {
+    u8 reserved_at_0[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_lb[0x1];
+    u8 ip_fragmented[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x6];
+    u8 tcp_syn[0x1];
+    u8 reserved_at_67[0x3];
+    u8 force_lb[0x1];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_dst_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 smac_47_32[0x10];
+
+    u8 smac_31_0[0x20];
+
+    u8 sx_sniffer[0x1];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 port[0x1];
+    u8 l3_type[0x2];
+    u8 reserved_at_66[0x6];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_src_dst_v1_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 smac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 reserved_at_50[0x2];
+    u8 functional_lb[0x1];
+    u8 reserved_at_53[0x5];
+    u8 port[0x2];
+    u8 l3_type[0x2];
+    u8 reserved_at_5c[0x2];
+    u8 first_vlan_qualifier[0x2];
+
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+    u8 smac_15_0[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits {
+    u8 destination_address[0x20];
+
+    u8 source_address[0x20];
+
+    u8 source_port[0x10];
+    u8 destination_port[0x10];
+
+    u8 fragmented[0x1];
+    u8 first_fragment[0x1];
+    u8 reserved_at_62[0x2];
+    u8 reserved_at_64[0x1];
+    u8 ecn[0x2];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 dscp[0x6];
+    u8 reserved_at_76[0x2];
+    u8 protocol[0x8];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_v1_bits {
+    u8 source_address[0x20];
+
+    u8 destination_address[0x20];
+
+    u8 source_port[0x10];
+    u8 destination_port[0x10];
+
+    u8 reserved_at_60[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 fragmented[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 dscp[0x6];
+    u8 ecn[0x2];
+    u8 protocol[0x8];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits {
+    u8 dst_ip_127_96[0x20];
+
+    u8 dst_ip_95_64[0x20];
+
+    u8 dst_ip_63_32[0x20];
+
+    u8 dst_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l2_tnl_bits {
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 l2_tunneling_network_id[0x20];
+
+    u8 ip_fragmented[0x1];
+    u8 tcp_syn[0x1];
+    u8 encp_type[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 reserved_at_6c[0x3];
+    u8 gre_key_flag[0x1];
+    u8 first_vlan_qualifier[0x2];
+    u8 reserved_at_72[0x2];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l2_tnl_v1_bits {
+    u8 l2_tunneling_network_id[0x20];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 l3_ethertype[0x10];
+
+    u8 reserved_at_60[0x3];
+    u8 ip_fragmented[0x1];
+    u8 reserved_at_64[0x2];
+    u8 encp_type[0x2];
+    u8 reserved_at_68[0x2];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv6_src_bits {
+    u8 src_ip_127_96[0x20];
+
+    u8 src_ip_95_64[0x20];
+
+    u8 src_ip_63_32[0x20];
+
+    u8 src_ip_31_0[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits {
+    u8 version[0x4];
+    u8 ihl[0x4];
+    u8 reserved_at_8[0x8];
+    u8 total_length[0x10];
+
+    u8 identification[0x10];
+    u8 flags[0x3];
+    u8 fragment_offset[0xd];
+
+    u8 time_to_live[0x8];
+    u8 reserved_at_48[0x8];
+    u8 checksum[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_eth_l3_ipv4_misc_v1_bits {
+    u8 identification[0x10];
+    u8 flags[0x3];
+    u8 fragment_offset[0xd];
+
+    u8 total_length[0x10];
+    u8 checksum[0x10];
+
+    u8 version[0x4];
+    u8 ihl[0x4];
+    u8 time_to_live[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x1c];
+    u8 voq_internal_prio[0x4];
+};
+
+struct mlx5_ifc_ste_eth_l4_bits {
+    u8 fragmented[0x1];
+    u8 first_fragment[0x1];
+    u8 reserved_at_2[0x6];
+    u8 protocol[0x8];
+    u8 dst_port[0x10];
+
+    u8 ipv6_version[0x4];
+    u8 reserved_at_24[0x1];
+    u8 ecn[0x2];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 src_port[0x10];
+
+    u8 ipv6_payload_length[0x10];
+    u8 ipv6_hop_limit[0x8];
+    u8 dscp[0x6];
+    u8 reserved_at_5e[0x2];
+
+    u8 tcp_data_offset[0x4];
+    u8 reserved_at_64[0x8];
+    u8 flow_label[0x14];
+};
+
+struct mlx5_ifc_ste_eth_l4_v1_bits {
+    u8 ipv6_version[0x4];
+    u8 reserved_at_4[0x4];
+    u8 dscp[0x6];
+    u8 ecn[0x2];
+    u8 ipv6_hop_limit[0x8];
+    u8 protocol[0x8];
+
+    u8 src_port[0x10];
+    u8 dst_port[0x10];
+
+    u8 first_fragment[0x1];
+    u8 reserved_at_41[0xb];
+    u8 flow_label[0x14];
+
+    u8 tcp_data_offset[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 fragmented[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 ipv6_paylen[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l4_misc_bits {
+    u8 checksum[0x10];
+    u8 length[0x10];
+
+    u8 seq_num[0x20];
+
+    u8 ack_num[0x20];
+
+    u8 urgent_pointer[0x10];
+    u8 window_size[0x10];
+};
+
+struct mlx5_ifc_ste_eth_l4_misc_v1_bits {
+    u8 window_size[0x10];
+    u8 urgent_pointer[0x10];
+
+    u8 ack_num[0x20];
+
+    u8 seq_num[0x20];
+
+    u8 length[0x10];
+    u8 checksum[0x10];
+};
+
+struct mlx5_ifc_ste_mpls_bits {
+    u8 mpls0_label[0x14];
+    u8 mpls0_exp[0x3];
+    u8 mpls0_s_bos[0x1];
+    u8 mpls0_ttl[0x8];
+
+    u8 mpls1_label[0x20];
+
+    u8 mpls2_label[0x20];
+
+    u8 reserved_at_60[0x16];
+    u8 mpls4_s_bit[0x1];
+    u8 mpls4_qualifier[0x1];
+    u8 mpls3_s_bit[0x1];
+    u8 mpls3_qualifier[0x1];
+    u8 mpls2_s_bit[0x1];
+    u8 mpls2_qualifier[0x1];
+    u8 mpls1_s_bit[0x1];
+    u8 mpls1_qualifier[0x1];
+    u8 mpls0_s_bit[0x1];
+    u8 mpls0_qualifier[0x1];
+};
+
+struct mlx5_ifc_ste_mpls_v1_bits {
+    u8 reserved_at_0[0x15];
+    u8 mpls_ok[0x1];
+    u8 mpls4_s_bit[0x1];
+    u8 mpls4_qualifier[0x1];
+    u8 mpls3_s_bit[0x1];
+    u8 mpls3_qualifier[0x1];
+    u8 mpls2_s_bit[0x1];
+    u8 mpls2_qualifier[0x1];
+    u8 mpls1_s_bit[0x1];
+    u8 mpls1_qualifier[0x1];
+    u8 mpls0_s_bit[0x1];
+    u8 mpls0_qualifier[0x1];
+
+    u8 mpls0_label[0x14];
+    u8 mpls0_exp[0x3];
+    u8 mpls0_s_bos[0x1];
+    u8 mpls0_ttl[0x8];
+
+    u8 mpls1_label[0x20];
+
+    u8 mpls2_label[0x20];
+};
+
+struct mlx5_ifc_ste_register_0_bits {
+    u8 register_0_h[0x20];
+
+    u8 register_0_l[0x20];
+
+    u8 register_1_h[0x20];
+
+    u8 register_1_l[0x20];
+};
+
+struct mlx5_ifc_ste_register_1_bits {
+    u8 register_2_h[0x20];
+
+    u8 register_2_l[0x20];
+
+    u8 register_3_h[0x20];
+
+    u8 register_3_l[0x20];
+};
+
+struct mlx5_ifc_ste_gre_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_at_1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 strict_src_route[0x1];
+    u8 recur[0x3];
+    u8 flags[0x5];
+    u8 version[0x3];
+    u8 gre_protocol[0x10];
+
+    u8 checksum[0x10];
+    u8 offset[0x10];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 seq_num[0x20];
+};
+
+struct mlx5_ifc_ste_gre_v1_bits {
+    u8 gre_c_present[0x1];
+    u8 reserved_at_1[0x1];
+    u8 gre_k_present[0x1];
+    u8 gre_s_present[0x1];
+    u8 strict_src_route[0x1];
+    u8 recur[0x3];
+    u8 flags[0x5];
+    u8 version[0x3];
+    u8 gre_protocol[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 gre_key_h[0x18];
+    u8 gre_key_l[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_flex_parser_0_bits {
+    u8 flex_parser_3[0x20];
+
+    u8 flex_parser_2[0x20];
+
+    u8 flex_parser_1[0x20];
+
+    u8 flex_parser_0[0x20];
+};
+
+struct mlx5_ifc_ste_flex_parser_1_bits {
+    u8 flex_parser_7[0x20];
+
+    u8 flex_parser_6[0x20];
+
+    u8 flex_parser_5[0x20];
+
+    u8 flex_parser_4[0x20];
+};
+
+struct mlx5_ifc_ste_tunnel_header_bits {
+    u8 tunnel_header_dw0[0x20];
+
+    u8 tunnel_header_dw1[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_tunnel_header_v1_bits {
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_vxlan_gpe_bits {
+    u8 outer_vxlan_gpe_flags[0x8];
+    u8 reserved_at_8[0x10];
+    u8 outer_vxlan_gpe_next_protocol[0x8];
+
+    u8 outer_vxlan_gpe_vni[0x18];
+    u8 reserved_at_38[0x8];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_geneve_bits {
+    u8 reserved_at_0[0x2];
+    u8 geneve_opt_len[0x6];
+    u8 geneve_oam[0x1];
+    u8 reserved_at_9[0x7];
+    u8 geneve_protocol_type[0x10];
+
+    u8 geneve_vni[0x18];
+    u8 reserved_at_38[0x8];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_flex_parser_tnl_gtpu_bits {
+    u8 gtpu_msg_flags[0x8];
+    u8 gtpu_msg_type[0x8];
+    u8 reserved_at_10[0x10];
+
+    u8 gtpu_teid[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_ste_general_purpose_bits {
+    u8 general_purpose_lookup_field[0x20];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_src_gvmi_qp_bits {
+    u8 loopback_syndrome[0x8];
+    u8 reserved_at_8[0x8];
+    u8 source_gvmi[0x10];
+
+    u8 reserved_at_20[0x5];
+    u8 force_lb[0x1];
+    u8 functional_lb[0x1];
+    u8 source_is_requestor[0x1];
+    u8 source_qp[0x18];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_src_gvmi_qp_v1_bits {
+    u8 loopback_synd[0x8];
+    u8 reserved_at_8[0x7];
+    u8 functional_lb[0x1];
+    u8 source_gvmi[0x10];
+
+    u8 force_lb[0x1];
+    u8 reserved_at_21[0x1];
+    u8 source_is_requestor[0x1];
+    u8 reserved_at_23[0x5];
+    u8 source_qp[0x18];
+
+    u8 reserved_at_40[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_icmp_v1_bits {
+    u8 icmp_payload_data[0x20];
+
+    u8 icmp_header_data[0x20];
+
+    u8 icmp_type[0x8];
+    u8 icmp_code[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_ste_def0_v1_bits {
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_c_1[0x20];
+
+    u8 dmac_47_16[0x20];
+
+    u8 dmac_15_0[0x10];
+    u8 ethertype[0x10];
+
+    u8 reserved_at_60[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 ip_frag[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 reserved_at_80[0xa];
+    u8 force_loopback[0x1];
+    u8 reserved_at_8b[0x3];
+    u8 second_vlan_qualifier[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+};
+
+struct mlx5_ifc_ste_def2_v1_bits {
+    u8 metadata_reg_a[0x20];
+
+    u8 outer_ip_version[0x4];
+    u8 outer_ip_ihl[0x4];
+    u8 outer_ip_dscp[0x6];
+    u8 outer_ip_ecn[0x2];
+    u8 outer_ip_ttl[0x8];
+    u8 outer_ip_protocol[0x8];
+
+    u8 outer_ip_identification[0x10];
+    u8 outer_ip_flags[0x3];
+    u8 outer_ip_fragment_offset[0xd];
+
+    u8 outer_ip_total_length[0x10];
+    u8 outer_ip_checksum[0x10];
+
+    u8 reserved_180[0xc];
+    u8 outer_ip_flow_label[0x14];
+
+    u8 outer_eth_packet_length[0x10];
+    u8 outer_ip_payload_length[0x10];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 outer_data_offset[0x4];
+    u8 reserved_1e4[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 outer_ip_frag_first[0x1];
+    u8 reserved_1f0[0x7];
+    u8 inner_ipv4_checksum_ok[0x1];
+    u8 inner_l4_checksum_ok[0x1];
+    u8 outer_ipv4_checksum_ok[0x1];
+    u8 outer_l4_checksum_ok[0x1];
+    u8 inner_l3_ok[0x1];
+    u8 inner_l4_ok[0x1];
+    u8 outer_l3_ok[0x1];
+    u8 outer_l4_ok[0x1];
+};
+
+struct mlx5_ifc_ste_def6_v1_bits {
+    u8 dst_ipv6_127_96[0x20];
+
+    u8 dst_ipv6_95_64[0x20];
+
+    u8 dst_ipv6_63_32[0x20];
+
+    u8 dst_ipv6_31_0[0x20];
+
+    u8 reserved_at_80[0x40];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_e0[0x4];
+    u8 l4_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 ip_frag[0x1];
+    u8 tcp_ns[0x1];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+    u8 reserved_f0[0x10];
+};
+
+struct mlx5_ifc_ste_def16_v1_bits {
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 tunnel_header_2[0x20];
+
+    u8 tunnel_header_3[0x20];
+
+    u8 random_number[0x10];
+    u8 reserved_90[0x10];
+
+    u8 metadata_reg_a[0x20];
+
+    u8 reserved_c0[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 reserved_ce[0x1];
+    u8 functional_lb[0x1];
+    u8 source_gvmi[0x10];
+
+    u8 force_lb[0x1];
+    u8 outer_ip_frag[0x1];
+    u8 source_is_requester[0x1];
+    u8 reserved_e3[0x5];
+    u8 source_sqn[0x18];
+};
+
+struct mlx5_ifc_ste_def22_v1_bits {
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_at_40[0x1];
+    u8 sx_sniffer[0x1];
+    u8 functional_loopback[0x1];
+    u8 outer_ip_frag[0x1];
+    u8 qp_type[0x2];
+    u8 encapsulation_type[0x2];
+    u8 port[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 first_vlan_qualifier[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 metadata_reg_c_0[0x20];
+
+    u8 outer_dmac_47_16[0x20];
+
+    u8 outer_smac_47_16[0x20];
+
+    u8 outer_smac_15_0[0x10];
+    u8 outer_dmac_15_0[0x10];
+};
+
+struct mlx5_ifc_ste_def24_v1_bits {
+    u8 metadata_reg_c_2[0x20];
+
+    u8 metadata_reg_c_3[0x20];
+
+    u8 metadata_reg_c_0[0x20];
+
+    u8 metadata_reg_c_1[0x20];
+
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 inner_ip_protocol[0x8];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 inner_ip_frag[0x1];
+    u8 functional_lb[0x1];
+    u8 outer_ip_protocol[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 functional_lb_dup[0x1];
+};
+
+struct mlx5_ifc_ste_def25_v1_bits {
+    u8 inner_ip_src_addr[0x20];
+
+    u8 inner_ip_dst_addr[0x20];
+
+    u8 inner_l4_sport[0x10];
+    u8 inner_l4_dport[0x10];
+
+    u8 tunnel_header_0[0x20];
+
+    u8 tunnel_header_1[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    u8 port_number_dup[0x2];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 port_number[0x2];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_l4_dport[0x10];
+
+    u8 reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_ste_def26_v1_bits {
+    u8 src_ipv6_127_96[0x20];
+
+    u8 src_ipv6_95_64[0x20];
+
+    u8 src_ipv6_63_32[0x20];
+
+    u8 src_ipv6_31_0[0x20];
+
+    u8 reserved_at_80[0x3];
+    u8 ip_frag[0x1];
+    u8 reserved_at_84[0x6];
+    u8 l3_type[0x2];
+    u8 l4_type[0x2];
+    u8 first_vlan_type[0x2];
+    u8 first_priority[0x3];
+    u8 first_cfi[0x1];
+    u8 first_vlan_id[0xc];
+
+    u8 reserved_at_a0[0xb];
+    u8 l2_ok[0x1];
+    u8 l3_ok[0x1];
+    u8 l4_ok[0x1];
+    u8 second_vlan_type[0x2];
+    u8 second_priority[0x3];
+    u8 second_cfi[0x1];
+    u8 second_vlan_id[0xc];
+
+    u8 smac_47_16[0x20];
+
+    u8 smac_15_0[0x10];
+    u8 ip_porotcol[0x8];
+    u8 tcp_cwr[0x1];
+    u8 tcp_ece[0x1];
+    u8 tcp_urg[0x1];
+    u8 tcp_ack[0x1];
+    u8 tcp_psh[0x1];
+    u8 tcp_rst[0x1];
+    u8 tcp_syn[0x1];
+    u8 tcp_fin[0x1];
+};
+
+struct mlx5_ifc_ste_def28_v1_bits {
+    u8 inner_l4_sport[0x10];
+    u8 inner_l4_dport[0x10];
+
+    u8 flex_gtpu_teid[0x20];
+
+    u8 inner_ip_src_addr[0x20];
+
+    u8 inner_ip_dst_addr[0x20];
+
+    u8 outer_ip_src_addr[0x20];
+
+    u8 outer_ip_dst_addr[0x20];
+
+    u8 outer_l4_sport[0x10];
+    u8 outer_l4_dport[0x10];
+
+    u8 inner_ip_protocol[0x8];
+    u8 inner_l3_type[0x2];
+    u8 inner_l4_type[0x2];
+    u8 inner_first_vlan_type[0x2];
+    u8 inner_ip_frag[0x1];
+    u8 functional_lb[0x1];
+    u8 outer_ip_protocol[0x8];
+    u8 outer_l3_type[0x2];
+    u8 outer_l4_type[0x2];
+    u8 outer_first_vlan_type[0x2];
+    u8 outer_ip_frag[0x1];
+    u8 functional_lb_dup[0x1];
+};
+
+struct mlx5_ifc_set_action_in_bits {
+    u8 action_type[0x4];
+    u8 field[0xc];
+    u8 reserved_at_10[0x3];
+    u8 offset[0x5];
+    u8 reserved_at_18[0x3];
+    u8 length[0x5];
+
+    u8 data[0x20];
+};
+
+struct mlx5_ifc_add_action_in_bits {
+    u8 action_type[0x4];
+    u8 field[0xc];
+    u8 reserved_at_10[0x10];
+
+    u8 data[0x20];
+};
+
+struct mlx5_ifc_copy_action_in_bits {
+    u8 action_type[0x4];
+    u8 src_field[0xc];
+    u8 reserved_at_10[0x3];
+    u8 src_offset[0x5];
+    u8 reserved_at_18[0x3];
+    u8 length[0x5];
+
+    u8 reserved_at_20[0x4];
+    u8 dst_field[0xc];
+    u8 reserved_at_30[0x3];
+    u8 dst_offset[0x5];
+    u8 reserved_at_38[0x8];
+};
+
+enum {
+    MLX5_ACTION_TYPE_SET = 0x1,
+    MLX5_ACTION_TYPE_ADD = 0x2,
+    MLX5_ACTION_TYPE_COPY = 0x3,
+};
+
+enum {
+    MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16 = 0x1,
+    MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0 = 0x2,
+    MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE = 0x3,
+    MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16 = 0x4,
+    MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0 = 0x5,
+    MLX5_ACTION_IN_FIELD_OUT_IP_DSCP = 0x6,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS = 0x7,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT = 0x8,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT = 0x9,
+    MLX5_ACTION_IN_FIELD_OUT_IP_TTL = 0xa,
+    MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT = 0xb,
+    MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT = 0xc,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96 = 0xd,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64 = 0xe,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32 = 0xf,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0 = 0x10,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96 = 0x11,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64 = 0x12,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32 = 0x13,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14,
+    MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15,
+    MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16,
+    MLX5_ACTION_IN_FIELD_OUT_FIRST_VID = 0x17,
+    MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT = 0x47,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA = 0x49,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB = 0x50,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0 = 0x51,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1 = 0x52,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2 = 0x53,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3 = 0x54,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4 = 0x55,
+    MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5 = 0x56,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM = 0x59,
+    MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM = 0x5B,
+    MLX5_ACTION_IN_FIELD_OUT_GTPU_TEID = 0x6E,
+};
+
+struct mlx5_ifc_dctc_bits {
+    u8 reserved_at_0[0x1d];
+    u8 data_in_order[0x1];
+    u8 reserved_at_1e[0x362];
+};
+
+struct mlx5_ifc_packet_reformat_context_in_bits {
+    u8 reserved_at_0[0x5];
+    u8 reformat_type[0x3];
+    u8 reserved_at_8[0xe];
+    u8 reformat_data_size[0xa];
+
+    u8 reserved_at_20[0x10];
+    u8 reformat_data[2][0x8];
+
+    u8 more_reformat_data[0][0x8];
+};
+
+struct mlx5_ifc_alloc_packet_reformat_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0xa0];
+
+    struct mlx5_ifc_packet_reformat_context_in_bits packet_reformat_context;
+};
+
+struct mlx5_ifc_alloc_packet_reformat_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_packet_reformat_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_packet_reformat_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum reformat_type {
+    MLX5_REFORMAT_TYPE_L2_TO_VXLAN = 0x0,
+    MLX5_REFORMAT_TYPE_L2_TO_NVGRE = 0x1,
+    MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2,
+    MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3,
+    MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4,
+};
+
+struct mlx5_ifc_alloc_flow_counter_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_alloc_flow_counter_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_flow_counter_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum {
+    MLX5_OBJ_TYPE_FLOW_METER = 0x000a,
+    MLX5_OBJ_TYPE_DEK = 0x000C,
+    MLX5_OBJ_TYPE_MATCH_DEFINER = 0x0018,
+    MLX5_OBJ_TYPE_CRYPTO_LOGIN = 0x001F,
+    MLX5_OBJ_TYPE_FLOW_SAMPLER = 0x0020,
+    MLX5_OBJ_TYPE_ASO_FLOW_METER = 0x0024,
+    MLX5_OBJ_TYPE_ASO_FIRST_HIT = 0x0025,
+    MLX5_OBJ_TYPE_SCHEDULING_ELEMENT = 0x0026,
+    MLX5_OBJ_TYPE_RESERVED_QPN = 0x002C,
+    MLX5_OBJ_TYPE_ASO_CT = 0x0031,
+    MLX5_OBJ_TYPE_AV_QP_MAPPING = 0x003A,
+};
+
+struct mlx5_ifc_general_obj_in_cmd_hdr_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 obj_type[0x10];
+
+    u8 obj_id[0x20];
+
+    u8 reserved_at_60[0x3];
+    u8 log_obj_range[0x5];
+    u8 reserved_at_68[0x18];
+};
+
+struct mlx5_ifc_general_obj_out_cmd_hdr_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 obj_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_flow_meter_bits {
+    u8 modify_field_select[0x40];
+
+    u8 active[0x1];
+    u8 reserved_at_41[0x3];
+    u8 return_reg_id[0x4];
+    u8 table_type[0x8];
+    u8 reserved_at_50[0x10];
+
+    u8 reserved_at_60[0x8];
+    u8 destination_table_id[0x18];
+
+    u8 reserved_at_80[0x80];
+
+    u8 flow_meter_params[0x100];
+
+    u8 reserved_at_180[0x180];
+
+    u8 sw_steering_icm_address_rx[0x40];
+    u8 sw_steering_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_create_flow_meter_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_meter_bits meter;
+};
+
+struct mlx5_ifc_query_flow_meter_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_meter_bits obj;
+};
+
+struct mlx5_ifc_flow_sampler_bits {
+    u8 modify_field_select[0x40];
+
+    u8 table_type[0x8];
+    u8 level[0x8];
+    u8 reserved_at_50[0xf];
+    u8 ignore_flow_level[0x1];
+
+    u8 sample_ratio[0x20];
+
+    u8 reserved_at_80[0x8];
+    u8 sample_table_id[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 default_table_id[0x18];
+
+    u8 sw_steering_icm_address_rx[0x40];
+    u8 sw_steering_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_create_flow_sampler_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_sampler_bits sampler;
+};
+
+struct mlx5_ifc_query_flow_sampler_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_flow_sampler_bits obj;
+};
+
+struct mlx5_ifc_definer_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x10];
+    u8 format_id[0x10];
+
+    u8 reserved_at_60[0x160];
+
+    u8 ctrl[0xA0];
+    u8 match_mask_dw_11_8[0x60];
+    u8 match_mask_dw_7_0[0x100];
+};
+
+struct mlx5_ifc_create_definer_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_definer_bits definer;
+};
+
+struct mlx5_ifc_esw_vport_context_bits {
+    u8 reserved_at_0[0x3];
+    u8 vport_svlan_strip[0x1];
+    u8 vport_cvlan_strip[0x1];
+    u8 vport_svlan_insert[0x1];
+    u8 vport_cvlan_insert[0x2];
+    u8 reserved_at_8[0x18];
+
+    u8 reserved_at_20[0x20];
+
+    u8 svlan_cfi[0x1];
+    u8 svlan_pcp[0x3];
+    u8 svlan_id[0xc];
+    u8 cvlan_cfi[0x1];
+    u8 cvlan_pcp[0x3];
+    u8 cvlan_id[0xc];
+
+    u8 reserved_at_40[0x720];
+    u8 sw_steering_vport_icm_address_rx[0x40];
+    u8 sw_steering_vport_icm_address_tx[0x40];
+};
+
+struct mlx5_ifc_query_esw_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_esw_vport_context_bits esw_vport_context;
+};
+
+struct mlx5_ifc_query_esw_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_nic_vport_context_bits {
+    u8 reserved_at_0[0x1f];
+    u8 roce_en[0x1];
+
+    u8 reserved_at_20[0x7e0];
+};
+
+struct mlx5_ifc_query_nic_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_query_nic_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum {
+    MLX5_QPC_ST_RC = 0x0,
+};
+
+enum {
+    MLX5_QPC_PM_STATE_MIGRATED = 0x3,
+};
+
+struct mlx5_ifc_ud_av_bits {
+    u8 reserved_at_0[0x60];
+
+    u8 reserved_at_60[0x4];
+    u8 sl_or_eth_prio[0x4];
+    u8 reserved_at_68[0x18];
+
+    u8 reserved_at_80[0x60];
+
+    u8 reserved_at_e0[0x4];
+    u8 src_addr_index[0x8];
+    u8 reserved_at_ec[0x14];
+
+    u8 rgid_or_rip[16][0x8];
+};
+
+struct mlx5_ifc_ads_bits {
+    u8 fl[0x1];
+    u8 free_ar[0x1];
+    u8 reserved_at_2[0xe];
+    u8 pkey_index[0x10];
+
+    u8 reserved_at_20[0x8];
+    u8 grh[0x1];
+    u8 mlid[0x7];
+    u8 rlid[0x10];
+
+    u8 ack_timeout[0x5];
+    u8 reserved_at_45[0x3];
+    u8 src_addr_index[0x8];
+    u8 reserved_at_50[0x4];
+    u8 stat_rate[0x4];
+    u8 hop_limit[0x8];
+
+    u8 reserved_at_60[0x4];
+    u8 tclass[0x8];
+    u8 flow_label[0x14];
+
+    u8 rgid_rip[16][0x8];
+
+    u8 reserved_at_100[0x4];
+    u8 f_dscp[0x1];
+    u8 f_ecn[0x1];
+    u8 reserved_at_106[0x1];
+    u8 f_eth_prio[0x1];
+    u8 ecn[0x2];
+    u8 dscp[0x6];
+    u8 udp_sport[0x10];
+
+    u8 dei_cfi[0x1];
+    u8 eth_prio[0x3];
+    u8 sl[0x4];
+    u8 vhca_port_num[0x8];
+    u8 rmac_47_32[0x10];
+
+    u8 rmac_31_0[0x20];
+};
+
+enum {
+    MLX5_QPC_TIMESTAMP_FORMAT_FREE_RUNNING = 0x0,
+    MLX5_QPC_TIMESTAMP_FORMAT_DEFAULT = 0x1,
+    MLX5_QPC_TIMESTAMP_FORMAT_REAL_TIME = 0x2,
+};
+
+struct mlx5_ifc_qpc_bits {
+    u8 state[0x4];
+    u8 lag_tx_port_affinity[0x4];
+    u8 st[0x8];
+    u8 reserved_at_10[0x2];
+    u8 isolate_vl_tc[0x1];
+    u8 pm_state[0x2];
+    u8 reserved_at_15[0x1];
+    u8 req_e2e_credit_mode[0x2];
+    u8 offload_type[0x4];
+    u8 end_padding_mode[0x2];
+    u8 reserved_at_1e[0x2];
+
+    u8 wq_signature[0x1];
+    u8 block_lb_mc[0x1];
+    u8 atomic_like_write_en[0x1];
+    u8 latency_sensitive[0x1];
+    u8 reserved_at_24[0x1];
+    u8 drain_sigerr[0x1];
+    u8 reserved_at_26[0x2];
+    u8 pd[0x18];
+
+    u8 mtu[0x3];
+    u8 log_msg_max[0x5];
+    u8 reserved_at_48[0x1];
+    u8 log_rq_size[0x4];
+    u8 log_rq_stride[0x3];
+    u8 no_sq[0x1];
+    u8 log_sq_size[0x4];
+    u8 reserved_at_55[0x3];
+    u8 ts_format[0x2];
+    u8 data_in_order[0x1];
+    u8 rlky[0x1];
+    u8 ulp_stateless_offload_mode[0x4];
+
+    u8 counter_set_id[0x8];
+    u8 uar_page[0x18];
+
+    u8 reserved_at_80[0x8];
+    u8 user_index[0x18];
+
+    u8 reserved_at_a0[0x3];
+    u8 log_page_size[0x5];
+    u8 remote_qpn[0x18];
+
+    struct mlx5_ifc_ads_bits primary_address_path;
+
+    struct mlx5_ifc_ads_bits secondary_address_path;
+
+    u8 log_ack_req_freq[0x4];
+    u8 reserved_at_384[0x4];
+    u8 log_sra_max[0x3];
+    u8 reserved_at_38b[0x2];
+    u8 retry_count[0x3];
+    u8 rnr_retry[0x3];
+    u8 reserved_at_393[0x1];
+    u8 fre[0x1];
+    u8 cur_rnr_retry[0x3];
+    u8 cur_retry_count[0x3];
+    u8 reserved_at_39b[0x5];
+
+    u8 reserved_at_3a0[0x20];
+
+    u8 reserved_at_3c0[0x8];
+    u8 next_send_psn[0x18];
+
+    u8 reserved_at_3e0[0x8];
+    u8 cqn_snd[0x18];
+
+    u8 reserved_at_400[0x8];
+    u8 deth_sqpn[0x18];
+
+    u8 reserved_at_420[0x20];
+
+    u8 reserved_at_440[0x8];
+    u8 last_acked_psn[0x18];
+
+    u8 reserved_at_460[0x8];
+    u8 ssn[0x18];
+
+    u8 reserved_at_480[0x8];
+    u8 log_rra_max[0x3];
+    u8 reserved_at_48b[0x1];
+    u8 atomic_mode[0x4];
+    u8 rre[0x1];
+    u8 rwe[0x1];
+    u8 rae[0x1];
+    u8 reserved_at_493[0x1];
+    u8 page_offset[0x6];
+    u8 reserved_at_49a[0x3];
+    u8 cd_slave_receive[0x1];
+    u8 cd_slave_send[0x1];
+    u8 cd_master[0x1];
+
+    u8 reserved_at_4a0[0x3];
+    u8 min_rnr_nak[0x5];
+    u8 next_rcv_psn[0x18];
+
+    u8 reserved_at_4c0[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_4e0[0x8];
+    u8 cqn_rcv[0x18];
+
+    u8 dbr_addr[0x40];
+
+    u8 q_key[0x20];
+
+    u8 reserved_at_560[0x5];
+    u8 rq_type[0x3];
+    u8 srqn_rmpn_xrqn[0x18];
+
+    u8 reserved_at_580[0x8];
+    u8 rmsn[0x18];
+
+    u8 hw_sq_wqebb_counter[0x10];
+    u8 sw_sq_wqebb_counter[0x10];
+
+    u8 hw_rq_counter[0x20];
+
+    u8 sw_rq_counter[0x20];
+
+    u8 reserved_at_600[0x20];
+
+    u8 reserved_at_620[0xf];
+    u8 cgs[0x1];
+    u8 cs_req[0x8];
+    u8 cs_res[0x8];
+
+    u8 dc_access_key[0x40];
+
+    u8 reserved_at_680[0x3];
+    u8 dbr_umem_valid[0x1];
+
+    u8 reserved_at_684[0x9c];
+
+    u8 dbr_umem_id[0x20];
+};
+
+struct mlx5_ifc_qpc_ext_bits {
+    u8 reserved_at_0[0x2];
+    u8 mmo[0x1];
+    u8 reserved_at_3[0xd];
+    u8 dci_stream_channel_id[0x10];
+
+    u8 qos_queue_group_id_requester[0x20];
+
+    u8 qos_queue_group_id_responder[0x20];
+
+    u8 reserved_at_60[0x5a0];
+};
+
+struct mlx5_ifc_create_tir_out_bits {
+    u8 status[0x8];
+    u8 icm_address_63_40[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 icm_address_39_32[0x8];
+    u8 tirn[0x18];
+
+    u8 icm_address_31_0[0x20];
+};
+
+struct mlx5_ifc_destroy_tir_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 tirn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 wq_umem_id[0x20];
+
+    u8 wq_umem_valid[0x1];
+    u8 reserved_at_861[0x1f];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_destroy_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum mlx5_qpc_opt_mask_32 {
+    MLX5_QPC_OPT_MASK_32_DCI_STREAM_CHANNEL_ID = 1 << 0,
+    MLX5_QPC_OPT_MASK_32_QOS_QUEUE_GROUP_ID = 1 << 1,
+    MLX5_QPC_OPT_MASK_32_UDP_SPORT = 1 << 2,
+};
+
+enum mlx5_qpc_opt_mask {
+    MLX5_QPC_OPT_MASK_INIT2INIT_DRAIN_SIGERR = 1 << 11,
+    MLX5_QPC_OPT_MASK_RTS2RTS_LAG_TX_PORT_AFFINITY = 1 << 15,
+    MLX5_QPC_OPT_MASK_INIT2INIT_MMO = 1 << 25,
+};
+
+struct mlx5_ifc_init2init_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2init_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 qpc_ext[0x1];
+    u8 reserved_at_41[0x7];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 opt_param_mask_95_32[0x40];
+
+    struct mlx5_ifc_qpc_ext_bits qpc_data_ext;
+};
+
+struct mlx5_ifc_init2rtr_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init2rtr_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rtr2rts_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rtr2rts_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rst2init_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rst2init_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+};
+
+struct mlx5_ifc_rts2rts_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_rts2rts_qp_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 qpc_ext[0x1];
+    u8 reserved_at_41[0x7];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x40];
+
+    u8 opt_param_mask_95_32[0x40];
+
+    struct mlx5_ifc_qpc_ext_bits qpc_data_ext;
+};
+
+struct mlx5_ifc_qp_2rst_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t vhca_tunnel_id[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t qpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_qp_2rst_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qp_2err_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t vhca_tunnel_id[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t qpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_qp_2err_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_qpc_extension_and_pas_list_in_bits {
+    uint8_t qpc_data_extension[48][0x20];
+
+    uint8_t pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 opt_param_mask[0x20];
+
+    u8 reserved_at_a0[0x20];
+
+    struct mlx5_ifc_qpc_bits qpc;
+
+    u8 reserved_at_800[0x80];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_query_qp_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_dct_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_dctc_bits dctc;
+};
+
+struct mlx5_ifc_query_dct_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_tisc_bits {
+    u8 strict_lag_tx_port_affinity[0x1];
+    u8 tls_en[0x1];
+    u8 reserved_at_2[0x2];
+    u8 lag_tx_port_affinity[0x04];
+
+    u8 reserved_at_8[0x4];
+    u8 prio[0x4];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x100];
+
+    u8 reserved_at_120[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_140[0x8];
+    u8 underlay_qpn[0x18];
+
+    u8 reserved_at_160[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_180[0x380];
+};
+
+struct mlx5_ifc_query_tis_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_tisc_bits tis_context;
+};
+
+struct mlx5_ifc_query_tis_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_lagc_bits {
+    u8 reserved_at_0[0x1d];
+    u8 lag_state[0x3];
+
+    u8 reserved_at_20[0x14];
+    u8 tx_remap_affinity_2[0x4];
+    u8 reserved_at_38[0x4];
+    u8 tx_remap_affinity_1[0x4];
+};
+
+struct mlx5_ifc_query_lag_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    struct mlx5_ifc_lagc_bits ctx;
+};
+
+struct mlx5_ifc_query_lag_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_av_qp_mapping_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x20];
+
+    u8 qpn[0x20];
+
+    struct mlx5_ifc_ud_av_bits remote_address_vector;
+};
+
+struct mlx5_ifc_create_av_qp_mapping_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_av_qp_mapping_bits mapping;
+};
+
+struct mlx5_ifc_query_av_qp_mapping_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_av_qp_mapping_bits obj;
+};
+
+struct mlx5_ifc_modify_tis_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_tis_bitmask_bits {
+    u8 reserved_at_0[0x20];
+
+    u8 reserved_at_20[0x1d];
+    u8 lag_tx_port_affinity[0x1];
+    u8 strict_lag_tx_port_affinity[0x1];
+    u8 prio[0x1];
+};
+
+struct mlx5_ifc_modify_tis_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    struct mlx5_ifc_modify_tis_bitmask_bits bitmask;
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_tisc_bits ctx;
+};
+
+enum roce_version {
+    MLX5_ROCE_VERSION_1 = 0,
+    MLX5_ROCE_VERSION_2 = 2,
+};
+
+struct mlx5_ifc_roce_addr_layout_bits {
+    u8 source_l3_address[4][0x20];
+
+    u8 reserved_at_80[0x2];
+    u8 rx_allow_untagged[0x1];
+    u8 vlan_valid[0x1];
+    u8 vlan_id[0xc];
+    u8 source_mac_47_32[0x10];
+
+    u8 source_mac_31_0[0x20];
+
+    u8 reserved_at_c0[0x14];
+    u8 roce_l3_type[0x4];
+    u8 roce_version[0x8];
+
+    u8 reserved_at_e0[0x20];
+};
+
+struct mlx5_ifc_query_roce_address_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x20];
+
+    uint8_t roce_address_num[0x10];
+    uint8_t reserved_at_70[0x10];
+
+    struct mlx5_ifc_roce_addr_layout_bits roce_address[0];
+};
+
+struct mlx5_ifc_query_roce_address_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 roce_address_index[0x10];
+    u8 reserved_at_50[0xc];
+    u8 vhca_port_num[0x4];
+
+    u8 reserved_at_60[0x20];
+};
+
+/* Both HW set and HW add share the same HW format with different opcodes */
+struct mlx5_ifc_dr_action_hw_set_bits {
+    u8 opcode[0x8];
+    u8 destination_field_code[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x3];
+    u8 destination_length[0x5];
+
+    u8 inline_data[0x20];
+};
+
+struct mlx5_ifc_dr_action_hw_copy_bits {
+    u8 opcode[0x8];
+    u8 destination_field_code[0x8];
+    u8 reserved_at_10[0x2];
+    u8 destination_left_shifter[0x6];
+    u8 reserved_at_18[0x2];
+    u8 destination_length[0x6];
+
+    u8 reserved_at_20[0x8];
+    u8 source_field_code[0x8];
+    u8 reserved_at_30[0x2];
+    u8 source_left_shifter[0x6];
+    u8 reserved_at_38[0x8];
+};
+
+struct mlx5_ifc_host_params_context_bits {
+    u8 host_number[0x8];
+    u8 reserved_at_8[0x6];
+    u8 host_pf_vhca_id_valid[0x1];
+    u8 host_pf_disabled[0x1];
+    u8 host_num_of_vfs[0x10];
+
+    u8 host_total_vfs[0x10];
+    u8 host_pci_bus[0x10];
+
+    u8 host_pf_vhca_id[0x10];
+    u8 host_pci_device[0x10];
+
+    u8 reserved_at_60[0x10];
+    u8 host_pci_function[0x10];
+
+    u8 reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_query_esw_functions_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_esw_functions_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_host_params_context_bits host_params_context;
+
+    u8 reserved_at_280[0x180];
+    u8 host_sf_enable[0][0x40];
+};
+
+struct mlx5_ifc_create_flow_group_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x1f40];
+};
+
+struct mlx5_ifc_create_flow_group_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 group_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_flow_group_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 group_id[0x20];
+
+    u8 reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_dest_format_bits {
+    u8 destination_type[0x8];
+    u8 destination_id[0x18];
+
+    u8 reserved_at_20[0x1];
+    u8 packet_reformat[0x1];
+    u8 reserved_at_22[0x1e];
+};
+
+struct mlx5_ifc_extended_dest_format_bits {
+    struct mlx5_ifc_dest_format_bits destination_entry;
+
+    u8 packet_reformat_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_flow_counter_list_bits {
+    u8 flow_counter_id[0x20];
+
+    u8 reserved_at_20[0x20];
+};
+
+union mlx5_ifc_dest_format_flow_counter_list_auto_bits {
+    struct mlx5_ifc_dest_format_bits dest_format;
+    struct mlx5_ifc_flow_counter_list_bits flow_counter_list;
+    u8 reserved_at_0[0x40];
+};
+
+struct mlx5_ifc_flow_context_bits {
+    u8 reserved_at_00[0x20];
+
+    u8 group_id[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 flow_tag[0x18];
+
+    u8 reserved_at_60[0x10];
+    u8 action[0x10];
+
+    u8 extended_destination[0x1];
+    u8 reserved_at_81[0x7];
+    u8 destination_list_size[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 flow_counter_list_size[0x18];
+
+    u8 reserved_at_c0[0x1740];
+
+    union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[0];
+};
+
+struct mlx5_ifc_set_fte_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x40];
+    u8 flow_index[0x20];
+
+    u8 reserved_at_120[0xe0];
+    struct mlx5_ifc_flow_context_bits flow_context;
+};
+
+struct mlx5_ifc_set_fte_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+enum dr_devx_flow_dest_type {
+    MLX5_FLOW_DEST_TYPE_VPORT = 0x0,
+    MLX5_FLOW_DEST_TYPE_FT = 0x1,
+    MLX5_FLOW_DEST_TYPE_TIR = 0x2,
+
+    MLX5_FLOW_DEST_TYPE_COUNTER = 0x100,
+};
+
+enum {
+    MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4,
+    MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8,
+};
+
+enum {
+    MLX5_QPC_PAGE_OFFSET_QUANTA = 64,
+};
+
+enum {
+    MLX5_ASO_FIRST_HIT_NUM_PER_OBJ = 512,
+    MLX5_ASO_FLOW_METER_NUM_PER_OBJ = 2,
+    MLX5_ASO_CT_NUM_PER_OBJ = 1,
+};
+
+enum mlx5_sched_hierarchy_type {
+    MLX5_SCHED_HIERARCHY_NIC = 3,
+};
+
+enum mlx5_sched_elem_type {
+    MLX5_SCHED_ELEM_TYPE_TSAR = 0x0,
+    MLX5_SCHED_ELEM_TYPE_VPORT = 0x1,
+    MLX5_SCHED_ELEM_TYPE_VPORT_TC = 0x2,
+    MLX5_SCHED_ELEM_TYPE_PARA_VPORT_TC = 0x3,
+    MLX5_SCHED_ELEM_TYPE_QUEUE_GROUP = 0x4,
+};
+
+enum mlx5_sched_tsar_type {
+    MLX5_SCHED_TSAR_TYPE_DWRR = 0x0,
+    MLX5_SCHED_TSAR_TYPE_ROUND_ROBIN = 0x1,
+    MLX5_SCHED_TSAR_TYPE_ETS = 0x2,
+};
+
+struct mlx5_ifc_sched_elem_attr_tsar_bits {
+    u8 reserved_at_0[0x8];
+    u8 tsar_type[0x8];
+    u8 reserved_at_10[0x10];
+};
+
+union mlx5_ifc_sched_elem_attr_bits {
+    struct mlx5_ifc_sched_elem_attr_tsar_bits tsar;
+};
+
+struct mlx5_ifc_sched_context_bits {
+    u8 element_type[0x8];
+    u8 reserved_at_8[0x18];
+
+    union mlx5_ifc_sched_elem_attr_bits sched_elem_attr;
+
+    u8 parent_element_id[0x20];
+
+    u8 reserved_at_60[0x40];
+
+    u8 bw_share[0x20];
+
+    u8 max_average_bw[0x20];
+
+    u8 reserved_at_e0[0x120];
+};
+
+struct mlx5_ifc_sched_elem_bits {
+    u8 modify_field_select[0x40];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 reserved_at_60[0xa0];
+
+    struct mlx5_ifc_sched_context_bits sched_context;
+
+    u8 reserved_at_300[0x100];
+};
+
+struct mlx5_ifc_create_sched_elem_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_sched_elem_bits sched_elem;
+};
+
+struct mlx5_ifc_create_modify_elem_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_sched_elem_bits sched_elem;
+};
+
+enum {
+    MLX5_SQC_STATE_RDY = 0x1,
+};
+
+struct mlx5_ifc_sqc_bits {
+    u8 reserved_at_0[0x8];
+    u8 state[0x4];
+    u8 reserved_at_c[0x14];
+
+    u8 reserved_at_20[0xe0];
+
+    u8 reserved_at_100[0x10];
+    u8 qos_queue_group_id[0x10];
+
+    u8 reserved_at_120[0x660];
+};
+
+enum {
+    MLX5_MODIFY_SQ_BITMASK_QOS_QUEUE_GROUP_ID = 1 << 2,
+};
+
+struct mlx5_ifc_modify_sq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_sq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 sq_state[0x4];
+    u8 reserved_at_44[0x4];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 modify_bitmask[0x40];
+
+    u8 reserved_at_c0[0x40];
+
+    struct mlx5_ifc_sqc_bits sq_context;
+};
+
+struct mlx5_ifc_reserved_qpn_bits {
+    u8 reserved_at_0[0x80];
+};
+
+struct mlx5_ifc_create_reserved_qpn_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_reserved_qpn_bits rqpns;
+};
+
+struct mlx5_ifc_create_psv_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x8];
+    u8 psv0_index[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 psv1_index[0x18];
+
+    u8 reserved_at_c0[0x8];
+    u8 psv2_index[0x18];
+
+    u8 reserved_at_e0[0x8];
+    u8 psv3_index[0x18];
+};
+
+struct mlx5_ifc_create_psv_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 num_psv[0x4];
+    u8 reserved_at_44[0x4];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_psv_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 psvn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_mbox_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_mbox_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_enable_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_enable_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x20];
+};
+
+struct mlx5_ifc_query_issi_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x10];
+    u8 current_issi[0x10];
+
+    u8 reserved_at_60[0xa0];
+
+    u8 reserved_at_100[76][0x8];
+    u8 supported_issi_dw0[0x20];
+};
+
+struct mlx5_ifc_query_issi_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_issi_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_issi_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 current_issi[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_pages_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 embedded_cpu_function[0x01];
+    u8 reserved_bits[0x0f];
+    u8 function_id[0x10];
+
+    u8 num_pages[0x20];
+};
+
+struct mlx5_ifc_query_pages_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_manage_pages_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 output_num_entries[0x20];
+
+    u8 reserved_at_60[0x20];
+
+    u8 pas[][0x40];
+};
+
+struct mlx5_ifc_manage_pages_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 embedded_cpu_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 input_num_entries[0x20];
+
+    u8 pas[][0x40];
+};
+
+enum {
+    MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL = 0x1,
+};
+
+struct mlx5_ifc_teardown_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x3f];
+
+    u8 state[0x1];
+};
+
+enum {
+    MLX5_TEARDOWN_HCA_IN_PROFILE_GRACEFUL_CLOSE = 0x0,
+    MLX5_TEARDOWN_HCA_IN_PROFILE_PREPARE_FAST_TEARDOWN = 0x2,
+};
+
+struct mlx5_ifc_teardown_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 profile[0x10];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_init_hca_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_init_hca_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_access_register_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+
+    u8 register_data[][0x20];
+};
+
+struct mlx5_ifc_access_register_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x10];
+    u8 register_id[0x10];
+
+    u8 argument[0x20];
+
+    u8 register_data[][0x20];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_modify_nic_vport_field_select_bits {
+    u8 reserved_at_0[0x12];
+    u8 affiliation[0x1];
+    u8 reserved_at_13[0x1];
+    u8 disable_uc_local_lb[0x1];
+    u8 disable_mc_local_lb[0x1];
+    u8 node_guid[0x1];
+    u8 port_guid[0x1];
+    u8 min_inline[0x1];
+    u8 mtu[0x1];
+    u8 change_event[0x1];
+    u8 promisc[0x1];
+    u8 permanent_address[0x1];
+    u8 addresses_list[0x1];
+    u8 roce_en[0x1];
+    u8 reserved_at_1f[0x1];
+};
+
+struct mlx5_ifc_modify_nic_vport_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    struct mlx5_ifc_modify_nic_vport_field_select_bits field_select;
+
+    u8 reserved_at_80[0x780];
+
+    struct mlx5_ifc_nic_vport_context_bits nic_vport_context;
+};
+
+struct mlx5_ifc_set_hca_cap_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_set_hca_cap_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 other_function[0x1];
+    u8 reserved_at_41[0xf];
+    u8 function_id[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    union mlx5_ifc_hca_cap_union_bits capability;
+};
+
+struct mlx5_ifc_alloc_uar_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 uar[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_uar_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 uar[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_eqc_bits {
+    u8 status[0x4];
+    u8 reserved_at_4[0x9];
+    u8 ec[0x1];
+    u8 oi[0x1];
+    u8 reserved_at_f[0x5];
+    u8 st[0x4];
+    u8 reserved_at_18[0x8];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x14];
+    u8 page_offset[0x6];
+    u8 reserved_at_5a[0x6];
+
+    u8 reserved_at_60[0x3];
+    u8 log_eq_size[0x5];
+    u8 uar_page[0x18];
+
+    u8 reserved_at_80[0x20];
+
+    u8 reserved_at_a0[0x18];
+    u8 intr[0x8];
+
+    u8 reserved_at_c0[0x3];
+    u8 log_page_size[0x5];
+    u8 reserved_at_c8[0x18];
+
+    u8 reserved_at_e0[0x60];
+
+    u8 reserved_at_140[0x8];
+    u8 consumer_counter[0x18];
+
+    u8 reserved_at_160[0x8];
+    u8 producer_counter[0x18];
+
+    u8 reserved_at_180[0x80];
+};
+
+struct mlx5_ifc_create_eq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x18];
+    u8 eq_number[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_eq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_eqc_bits eq_context_entry;
+
+    u8 reserved_at_280[0x40];
+
+    u8 event_bitmask[4][0x40];
+
+    u8 reserved_at_3c0[0x4c0];
+
+    u8 pas[][0x40];
+};
+
+struct mlx5_ifc_destroy_eq_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_eq_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x18];
+    u8 eq_number[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_pd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_pd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_mtt_bits {
+    u8 ptag_63_32[0x20];
+
+    u8 ptag_31_8[0x18];
+    u8 reserved_at_38[0x6];
+    u8 wr_en[0x1];
+    u8 rd_en[0x1];
+};
+
+struct mlx5_ifc_umem_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 reserved_at_80[0x1b];
+    u8 log_page_size[0x5];
+
+    u8 page_offset[0x20];
+
+    u8 num_of_mtt[0x40];
+
+    struct mlx5_ifc_mtt_bits mtt[];
+};
+
+struct mlx5_ifc_create_umem_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x40];
+
+    struct mlx5_ifc_umem_bits umem;
+};
+
+struct mlx5_ifc_create_umem_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 umem_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x10];
+    u8 op_mod[0x10];
+
+    u8 reserved_at_40[0x8];
+    u8 umem_id[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_umem_out_bits {
+    u8 status[0x8];
+    u8 reserved_at_8[0x18];
+
+    u8 syndrome[0x20];
+
+    u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_delete_fte_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 other_vport[0x1];
+    u8 reserved_at_41[0xf];
+    u8 vport_number[0x10];
+
+    u8 reserved_at_60[0x20];
+
+    u8 table_type[0x8];
+    u8 reserved_at_88[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_id[0x18];
+
+    u8 reserved_at_c0[0x40];
+
+    u8 flow_index[0x20];
+
+    u8 reserved_at_120[0xe0];
+};
+
+struct mlx5_ifc_create_cq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 cqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_cq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 cqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_transport_domain_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_transport_domain_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 transport_domain[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_wq_bits {
+    uint8_t wq_type[0x4];
+    uint8_t wq_signature[0x1];
+    uint8_t end_padding_mode[0x2];
+    uint8_t cd_slave[0x1];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t reserved_at_20[0x1];
+    uint8_t reserved_at_21[0x3];
+    uint8_t reserved_at_24[0x7];
+    uint8_t page_offset[0x5];
+    uint8_t reserved_at_30[0x10];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t pd[0x18];
+
+    uint8_t reserved_at_60[0x8];
+    uint8_t uar_page[0x18];
+
+    uint8_t dbr_addr[0x40];
+
+    uint8_t reserved_at_c0[0x20];
+
+    uint8_t reserved_at_e0[0x20];
+
+    uint8_t reserved_at_100[0xc];
+    uint8_t log_wq_stride[0x4];
+    uint8_t reserved_at_110[0x3];
+    uint8_t log_wq_pg_sz[0x5];
+    uint8_t reserved_at_118[0x3];
+    uint8_t log_wq_sz[0x5];
+
+    uint8_t dbr_umem_valid[0x1];
+    uint8_t wq_umem_valid[0x1];
+    uint8_t reserved_at_122[0x1];
+    uint8_t reserved_at_123[0x5];
+    uint8_t reserved_at_128[0x3];
+    uint8_t reserved_at_12b[0x5];
+    uint8_t reserved_at_130[0x4];
+    uint8_t reserved_at_134[0x4];
+    uint8_t reserved_at_138[0x1];
+    uint8_t reserved_at_139[0x4];
+    uint8_t reserved_at_13d[0x3];
+
+    uint8_t dbr_umem_id[0x20];
+
+    uint8_t wq_umem_id[0x20];
+
+    uint8_t wq_umem_offset[0x40];
+
+    uint8_t reserved_at_1bc[0x20];
+
+    uint8_t reserved_at_1dd[0x1];
+    uint8_t reserved_at_1e1[0x1];
+    uint8_t reserved_at_1e2[0x2];
+    uint8_t reserved_at_1e4[0x1];
+    uint8_t reserved_at_1e5[0x3];
+    uint8_t reserved_at_1e8[0x5];
+    uint8_t reserved_at_1ed[0x3];
+    uint8_t reserved_at_1f0[0x6];
+    uint8_t reserved_at_1f6[0x2];
+    uint8_t reserved_at_1fa[0x4];
+    uint8_t reserved_at_1fc[0x4];
+
+    uint8_t reserved_at_200[0xb];
+    uint8_t reserved_at_20b[0x5];
+    uint8_t reserved_at_210[0x10];
+
+    uint8_t reserved_at_220[0x3e0];
+
+    u8 pas[0][0x40];
+};
+
+struct mlx5_ifc_rmpc_bits {
+    uint8_t reserved_at_0[0x8];
+    uint8_t state[0x4];
+    uint8_t reserved_at_c[0x14];
+
+    uint8_t basic_cyclic_rcv_wqe[0x1];
+    uint8_t reserved_at_21[0x1f];
+
+    uint8_t reserved_at_40[0x140];
+
+    struct mlx5_ifc_wq_bits wq;
+};
+
+struct mlx5_ifc_create_rmp_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0xc0];
+
+    struct mlx5_ifc_rmpc_bits ctx;
+};
+
+struct mlx5_ifc_create_rmp_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x8];
+    uint8_t rmpn[0x18];
+
+    uint8_t reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_sq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_sq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 sqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 rqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_rq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 rqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_rqt_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 rqtn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_rqt_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 rqtn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_tis_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_tis_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 tisn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_q_counter_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x18];
+    u8 counter_set_id[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_q_counter_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x18];
+    u8 counter_set_id[0x8];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_modify_header_context_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 modify_header_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_modify_header_context_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 modify_header_id[0x20];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_scheduling_element_out_bits {
+    u8 reserved_at_0[0x80];
+
+    u8 scheduling_element_id[0x20];
+
+    u8 reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_create_scheduling_element_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 reserved_at_60[0x3a0];
+};
+
+struct mlx5_ifc_destroy_scheduling_element_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 scheduling_hierarchy[0x8];
+    u8 reserved_at_48[0x18];
+
+    u8 scheduling_element_id[0x20];
+
+    u8 reserved_at_80[0x180];
+};
+
+struct mlx5_ifc_add_vxlan_udp_dport_in_bits {
+    u8 reserved_at_0[0x60];
+
+    u8 reserved_at_60[0x10];
+    u8 vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_delete_vxlan_udp_dport_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x40];
+
+    u8 reserved_at_60[0x10];
+    u8 vxlan_udp_port[0x10];
+};
+
+struct mlx5_ifc_set_l2_table_entry_in_bits {
+    u8 reserved_at_0[0xa0];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_index[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_delete_l2_table_entry_in_bits {
+    u8 opcode[0x10];
+    u8 reserved_at_10[0x10];
+
+    u8 reserved_at_20[0x80];
+
+    u8 reserved_at_a0[0x8];
+    u8 table_index[0x18];
+
+    u8 reserved_at_c0[0x140];
+};
+
+struct mlx5_ifc_create_srq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_srq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrc_srq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrc_srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_xrc_srq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrc_srqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_dct_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 ece[0x20];
+};
+
+struct mlx5_ifc_destroy_dct_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 dctn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_xrq_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_destroy_xrq_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrqn[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_attach_to_mcg_in_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_detach_from_mcg_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 qpn[0x18];
+
+    u8 reserved_at_60[0x20];
+
+    u8 multicast_gid[16][0x8];
+};
+
+struct mlx5_ifc_alloc_xrcd_out_bits {
+    u8 reserved_at_0[0x40];
+
+    u8 reserved_at_40[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_dealloc_xrcd_in_bits {
+    u8 opcode[0x10];
+    u8 uid[0x10];
+
+    u8 reserved_at_20[0x20];
+
+    u8 reserved_at_40[0x8];
+    u8 xrcd[0x18];
+
+    u8 reserved_at_60[0x20];
+};
+
+enum {
+    MLX5_CRYPTO_LOGIN_OBJ_STATE_VALID = 0x0,
+    MLX5_CRYPTO_LOGIN_OBJ_STATE_INVALID = 0x1,
+};
+
+struct mlx5_ifc_crypto_login_obj_bits {
+    u8 modify_field_select[0x40];
+
+    u8 reserved_at_40[0x40];
+
+    u8 reserved_at_80[0x4];
+    u8 state[0x4];
+    u8 credential_pointer[0x18];
+
+    u8 reserved_at_a0[0x8];
+    u8 session_import_kek_ptr[0x18];
+
+    u8 reserved_at_c0[0x140];
+
+    u8 credential[12][0x20];
+
+    u8 reserved_at_380[0x480];
+};
+
+struct mlx5_ifc_create_crypto_login_obj_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_crypto_login_obj_bits login_obj;
+};
+
+struct mlx5_ifc_query_crypto_login_obj_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_crypto_login_obj_bits obj;
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_STATE_READY = 0x0,
+    MLX5_ENCRYPTION_KEY_OBJ_STATE_ERROR = 0x1,
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_128 = 0x0,
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_SIZE_SIZE_256 = 0x1,
+};
+
+enum {
+    MLX5_ENCRYPTION_KEY_OBJ_KEY_PURPOSE_AES_XTS = 0x3,
+};
+
+struct mlx5_ifc_encryption_key_obj_bits {
+    u8 modify_field_select[0x40];
+
+    u8 state[0x8];
+    u8 reserved_at_48[0xc];
+    u8 key_size[0x4];
+    u8 has_keytag[0x1];
+    u8 reserved_at_59[0x3];
+    u8 key_purpose[0x4];
+
+    u8 reserved_at_60[0x8];
+    u8 pd[0x18];
+
+    u8 reserved_at_80[0x100];
+
+    u8 opaque[0x40];
+
+    u8 reserved_at_1c0[0x40];
+
+    u8 key[32][0x20];
+
+    u8 reserved_at_600[0x200];
+};
+
+struct mlx5_ifc_create_encryption_key_obj_in_bits {
+    struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr;
+    struct mlx5_ifc_encryption_key_obj_bits key_obj;
+};
+
+struct mlx5_ifc_query_encryption_key_obj_out_bits {
+    struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr;
+    struct mlx5_ifc_encryption_key_obj_bits obj;
+};
+
+enum {
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_WIRE_SIGNATURE = 0x0,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_MEMORY_SIGNATURE = 0x1,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_WIRE = 0x2,
+    MLX5_ENCRYPTION_ORDER_ENCRYPTED_RAW_MEMORY = 0x3,
+};
+
+enum {
+    MLX5_ENCRYPTION_STANDARD_AES_XTS = 0x0,
+};
+
+struct mlx5_ifc_nop_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_nop_out_bits {
+    uint8_t status[0x8];
+    uint8_t reserved_at_8[0x18];
+
+    uint8_t syndrome[0x20];
+
+    uint8_t reserved_at_40[0x40];
+};
+
+#endif /* MLX5_IFC_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h
new file mode 100644
index 00000000000..3c5a182a141
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/include/host/mlx5_prm.h
@@ -0,0 +1,170 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef RTE_PMD_MLX5_PRM_H_
+#define RTE_PMD_MLX5_PRM_H_
+
+#include <unistd.h>
+#include <linux/types.h>
+
+#define MLX5_ADAPTER_PAGE_SHIFT 12
+
+enum {
+    MLX5_CQE_SIZE_64B = 0x0,
+    MLX5_CQE_SIZE_128B = 0x1,
+};
+
+enum {
+    MLX5_QPC_RQ_TYPE_REGULAR = 0x0,
+    MLX5_QPC_RQ_TYPE_SRQ_RMP_XRC_SRQ_XRQ = 0x1,
+    MLX5_QPC_RQ_TYPE_ZERO_SIZE_RQ = 0x3,
+};
+
+enum {
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_1_0 = 0x0,
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_1_5 = 0x1,
+    MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0 = 0x2,
+};
+
+enum {
+    MLX5_QPC_MTU_256_BYTES = 0x1,
+    MLX5_QPC_MTU_512_BYTES = 0x2,
+    MLX5_QPC_MTU_1K_BYTES = 0x3,
+    MLX5_QPC_MTU_2K_BYTES = 0x4,
+    MLX5_QPC_MTU_4K_BYTES = 0x5,
+    MLX5_QPC_MTU_8K_BYTES = 0x6,
+    MLX5_QPC_MTU_RAW_ETHERNET_QP = 0x7,
+};
+
+enum {
+    MLX5_QPC_STATE_RST = 0x0,
+    MLX5_QPC_STATE_INIT = 0x1,
+    MLX5_QPC_STATE_RTR = 0x2,
+    MLX5_QPC_STATE_RTS = 0x3,
+    MLX5_QPC_STATE_SQER = 0x4,
+    MLX5_QPC_STATE_SQDRAINED = 0x5,
+    MLX5_QPC_STATE_ERR = 0x6,
+};
+
+enum {
+    MLX5_CQC_CQE_SZ_BYTES_64 = 0x0,
+};
+
+enum {
+    MLX5_CQ_SET_CI = 0,
+    MLX5_CQ_ARM_DB = 1,
+};
+
+struct mlx5_ifc_cqc_bits {
+    uint8_t status[0x4];
+    uint8_t as_notify[0x1];
+    uint8_t initiator_src_dct[0x1];
+    uint8_t dbr_umem_valid[0x1];
+    uint8_t reserved_at_7[0x1];
+    uint8_t cqe_sz[0x3];
+    uint8_t cc[0x1];
+    uint8_t reserved_at_c[0x1];
+    uint8_t scqe_break_moderation_en[0x1];
+    uint8_t oi[0x1];
+    uint8_t cq_period_mode[0x2];
+    uint8_t cqe_comp_en[0x1];
+    uint8_t mini_cqe_res_format[0x2];
+    uint8_t st[0x4];
+    uint8_t reserved_at_18[0x1];
+    uint8_t cqe_comp_layout[0x7];
+    uint8_t dbr_umem_id[0x20];
+    uint8_t reserved_at_40[0x14];
+    uint8_t page_offset[0x6];
+    uint8_t reserved_at_5a[0x2];
+    uint8_t mini_cqe_res_format_ext[0x2];
+    uint8_t cq_timestamp_format[0x2];
+    uint8_t reserved_at_60[0x3];
+    uint8_t log_cq_size[0x5];
+    uint8_t uar_page[0x18];
+    uint8_t reserved_at_80[0x4];
+    uint8_t cq_period[0xc];
+    uint8_t cq_max_count[0x10];
+    uint8_t reserved_at_a0[0x18];
+    uint8_t c_eqn[0x8];
+    uint8_t reserved_at_c0[0x3];
+    uint8_t log_page_size[0x5];
+    uint8_t reserved_at_c8[0x18];
+    uint8_t reserved_at_e0[0x20];
+    uint8_t reserved_at_100[0x8];
+    uint8_t last_notified_index[0x18];
+    uint8_t reserved_at_120[0x8];
+    uint8_t last_solicit_index[0x18];
+    uint8_t reserved_at_140[0x8];
+    uint8_t consumer_counter[0x18];
+    uint8_t reserved_at_160[0x8];
+    uint8_t producer_counter[0x18];
+    uint8_t local_partition_id[0xc];
+    uint8_t process_id[0x14];
+    uint8_t reserved_at_1A0[0x20];
+    uint8_t dbr_addr[0x40];
+};
+
+struct mlx5_ifc_create_cq_in_bits {
+    uint8_t opcode[0x10];
+    uint8_t uid[0x10];
+    uint8_t reserved_at_20[0x10];
+    uint8_t op_mod[0x10];
+    uint8_t reserved_at_40[0x40];
+    struct mlx5_ifc_cqc_bits cq_context;
+    uint8_t cq_umem_offset[0x40];
+    uint8_t cq_umem_id[0x20];
+    uint8_t cq_umem_valid[0x1];
+    uint8_t reserved_at_2e1[0x1f];
+    uint8_t reserved_at_300[0x580];
+    uint8_t pas[];
+};
+
+struct mlx5_err_cqe_ex {
+    uint8_t rsvd0[32];
+    __be32 srqn;
+    uint8_t rsvd1[16];
+    uint8_t hw_err_synd;
+    uint8_t hw_synd_type;
+    uint8_t vendor_err_synd;
+    uint8_t syndrome;
+    __be32 s_wqe_opcode_qpn;
+    __be16 wqe_counter;
+    uint8_t signature;
+    uint8_t op_own;
+};
+
+/* If not present, it will compile but it will not work.
+ * Fallback UAR mechanism is in place.
+ */
+#ifndef MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED
+#define MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED (1U << 31)
+#endif
+
+#endif /* RTE_PMD_MLX5_PRM_H_ */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
new file mode 100644
index 00000000000..930a19b71d5
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio.cpp
@@ -0,0 +1,942 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+#include <cuda_runtime.h>
+#include <string.h>
+
+#include <atomic>
+#include <set>
+#include <unordered_map>
+#include <mutex>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "host/doca_gpunetio.h"
+#include "doca_gpunetio_gdrcopy.h"
+#include "common/doca_gpunetio_verbs_dev.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_qp.hpp"
+#include "doca_verbs_cuda_wrapper.h"
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR 10
+
+struct doca_gpu_mtable {
+    uintptr_t base_addr;
+    size_t size_orig;
+    uintptr_t align_addr_gpu;
+    uintptr_t align_addr_cpu;
+    size_t size;
+    enum doca_gpu_mem_type mtype;
+    void *gdr_mh;
+};
+
+struct doca_gpu {
+    CUdevice cuda_dev; /* CUDA device handler */
+    std::unordered_map<uintptr_t, struct doca_gpu_mtable *>
+        *mtable;                       /* Table of GPU/CPU memory allocated addresses */
+    bool support_gdrcopy;              ///< Boolean value that indicates if gdrcopy is
+                                       ///< supported
+    bool support_dmabuf;               ///< Boolean value that indicates if dmabuf is
+                                       ///< supported by the gpu
+    bool support_wq_gpumem;            ///< Boolean value that indicates if gpumem is
+                                       ///< available and nic-gpu mapping is supported
+    bool support_cq_gpumem;            ///< Boolean value that indicates if gpumem is
+                                       ///< available and nic-gpu mapping is supported
+    bool support_uar_gpumem;           ///< Boolean value that indicates if gpumem is
+                                       ///< available and gpu-nic mapping is supported
+    bool support_async_store_release;  ///< Boolean value that indicates if
+                                       ///< async store release is supported
+    bool support_bf_uar;               ///< Boolean value that indicates if BlueFlame
+                                       ///< is supported
+};
+
+struct doca_gpu_verbs_service {
+    pthread_t service_thread;
+    pthread_rwlock_t service_lock;
+    bool running;
+    std::set<struct doca_gpu_verbs_qp *> *qps;
+};
+
+static inline bool priv_query_async_store_release_support(void) {
+    int current_device;
+    int compute_cap_major;
+    cudaError_t status = cudaSuccess;
+
+    status = cudaGetDevice(&current_device);
+    if (status != cudaSuccess) return false;
+
+    status = cudaDeviceGetAttribute(&compute_cap_major, cudaDevAttrComputeCapabilityMajor,
+                                    current_device);
+    if (status != cudaSuccess) return false;
+
+    return (compute_cap_major >= GPU_FULL_ASYNC_STORE_RELEASE_SUPPORT_COMPUTE_CAP_MAJOR);
+}
+
+bool priv_is_power_of_two(uint64_t x) { return x && (x & (x - 1)) == 0; }
+
+static size_t priv_get_page_size() {
+    auto ret = sysconf(_SC_PAGESIZE);
+    if (ret == -1) return 4096;  // 4KB, default Linux page size
+
+    return (size_t)ret;
+}
+
+doca_error_t doca_gpu_create(const char *gpu_bus_id, struct doca_gpu **gpu_dev) {
+    struct doca_gpu *gpu_dev_;
+    int dmabuf_supported;
+    CUresult res_drv = CUDA_SUCCESS;
+    cudaError_t res_cuda = cudaSuccess;
+
+    if (gpu_bus_id == nullptr || gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input parameters.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    gpu_dev_ = (struct doca_gpu *)calloc(1, sizeof(struct doca_gpu));
+    if (gpu_dev_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "error in %s: failed to allocate memory for doca_gpu", __func__);
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    res_cuda = cudaDeviceGetByPCIBusId(&gpu_dev_->cuda_dev, gpu_bus_id);
+    if (res_cuda != cudaSuccess) {
+        DOCA_LOG(LOG_ERR, "Invalid GPU bus id provided (ret %d).", res_drv);
+        goto exit_error;
+    }
+
+    res_drv = doca_verbs_wrapper_cuDeviceGetAttribute(
+        &(dmabuf_supported), CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED, gpu_dev_->cuda_dev);
+    if (res_drv != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED returned %d.",
+                 res_drv);
+        goto exit_error;
+    }
+
+    (dmabuf_supported == 1 ? (gpu_dev_->support_dmabuf = true)
+                           : (gpu_dev_->support_dmabuf = false));
+
+    // status = gdaki_map_uar(guar);
+    // device_attr->support_uar_gpumem = (status == 0);
+    // did_map_uar = (status == 0);
+
+    // TBD
+    gpu_dev_->support_wq_gpumem = true;
+    gpu_dev_->support_cq_gpumem = true;
+    gpu_dev_->support_uar_gpumem = true;
+    gpu_dev_->support_bf_uar = true;
+    gpu_dev_->support_async_store_release = priv_query_async_store_release_support();
+    gpu_dev_->support_gdrcopy = doca_gpu_gdrcopy_is_supported();
+
+    try {
+        gpu_dev_->mtable = new std::unordered_map<uintptr_t, struct doca_gpu_mtable *>();
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "mtable map allocation failed");
+        goto exit_error;
+    }
+
+    (*gpu_dev) = gpu_dev_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    free(gpu_dev_);
+
+    return DOCA_ERROR_INITIALIZATION;
+}
+
+doca_error_t doca_gpu_destroy(struct doca_gpu *gpu_dev) {
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input parameters.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (gpu_dev->mtable != nullptr) {
+        if (gpu_dev->mtable->size() > 0) {
+            DOCA_LOG(LOG_ERR, "mtable map is not empty.");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        delete gpu_dev->mtable;
+    }
+
+    free(gpu_dev);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_mem_alloc(struct doca_gpu *gpu_dev, size_t size, size_t alignment,
+                                enum doca_gpu_mem_type mtype, void **memptr_gpu,
+                                void **memptr_cpu) {
+    cudaError_t res;
+    CUresult res_drv;
+    int ret;
+    void *cudev_memptr_gpu_orig_ = 0;
+    void *cudev_memptr_gpu_ = 0;
+    struct doca_gpu_mtable *mentry;
+    unsigned int flag = 1;
+    const char *err_string;
+    void *memptr_cpu_ = nullptr;
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (memptr_gpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_gpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (mtype != DOCA_GPU_MEM_TYPE_GPU && memptr_cpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_cpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (size == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid size provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (alignment == 0) alignment = priv_get_page_size();
+
+    if (priv_is_power_of_two(alignment) == false) {
+        DOCA_LOG(LOG_ERR, "alignment %zd has to be power of 2.", alignment);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    mentry = (struct doca_gpu_mtable *)calloc(1, sizeof(struct doca_gpu_mtable));
+    mentry->mtype = mtype;
+    mentry->size = size;
+
+    if (mtype == DOCA_GPU_MEM_TYPE_GPU_CPU && alignment != GPU_PAGE_SIZE) alignment = GPU_PAGE_SIZE;
+
+    if (mtype == DOCA_GPU_MEM_TYPE_GPU) {
+        mentry->size_orig = mentry->size + alignment;
+
+        res = cudaMalloc(&(cudev_memptr_gpu_orig_), mentry->size_orig);
+        if (res != cudaSuccess) {
+            err_string = cudaGetErrorString(res);
+            DOCA_LOG(LOG_ERR, "cudaMalloc current failed with %s size %zd", err_string,
+                     mentry->size_orig);
+            goto error;
+        }
+
+        /* Align memory address */
+        cudev_memptr_gpu_ = cudev_memptr_gpu_orig_;
+        if (alignment && ((uintptr_t)cudev_memptr_gpu_) % alignment)
+            cudev_memptr_gpu_ =
+                (void *)((uintptr_t)cudev_memptr_gpu_ +
+                         (alignment - (((uintptr_t)cudev_memptr_gpu_) % alignment)));
+
+        /* GPUDirect RDMA attribute required */
+        res_drv = doca_verbs_wrapper_cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS,
+                                                           (CUdeviceptr)cudev_memptr_gpu_);
+        if (res_drv != CUDA_SUCCESS) {
+            cudaFree(cudev_memptr_gpu_orig_);
+            DOCA_LOG(LOG_ERR, "Could not set SYNC MEMOP attribute for GPU memory at %lx, err %d",
+                     (uintptr_t)cudev_memptr_gpu_, res);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->base_addr = (uintptr_t)cudev_memptr_gpu_orig_;
+        mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+        mentry->align_addr_cpu = 0;
+    } else if (mtype == DOCA_GPU_MEM_TYPE_GPU_CPU) {
+        if (gpu_dev->support_gdrcopy == true) {
+            mentry->size_orig = mentry->size + alignment;
+
+            res = cudaMalloc(&(cudev_memptr_gpu_orig_), mentry->size_orig);
+            if (res != cudaSuccess) {
+                err_string = cudaGetErrorString(res);
+                DOCA_LOG(LOG_ERR, "cudaMalloc current failed with %s", err_string);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            /* Align memory address */
+            cudev_memptr_gpu_ = cudev_memptr_gpu_orig_;
+            if (alignment && ((uintptr_t)cudev_memptr_gpu_) % alignment)
+                cudev_memptr_gpu_ =
+                    (void *)((uintptr_t)cudev_memptr_gpu_ +
+                             (alignment - (((uintptr_t)cudev_memptr_gpu_) % alignment)));
+
+            /* GPUDirect RDMA attribute required */
+            res_drv = doca_verbs_wrapper_cuPointerSetAttribute(
+                &flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)cudev_memptr_gpu_);
+            if (res_drv != CUDA_SUCCESS) {
+                cudaFree(cudev_memptr_gpu_orig_);
+                DOCA_LOG(LOG_ERR,
+                         "Could not set SYNC MEMOP attribute for GPU memory at %lx, err %d",
+                         (uintptr_t)cudev_memptr_gpu_, res);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->base_addr = (uintptr_t)cudev_memptr_gpu_orig_;
+            mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+            mentry->align_addr_cpu = 0;
+
+            ret =
+                doca_gpu_gdrcopy_create_mapping((void *)mentry->align_addr_gpu, mentry->size,
+                                                &mentry->gdr_mh, (void **)&mentry->align_addr_cpu);
+            if (ret) {
+                DOCA_LOG(LOG_ERR, "Error mapping GPU memory at %lx to CPU", mentry->align_addr_gpu);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+        } else {
+            DOCA_LOG(LOG_WARNING,
+                     "GDRCopy not enabled, can't allocate memory type DOCA_GPU_MEM_TYPE_GPU_CPU. "
+                     "Using DOCA_GPU_MEM_TYPE_CPU_GPU mode instead");
+
+            mentry->size_orig = mentry->size;
+
+            memptr_cpu_ = (uint8_t *)calloc(alignment, mentry->size_orig);
+            if (memptr_cpu_ == nullptr) {
+                DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory.");
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            res = cudaHostRegister(memptr_cpu_, mentry->size_orig,
+                                   cudaHostRegisterPortable | cudaHostRegisterMapped);
+            if (res != cudaSuccess) {
+                DOCA_LOG(LOG_ERR, "Could register CPU memory to CUDA %lx, err %d",
+                         (uintptr_t)memptr_cpu_, res);
+                free(memptr_cpu_);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->base_addr = (uintptr_t)memptr_cpu_;
+
+            res = cudaHostGetDevicePointer(&cudev_memptr_gpu_, memptr_cpu_, 0);
+            if (res != cudaSuccess) {
+                DOCA_LOG(LOG_ERR, "Could get GPU device ptr for CPU memory %lx, err %d",
+                         (uintptr_t)memptr_cpu_, res);
+                free(memptr_cpu_);
+                status = DOCA_ERROR_DRIVER;
+                goto error;
+            }
+
+            mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+            mentry->align_addr_cpu = (uintptr_t)memptr_cpu_;
+        }
+
+    } else if (mtype == DOCA_GPU_MEM_TYPE_CPU_GPU) {
+        mentry->size_orig = mentry->size;
+
+        memptr_cpu_ = (uint8_t *)calloc(alignment, mentry->size_orig);
+        if (memptr_cpu_ == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory.");
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        res = cudaHostRegister(memptr_cpu_, mentry->size_orig,
+                               cudaHostRegisterPortable | cudaHostRegisterMapped);
+        if (res != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Could register CPU memory to CUDA %lx, err %d",
+                     (uintptr_t)memptr_cpu_, res);
+            free(memptr_cpu_);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->base_addr = (uintptr_t)memptr_cpu_;
+
+        res = cudaHostGetDevicePointer(&cudev_memptr_gpu_, memptr_cpu_, 0);
+        if (res != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Could get GPU device ptr for CPU memory %lx, err %d",
+                     (uintptr_t)memptr_cpu_, res);
+            free(memptr_cpu_);
+            status = DOCA_ERROR_DRIVER;
+            goto error;
+        }
+
+        mentry->align_addr_gpu = (uintptr_t)cudev_memptr_gpu_;
+        mentry->align_addr_cpu = (uintptr_t)memptr_cpu_;
+    }
+
+    *memptr_gpu = (void *)mentry->align_addr_gpu;
+    if (memptr_cpu) *memptr_cpu = (void *)mentry->align_addr_cpu;
+
+    // DOCA_LOG(LOG_DEBUG, "New memory: Orig %lx GPU %lx CPU %lx type %d size %zd\n",
+    // 	      mentry->base_addr,
+    // 	      mentry->align_addr_gpu,
+    // 	      mentry->align_addr_cpu,
+    // 	      mentry->mtype,
+    // 	      mentry->size);
+
+    try {
+        gpu_dev->mtable->insert({mentry->align_addr_gpu, mentry});
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "mtable map insert failed");
+        status = DOCA_ERROR_DRIVER;
+        goto error;
+    }
+
+    return DOCA_SUCCESS;
+
+error:
+    free(mentry);
+    return status;
+}
+
+doca_error_t doca_gpu_mem_free(struct doca_gpu *gpu_dev, void *memptr_gpu) {
+    struct doca_gpu_mtable *mentry;
+    cudaError_t res_cuda;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (memptr_gpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid memptr_gpu provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    std::unordered_map<uint64_t, struct doca_gpu_mtable *>::const_iterator it =
+        gpu_dev->mtable->find((uintptr_t)memptr_gpu);
+    if (it == gpu_dev->mtable->end()) {
+        DOCA_LOG(LOG_ERR, "memptr_gpu = %p was not allocated by DOCA GPUNetIO.", memptr_gpu);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    mentry = it->second;
+
+    if (mentry->mtype == DOCA_GPU_MEM_TYPE_GPU)
+        cudaFree((void *)mentry->base_addr);
+    else if (mentry->mtype == DOCA_GPU_MEM_TYPE_GPU_CPU) {
+        if (gpu_dev->support_gdrcopy)
+            doca_gpu_gdrcopy_destroy_mapping(mentry->gdr_mh, (void *)mentry->align_addr_cpu,
+                                             mentry->size);
+        cudaFree((void *)mentry->base_addr);
+    } else {
+        res_cuda = cudaHostUnregister((void *)mentry->base_addr);
+        if (res_cuda != cudaSuccess)
+            DOCA_LOG(LOG_ERR, "Error unregistering GPU memory at %p", (void *)mentry->base_addr);
+        free((void *)mentry->base_addr);
+    }
+
+    gpu_dev->mtable->erase(it);
+    free(mentry);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_dmabuf_fd(struct doca_gpu *gpu_dev, void *memptr_gpu, size_t size,
+                                int *dmabuf_fd) {
+#if DOCA_GPUNETIO_HAVE_CUDA_DMABUF == 1
+    CUresult res_drv = CUDA_SUCCESS;
+
+    if (gpu_dev == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DOCA GPUNetIO instance provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (gpu_dev->support_dmabuf == false) {
+        DOCA_LOG(LOG_ERR, "DMABuf not supported on this system by this CUDA installation.");
+        return DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    if (dmabuf_fd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid DMABuf fd pointer provided.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    res_drv = doca_verbs_wrapper_cuMemGetHandleForAddressRange(
+        dmabuf_fd, (CUdeviceptr)memptr_gpu, size, CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0);
+    if (res_drv != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "cuMemGetHandleForAddressRange returned %d.", res_drv);
+        return DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    return DOCA_SUCCESS;
+#else
+    return DOCA_ERROR_NOT_SUPPORTED;
+#endif
+}
+
+doca_error_t doca_gpu_verbs_can_gpu_register_uar(void *db, bool *out_can_register) {
+    cudaError_t cuda_status = cudaSuccess;
+
+    if (db == nullptr || out_can_register == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    cuda_status = cudaHostRegister(
+        db, DOCA_VERBS_DB_UAR_SIZE,
+        cudaHostRegisterPortable | cudaHostRegisterMapped | cudaHostRegisterIoMemory);
+
+    *out_can_register =
+        (cuda_status == cudaSuccess || cuda_status == cudaErrorHostMemoryAlreadyRegistered);
+
+    if (cuda_status == cudaSuccess) cudaHostUnregister(db);
+
+    return DOCA_SUCCESS;
+}
+
+struct pair_ptr_cucontext_hash {
+    std::size_t operator()(const std::pair<void *, CUcontext> &p) const noexcept {
+        // Hash the pointer and the CUcontext (which is also a pointer type)
+        std::size_t h1 = std::hash<void *>{}(p.first);
+        std::size_t h2 = std::hash<CUcontext>{}(p.second);
+        // Combine the two hashes
+        return h1 ^ (h2 << 1);
+    }
+};
+static std::unordered_map<std::pair<void *, CUcontext>, unsigned int, pair_ptr_cucontext_hash>
+    registered_uar_refcount;
+static std::mutex registered_uar_mutex;
+
+doca_error_t doca_gpu_verbs_export_uar(uint64_t *sq_db, uint64_t **uar_addr_gpu) {
+    std::lock_guard<std::mutex> lock(registered_uar_mutex);
+
+    void *ptr = nullptr;
+    cudaError_t cuda_status = cudaSuccess;
+    CUresult cuda_drv_status = CUDA_SUCCESS;
+    bool registered = false;
+    CUcontext current_ctx = nullptr;
+    std::pair<void *, CUcontext> uar_key;
+
+    if (sq_db == nullptr || uar_addr_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    // Get current CUDA context
+    cuda_drv_status = doca_verbs_wrapper_cuCtxGetCurrent(&current_ctx);
+    if (cuda_drv_status != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get current CUDA context (err %d)", cuda_drv_status);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    cuda_status = cudaHostRegister(
+        sq_db, DOCA_VERBS_DB_UAR_SIZE,
+        cudaHostRegisterPortable | cudaHostRegisterMapped | cudaHostRegisterIoMemory);
+    if (cuda_status == cudaSuccess)
+        registered = true;
+    else if (cuda_status != cudaErrorHostMemoryAlreadyRegistered) {
+        DOCA_LOG(LOG_ERR,
+                 "Function cudaHostRegister (err %d) "
+                 "failed on addr %p size %d",
+                 cuda_status, (void *)sq_db, DOCA_VERBS_DB_UAR_SIZE);
+        goto out;
+    }
+
+    cuda_status = cudaHostGetDevicePointer(&ptr, sq_db, 0);
+    if (cuda_status != cudaSuccess) {
+        DOCA_LOG(LOG_ERR,
+                 "Function cudaHostGetDevicePointer (err %d) "
+                 "failed on addr %p size %d",
+                 cuda_status, (void *)sq_db, DOCA_VERBS_DB_UAR_SIZE);
+        goto out;
+    }
+
+    uar_key = std::make_pair((void *)sq_db, current_ctx);
+    if (registered_uar_refcount.find(uar_key) == registered_uar_refcount.end()) {
+        registered_uar_refcount[uar_key] = 0;
+    }
+    registered_uar_refcount[uar_key]++;
+
+    *uar_addr_gpu = (uint64_t *)ptr;
+
+out:
+    if (cuda_status != cudaSuccess) {
+        if (registered) cudaHostUnregister(sq_db);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_unexport_uar(uint64_t *uar_addr_gpu) {
+    std::lock_guard<std::mutex> lock(registered_uar_mutex);
+
+    CUcontext current_ctx = nullptr;
+    CUresult cuda_drv_status = CUDA_SUCCESS;
+    cudaError_t cuda_status = cudaSuccess;
+    std::pair<void *, CUcontext> uar_key;
+
+    if (uar_addr_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    // Get current CUDA context
+    cuda_drv_status = doca_verbs_wrapper_cuCtxGetCurrent(&current_ctx);
+    if (cuda_drv_status != CUDA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get current CUDA context (err %d)", cuda_drv_status);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    uar_key = std::make_pair((void *)uar_addr_gpu, current_ctx);
+    if (registered_uar_refcount.find(uar_key) == registered_uar_refcount.end()) {
+        DOCA_LOG(LOG_ERR, "UAR address %p with context %p not found in registered_uar_refcount",
+                 uar_addr_gpu, current_ctx);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    registered_uar_refcount[uar_key]--;
+    assert(registered_uar_refcount[uar_key] >= 0);
+    if (registered_uar_refcount[uar_key] == 0) {
+        registered_uar_refcount.erase(uar_key);
+        cuda_status = cudaHostUnregister(uar_addr_gpu);
+        if (cuda_status != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "Failed to unregister UAR address %p", uar_addr_gpu);
+            return DOCA_ERROR_DRIVER;
+        }
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_export_qp(struct doca_gpu *gpu_dev, struct doca_verbs_qp *qp,
+                                      enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                                      void *gpu_qp_umem_dev_ptr, struct doca_verbs_cq *cq_sq,
+                                      struct doca_gpu_verbs_qp **qp_out) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    struct doca_gpu_dev_verbs_qp *qp_cpu_ = nullptr;
+    void *rq_wqe_daddr;
+    uint32_t rq_wqe_num;
+    uint32_t rcv_wqe_size;
+    uint64_t *sq_db;
+    uint32_t sq_wqe_num;
+    uint64_t *uar_db_reg = NULL;
+    uint32_t *arm_dbr = NULL;
+    uint32_t *cq_dbrec;
+
+    if (gpu_dev == nullptr || qp == nullptr || qp == nullptr || cq_sq == nullptr)
+        return DOCA_ERROR_INVALID_VALUE;
+
+    *qp_out = (struct doca_gpu_verbs_qp *)calloc(1, sizeof(struct doca_gpu_verbs_qp));
+    if (*qp_out == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    (*qp_out)->qp_cpu =
+        (struct doca_gpu_dev_verbs_qp *)calloc(1, sizeof(struct doca_gpu_dev_verbs_qp));
+    if ((*qp_out)->qp_cpu == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate CPU memory");
+        free(*qp_out);
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    qp_cpu_ = (*qp_out)->qp_cpu;
+
+    // Should this be propagated to GPU?
+    if (qp->get_uar_mtype() == DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME)
+        gpu_dev->support_bf_uar = true;
+
+    // Check QP and CQ same size!!!!
+
+    doca_verbs_qp_get_wq(qp,
+                         (void **)&(qp_cpu_->sq_wqe_daddr),  // broken for external umem
+                         &sq_wqe_num,
+                         (void **)&(rq_wqe_daddr),  // broken for external umem
+                         &rq_wqe_num, &rcv_wqe_size);
+
+    uint32_t *dbrec = reinterpret_cast<uint32_t *>(doca_verbs_qp_get_dbr_addr(qp));
+
+    qp_cpu_->sq_wqe_num = (uint16_t)sq_wqe_num;
+    qp_cpu_->sq_wqe_mask = qp_cpu_->sq_wqe_num - 1;
+    qp_cpu_->sq_num = doca_verbs_qp_get_qpn(qp);
+    qp_cpu_->sq_num_shift8 = qp_cpu_->sq_num << 8;
+    qp_cpu_->sq_num_shift8_be = htobe32(qp_cpu_->sq_num_shift8);
+    qp_cpu_->sq_num_shift8_be_1ds = htobe32(qp_cpu_->sq_num_shift8 | 1);
+    qp_cpu_->sq_num_shift8_be_2ds = htobe32(qp_cpu_->sq_num_shift8 | 2);
+    qp_cpu_->sq_num_shift8_be_3ds = htobe32(qp_cpu_->sq_num_shift8 | 3);
+    qp_cpu_->sq_num_shift8_be_4ds = htobe32(qp_cpu_->sq_num_shift8 | 4);
+    qp_cpu_->sq_wqe_pi = 0;
+    qp_cpu_->sq_rsvd_index = 0;
+    qp_cpu_->sq_ready_index = 0;
+    qp_cpu_->sq_lock = 0;
+    qp_cpu_->sq_dbrec = (__be32 *)(dbrec + DOCA_GPUNETIO_IB_MLX5_SND_DBR);
+    qp_cpu_->mem_type = DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU;
+    (*qp_out)->cpu_db = nullptr;
+    (*qp_out)->sq_db = nullptr;
+    (*qp_out)->sq_wqe_pi_last = 0;
+    (*qp_out)->cpu_proxy = false;
+    (*qp_out)->qp_gpu = nullptr;
+    (*qp_out)->qp = qp;
+
+    sq_db = reinterpret_cast<uint64_t *>(doca_verbs_qp_get_uar_addr(qp));
+
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        status = doca_gpu_verbs_export_uar(sq_db, (uint64_t **)&(qp_cpu_->sq_db));
+        if (status != DOCA_SUCCESS && nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+            DOCA_LOG(LOG_ERR, "Can't export UAR to GPU.");
+            goto destroy_uar;
+        }
+    }
+
+    if ((status != DOCA_SUCCESS && nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) ||
+        nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        DOCA_LOG(LOG_WARNING, "Enabling CPU proxy mode");
+
+        status = doca_gpu_mem_alloc(gpu_dev, sizeof(uint64_t), priv_get_page_size(),
+                                    DOCA_GPU_MEM_TYPE_CPU_GPU, (void **)&((*qp_out)->cpu_db),
+                                    (void **)&((*qp_out)->cpu_db));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc GPU memory for CPU proxy DB");
+            goto destroy_uar;
+        }
+
+        *((*qp_out)->cpu_db) = 0;
+        qp_cpu_->sq_db = (*qp_out)->cpu_db;
+        (*qp_out)->sq_dbrec = qp_cpu_->sq_dbrec;
+        (*qp_out)->sq_db = reinterpret_cast<uint64_t *>(doca_verbs_qp_get_uar_addr(qp));
+        (*qp_out)->cpu_proxy = true;
+        (*qp_out)->sq_num_shift8_be = qp_cpu_->sq_num_shift8_be;
+        qp_cpu_->nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY;
+    } else {
+        qp_cpu_->nic_handler = DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB;
+    }
+
+    doca_verbs_cq_get_wq(cq_sq, (void **)&(qp_cpu_->cq_sq.cqe_daddr), &(qp_cpu_->cq_sq.cqe_num),
+                         &(qp_cpu_->cq_sq.cqe_size));
+
+    doca_verbs_cq_get_dbr_addr(cq_sq, &uar_db_reg, (uint32_t **)&(cq_dbrec), &arm_dbr);
+
+    qp_cpu_->cq_sq.dbrec = (__be32 *)cq_dbrec;
+    qp_cpu_->cq_sq.cq_num = doca_verbs_cq_get_cqn(cq_sq);
+    qp_cpu_->cq_sq.cqe_mask = (qp_cpu_->cq_sq.cqe_num - 1);
+    qp_cpu_->cq_sq.cqe_ci = 0;
+    qp_cpu_->cq_sq.cqe_rsvd = 0;
+    qp_cpu_->cq_sq.mem_type = DOCA_GPUNETIO_VERBS_MEM_TYPE_GPU;
+
+    (*qp_out)->gpu_dev = gpu_dev;
+
+    return DOCA_SUCCESS;
+
+destroy_uar:
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        tmp_status = doca_gpu_verbs_unexport_uar((*qp_out)->qp_cpu->sq_db);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy GPU descriptor memory");
+    }
+
+    free((*qp_out)->qp_cpu);
+    free(*qp_out);
+
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_get_qp_dev(struct doca_gpu_verbs_qp *qp,
+                                       struct doca_gpu_dev_verbs_qp **qp_gpu) {
+    doca_error_t status = DOCA_SUCCESS;
+    int custatus = 0;
+
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp->qp_gpu == nullptr) {
+        status = doca_gpu_mem_alloc(qp->gpu_dev, sizeof(struct doca_gpu_dev_verbs_qp),
+                                    priv_get_page_size(), DOCA_GPU_MEM_TYPE_GPU,
+                                    (void **)&qp->qp_gpu, nullptr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for qp_gpu");
+            return status;
+        }
+
+        custatus = cudaMemcpy(qp->qp_gpu, qp->qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp),
+                              cudaMemcpyHostToDevice);
+        if (custatus != cudaSuccess) {
+            DOCA_LOG(LOG_ERR, "cuMemcpyHtoD failed");
+            doca_gpu_mem_free(qp->gpu_dev, qp->qp_gpu);
+            qp->qp_gpu = nullptr;
+            return DOCA_ERROR_DRIVER;
+        }
+    }
+
+    *qp_gpu = qp->qp_gpu;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_unexport_qp(struct doca_gpu *gpu_dev,
+                                        struct doca_gpu_verbs_qp *qp_gverbs) {
+    if (gpu_dev == nullptr || qp_gverbs == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp_gverbs->cpu_db) doca_gpu_mem_free(gpu_dev, qp_gverbs->cpu_db);
+
+    if (qp_gverbs->qp_cpu) {
+        if (qp_gverbs->qp_cpu->nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY)
+            doca_gpu_verbs_unexport_uar(qp_gverbs->qp_cpu->sq_db);
+        free(qp_gverbs->qp_cpu);
+    }
+
+    if (qp_gverbs->qp_gpu) {
+        doca_gpu_mem_free(gpu_dev, qp_gverbs->qp_gpu);
+        qp_gverbs->qp_gpu = nullptr;
+    }
+
+    free(qp_gverbs);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_cpu_proxy_progress(struct doca_gpu_verbs_qp *qp_cpu) {
+    uint32_t tmp_db = 0;
+    __be32 dbr_val;
+
+    if (qp_cpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    if (qp_cpu->cpu_proxy != true) return DOCA_ERROR_NOT_SUPPORTED;
+
+    tmp_db = (uint32_t) * ((volatile uint64_t *)qp_cpu->cpu_db);
+    if (tmp_db != qp_cpu->sq_wqe_pi_last) {
+        struct doca_gpu_dev_verbs_wqe_ctrl_seg ctrl_seg = {.opmod_idx_opcode = htobe32(tmp_db << 8),
+                                                           .qpn_ds = qp_cpu->sq_num_shift8_be};
+
+        dbr_val = htobe32(tmp_db & 0xffff);
+
+        // Ring the DB ASAP.
+        // The second DB ringing happens after the fence. This is used when the NIC enters a
+        // recovery state and it needs to read DBR.
+        *((volatile uint32_t *)qp_cpu->sq_dbrec) = dbr_val;
+        std::atomic_thread_fence(std::memory_order_release);
+        *((volatile uint64_t *)qp_cpu->sq_db) = *((volatile uint64_t *)&ctrl_seg);
+
+        // DOCA_LOG(LOG_DEBUG, "CPU proxy ring wqe %d\n", tmp_db);
+        qp_cpu->sq_wqe_pi_last = tmp_db;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+static void *priv_service_mainloop(void *args) {
+    struct doca_gpu_verbs_service *service = (struct doca_gpu_verbs_service *)args;
+    const unsigned int num_loops = 1000;
+
+    while (service->running) {
+        pthread_rwlock_rdlock(&service->service_lock);
+        for (unsigned int i = 0; i < num_loops; i++) {
+            for (auto qp : *service->qps) {
+                doca_gpu_verbs_cpu_proxy_progress(qp);
+            }
+        }
+        pthread_rwlock_unlock(&service->service_lock);
+        sched_yield();
+    }
+
+    return nullptr;
+}
+
+doca_error_t doca_gpu_verbs_create_service(doca_gpu_verbs_service_t *out_service) {
+    int status = 0;
+    doca_error_t doca_status = DOCA_SUCCESS;
+    struct doca_gpu_verbs_service *service = nullptr;
+
+    if (out_service == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    service = (struct doca_gpu_verbs_service *)calloc(1, sizeof(struct doca_gpu_verbs_service));
+    if (service == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate memory for service");
+        doca_status = DOCA_ERROR_NO_MEMORY;
+        goto out;
+    }
+
+    status = pthread_rwlock_init(&service->service_lock, nullptr);
+    if (status != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to initialize service lock");
+        doca_status = DOCA_ERROR_DRIVER;
+        goto out;
+    }
+
+    service->running = true;
+    service->qps = new std::set<struct doca_gpu_verbs_qp *>();
+    status = pthread_create(&service->service_thread, nullptr, priv_service_mainloop, service);
+    if (status != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create service thread");
+        doca_status = DOCA_ERROR_DRIVER;
+        goto out;
+    }
+
+    *out_service = service;
+
+out:
+    if (status) {
+        if (service->qps) delete service->qps;
+        if (service) free(service);
+    }
+    return doca_status;
+}
+
+doca_error_t doca_gpu_verbs_service_monitor_qp(doca_gpu_verbs_service_t service,
+                                               struct doca_gpu_verbs_qp *qp) {
+    struct doca_gpu_verbs_service *service_ = (struct doca_gpu_verbs_service *)service;
+    if (service == nullptr || qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    pthread_rwlock_wrlock(&service_->service_lock);
+    service_->qps->insert(qp);
+    pthread_rwlock_unlock(&service_->service_lock);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_destroy_service(doca_gpu_verbs_service_t service) {
+    struct doca_gpu_verbs_service *service_ = (struct doca_gpu_verbs_service *)service;
+    if (service == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    service_->running = false;
+    pthread_join(service_->service_thread, nullptr);
+    pthread_rwlock_destroy(&service_->service_lock);
+    delete service_->qps;
+    free(service_);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_query_last_error(struct doca_gpu_verbs_qp *qp,
+                                             struct doca_gpu_verbs_qp_error_info *error_info) {
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (qp == nullptr || qp->qp == nullptr || error_info == nullptr)
+        return DOCA_ERROR_INVALID_VALUE;
+
+    memset(error_info, 0, sizeof(struct doca_gpu_verbs_qp_error_info));
+
+    struct doca_verbs_qp_attr qp_attr;
+    struct doca_verbs_qp_init_attr qp_init_attr;
+    status = doca_verbs_qp_query(qp->qp, &qp_attr, &qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query QP");
+        return status;
+    }
+
+    error_info->has_error = (qp_attr.current_state == DOCA_VERBS_QP_STATE_ERR);
+
+    return DOCA_SUCCESS;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp
new file mode 100644
index 00000000000..ecf1f13c673
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.cpp
@@ -0,0 +1,261 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_gpu_gdrcopy.h
+ * @brief Implementation of the GDRCopy APIs used in doca_gpunetio
+ */
+
+#include <dlfcn.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "doca_gpunetio_gdrcopy.h"
+#include "doca_gpunetio_log.hpp"
+
+struct gdr;
+typedef struct gdr *gdr_t;
+typedef struct gdr_mh_s {
+    unsigned long h;
+} gdr_mh_t;
+
+#define GPU_PAGE_SHIFT 16
+#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE - 1)
+#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
+
+#ifdef __GNUC__
+#define TYPEOF(x) __typeof__(x)
+#else
+#define TYPEOF(x) decltype(x)
+#endif
+
+#define DOCA_GPUNETIO_GDRCOPY_LIB_NAME "libgdrapi.so.2"
+#define DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, symbol, funcptr, on_error_status, on_error_out) \
+    do {                                                                                       \
+        funcptr = (TYPEOF(funcptr))dlsym(handle, symbol);                                      \
+        if (!funcptr) {                                                                        \
+            DOCA_LOG(LOG_ERR, "Failed to load symbol %s", symbol);                             \
+            on_error_status = ENOENT;                                                          \
+            goto on_error_out;                                                                 \
+        }                                                                                      \
+    } while (0)
+
+struct doca_gpu_gdrcopy_function_table {
+    void *handle;
+    gdr_t (*open)();
+    int (*close)(gdr_t g);
+    int (*pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token,
+                      uint32_t va_space, gdr_mh_t *handle);
+    int (*unpin_buffer)(gdr_t g, gdr_mh_t handle);
+    int (*map)(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+    int (*unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+    int (*copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+    int (*copy_to_mapping)(gdr_mh_t handle, const void *map_d_ptr, void *h_ptr, size_t size);
+    void (*runtime_get_version)(int *major, int *minor);
+    int (*driver_get_version)(gdr_t g, int *major, int *minor);
+};
+
+static struct doca_gpu_gdrcopy_function_table *doca_gpu_gdrcopy_ftable = NULL;
+static gdr_t doca_gpu_gdr = NULL;
+
+static int doca_gpu_gdrcopy_ftable_init(struct doca_gpu_gdrcopy_function_table **ftable) {
+    int status = 0;
+    void *handle = NULL;
+    struct doca_gpu_gdrcopy_function_table *table = NULL;
+
+    handle = dlopen(DOCA_GPUNETIO_GDRCOPY_LIB_NAME, RTLD_LAZY);
+    if (!handle) {
+        DOCA_LOG(LOG_ERR, "Failed to open libgdrapi.so.2");
+        status = ENOENT;
+        goto out;
+    }
+
+    table = (struct doca_gpu_gdrcopy_function_table *)malloc(
+        sizeof(struct doca_gpu_gdrcopy_function_table));
+    if (!table) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate memory for gdrcopy function table");
+        status = ENOMEM;
+        goto out;
+    }
+
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_open", table->open, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_close", table->close, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_pin_buffer", table->pin_buffer, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_unpin_buffer", table->unpin_buffer, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_map", table->map, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_unmap", table->unmap, status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_copy_from_mapping", table->copy_from_mapping,
+                                   status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_copy_to_mapping", table->copy_to_mapping, status,
+                                   out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_runtime_get_version", table->runtime_get_version,
+                                   status, out);
+    DOCA_GPUNETIO_GDRCOPY_LOAD_SYM(handle, "gdr_driver_get_version", table->driver_get_version,
+                                   status, out);
+
+    table->handle = handle;
+    *ftable = table;
+
+out:
+    if (status != 0) {
+        if (handle) {
+            dlclose(handle);
+        }
+        if (table) {
+            free(table);
+        }
+    }
+    return status;
+}
+
+static int doca_gpu_init_gdrcopy() {
+    int status = 0;
+    if (!doca_gpu_gdr) {
+        if (!doca_gpu_gdrcopy_ftable) {
+            status = doca_gpu_gdrcopy_ftable_init(&doca_gpu_gdrcopy_ftable);
+            if (status) {
+                DOCA_LOG(LOG_ERR, "Error in doca_gpu_gdrcopy_ftable_init");
+                goto out;
+            }
+        }
+
+        doca_gpu_gdr = doca_gpu_gdrcopy_ftable->open();
+        if (!doca_gpu_gdr) {
+            DOCA_LOG(LOG_ERR, "Error in gdr_open");
+            status = EIO;
+            goto out;
+        }
+    }
+
+out:
+    return status;
+}
+
+static bool doca_gpu_enable_gdrcopy() {
+    const char *env = getenv("DOCA_GPUNETIO_DISABLE_GDRCOPY");
+    if (env && atoi(env) != 0) {
+        DOCA_LOG(LOG_INFO, "DOCA_GPUNETIO_DISABLE_GDRCOPY is set, disabling GDRCopy");
+        return false;
+    }
+    return true;
+}
+
+bool doca_gpu_gdrcopy_is_supported() {
+    static bool is_tried_init = false;
+    static bool is_supported = false;
+    if (!is_tried_init) {
+        bool enabled = doca_gpu_enable_gdrcopy();
+        is_supported = (enabled && (doca_gpu_init_gdrcopy() == 0));
+        DOCA_LOG(LOG_INFO, "GDRCopy usage is %s", is_supported ? "enabled" : "disabled");
+        is_tried_init = true;
+    }
+    return is_supported;
+}
+
+static int priv_doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size,
+                                                gdr_mh_t *out_mh, void **out_host_ptr) {
+    int status = 0;
+    gdr_mh_t mh;
+    void *host_ptr;
+    bool did_gdr_pin_buffer = false;
+
+    status = doca_gpu_init_gdrcopy();
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in doca_gpu_init_gdrcopy");
+        goto out;
+    }
+
+    assert(((uintptr_t)dev_aligned_ptr & (GPU_PAGE_SIZE - 1ULL)) == 0);
+
+    status = doca_gpu_gdrcopy_ftable->pin_buffer(doca_gpu_gdr, (unsigned long)dev_aligned_ptr, size,
+                                                 0, 0, &mh);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in gdr_pin_buffer");
+        goto out;
+    }
+    did_gdr_pin_buffer = true;
+
+    status = doca_gpu_gdrcopy_ftable->map(doca_gpu_gdr, mh, &host_ptr, size);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in gdr_map");
+        goto out;
+    }
+
+    *out_mh = mh;
+    *out_host_ptr = host_ptr;
+
+out:
+    if (status) {
+        if (did_gdr_pin_buffer) doca_gpu_gdrcopy_ftable->unpin_buffer(doca_gpu_gdr, mh);
+    }
+    return status;
+}
+
+int doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size, void **out_mh,
+                                    void **out_host_ptr) {
+    int status = 0;
+    gdr_mh_t *mh = NULL;
+    mh = (gdr_mh_t *)malloc(sizeof(gdr_mh_t));
+    if (!mh) {
+        DOCA_LOG(LOG_ERR, "Error in malloc for mh");
+        status = ENOMEM;
+        goto out;
+    }
+
+    status = priv_doca_gpu_gdrcopy_create_mapping(dev_aligned_ptr, size, mh, out_host_ptr);
+    if (status) {
+        DOCA_LOG(LOG_ERR, "Error in priv_doca_gpu_gdrcopy_create_mapping");
+        goto out;
+    }
+
+    *out_mh = mh;
+
+out:
+    if (status) {
+        if (mh) {
+            free(mh);
+        }
+    }
+    return status;
+}
+
+static void _doca_gpu_gdrcopy_destroy_mapping(gdr_mh_t *mh, void *host_ptr, size_t size) {
+    assert(doca_gpu_gdr);
+    doca_gpu_gdrcopy_ftable->unmap(doca_gpu_gdr, *mh, host_ptr, size);
+    doca_gpu_gdrcopy_ftable->unpin_buffer(doca_gpu_gdr, *mh);
+}
+
+void doca_gpu_gdrcopy_destroy_mapping(void *mh, void *host_ptr, size_t size) {
+    if (mh) {
+        _doca_gpu_gdrcopy_destroy_mapping((gdr_mh_t *)mh, host_ptr, size);
+        free(mh);
+    }
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h
new file mode 100644
index 00000000000..dcc8a1eb9d2
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_gdrcopy.h
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file gdaki_gdrcopy.h
+ * @brief A header file for the GDRCopy APIs used in GDAKI
+ */
+
+#ifndef DOCA_GPUNETIO_GDRCOPY_H
+#define DOCA_GPUNETIO_GDRCOPY_H
+
+#include <stddef.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool doca_gpu_gdrcopy_is_supported();
+int doca_gpu_gdrcopy_create_mapping(void *dev_aligned_ptr, size_t size, void **out_mh,
+                                    void **out_host_ptr);
+void doca_gpu_gdrcopy_destroy_mapping(void *mh, void *host_ptr, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // DOCA_GPUNETIO_GDRCOPY_H
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp
new file mode 100644
index 00000000000..b97ff5f4ac5
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_high_level.cpp
@@ -0,0 +1,903 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mutex>
+#include <atomic>
+#include <time.h>
+#include <unordered_map>
+#include <cuda_runtime.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "host/doca_gpunetio_high_level.h"
+#include "doca_gpunetio_gdrcopy.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_qp.hpp"
+#include "common/doca_gpunetio_verbs_dev.h"
+
+#define DBR_SIZE (8)
+#define MAX_SEND_SEGS (1)
+#define MAX_RECEIVE_SEGS (1)
+
+static size_t priv_get_page_size() {
+    auto ret = sysconf(_SC_PAGESIZE);
+    if (ret == -1) return 4096;  // 4KB, default Linux page size
+
+    return (size_t)ret;
+}
+
+static uint32_t align_up_uint32(uint32_t value, uint32_t alignment) {
+    uint64_t remainder = (value % alignment);
+
+    if (remainder == 0) return value;
+
+    return (uint32_t)(value + (alignment - remainder));
+}
+
+static doca_error_t create_uar(struct ibv_context *ibctx,
+                               enum doca_gpu_dev_verbs_nic_handler nic_handler,
+                               struct doca_verbs_uar **external_uar, bool bf_supported) {
+    doca_error_t status = DOCA_SUCCESS;
+
+    if (nic_handler != DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF) {
+        status = doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED,
+                                       external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC DEDICATED");
+            status =
+                doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE, external_uar);
+            if (status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC");
+            } else {
+                DOCA_LOG(LOG_INFO, "UAR created with DOCA_UAR_ALLOCATION_TYPE_NONCACHE");
+            }
+            return DOCA_SUCCESS;
+        } else
+            return DOCA_SUCCESS;
+    }
+
+    if (bf_supported &&
+        (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF ||
+         (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO && status != DOCA_SUCCESS))) {
+        status =
+            doca_verbs_uar_create(ibctx, DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME, external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to doca_verbs_uar_create NC");
+            return status;
+        }
+    } else
+        return DOCA_ERROR_DRIVER;
+
+    return status;
+}
+
+static doca_error_t create_gpu_umem(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                                    enum doca_gpu_verbs_mem_reg_type mreg_type, uint32_t umem_sz,
+                                    void *umem_ptr, struct doca_verbs_umem **umem) {
+    doca_error_t status;
+    int dmabuf_fd;
+    struct ibv_context *ibctx = ibpd->context;
+
+    if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT) {
+        status = doca_gpu_dmabuf_fd(gpu_dev, umem_ptr, umem_sz, &dmabuf_fd);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING,
+                     "GPU doesn't support dmabuf, fallback to legacy nvidia-peermem mode");
+            dmabuf_fd = DOCA_VERBS_DMABUF_INVALID_FD;
+        }
+
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE, dmabuf_fd,
+                                        0, umem);
+        if (status != DOCA_SUCCESS) {
+            if (dmabuf_fd > 0) {
+                DOCA_LOG(LOG_WARNING,
+                         "Failed to create gpu umem with dmabuf. Fallback to legacy nvidia-peermem "
+                         "mode");
+                status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE,
+                                                DOCA_VERBS_DMABUF_INVALID_FD, 0, umem);
+                if (status != DOCA_SUCCESS) {
+                    DOCA_LOG(LOG_ERR, "Failed to create gpu umem with nvidia-peermem mode");
+                    goto destroy_resources;
+                }
+            } else {
+                DOCA_LOG(LOG_ERR, "Failed to create gpu umem");
+                goto destroy_resources;
+            }
+        }
+    } else if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_DMABUF) {
+        status = doca_gpu_dmabuf_fd(gpu_dev, umem_ptr, umem_sz, &dmabuf_fd);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING, "GPU doesn't support dmabuf.");
+            goto destroy_resources;
+        }
+
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE, dmabuf_fd,
+                                        0, umem);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_WARNING, "GPU doesn't support dmabuf.");
+            goto destroy_resources;
+        }
+    } else if (mreg_type == DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_CUDA_PEERMEM) {
+        status = doca_verbs_umem_create(ibctx, umem_ptr, umem_sz, IBV_ACCESS_LOCAL_WRITE,
+                                        DOCA_VERBS_DMABUF_INVALID_FD, 0, umem);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create gpu umem with nvidia-peermem mode");
+            goto destroy_resources;
+        }
+    }
+
+    // Immediately close dmabuf_fd after registration.
+    if (dmabuf_fd > 0 && dmabuf_fd != (int)DOCA_VERBS_DMABUF_INVALID_FD) close(dmabuf_fd);
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (*umem) doca_verbs_umem_destroy(*umem);
+
+    return status;
+}
+
+static uint32_t calc_cq_external_umem_size(uint32_t queue_size) {
+    uint32_t cqe_buf_size = 0;
+
+    if (queue_size != 0)
+        cqe_buf_size = (uint32_t)(queue_size * sizeof(struct doca_gpunetio_ib_mlx5_cqe64));
+
+    return align_up_uint32(cqe_buf_size, priv_get_page_size());
+}
+
+static void mlx5_init_cqes(struct doca_gpunetio_ib_mlx5_cqe64 *cqes, uint32_t nb_cqes) {
+    for (uint32_t cqe_idx = 0; cqe_idx < nb_cqes; cqe_idx++)
+        cqes[cqe_idx].op_own =
+            (DOCA_GPUNETIO_IB_MLX5_CQE_INVALID << DOCA_GPUNETIO_VERBS_MLX5_CQE_OPCODE_SHIFT) |
+            DOCA_GPUNETIO_IB_MLX5_CQE_OWNER_MASK;
+}
+
+static doca_error_t create_cq(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                              enum doca_gpu_verbs_mem_reg_type mreg_type, uint32_t ncqes,
+                              void **gpu_umem_dev_ptr, struct doca_verbs_umem **gpu_umem,
+                              void **gpu_umem_dbr_dev_ptr, struct doca_verbs_umem **gpu_umem_dbr,
+                              struct doca_verbs_uar *external_uar,
+                              struct doca_verbs_cq **verbs_cq) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    cudaError_t status_cuda = cudaSuccess;
+    struct doca_verbs_cq_attr *verbs_cq_attr = NULL;
+    struct doca_verbs_cq *new_cq = NULL;
+    struct doca_gpunetio_ib_mlx5_cqe64 *cq_ring_haddr = NULL;
+    uint32_t external_umem_size = 0;
+    size_t dbr_umem_align_sz;
+    struct ibv_context *ibctx = ibpd->context;
+
+    status = doca_verbs_cq_attr_create(&verbs_cq_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq attributes");
+        return status;
+    }
+
+    external_umem_size = calc_cq_external_umem_size(ncqes);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to calc external umem size");
+        goto destroy_resources;
+    }
+
+    status = doca_gpu_mem_alloc(gpu_dev, external_umem_size, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, (void **)gpu_umem_dev_ptr, NULL);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem cq");
+        goto destroy_resources;
+    }
+
+    cq_ring_haddr =
+        (struct doca_gpunetio_ib_mlx5_cqe64 *)(calloc(external_umem_size, sizeof(uint8_t)));
+    if (cq_ring_haddr == NULL) {
+        DOCA_LOG(LOG_ERR, "Failed to allocate cq host ring buffer memory for initialization");
+        status = DOCA_ERROR_NO_MEMORY;
+        goto destroy_resources;
+    }
+
+    mlx5_init_cqes(cq_ring_haddr, ncqes);
+
+    DOCA_LOG(LOG_DEBUG, "Create CQ memcpy cq_ring_haddr %p into gpu_umem_dev_ptr %p size %d\n",
+             (void *)(cq_ring_haddr), (*gpu_umem_dev_ptr), external_umem_size);
+
+    status_cuda = cudaMemcpy((*gpu_umem_dev_ptr), (void *)(cq_ring_haddr), external_umem_size,
+                             cudaMemcpyDefault);
+    if (status_cuda != cudaSuccess) {
+        DOCA_LOG(LOG_ERR, "Failed to cudaMempy gpu cq cq ring buffer ret %d", status_cuda);
+        goto destroy_resources;
+    }
+
+    free(cq_ring_haddr);
+    cq_ring_haddr = nullptr;
+
+    status =
+        create_gpu_umem(gpu_dev, ibpd, mreg_type, external_umem_size, *gpu_umem_dev_ptr, gpu_umem);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_external_umem(verbs_cq_attr, *gpu_umem, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external umem");
+        goto destroy_resources;
+    }
+
+    dbr_umem_align_sz = CUDA_ROUND_UP(DBR_SIZE, priv_get_page_size());
+    status = doca_gpu_mem_alloc(gpu_dev, dbr_umem_align_sz, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, (void **)gpu_umem_dbr_dev_ptr, nullptr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+        goto destroy_resources;
+    }
+
+    status = create_gpu_umem(gpu_dev, ibpd, mreg_type, dbr_umem_align_sz, *gpu_umem_dbr_dev_ptr,
+                             gpu_umem_dbr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_external_dbr_umem(verbs_cq_attr, *gpu_umem_dbr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external dbr umem");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_cq_size(verbs_cq_attr, ncqes);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_set_cq_overrun(verbs_cq_attr, DOCA_VERBS_CQ_ENABLE_OVERRUN);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq size");
+        goto destroy_resources;
+    }
+
+    if (external_uar != NULL) {
+        status = doca_verbs_cq_attr_set_external_uar(verbs_cq_attr, external_uar);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to set doca verbs cq external uar");
+            goto destroy_resources;
+        }
+    }
+
+    status = doca_verbs_cq_create(ibctx, verbs_cq_attr, &new_cq);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_cq_attr_destroy(verbs_cq_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq attributes");
+        goto destroy_resources;
+    }
+
+    *verbs_cq = new_cq;
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (new_cq != NULL) {
+        tmp_status = doca_verbs_cq_destroy(new_cq);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq");
+    }
+
+    if (verbs_cq_attr != NULL) {
+        tmp_status = doca_verbs_cq_attr_destroy(verbs_cq_attr);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs cq attributes");
+    }
+
+    if (*gpu_umem != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu ring buffer umem");
+    }
+
+    if (*gpu_umem_dbr != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem_dbr);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu ring buffer umem");
+    }
+
+    if (cq_ring_haddr) {
+        free(cq_ring_haddr);
+    }
+
+    if ((*gpu_umem_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of cq umem buffer");
+    }
+
+    if ((*gpu_umem_dbr_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dbr_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of cq umem dbr buffer");
+    }
+
+    return status;
+}
+
+static uint32_t calc_qp_external_umem_size(uint32_t sq_nwqes) {
+    uint32_t sq_ring_size = 0;
+
+    if (sq_nwqes != 0) sq_ring_size = (uint32_t)(sq_nwqes * sizeof(struct doca_gpu_dev_verbs_wqe));
+
+    return align_up_uint32(sq_ring_size, priv_get_page_size());
+}
+
+static doca_error_t create_qp(struct doca_gpu *gpu_dev, struct ibv_pd *ibpd,
+                              enum doca_gpu_verbs_mem_reg_type mreg_type,
+                              struct doca_verbs_cq *cq_sq, uint32_t sq_nwqe,
+                              void **gpu_umem_dev_ptr, struct doca_verbs_umem **gpu_umem,
+                              void **gpu_umem_dbr_dev_ptr, struct doca_verbs_umem **gpu_umem_dbr,
+                              struct doca_verbs_uar *external_uar,
+                              enum doca_gpu_dev_verbs_nic_handler req_nic_handler,
+                              bool set_core_direct, struct doca_verbs_qp **verbs_qp,
+                              enum doca_gpu_dev_verbs_nic_handler *out_nic_handler) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr = NULL;
+    struct doca_verbs_qp *new_qp = NULL;
+    uint32_t external_umem_size = 0;
+    size_t dbr_umem_align_sz = align_up_uint32(DBR_SIZE, priv_get_page_size());
+    struct ibv_context *ibctx = ibpd->context;
+    enum doca_gpu_dev_verbs_nic_handler nic_handler = req_nic_handler;
+
+    status = doca_verbs_qp_init_attr_create(&verbs_qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp attributes");
+        return status;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_uar(verbs_qp_init_attr, external_uar);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges");
+        goto destroy_resources;
+    }
+
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_AUTO) {
+        bool can_register = false;
+        status = doca_gpu_verbs_can_gpu_register_uar(external_uar->get_reg_addr(), &can_register);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to check if UAR can be registered on GPU");
+            goto destroy_resources;
+        }
+
+        nic_handler = can_register ? DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_DB
+                                   : DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY;
+    }
+
+    external_umem_size = calc_qp_external_umem_size(sq_nwqe);
+
+    status = doca_gpu_mem_alloc(gpu_dev, external_umem_size, priv_get_page_size(),
+                                DOCA_GPU_MEM_TYPE_GPU, gpu_umem_dev_ptr, NULL);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+        goto destroy_resources;
+    }
+
+    status =
+        create_gpu_umem(gpu_dev, ibpd, mreg_type, external_umem_size, *gpu_umem_dev_ptr, gpu_umem);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_umem(verbs_qp_init_attr, *gpu_umem, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs qp external umem");
+        goto destroy_resources;
+    }
+
+    if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+        *gpu_umem_dbr_dev_ptr = calloc(dbr_umem_align_sz, sizeof(uint8_t));
+        if (*gpu_umem_dbr_dev_ptr == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+            goto destroy_resources;
+        }
+    } else {
+        status = doca_gpu_mem_alloc(gpu_dev, dbr_umem_align_sz, priv_get_page_size(),
+                                    DOCA_GPU_MEM_TYPE_GPU, gpu_umem_dbr_dev_ptr, NULL);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to alloc gpu memory for external umem qp");
+            goto destroy_resources;
+        }
+    }
+
+    status = create_gpu_umem(gpu_dev, ibpd, mreg_type, dbr_umem_align_sz, *gpu_umem_dbr_dev_ptr,
+                             gpu_umem_dbr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "create_gpu_umem failed with %d", status);
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_external_dbr_umem(verbs_qp_init_attr, *gpu_umem_dbr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs qp external dbr umem");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_pd(verbs_qp_init_attr, ibpd);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs PD");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_sq_wr(verbs_qp_init_attr, sq_nwqe);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set SQ size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_rq_wr(verbs_qp_init_attr, 0);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set RQ size");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_qp_type(verbs_qp_init_attr, DOCA_VERBS_QP_TYPE_RC);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set QP type");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_send_cq(verbs_qp_init_attr, cq_sq);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set doca verbs CQ");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_send_max_sges(verbs_qp_init_attr, MAX_SEND_SEGS);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_max_sges");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_set_receive_max_sges(verbs_qp_init_attr, MAX_RECEIVE_SEGS);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges");
+        goto destroy_resources;
+    }
+
+    if (set_core_direct) {
+        status = doca_verbs_qp_init_attr_set_core_direct_master(verbs_qp_init_attr, 1);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to set core_direct");
+            goto destroy_resources;
+        }
+    }
+
+    status = doca_verbs_qp_create(ibctx, verbs_qp_init_attr, &new_qp);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs QP");
+        goto destroy_resources;
+    }
+
+    status = doca_verbs_qp_init_attr_destroy(verbs_qp_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP attributes");
+        goto destroy_resources;
+    }
+
+    *verbs_qp = new_qp;
+    *out_nic_handler = nic_handler;
+
+    return DOCA_SUCCESS;
+
+destroy_resources:
+    if (new_qp != NULL) {
+        tmp_status = doca_verbs_qp_destroy(new_qp);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP");
+    }
+
+    if (verbs_qp_init_attr != NULL) {
+        tmp_status = doca_verbs_qp_init_attr_destroy(verbs_qp_init_attr);
+        if (tmp_status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP attributes");
+    }
+
+    if (*gpu_umem != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu umem");
+    }
+
+    if ((*gpu_umem_dev_ptr) != 0) {
+        tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dev_ptr));
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of umem");
+    }
+
+    if (*gpu_umem_dbr != NULL) {
+        tmp_status = doca_verbs_umem_destroy(*gpu_umem_dbr);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu umem");
+    }
+
+    if ((*gpu_umem_dbr_dev_ptr) != 0) {
+        if (nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+            free(*gpu_umem_dbr_dev_ptr);
+        } else {
+            tmp_status = doca_gpu_mem_free(gpu_dev, (*gpu_umem_dbr_dev_ptr));
+            if (tmp_status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of umem");
+        }
+    }
+
+    // Immediately close dmabuf_fd after registration.
+    // if (dmabuf_fd > 0) close(dmabuf_fd);
+
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_create_qp_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                         struct doca_gpu_verbs_qp_hl **qp) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+
+    if (qp_init_attr == nullptr || qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: qp_init_attr %p qp %p", (void *)qp_init_attr,
+                 (void *)*qp);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->gpu_dev == nullptr || qp_init_attr->ibpd == nullptr ||
+        qp_init_attr->sq_nwqe == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: gpu_dev %p ibpd %p sq_nwqe %d",
+                 (void *)qp_init_attr->gpu_dev, (void *)qp_init_attr->ibpd, qp_init_attr->sq_nwqe);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    struct doca_gpu_verbs_qp_hl *qp_ =
+        (struct doca_gpu_verbs_qp_hl *)calloc(1, sizeof(struct doca_gpu_verbs_qp_hl));
+    if (qp_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed alloc memory for high-level qp");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    qp_->gpu_dev = qp_init_attr->gpu_dev;
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status = create_cq(qp_->gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                           qp_init_attr->sq_nwqe, &qp_->cq_sq_umem_gpu_ptr, &qp_->cq_sq_umem,
+                           &qp_->cq_sq_umem_dbr_gpu_ptr, &qp_->cq_sq_umem_dbr, NULL, &qp_->cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    qp_->nic_handler = qp_init_attr->nic_handler;
+
+    status = create_uar(qp_init_attr->ibpd->context, qp_->nic_handler, &qp_->external_uar, true);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs uar");
+        goto exit_error;
+    }
+
+    status = create_qp(qp_->gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type, qp_->cq_sq,
+                       qp_init_attr->sq_nwqe, &qp_->qp_umem_gpu_ptr, &qp_->qp_umem,
+                       &qp_->qp_umem_dbr_gpu_ptr, &qp_->qp_umem_dbr, qp_->external_uar,
+                       qp_init_attr->nic_handler, false, &qp_->qp, &qp_->nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status = doca_gpu_verbs_export_qp(qp_->gpu_dev, qp_->qp, qp_->nic_handler, qp_->qp_umem_gpu_ptr,
+                                      qp_->cq_sq, &qp_->qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    *qp = qp_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    if (qp_->external_uar != NULL) {
+        tmp_status = doca_verbs_uar_destroy(qp_->external_uar);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    free(qp_);
+    return status;
+}
+
+static doca_error_t doca_gpu_verbs_destroy_qp_hl_internal(struct doca_gpu_verbs_qp_hl *qp) {
+    doca_error_t status;
+
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    status = doca_gpu_verbs_unexport_qp(qp->gpu_dev, qp->qp_gverbs);
+    if (status != DOCA_SUCCESS)
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca gpu thread argument cq memory");
+
+    status = doca_verbs_qp_destroy(qp->qp);
+    if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs QP");
+
+    if (qp->qp_umem != NULL) {
+        status = doca_verbs_umem_destroy(qp->qp_umem);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu qp umem");
+    }
+
+    if (qp->qp_umem_gpu_ptr != 0) {
+        status = doca_gpu_mem_free(qp->gpu_dev, qp->qp_umem_gpu_ptr);
+        if (status != DOCA_SUCCESS)
+            DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of qp ring buffer");
+    }
+
+    if (qp->qp_umem_dbr != NULL) {
+        status = doca_verbs_umem_destroy(qp->qp_umem_dbr);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy gpu qp umem dbr");
+    }
+
+    if (qp->qp_umem_dbr_gpu_ptr != NULL) {
+        if (qp->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY) {
+            free(qp->qp_umem_dbr_gpu_ptr);
+        } else {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->qp_umem_dbr_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of qp ring buffer dbr");
+        }
+    }
+
+    if (qp->external_uar != NULL) {
+        status = doca_verbs_uar_destroy(qp->external_uar);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    if (qp->cq_sq) {
+        status = doca_verbs_cq_destroy(qp->cq_sq);
+        if (status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs CQ");
+
+        if (qp->cq_sq_umem != NULL) {
+            status = doca_verbs_umem_destroy(qp->cq_sq_umem);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu sq cq ring buffer umem");
+        }
+
+        if (qp->cq_sq_umem_gpu_ptr != 0) {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->cq_sq_umem_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of sq cq ring buffer");
+        }
+
+        if (qp->cq_sq_umem_dbr != NULL) {
+            status = doca_verbs_umem_destroy(qp->cq_sq_umem_dbr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu sq cq ring buffer umem");
+        }
+
+        if (qp->cq_sq_umem_dbr_gpu_ptr != 0) {
+            status = doca_gpu_mem_free(qp->gpu_dev, qp->cq_sq_umem_dbr_gpu_ptr);
+            if (status != DOCA_SUCCESS)
+                DOCA_LOG(LOG_ERR, "Failed to destroy gpu memory of sq cq umem dbr buffer");
+        }
+    }
+
+    memset(qp, 0, sizeof(*qp));
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_destroy_qp_hl(struct doca_gpu_verbs_qp_hl *qp) {
+    if (qp == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    doca_gpu_verbs_destroy_qp_hl_internal(qp);
+    free(qp);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_create_qp_group_hl(struct doca_gpu_verbs_qp_init_attr_hl *qp_init_attr,
+                                               struct doca_gpu_verbs_qp_group_hl **qpg) {
+    doca_error_t status = DOCA_SUCCESS, tmp_status = DOCA_SUCCESS;
+
+    if (qp_init_attr == nullptr || qpg == nullptr) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: qp_init_attr %p qp %p", (void *)qp_init_attr,
+                 (void *)*qpg);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->gpu_dev == nullptr || qp_init_attr->ibpd == nullptr ||
+        qp_init_attr->sq_nwqe == 0) {
+        DOCA_LOG(LOG_ERR, "Invalid input value: gpu_dev %p ibpd %p sq_nwqe %d",
+                 (void *)qp_init_attr->gpu_dev, (void *)qp_init_attr->ibpd, qp_init_attr->sq_nwqe);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (qp_init_attr->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_GPU_SM_BF) {
+        DOCA_LOG(LOG_ERR, "BlueFlame not supported with QP group");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    struct doca_gpu_verbs_qp_group_hl *qpg_ =
+        (struct doca_gpu_verbs_qp_group_hl *)calloc(1, sizeof(struct doca_gpu_verbs_qp_group_hl));
+    if (qpg_ == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed alloc memory for high-level qp");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    /********** Create main QP **********/
+
+    qpg_->qp_main.gpu_dev = qp_init_attr->gpu_dev;
+
+    status = create_uar(qp_init_attr->ibpd->context, qpg_->qp_main.nic_handler,
+                        &qpg_->qp_main.external_uar, true);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs uar");
+        goto exit_error;
+    }
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status = create_cq(qpg_->qp_main.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                           qp_init_attr->sq_nwqe, &qpg_->qp_main.cq_sq_umem_gpu_ptr,
+                           &qpg_->qp_main.cq_sq_umem, &qpg_->qp_main.cq_sq_umem_dbr_gpu_ptr,
+                           &qpg_->qp_main.cq_sq_umem_dbr, qpg_->qp_main.external_uar,
+                           &qpg_->qp_main.cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    status = create_qp(
+        qpg_->qp_main.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type, qpg_->qp_main.cq_sq,
+        qp_init_attr->sq_nwqe, &qpg_->qp_main.qp_umem_gpu_ptr, &qpg_->qp_main.qp_umem,
+        &qpg_->qp_main.qp_umem_dbr_gpu_ptr, &qpg_->qp_main.qp_umem_dbr, qpg_->qp_main.external_uar,
+        qp_init_attr->nic_handler, false, &qpg_->qp_main.qp, &qpg_->qp_main.nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status = doca_gpu_verbs_export_qp(qpg_->qp_main.gpu_dev, qpg_->qp_main.qp,
+                                      qpg_->qp_main.nic_handler, qpg_->qp_main.qp_umem_gpu_ptr,
+                                      qpg_->qp_main.cq_sq, &qpg_->qp_main.qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    /********** Create companion QP **********/
+
+    qpg_->qp_companion.gpu_dev = qp_init_attr->gpu_dev;
+    qpg_->qp_companion.external_uar = qpg_->qp_main.external_uar;
+
+    if (qp_init_attr->sq_nwqe > 0) {
+        qp_init_attr->sq_nwqe =
+            (uint32_t)doca_internal_utils_next_power_of_two(qp_init_attr->sq_nwqe);
+
+        status =
+            create_cq(qpg_->qp_companion.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                      qp_init_attr->sq_nwqe, &qpg_->qp_companion.cq_sq_umem_gpu_ptr,
+                      &qpg_->qp_companion.cq_sq_umem, &qpg_->qp_companion.cq_sq_umem_dbr_gpu_ptr,
+                      &qpg_->qp_companion.cq_sq_umem_dbr, qpg_->qp_companion.external_uar,
+                      &qpg_->qp_companion.cq_sq);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create doca verbs cq");
+            goto exit_error;
+        }
+    }
+
+    status = create_qp(qpg_->qp_companion.gpu_dev, qp_init_attr->ibpd, qp_init_attr->mreg_type,
+                       qpg_->qp_companion.cq_sq, qp_init_attr->sq_nwqe,
+                       &qpg_->qp_companion.qp_umem_gpu_ptr, &qpg_->qp_companion.qp_umem,
+                       &qpg_->qp_companion.qp_umem_dbr_gpu_ptr, &qpg_->qp_companion.qp_umem_dbr,
+                       qpg_->qp_companion.external_uar, qp_init_attr->nic_handler, true,
+                       &qpg_->qp_companion.qp, &qpg_->qp_companion.nic_handler);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca verbs qp");
+        goto exit_error;
+    }
+
+    status =
+        doca_gpu_verbs_export_qp(qpg_->qp_companion.gpu_dev, qpg_->qp_companion.qp,
+                                 qpg_->qp_companion.nic_handler, qpg_->qp_companion.qp_umem_gpu_ptr,
+                                 qpg_->qp_companion.cq_sq, &qpg_->qp_companion.qp_gverbs);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create GPU verbs QP");
+        return status;
+    }
+
+    *qpg = qpg_;
+
+    return DOCA_SUCCESS;
+
+exit_error:
+    if (qpg_->qp_main.external_uar != NULL) {
+        tmp_status = doca_verbs_uar_destroy(qpg_->qp_main.external_uar);
+        if (tmp_status != DOCA_SUCCESS) DOCA_LOG(LOG_ERR, "Failed to destroy doca verbs UAR");
+    }
+
+    free(qpg_);
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_destroy_qp_group_hl(struct doca_gpu_verbs_qp_group_hl *qpg) {
+    if (qpg == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    doca_gpu_verbs_destroy_qp_hl_internal(&qpg->qp_main);
+    qpg->qp_companion.external_uar = nullptr;
+    doca_gpu_verbs_destroy_qp_hl_internal(&qpg->qp_companion);
+
+    memset(qpg, 0, sizeof(*qpg));
+
+    free(qpg);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_gpu_verbs_qp_flat_list_create_hl(struct doca_gpu_verbs_qp_hl **qp_list,
+                                                   uint32_t num_elems,
+                                                   struct doca_gpu_dev_verbs_qp **qp_gpu) {
+    doca_error_t status = DOCA_SUCCESS;
+    cudaError_t error;
+    struct doca_gpu_dev_verbs_qp *qp_gpu_;
+
+    if (num_elems == 0 || qp_list == nullptr || qp_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    error = cudaMalloc((void **)&qp_gpu_, sizeof(struct doca_gpu_dev_verbs_qp) * num_elems);
+    if (error != cudaSuccess) return DOCA_ERROR_NO_MEMORY;
+
+    for (uint32_t i = 0; i < num_elems; i++) {
+        error = cudaMemcpy(qp_gpu_ + i, qp_list[i]->qp_gverbs->qp_cpu,
+                           sizeof(struct doca_gpu_dev_verbs_qp), cudaMemcpyDefault);
+        if (error != cudaSuccess) goto exit_error;
+    }
+
+    *qp_gpu = qp_gpu_;
+
+    return status;
+
+exit_error:
+    cudaFree(qp_gpu);
+    return status;
+}
+
+doca_error_t doca_gpu_verbs_qp_flat_list_destroy_hl(struct doca_gpu_dev_verbs_qp *qp_gpu) {
+    if (qp_gpu == nullptr) return DOCA_ERROR_INVALID_VALUE;
+
+    cudaFree(qp_gpu);
+    return DOCA_SUCCESS;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp
new file mode 100644
index 00000000000..05c71997f8b
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.cpp
@@ -0,0 +1,77 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <time.h>
+#include <stdarg.h>
+
+#include "doca_gpunetio_log.hpp"
+
+static const char *doca_gpu_log_level_strings[] = {"EMERG",   "ALERT",  "CRIT", "ERR",
+                                                   "WARNING", "NOTICE", "INFO", "DEBUG"};
+
+void doca_gpu_log_print(int log_level, const char *file, int line, const char *func,
+                        const char *fmt, ...) {
+    static int cur_log_level = -1;
+    if (cur_log_level < 0) {
+        const char *debug_env = getenv("DOCA_GPUNETIO_LOG");
+        if (debug_env != NULL) {
+            int env_log_level = atoi(debug_env);
+            if (env_log_level >= 0 &&
+                env_log_level <= (int)(sizeof(doca_gpu_log_level_strings) /
+                                       sizeof(doca_gpu_log_level_strings[0]))) {
+                cur_log_level = env_log_level;
+            }
+        }
+        if (cur_log_level < 0) {
+            cur_log_level = 0;
+        }
+    }
+
+    if (log_level <= cur_log_level) {
+        time_t now = time(NULL);
+        char *timestamp = ctime(&now);
+        timestamp[strlen(timestamp) - 1] = '\0';
+        va_list args;
+        va_start(args, fmt);
+        fprintf(stderr, "%s [%s] [%s]: %d: %s(): ", timestamp,
+                doca_gpu_log_level_strings[log_level], file, line, func);
+        vfprintf(stderr, fmt, args);
+        fprintf(stderr, "\n");
+        va_end(args);
+    }
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp
new file mode 100644
index 00000000000..84e3a7ec75d
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_gpunetio_log.hpp
@@ -0,0 +1,43 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <syslog.h>
+
+void doca_gpu_log_print(int log_level, const char *file, int line, const char *func,
+                        const char *fmt, ...);
+
+#define DOCA_LOG(LOG_LEVEL, fmt, ...)                                                    \
+    do {                                                                                 \
+        doca_gpu_log_print(LOG_LEVEL, __FILE__, __LINE__, __func__, fmt, ##__VA_ARGS__); \
+    } while (0)
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp
new file mode 100644
index 00000000000..e4058b23765
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_internal.hpp
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <vector>
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string>
+#include <cmath>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <linux/types.h>
+
+#include "host/doca_error.h"
+#include "doca_gpunetio_config.h"
+#include "doca_gpunetio_log.hpp"
+
+#ifndef CUDA_ROUND_UP
+#define CUDA_ROUND_UP(unaligned_mapping_size, align_val) \
+    ((unaligned_mapping_size) + (align_val) - 1) & (~((align_val) - 1))
+#endif
+
+#ifndef CUDA_ROUND_DOWN
+#define CUDA_ROUND_DOWN(unaligned_mapping_size, align_val) \
+    ((unaligned_mapping_size) & ~((align_val) - 1))
+#endif
+
+#define DOCA_VERBS_PAGE_SIZE 4096
+#define DOCA_VERBS_CACHELINE_SIZE (64)
+
+#define DOCA_VERBS_DB_UAR_SIZE 8
+
+/**
+ * @brief This method checks if a number is a power of 2
+ *
+ * @param [in] x
+ * The number to check
+ * @return true if x is a power of 2, false if not.
+ */
+inline bool doca_internal_utils_is_power_of_two(uint64_t x) { return x && (x & (x - 1)) == 0; }
+
+inline uint64_t doca_internal_utils_next_power_of_two(uint64_t x) {
+    x--;
+
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    x |= x >> 32;
+
+    return x + 1;
+}
+
+struct doca_internal_mlx5_wqe_data_seg {
+    __be32 byte_count;
+    __be32 lkey;
+    __be64 addr;
+};
+
+struct doca_internal_mlx5_wqe_mprq_next_seg {
+    uint8_t rsvd0[2];
+    __be16 next_wqe_index;
+    uint8_t signature;
+    uint8_t rsvd1[11];
+};
+
+template <typename T>
+T doca_internal_utils_log2(T x) {
+    if (x == 0) /* log(0) is undefined */
+        return 0;
+
+    return static_cast<T>(std::log2(x));
+}
+
+inline uint64_t doca_internal_utils_align_up_uint64(uint64_t value, uint64_t alignment) {
+    uint64_t remainder = (value % alignment);
+
+    if (remainder == 0) return value;
+
+    return value + (alignment - remainder);
+}
+
+inline uint32_t doca_internal_utils_align_up_uint32(uint32_t value, uint32_t alignment) {
+    return (uint32_t)doca_internal_utils_align_up_uint64(value, alignment);
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp
new file mode 100644
index 00000000000..4009d2eea48
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.cpp
@@ -0,0 +1,472 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_cq.hpp"
+#include "doca_verbs_net_wrapper.h"
+
+#define DOCA_VERBS_CQE_SIZE 64
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+static constexpr uint32_t sc_cq_doorbell_size = 64;
+
+using create_cq_in = uint32_t[MLX5_ST_SZ_DW(create_cq_in)];
+using create_cq_out = uint32_t[MLX5_ST_SZ_DW(create_cq_out)];
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_cq Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_cq::doca_verbs_cq(struct ibv_context *ibv_ctx, struct doca_verbs_cq_attr &cq_attr)
+    : m_ibv_ctx(ibv_ctx), m_cq_attr(cq_attr) {
+    try {
+        create(cq_attr);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create CQ");
+        throw;
+    }
+}
+
+doca_verbs_cq::~doca_verbs_cq() { static_cast<void>(destroy()); }
+
+doca_error_t doca_verbs_cq::create_cq_obj(uint32_t uar_id, uint32_t log_nb_cqes,
+                                          uint64_t db_umem_offset, uint32_t db_umem_id,
+                                          uint32_t wq_umem_id, bool cq_overrun) noexcept {
+    create_cq_in create_in{0};
+    create_cq_out create_out{0};
+
+    DEVX_SET(create_cq_in, create_in, opcode, MLX5_CMD_OP_CREATE_CQ);
+    DEVX_SET(create_cq_in, create_in, cq_context.cqe_sz, MLX5_CQC_CQE_SZ_BYTES_64);
+    DEVX_SET(create_cq_in, create_in, cq_context.cc, 0x0);  // Disable collapsed CQ
+    DEVX_SET(create_cq_in, create_in, cq_context.oi,
+             static_cast<uint8_t>(cq_overrun));                              // Enable overrun
+    DEVX_SET(create_cq_in, create_in, cq_context.log_cq_size, log_nb_cqes);  //<--
+    DEVX_SET(create_cq_in, create_in, cq_context.uar_page, uar_id);
+    DEVX_SET(create_cq_in, create_in, cq_umem_id, wq_umem_id);
+    DEVX_SET(create_cq_in, create_in, cq_umem_valid, 1);
+    DEVX_SET64(create_cq_in, create_in, cq_umem_offset, 0x0);
+    DEVX_SET(create_cq_in, create_in, cq_context.dbr_umem_id, db_umem_id);
+    DEVX_SET(create_cq_in, create_in, cq_context.dbr_umem_valid, 1);
+    DEVX_SET64(create_cq_in, create_in, cq_context.dbr_addr, db_umem_offset);
+
+    uint32_t element_id;
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_query_eqn(m_ibv_ctx, 0, &element_id);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query devx eqn");
+        return DOCA_ERROR_OPERATING_SYSTEM;
+    }
+
+    DEVX_SET(create_cq_in, create_in, cq_context.c_eqn, element_id);
+
+    /* Since cq_umem_valid == 1, FW deduces page size from umem and this field is reserved */
+    DEVX_SET(create_cq_in, create_in, cq_context.log_page_size,
+             0);  // GPU_PAGE_SHIFT - MLX5_ADAPTER_PAGE_SHIFT
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ibv_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_cq_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create CQ. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_cqn = DEVX_GET(create_cq_out, create_out, cqn);
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_cq::create(struct doca_verbs_cq_attr &cq_attr) {
+    auto status{DOCA_SUCCESS};
+
+    if ((cq_attr.external_umem != nullptr && cq_attr.external_umem_dbr == nullptr) ||
+        (cq_attr.external_umem == nullptr && cq_attr.external_umem_dbr != nullptr)) {
+        DOCA_LOG(LOG_ERR, "Both UMEM should be either external or internal");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    /* Query device attr */
+    status = doca_verbs_query_device(m_ibv_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (doca_internal_utils_is_power_of_two(cq_attr.cq_size) == false) {
+        DOCA_LOG(LOG_ERR, "Number of CQE is not a power of 2");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    m_num_cqes = static_cast<uint32_t>(cq_attr.cq_size);
+    uint32_t log_nb_cqes = doca_internal_utils_log2(m_num_cqes);
+
+    if (m_num_cqes > m_verbs_device_attr->m_max_cqe) {
+        DOCA_LOG(LOG_ERR, "CQ cq_size is invalid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    uint32_t umem_id{0};
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+
+    dbr_umem_offset = m_num_cqes * DOCA_VERBS_CQE_SIZE;
+    dbr_umem_offset =
+        doca_internal_utils_align_up_uint32(dbr_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+    if (cq_attr.external_umem == nullptr) {
+        /* Case of internal umem */
+        uint32_t total_umem_size = doca_internal_utils_align_up_uint32(
+            dbr_umem_offset + sc_cq_doorbell_size, DOCA_VERBS_PAGE_SIZE);
+
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+        memset(m_umem_buf, 0, total_umem_size);
+
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ibv_ctx, m_umem_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create CQ UMEM");
+            throw umem_status;
+        }
+
+        m_cq_buf = m_umem_buf;
+        umem_id = m_umem_obj->umem_id;
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_cq_buf + dbr_umem_offset);
+    } else {
+        uint8_t *tmp_db_buffer;
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(cq_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_cq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(cq_attr.external_umem, &umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(cq_attr.external_umem_dbr,
+                                             reinterpret_cast<void **>(&tmp_db_buffer));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(cq_attr.external_umem_dbr, &dbr_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        dbr_umem_offset = cq_attr.external_umem_dbr_offset;
+        m_db_buffer = reinterpret_cast<uint32_t *>(tmp_db_buffer + dbr_umem_offset);
+    }
+
+    m_ci_dbr = &m_db_buffer[MLX5_CQ_SET_CI];
+    m_arm_dbr = &m_db_buffer[MLX5_CQ_ARM_DB];
+
+    uint32_t uar_id{};
+    if (cq_attr.external_uar == nullptr) {
+        auto uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+            m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_NC, &m_uar_obj);
+        if (uar_status != DOCA_SUCCESS) {
+            uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+                m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_BF, &m_uar_obj);
+            if (uar_status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to create UAR");
+                throw uar_status;
+            }
+        }
+
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(m_uar_obj->reg_addr);
+        uar_id = m_uar_obj->page_id;
+    } else {
+        /* Case of external UAR */
+        status = doca_verbs_uar_id_get(cq_attr.external_uar, &uar_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR ID");
+            throw status;
+        }
+
+        void *reg_addr{};
+        status = doca_verbs_uar_reg_addr_get(cq_attr.external_uar, &reg_addr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR reg_addr");
+            throw status;
+        }
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(reg_addr);
+    }
+
+    /* Create CQ object */
+    status = create_cq_obj(uar_id, log_nb_cqes, dbr_umem_offset, dbr_umem_id, umem_id,
+                           cq_attr.cq_overrun);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create CQ object");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs CQ %p: has been successfully created", this);
+}
+
+doca_error_t doca_verbs_cq::destroy() noexcept {
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_cq_obj) {
+        auto destroy_status = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_cq_obj);
+        if (destroy_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy CQ object");
+            return destroy_status;
+        }
+        m_cq_obj = nullptr;
+    }
+
+    if (m_uar_obj) {
+        auto free_uar_status = doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        if (free_uar_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free UAR");
+            return free_uar_status;
+        }
+        m_uar_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_cq_attr_create(struct doca_verbs_cq_attr **verbs_cq_attr) {
+    if (verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create cq_attr: parameter verbs_cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_cq_attr = (struct doca_verbs_cq_attr *)calloc(1, sizeof(struct doca_verbs_cq_attr));
+    if (*verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create cq_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_destroy(struct doca_verbs_cq_attr *cq_attr) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy doca_verbs_cq_attr. parameter cq_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(cq_attr);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_size(struct doca_verbs_cq_attr *cq_attr, uint32_t cq_size) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set cq_size: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_size = cq_size;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_context(struct doca_verbs_cq_attr *cq_attr,
+                                               void *cq_context) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set cq_context: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_context = cq_context;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                  struct doca_verbs_umem *external_umem,
+                                                  uint64_t external_umem_offset) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter cq_attr is NULL.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL.");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_umem = external_umem;
+    cq_attr->external_umem_offset = external_umem_offset;
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_dbr_umem(struct doca_verbs_cq_attr *cq_attr,
+                                                      struct doca_verbs_umem *external_umem,
+                                                      uint64_t external_umem_offset) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_umem_dbr = external_umem;
+    cq_attr->external_umem_dbr_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_external_uar(struct doca_verbs_cq_attr *cq_attr,
+                                                 struct doca_verbs_uar *external_uar) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->external_uar = external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_attr_set_cq_overrun(struct doca_verbs_cq_attr *cq_attr,
+                                               enum doca_verbs_cq_overrun overrun) {
+    if (cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter cq_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    cq_attr->cq_overrun = overrun;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_cq_create(struct ibv_context *context,
+                                  struct doca_verbs_cq_attr *verbs_cq_attr,
+                                  struct doca_verbs_cq **verbs_cq) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca_verbs_cq. param context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (verbs_cq_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create doca_verbs_cq. param verbs_cq_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_cq = new doca_verbs_cq(context, *verbs_cq_attr);
+        DOCA_LOG(LOG_INFO, "IB Verbs Context %p: verbs_cq=%p was created", context, *verbs_cq);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_cq_destroy(struct doca_verbs_cq *verbs_cq) {
+    if (verbs_cq == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_cq: parameter verbs_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_cq->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_cq.");
+        return status;
+    }
+
+    delete (verbs_cq);
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_cq_get_wq(struct doca_verbs_cq *verbs_cq, void **cq_buf, uint32_t *cq_num_entries,
+                          uint8_t *cq_entry_size) {
+    *cq_buf = verbs_cq->get_cq_buf();
+    *cq_num_entries = verbs_cq->get_cq_num_entries();
+    *cq_entry_size = DOCA_VERBS_CQE_SIZE;
+}
+
+void doca_verbs_cq_get_dbr_addr(struct doca_verbs_cq *verbs_cq, uint64_t **uar_db_reg,
+                                uint32_t **ci_dbr, uint32_t **arm_dbr) {
+    *uar_db_reg = verbs_cq->get_cq_uar_db_reg();
+    *ci_dbr = verbs_cq->get_cq_ci_dbr();
+    *arm_dbr = verbs_cq->get_cq_arm_dbr();
+}
+
+uint32_t doca_verbs_cq_get_cqn(const struct doca_verbs_cq *verbs_cq) { return verbs_cq->get_cqn(); }
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp
new file mode 100644
index 00000000000..de863451732
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cq.hpp
@@ -0,0 +1,151 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+struct doca_verbs_cq_attr {
+    uint32_t cq_size{};
+    void *cq_context{};
+    struct doca_verbs_umem *external_umem{};
+    struct doca_verbs_umem *external_umem_dbr{};
+    uint32_t external_umem_offset{};
+    uint64_t external_umem_dbr_offset{};
+    struct doca_verbs_uar *external_uar{};
+    enum doca_verbs_cq_overrun cq_overrun;
+};
+
+/**
+ *  @brief This struct implements the doca verbs cq
+ */
+struct doca_verbs_cq {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] verbs_ctx
+     * ibv_context
+     * @param [in] cq_attr
+     * The DOCA IB Verbs CQ attributes
+     *
+     */
+    doca_verbs_cq(struct ibv_context *ibv_ctx, struct doca_verbs_cq_attr &cq_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_cq();
+
+    /**
+     * @brief destroy the cq
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the cq.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the cq
+     *
+     */
+    void create(struct doca_verbs_cq_attr &cq_attr);
+
+    doca_error_t create_cq_obj(uint32_t uar_id, uint32_t log_nb_cqes, uint64_t db_umem_offset,
+                               uint32_t db_umem_id, uint32_t wq_umem_id, bool cq_overrun) noexcept;
+
+    /**
+     * @brief Get CQ number
+     *
+     * @return CQ number
+     */
+    uint32_t get_cqn() const noexcept { return m_cqn; }
+
+    /**
+     * @brief Get CQ buff
+     *
+     * @return CQ buff
+     */
+    void *get_cq_buf() const noexcept { return m_cq_buf; }
+
+    /**
+     * @brief Get CQ num entries
+     *
+     * @return CQ num entries
+     */
+    uint32_t get_cq_num_entries() const noexcept { return m_num_cqes; }
+
+    /**
+     * @brief Get CQ UAR reg
+     *
+     * @return CQ UAR reg
+     */
+    uint64_t *get_cq_uar_db_reg() const noexcept { return m_uar_db_reg; }
+
+    /**
+     * @brief Get CQ ci dbr
+     *
+     * @return CQ ci dbr
+     */
+    uint32_t *get_cq_ci_dbr() const noexcept { return m_ci_dbr; }
+
+    /**
+     * @brief Get CQ arm dbr
+     *
+     * @return CQ arm dbr
+     */
+    uint32_t *get_cq_arm_dbr() const noexcept { return m_arm_dbr; }
+
+   private:
+    struct mlx5dv_devx_obj *m_cq_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_cq_buf{};
+    uint32_t *m_db_buffer;
+    uint64_t *m_uar_db_reg{};
+    uint32_t m_num_cqes{};
+    uint32_t m_cqn{};
+    uint32_t *m_ci_dbr{};
+    uint32_t *m_arm_dbr{};
+    struct doca_verbs_cq_attr m_cq_attr {};
+    struct doca_verbs_device_attr *m_verbs_device_attr{};
+
+    doca_verbs_cq(doca_verbs_cq const &) = delete;
+    doca_verbs_cq &operator=(doca_verbs_cq const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp
new file mode 100644
index 00000000000..6b6259a132f
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.cpp
@@ -0,0 +1,129 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syslog.h>
+#include <mutex>
+
+#include "doca_verbs_cuda_wrapper.h"
+#include "doca_gpunetio_log.hpp"
+
+/* Function pointer types for CUDA device APIs */
+typedef CUresult (*cuDeviceGetAttribute_t)(int *pi, CUdevice_attribute attrib, CUdevice dev);
+typedef CUresult (*cuPointerSetAttribute_t)(const void *value, CUpointer_attribute attribute,
+                                            CUdeviceptr ptr);
+typedef CUresult (*cuMemGetHandleForAddressRange_t)(int *pHandle, CUdeviceptr dptr, size_t size,
+                                                    CUmemRangeHandleType handleType,
+                                                    unsigned long long flags);
+typedef CUresult (*cuCtxGetCurrent_t)(CUcontext *pctx);
+
+/* Global function pointers */
+cuDeviceGetAttribute_t p_cuDeviceGetAttribute = nullptr;
+cuPointerSetAttribute_t p_cuPointerSetAttribute = nullptr;
+cuMemGetHandleForAddressRange_t p_cuMemGetHandleForAddressRange = nullptr;
+cuCtxGetCurrent_t p_cuCtxGetCurrent = nullptr;
+
+static void *cuda_handle = nullptr;
+
+/* Helper function to get function pointer from libcuda */
+static void *get_cuda_symbol(const char *symbol_name) {
+    void *symbol = dlsym(cuda_handle, symbol_name);
+    if (!symbol) {
+        DOCA_LOG(LOG_ERR, "Failed to get symbol %s: %s\n", symbol_name, dlerror());
+        return nullptr;
+    }
+    return symbol;
+}
+
+static void doca_verbs_wrapper_init_once(int *ret) {
+    /* Open libcuda.so */
+    cuda_handle = dlopen("libcuda.so", RTLD_LAZY);
+    if (!cuda_handle) {
+        cuda_handle = dlopen("libcuda.so.1", RTLD_LAZY);
+        if (!cuda_handle) {
+            DOCA_LOG(LOG_ERR, "Failed to open libcuda: %s\n", dlerror());
+            *ret = -1;
+            return;
+        }
+    }
+
+    /* Get function pointers */
+    p_cuDeviceGetAttribute = (cuDeviceGetAttribute_t)get_cuda_symbol("cuDeviceGetAttribute");
+    p_cuPointerSetAttribute = (cuPointerSetAttribute_t)get_cuda_symbol("cuPointerSetAttribute");
+    p_cuMemGetHandleForAddressRange =
+        (cuMemGetHandleForAddressRange_t)get_cuda_symbol("cuMemGetHandleForAddressRange");
+    p_cuCtxGetCurrent = (cuCtxGetCurrent_t)get_cuda_symbol("cuCtxGetCurrent");
+
+    /* Check if all symbols were found */
+    if (!p_cuDeviceGetAttribute || !p_cuPointerSetAttribute || !p_cuMemGetHandleForAddressRange ||
+        !p_cuCtxGetCurrent) {
+        DOCA_LOG(LOG_ERR, "Failed to get all required CUDA symbols\n");
+        dlclose(cuda_handle);
+        cuda_handle = nullptr;
+        *ret = -1;
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int init_cuda_wrapper(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* Wrapper function implementations */
+CUresult doca_verbs_wrapper_cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuDeviceGetAttribute(pi, attrib, dev);
+}
+
+CUresult doca_verbs_wrapper_cuPointerSetAttribute(const void *value, CUpointer_attribute attribute,
+                                                  CUdeviceptr ptr) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuPointerSetAttribute(value, attribute, ptr);
+}
+
+CUresult doca_verbs_wrapper_cuMemGetHandleForAddressRange(int *pHandle, CUdeviceptr dptr,
+                                                          size_t size,
+                                                          CUmemRangeHandleType handleType,
+                                                          unsigned long long flags) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuMemGetHandleForAddressRange(pHandle, dptr, size, handleType, flags);
+}
+
+CUresult doca_verbs_wrapper_cuCtxGetCurrent(CUcontext *pctx) {
+    if (init_cuda_wrapper() != 0) return CUDA_ERROR_NOT_INITIALIZED;
+    return p_cuCtxGetCurrent(pctx);
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h
new file mode 100644
index 00000000000..d748e1b771c
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_cuda_wrapper.h
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DOCA_VERBS_CUDA_WRAPPER_H
+#define DOCA_VERBS_CUDA_WRAPPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef DOCA_VERBS_USE_CUDA_WRAPPER
+
+/* CUDA type declarations for builds without cuda.h */
+typedef enum cudaError_enum {
+    CUDA_SUCCESS = 0,
+    CUDA_ERROR_NOT_INITIALIZED = 3,
+} CUresult;
+typedef int CUdevice;
+typedef unsigned long long CUdeviceptr;
+typedef enum CUmemRangeHandleType_enum {
+    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
+    CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF
+} CUmemRangeHandleType;
+
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS =
+        6, /**< Synchronize every synchronous memory operation initiated on this region */
+} CUpointer_attribute;
+
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED =
+        124, /**< Device supports buffer sharing with dma_buf mechanism. */
+} CUdevice_attribute;
+
+typedef void *CUcontext;
+
+/* Wrapper function declarations */
+CUresult doca_verbs_wrapper_cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+CUresult doca_verbs_wrapper_cuPointerSetAttribute(const void *value, CUpointer_attribute attribute,
+                                                  CUdeviceptr ptr);
+CUresult doca_verbs_wrapper_cuMemGetHandleForAddressRange(int *pHandle, CUdeviceptr dptr,
+                                                          size_t size,
+                                                          CUmemRangeHandleType handleType,
+                                                          unsigned long long flags);
+CUresult doca_verbs_wrapper_cuCtxGetCurrent(CUcontext *pctx);
+
+/* Initialization function */
+int doca_cuda_wrapper_init(void);
+
+#else
+
+#include <cuda.h>
+
+/* Direct API calls when wrapper is not enabled */
+#define doca_verbs_wrapper_cuDeviceGetAttribute cuDeviceGetAttribute
+#define doca_verbs_wrapper_cuPointerSetAttribute cuPointerSetAttribute
+#define doca_verbs_wrapper_cuMemGetHandleForAddressRange cuMemGetHandleForAddressRange
+#define doca_verbs_wrapper_cuCtxGetCurrent cuCtxGetCurrent
+
+/* No initialization needed when wrapper is not enabled */
+#define doca_cuda_wrapper_init() 0
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_CUDA_WRAPPER_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp
new file mode 100644
index 00000000000..8ed34e91fa2
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.cpp
@@ -0,0 +1,266 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_verbs_net_wrapper.h"
+#include "host/doca_verbs.h"
+#include "doca_verbs_device_attr.hpp"
+
+#define PRIV_DOCA_MLX5_GID_TABLE_8_ENTRIES 0x0
+#define PRIV_DOCA_MLX5_GID_TABLE_16_ENTRIES 0x1
+#define PRIV_DOCA_MLX5_GID_TABLE_32_ENTRIES 0x2
+#define PRIV_DOCA_MLX5_GID_TABLE_64_ENTRIES 0x3
+#define PRIV_DOCA_MLX5_GID_TABLE_128_ENTRIES 0x4
+
+#define PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_MAX 0
+#define PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR 1
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+uint16_t translate_gid_table_size(uint16_t gid_table_size_prm) {
+    switch (gid_table_size_prm) {
+        case PRIV_DOCA_MLX5_GID_TABLE_8_ENTRIES:
+            return 8;
+        case PRIV_DOCA_MLX5_GID_TABLE_16_ENTRIES:
+            return 16;
+        case PRIV_DOCA_MLX5_GID_TABLE_32_ENTRIES:
+            return 32;
+        case PRIV_DOCA_MLX5_GID_TABLE_64_ENTRIES:
+            return 64;
+        case PRIV_DOCA_MLX5_GID_TABLE_128_ENTRIES:
+            return 128;
+        default:
+            // Shouldn't reach this
+            return 0;
+    }
+
+    // Shouldn't reach this
+    return 0;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_device_attr Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_device_attr::doca_verbs_device_attr(struct ibv_context *ibv_ctx) {
+    try {
+        query_caps(ibv_ctx);
+    } catch (...) {
+        DOCA_LOG(LOG_ERR, "Failed to create device_attr");
+        throw;
+    }
+}
+
+void doca_verbs_device_attr::query_caps(struct ibv_context *ibv_ctx) {
+    struct ibv_device_attr device_attr {};
+    auto ret = doca_verbs_wrapper_ibv_query_device(ibv_ctx, &device_attr);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw ret;
+    }
+
+    m_max_qp = device_attr.max_qp;
+    m_max_qp_wr = device_attr.max_qp_wr;
+    m_max_sge = device_attr.max_sge;
+    m_max_cq = device_attr.max_cq;
+    m_max_cqe = device_attr.max_cqe;
+    m_max_mr = device_attr.max_mr;
+    m_max_pd = device_attr.max_pd;
+    m_max_ah = device_attr.max_ah;
+    m_max_srq = device_attr.max_srq;
+    m_max_srq_wr = device_attr.max_srq_wr;
+    m_max_srq_sge = device_attr.max_srq_sge;
+    m_max_pkeys = device_attr.max_pkeys;
+    m_phys_port_cnt = device_attr.phys_port_cnt;
+
+    uint32_t in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {0};
+    uint32_t out[MLX5_ST_SZ_DW(query_hca_cap_out)] = {0};
+
+    DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+    DEVX_SET(query_hca_cap_in, in, op_mod,
+             PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR | MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE);
+
+    ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ibv_ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device capabilities");
+        throw ret;
+    }
+
+    m_port_type = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.port_type);
+    if (m_port_type == MLX5_CAP_PORT_TYPE_IB)
+        m_gid_table_size = translate_gid_table_size(
+            DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.gid_table_size));
+    m_is_qp_rc_supported = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.rc);
+    m_is_rts2rts_qp_dscp_supported =
+        DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.rts2rts_qp_dscp);
+    m_max_sq_desc_size = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.max_wqe_sz_sq);
+    m_max_rq_desc_size = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.max_wqe_sz_rq);
+    m_max_send_wqebb = 1 << DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.log_max_qp_sz);
+
+    memset(in, 0, sizeof(in));
+    memset(out, 0, sizeof(out));
+
+    DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
+    DEVX_SET(query_hca_cap_in, in, op_mod,
+             PRIV_DOCA_MLX5_HCA_CAP_OPMOD_GET_CUR | MLX5_SET_HCA_CAP_OP_MOD_ROCE);
+
+    ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ibv_ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query ROCE capabilities");
+        throw ret;
+    }
+
+    if (m_port_type == MLX5_CAP_PORT_TYPE_ETH)
+        m_gid_table_size =
+            DEVX_GET(query_hca_cap_out, out, capability.roce_caps.roce_address_table_size);
+    m_min_udp_sport =
+        DEVX_GET(query_hca_cap_out, out, capability.roce_caps.r_roce_min_src_udp_port);
+    m_max_udp_sport =
+        DEVX_GET(query_hca_cap_out, out, capability.roce_caps.r_roce_max_src_udp_port);
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_query_device(struct ibv_context *context,
+                                     struct doca_verbs_device_attr **verbs_device_attr) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query doca_verbs_device_attr. param context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (verbs_device_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query doca_verbs_device_attr. param verbs_device_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_device_attr = new doca_verbs_device_attr(context);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_device_attr_free(struct doca_verbs_device_attr *verbs_device_attr) {
+    if (verbs_device_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to free doca_verbs_device_attr. param verbs_device_attr=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    delete (verbs_device_attr);
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_device_attr_get_max_qp(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_qp;
+}
+
+uint32_t doca_verbs_device_attr_get_max_qp_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_qp_wr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_sge;
+}
+
+uint32_t doca_verbs_device_attr_get_max_cq(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_cq;
+}
+
+uint32_t doca_verbs_device_attr_get_max_cqe(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_cqe;
+}
+
+uint32_t doca_verbs_device_attr_get_max_mr(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_mr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_pd(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_pd;
+}
+
+uint32_t doca_verbs_device_attr_get_max_ah(const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_ah;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq_wr(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq_wr;
+}
+
+uint32_t doca_verbs_device_attr_get_max_srq_sge(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_srq_sge;
+}
+
+uint16_t doca_verbs_device_attr_get_max_pkeys(
+    const struct doca_verbs_device_attr *verbs_device_attr) {
+    return verbs_device_attr->m_max_pkeys;
+}
+
+doca_error_t doca_verbs_device_attr_get_is_qp_type_supported(
+    const struct doca_verbs_device_attr *verbs_device_attr, uint32_t qp_type) {
+    switch (qp_type) {
+        case DOCA_VERBS_QP_TYPE_RC:
+            return verbs_device_attr->m_is_qp_rc_supported ? DOCA_SUCCESS
+                                                           : DOCA_ERROR_NOT_SUPPORTED;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Failed to check if QP type is supported. param QP type is invalid");
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    // Shouldn't reach this
+    return DOCA_ERROR_UNEXPECTED;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp
new file mode 100644
index 00000000000..21e52fe3bf1
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_device_attr.hpp
@@ -0,0 +1,96 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+
+/**
+ *  @brief This struct implements the doca rdma_verbs device attributes
+ */
+struct doca_verbs_device_attr {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * IBV context to query device attributes from
+     *
+     */
+    doca_verbs_device_attr(struct ibv_context *ibv_ctx);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_device_attr() = default;
+
+    /**
+     * @brief Query device capabilities
+     *
+     * @param [in] ibv_ctx
+     * IBV context to query device attributes from
+     *
+     */
+    void query_caps(struct ibv_context *ibv_ctx);
+
+    uint32_t m_max_qp{};
+    uint32_t m_max_qp_wr{};
+    uint32_t m_max_sge{};
+    uint32_t m_max_cq{};
+    uint32_t m_max_cqe{};
+    uint32_t m_max_mr{};
+    uint32_t m_max_pd{};
+    uint32_t m_max_ah{};
+    uint32_t m_max_srq{};
+    uint32_t m_max_srq_wr{};
+    uint32_t m_max_srq_sge{};
+    uint32_t m_max_pkeys{};
+    uint32_t m_max_sq_desc_size{};
+    uint32_t m_max_rq_desc_size{};
+    uint32_t m_max_send_wqebb{};
+    uint16_t m_min_udp_sport{};
+    uint16_t m_max_udp_sport{};
+    uint16_t m_gid_table_size{};
+    uint8_t m_is_qp_rc_supported{};
+    uint8_t m_port_type{};
+    uint8_t m_is_rts2rts_qp_dscp_supported{};
+    uint8_t m_phys_port_cnt{};
+
+   private:
+    doca_verbs_device_attr &operator=(doca_verbs_device_attr const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp
new file mode 100644
index 00000000000..864fbfdae57
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.cpp
@@ -0,0 +1,374 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_ibv_wrapper.cpp
+ * @brief Implementation of IB Verbs API wrapper using dlopen
+ *
+ * This file implements the IB Verbs API wrapper using dynamic loading.
+ * It is only compiled when DOCA_VERBS_USE_IBV_WRAPPER is defined.
+ */
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <mutex>
+
+#include "doca_verbs_net_wrapper.h"
+#include "host/doca_error.h"
+
+/* *********** Function Pointer Types *********** */
+
+typedef struct ibv_device **(*ibv_get_device_list_func_t)(int *num_devices);
+typedef void (*ibv_free_device_list_func_t)(struct ibv_device **list);
+typedef const char *(*ibv_get_device_name_func_t)(struct ibv_device *device);
+typedef struct ibv_context *(*ibv_open_device_func_t)(struct ibv_device *device);
+typedef int (*ibv_close_device_func_t)(struct ibv_context *context);
+typedef struct ibv_pd *(*ibv_alloc_pd_func_t)(struct ibv_context *context);
+typedef int (*ibv_dealloc_pd_func_t)(struct ibv_pd *pd);
+typedef struct ibv_mr *(*ibv_reg_mr_func_t)(struct ibv_pd *pd, void *addr, size_t length,
+                                            int access);
+typedef int (*ibv_dereg_mr_func_t)(struct ibv_mr *mr);
+typedef int (*ibv_query_device_func_t)(struct ibv_context *context,
+                                       struct ibv_device_attr *device_attr);
+typedef int (*ibv_query_port_func_t)(struct ibv_context *context, uint8_t port_num,
+                                     struct ibv_port_attr *port_attr);
+typedef int (*ibv_query_gid_func_t)(struct ibv_context *context, uint8_t port_num, int index,
+                                    union ibv_gid *gid);
+typedef struct ibv_ah *(*ibv_create_ah_func_t)(struct ibv_pd *pd, struct ibv_ah_attr *attr);
+typedef int (*ibv_destroy_ah_func_t)(struct ibv_ah *ah);
+typedef struct ibv_cq *(*ibv_create_cq_func_t)(struct ibv_context *context, int cqe,
+                                               void *cq_context, struct ibv_comp_channel *channel,
+                                               int comp_vector);
+typedef int (*ibv_destroy_cq_func_t)(struct ibv_cq *cq);
+typedef struct ibv_srq *(*ibv_create_srq_func_t)(struct ibv_pd *pd,
+                                                 struct ibv_srq_init_attr *srq_init_attr);
+typedef int (*ibv_destroy_srq_func_t)(struct ibv_srq *srq);
+typedef struct ibv_qp *(*ibv_create_qp_func_t)(struct ibv_pd *pd,
+                                               struct ibv_qp_init_attr *qp_init_attr);
+typedef int (*ibv_destroy_qp_func_t)(struct ibv_qp *qp);
+typedef int (*ibv_modify_qp_func_t)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask);
+typedef int (*ibv_query_qp_func_t)(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
+                                   struct ibv_qp_init_attr *init_attr);
+
+/* *********** Global Function Pointers *********** */
+
+static ibv_get_device_list_func_t real_ibv_get_device_list = NULL;
+static ibv_free_device_list_func_t real_ibv_free_device_list = NULL;
+static ibv_get_device_name_func_t real_ibv_get_device_name = NULL;
+static ibv_open_device_func_t real_ibv_open_device = NULL;
+static ibv_close_device_func_t real_ibv_close_device = NULL;
+static ibv_alloc_pd_func_t real_ibv_alloc_pd = NULL;
+static ibv_dealloc_pd_func_t real_ibv_dealloc_pd = NULL;
+static ibv_reg_mr_func_t real_ibv_reg_mr = NULL;
+static ibv_dereg_mr_func_t real_ibv_dereg_mr = NULL;
+static ibv_query_device_func_t real_ibv_query_device = NULL;
+static ibv_query_port_func_t real_ibv_query_port = NULL;
+static ibv_query_gid_func_t real_ibv_query_gid = NULL;
+static ibv_create_ah_func_t real_ibv_create_ah = NULL;
+static ibv_destroy_ah_func_t real_ibv_destroy_ah = NULL;
+static ibv_create_cq_func_t real_ibv_create_cq = NULL;
+static ibv_destroy_cq_func_t real_ibv_destroy_cq = NULL;
+static ibv_create_srq_func_t real_ibv_create_srq = NULL;
+static ibv_destroy_srq_func_t real_ibv_destroy_srq = NULL;
+static ibv_create_qp_func_t real_ibv_create_qp = NULL;
+static ibv_destroy_qp_func_t real_ibv_destroy_qp = NULL;
+static ibv_modify_qp_func_t real_ibv_modify_qp = NULL;
+static ibv_query_qp_func_t real_ibv_query_qp = NULL;
+
+/* *********** Library Handle *********** */
+
+static void *ibverbs_handle = NULL;
+
+/* *********** Helper Functions *********** */
+
+/**
+ * @brief Initialize the IB Verbs library using dlopen
+ *
+ * @return 0 on success, -1 on failure
+ */
+static void doca_verbs_wrapper_init_once(int *ret) {
+    /* Try to open the IB Verbs library */
+    ibverbs_handle = dlopen("libibverbs.so.1", RTLD_LAZY);
+    if (!ibverbs_handle) {
+        ibverbs_handle = dlopen("libibverbs.so", RTLD_LAZY);
+    }
+    if (!ibverbs_handle) {
+        fprintf(stderr, "Failed to load libibverbs: %s\n", dlerror());
+        *ret = -1;
+        return;
+    }
+
+    /* Load all function pointers */
+    real_ibv_get_device_list =
+        (ibv_get_device_list_func_t)dlsym(ibverbs_handle, "ibv_get_device_list");
+    real_ibv_free_device_list =
+        (ibv_free_device_list_func_t)dlsym(ibverbs_handle, "ibv_free_device_list");
+    real_ibv_get_device_name =
+        (ibv_get_device_name_func_t)dlsym(ibverbs_handle, "ibv_get_device_name");
+    real_ibv_open_device = (ibv_open_device_func_t)dlsym(ibverbs_handle, "ibv_open_device");
+    real_ibv_close_device = (ibv_close_device_func_t)dlsym(ibverbs_handle, "ibv_close_device");
+    real_ibv_alloc_pd = (ibv_alloc_pd_func_t)dlsym(ibverbs_handle, "ibv_alloc_pd");
+    real_ibv_dealloc_pd = (ibv_dealloc_pd_func_t)dlsym(ibverbs_handle, "ibv_dealloc_pd");
+    real_ibv_reg_mr = (ibv_reg_mr_func_t)dlsym(ibverbs_handle, "ibv_reg_mr");
+    real_ibv_dereg_mr = (ibv_dereg_mr_func_t)dlsym(ibverbs_handle, "ibv_dereg_mr");
+    real_ibv_query_device = (ibv_query_device_func_t)dlsym(ibverbs_handle, "ibv_query_device");
+    real_ibv_query_port = (ibv_query_port_func_t)dlsym(ibverbs_handle, "ibv_query_port");
+    real_ibv_query_gid = (ibv_query_gid_func_t)dlsym(ibverbs_handle, "ibv_query_gid");
+    real_ibv_create_ah = (ibv_create_ah_func_t)dlsym(ibverbs_handle, "ibv_create_ah");
+    real_ibv_destroy_ah = (ibv_destroy_ah_func_t)dlsym(ibverbs_handle, "ibv_destroy_ah");
+    real_ibv_create_cq = (ibv_create_cq_func_t)dlsym(ibverbs_handle, "ibv_create_cq");
+    real_ibv_destroy_cq = (ibv_destroy_cq_func_t)dlsym(ibverbs_handle, "ibv_destroy_cq");
+    real_ibv_create_srq = (ibv_create_srq_func_t)dlsym(ibverbs_handle, "ibv_create_srq");
+    real_ibv_destroy_srq = (ibv_destroy_srq_func_t)dlsym(ibverbs_handle, "ibv_destroy_srq");
+    real_ibv_create_qp = (ibv_create_qp_func_t)dlsym(ibverbs_handle, "ibv_create_qp");
+    real_ibv_destroy_qp = (ibv_destroy_qp_func_t)dlsym(ibverbs_handle, "ibv_destroy_qp");
+    real_ibv_modify_qp = (ibv_modify_qp_func_t)dlsym(ibverbs_handle, "ibv_modify_qp");
+    real_ibv_query_qp = (ibv_query_qp_func_t)dlsym(ibverbs_handle, "ibv_query_qp");
+
+    /* Check if all functions were loaded successfully */
+    if (!real_ibv_get_device_list || !real_ibv_free_device_list || !real_ibv_get_device_name ||
+        !real_ibv_open_device || !real_ibv_close_device || !real_ibv_alloc_pd ||
+        !real_ibv_dealloc_pd || !real_ibv_reg_mr || !real_ibv_dereg_mr || !real_ibv_query_device ||
+        !real_ibv_query_port || !real_ibv_query_gid || !real_ibv_create_ah ||
+        !real_ibv_destroy_ah || !real_ibv_create_cq || !real_ibv_destroy_cq ||
+        !real_ibv_create_srq || !real_ibv_destroy_srq || !real_ibv_create_qp ||
+        !real_ibv_destroy_qp || !real_ibv_modify_qp || !real_ibv_query_qp) {
+        fprintf(stderr, "Failed to load IB Verbs functions: %s\n", dlerror());
+        dlclose(ibverbs_handle);
+        ibverbs_handle = NULL;
+        *ret = -1;
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int init_ibverbs_library(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* *********** Wrapper Implementations *********** */
+
+doca_error_t doca_verbs_wrapper_ibv_get_device_list(int *num_devices,
+                                                    struct ibv_device ***device_list) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *device_list = real_ibv_get_device_list(num_devices);
+    return (*device_list != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list) {
+    if (real_ibv_free_device_list) {
+        real_ibv_free_device_list(list);
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                    const char **device_name) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *device_name = real_ibv_get_device_name(device);
+    return (*device_name != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                struct ibv_context **context) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *context = real_ibv_open_device(device);
+    return (*context != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context) {
+    if (real_ibv_close_device) {
+        int ret = real_ibv_close_device(context);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context, struct ibv_pd **pd) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *pd = real_ibv_alloc_pd(context);
+    return (*pd != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd) {
+    if (real_ibv_dealloc_pd) {
+        int ret = real_ibv_dealloc_pd(pd);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access,
+                                           struct ibv_mr **mr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *mr = real_ibv_reg_mr(pd, addr, length, access);
+    return (*mr != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr) {
+    if (real_ibv_dereg_mr) {
+        int ret = real_ibv_dereg_mr(mr);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_device(struct ibv_context *context,
+                                                 struct ibv_device_attr *device_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_device(context, device_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context, uint8_t port_num,
+                                               struct ibv_port_attr *port_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_port(context, port_num, port_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+                                              int index, union ibv_gid *gid) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_gid(context, port_num, index, gid);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr,
+                                              struct ibv_ah **ah) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *ah = real_ibv_create_ah(pd, attr);
+    return (*ah != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah) {
+    if (real_ibv_destroy_ah) {
+        int ret = real_ibv_destroy_ah(ah);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                              void *cq_context, struct ibv_comp_channel *channel,
+                                              int comp_vector, struct ibv_cq **cq) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *cq = real_ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
+    return (*cq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq) {
+    if (real_ibv_destroy_cq) {
+        int ret = real_ibv_destroy_cq(cq);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_srq(struct ibv_pd *pd,
+                                               struct ibv_srq_init_attr *srq_init_attr,
+                                               struct ibv_srq **srq) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *srq = real_ibv_create_srq(pd, srq_init_attr);
+    return (*srq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq) {
+    if (real_ibv_destroy_srq) {
+        int ret = real_ibv_destroy_srq(srq);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                              struct ibv_qp_init_attr *qp_init_attr,
+                                              struct ibv_qp **qp) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    *qp = real_ibv_create_qp(pd, qp_init_attr);
+    return (*qp != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp) {
+    if (real_ibv_destroy_qp) {
+        int ret = real_ibv_destroy_qp(qp);
+        return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                              int attr_mask) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_modify_qp(qp, attr, attr_mask);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                             int attr_mask, struct ibv_qp_init_attr *init_attr) {
+    if (init_ibverbs_library() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = real_ibv_query_qp(qp, attr, attr_mask, init_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h
new file mode 100644
index 00000000000..a825aec8250
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_ibv_wrapper.h
@@ -0,0 +1,452 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_ibv_wrapper.h
+ * @brief Wrapper for IB Verbs API calls and structs
+ *
+ * This wrapper provides an abstraction layer over IB Verbs APIs.
+ * It can be enabled by defining DOCA_VERBS_USE_IBV_WRAPPER.
+ *
+ * When DOCA_VERBS_USE_IBV_WRAPPER is defined:
+ * - All IB Verbs API calls are wrapped using dlopen
+ * - All IB Verbs structs are wrapped
+ * - The wrapper provides a clean abstraction layer
+ *
+ * When DOCA_VERBS_USE_IBV_WRAPPER is not defined:
+ * - Direct IB Verbs APIs are used
+ * - No overhead is introduced
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_IBV_WRAPPER_H
+#define DOCA_VERBS_IBV_WRAPPER_H
+
+#ifdef DOCA_VERBS_USE_IBV_WRAPPER
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+
+union ibv_gid {
+    uint8_t raw[16];
+    struct {
+        __be64 subnet_prefix;
+        __be64 interface_id;
+    } global;
+};
+
+struct ibv_global_route {
+    union ibv_gid dgid;
+    uint32_t flow_label;
+    uint8_t sgid_index;
+    uint8_t hop_limit;
+    uint8_t traffic_class;
+};
+
+struct ibv_ah_attr {
+    struct ibv_global_route grh;
+    uint16_t dlid;
+    uint8_t sl;
+    uint8_t src_path_bits;
+    uint8_t static_rate;
+    uint8_t is_global;
+    uint8_t port_num;
+};
+
+enum ibv_atomic_cap { IBV_ATOMIC_NONE, IBV_ATOMIC_HCA, IBV_ATOMIC_GLOB };
+
+struct ibv_device_attr {
+    char fw_ver[64];
+    __be64 node_guid;
+    __be64 sys_image_guid;
+    uint64_t max_mr_size;
+    uint64_t page_size_cap;
+    uint32_t vendor_id;
+    uint32_t vendor_part_id;
+    uint32_t hw_ver;
+    int max_qp;
+    int max_qp_wr;
+    unsigned int device_cap_flags;
+    int max_sge;
+    int max_sge_rd;
+    int max_cq;
+    int max_cqe;
+    int max_mr;
+    int max_pd;
+    int max_qp_rd_atom;
+    int max_ee_rd_atom;
+    int max_res_rd_atom;
+    int max_qp_init_rd_atom;
+    int max_ee_init_rd_atom;
+    enum ibv_atomic_cap atomic_cap;
+    int max_ee;
+    int max_rdd;
+    int max_mw;
+    int max_raw_ipv6_qp;
+    int max_raw_ethy_qp;
+    int max_mcast_grp;
+    int max_mcast_qp_attach;
+    int max_total_mcast_qp_attach;
+    int max_ah;
+    int max_fmr;
+    int max_map_per_fmr;
+    int max_srq;
+    int max_srq_wr;
+    int max_srq_sge;
+    uint16_t max_pkeys;
+    uint8_t local_ca_ack_delay;
+    uint8_t phys_port_cnt;
+};
+
+struct ibv_pd {
+    struct ibv_context *context;
+    uint32_t handle;
+};
+
+enum ibv_access_flags {
+    IBV_ACCESS_LOCAL_WRITE = 1,
+    IBV_ACCESS_REMOTE_WRITE = (1 << 1),
+    IBV_ACCESS_REMOTE_READ = (1 << 2),
+    IBV_ACCESS_REMOTE_ATOMIC = (1 << 3),
+    IBV_ACCESS_MW_BIND = (1 << 4),
+    IBV_ACCESS_ZERO_BASED = (1 << 5),
+    IBV_ACCESS_ON_DEMAND = (1 << 6),
+    IBV_ACCESS_HUGETLB = (1 << 7),
+    IBV_ACCESS_FLUSH_GLOBAL = (1 << 8),
+    IBV_ACCESS_FLUSH_PERSISTENT = (1 << 9),
+    IBV_ACCESS_RELAXED_ORDERING = (1 << 20),
+};
+
+struct ibv_device;
+struct ibv_context;
+struct ibv_mr;
+struct ibv_ah;
+struct ibv_cq;
+struct ibv_comp_channel;
+struct ibv_srq;
+struct ibv_srq_init_attr;
+struct ibv_qp;
+struct ibv_qp_init_attr;
+struct ibv_qp_attr;
+struct ibv_port_attr;
+
+/* *********** IB Verbs API Wrappers *********** */
+
+/**
+ * @brief Wrapper for ibv_get_device_list
+ */
+doca_error_t doca_verbs_wrapper_ibv_get_device_list(int *num_devices,
+                                                    struct ibv_device ***device_list);
+
+/**
+ * @brief Wrapper for ibv_free_device_list
+ */
+doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list);
+
+/**
+ * @brief Wrapper for ibv_get_device_name
+ */
+doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                    const char **device_name);
+
+/**
+ * @brief Wrapper for ibv_open_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                struct ibv_context **context);
+
+/**
+ * @brief Wrapper for ibv_close_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context);
+
+/**
+ * @brief Wrapper for ibv_alloc_pd
+ */
+doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context, struct ibv_pd **pd);
+
+/**
+ * @brief Wrapper for ibv_dealloc_pd
+ */
+doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd);
+
+/**
+ * @brief Wrapper for ibv_reg_mr
+ */
+doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr, size_t length, int access,
+                                           struct ibv_mr **mr);
+
+/**
+ * @brief Wrapper for ibv_dereg_mr
+ */
+doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr);
+
+/**
+ * @brief Wrapper for ibv_query_device
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_device(struct ibv_context *context,
+                                                 struct ibv_device_attr *device_attr);
+
+/**
+ * @brief Wrapper for ibv_query_port
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context, uint8_t port_num,
+                                               struct ibv_port_attr *port_attr);
+
+/**
+ * @brief Wrapper for ibv_query_gid
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context, uint8_t port_num,
+                                              int index, union ibv_gid *gid);
+
+/**
+ * @brief Wrapper for ibv_create_ah
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr,
+                                              struct ibv_ah **ah);
+
+/**
+ * @brief Wrapper for ibv_destroy_ah
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah);
+
+/**
+ * @brief Wrapper for ibv_create_cq
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                              void *cq_context, struct ibv_comp_channel *channel,
+                                              int comp_vector, struct ibv_cq **cq);
+
+/**
+ * @brief Wrapper for ibv_destroy_cq
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq);
+
+/**
+ * @brief Wrapper for ibv_create_srq
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_srq(struct ibv_pd *pd,
+                                               struct ibv_srq_init_attr *srq_init_attr,
+                                               struct ibv_srq **srq);
+
+/**
+ * @brief Wrapper for ibv_destroy_srq
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq);
+
+/**
+ * @brief Wrapper for ibv_create_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                              struct ibv_qp_init_attr *qp_init_attr,
+                                              struct ibv_qp **qp);
+
+/**
+ * @brief Wrapper for ibv_destroy_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp);
+
+/**
+ * @brief Wrapper for ibv_modify_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                              int attr_mask);
+
+/**
+ * @brief Wrapper for ibv_query_qp
+ */
+doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
+                                             int attr_mask, struct ibv_qp_init_attr *init_attr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#else /* !DOCA_VERBS_USE_IBV_WRAPPER */
+
+#include <infiniband/verbs.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+/* *********** Direct Implementation (when wrapper not used) *********** */
+
+static inline doca_error_t doca_verbs_wrapper_ibv_get_device_list(
+    int *num_devices, struct ibv_device ***device_list) {
+    *device_list = ibv_get_device_list(num_devices);
+    return (*device_list != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_free_device_list(struct ibv_device **list) {
+    ibv_free_device_list(list);
+    return DOCA_SUCCESS;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_get_device_name(struct ibv_device *device,
+                                                                  const char **device_name) {
+    *device_name = ibv_get_device_name(device);
+    return (*device_name != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_open_device(struct ibv_device *device,
+                                                              struct ibv_context **context) {
+    *context = ibv_open_device(device);
+    return (*context != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_close_device(struct ibv_context *context) {
+    int ret = ibv_close_device(context);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_alloc_pd(struct ibv_context *context,
+                                                           struct ibv_pd **pd) {
+    *pd = ibv_alloc_pd(context);
+    return (*pd != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_dealloc_pd(struct ibv_pd *pd) {
+    int ret = ibv_dealloc_pd(pd);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_reg_mr(struct ibv_pd *pd, void *addr,
+                                                         size_t length, int access,
+                                                         struct ibv_mr **mr) {
+    *mr = ibv_reg_mr(pd, addr, length, access);
+    return (*mr != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_dereg_mr(struct ibv_mr *mr) {
+    int ret = ibv_dereg_mr(mr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_device(
+    struct ibv_context *context, struct ibv_device_attr *device_attr) {
+    int ret = ibv_query_device(context, device_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_port(struct ibv_context *context,
+                                                             uint8_t port_num,
+                                                             struct ibv_port_attr *port_attr) {
+    int ret = ibv_query_port(context, port_num, port_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_gid(struct ibv_context *context,
+                                                            uint8_t port_num, int index,
+                                                            union ibv_gid *gid) {
+    int ret = ibv_query_gid(context, port_num, index, gid);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_ah(struct ibv_pd *pd,
+                                                            struct ibv_ah_attr *attr,
+                                                            struct ibv_ah **ah) {
+    *ah = ibv_create_ah(pd, attr);
+    return (*ah != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_ah(struct ibv_ah *ah) {
+    int ret = ibv_destroy_ah(ah);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_cq(struct ibv_context *context, int cqe,
+                                                            void *cq_context,
+                                                            struct ibv_comp_channel *channel,
+                                                            int comp_vector, struct ibv_cq **cq) {
+    *cq = ibv_create_cq(context, cqe, cq_context, channel, comp_vector);
+    return (*cq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_cq(struct ibv_cq *cq) {
+    int ret = ibv_destroy_cq(cq);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_srq(
+    struct ibv_pd *pd, struct ibv_srq_init_attr *srq_init_attr, struct ibv_srq **srq) {
+    *srq = ibv_create_srq(pd, srq_init_attr);
+    return (*srq != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_srq(struct ibv_srq *srq) {
+    int ret = ibv_destroy_srq(srq);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_create_qp(struct ibv_pd *pd,
+                                                            struct ibv_qp_init_attr *qp_init_attr,
+                                                            struct ibv_qp **qp) {
+    *qp = ibv_create_qp(pd, qp_init_attr);
+    return (*qp != NULL) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_destroy_qp(struct ibv_qp *qp) {
+    int ret = ibv_destroy_qp(qp);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_modify_qp(struct ibv_qp *qp,
+                                                            struct ibv_qp_attr *attr,
+                                                            int attr_mask) {
+    int ret = ibv_modify_qp(qp, attr, attr_mask);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_ibv_query_qp(struct ibv_qp *qp,
+                                                           struct ibv_qp_attr *attr, int attr_mask,
+                                                           struct ibv_qp_init_attr *init_attr) {
+    int ret = ibv_query_qp(qp, attr, attr_mask, init_attr);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_USE_IBV_WRAPPER */
+
+/** @} */
+
+#endif /* DOCA_VERBS_IBV_WRAPPER_H */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp
new file mode 100644
index 00000000000..7d690d466fd
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.cpp
@@ -0,0 +1,287 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_mlx5dv_wrapper.cpp
+ * @brief Implementation of mlx5dv API wrapper using dlopen
+ *
+ * This file contains the implementation of the mlx5dv API wrapper
+ * using dynamic loading with dlopen when DOCA_VERBS_USE_MLX5DV_WRAPPER is defined.
+ */
+
+#include "doca_verbs_net_wrapper.h"
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <endian.h>
+#include <mutex>
+
+#include "host/doca_error.h"
+
+/* *********** dlopen Function Pointers *********** */
+
+static void *mlx5dv_handle = NULL;
+
+/* Function pointer types */
+typedef int (*mlx5dv_init_obj_func_t)(struct mlx5dv_obj *obj, enum mlx5dv_obj_type obj_type);
+typedef struct mlx5dv_devx_obj *(*mlx5dv_devx_obj_create_func_t)(struct ibv_context *context,
+                                                                 const void *in, size_t inlen,
+                                                                 void *out, size_t outlen);
+typedef int (*mlx5dv_devx_obj_destroy_func_t)(struct mlx5dv_devx_obj *obj);
+typedef int (*mlx5dv_devx_obj_query_func_t)(struct mlx5dv_devx_obj *obj, const void *in,
+                                            size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_obj_modify_func_t)(struct mlx5dv_devx_obj *obj, const void *in,
+                                             size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_general_cmd_func_t)(struct ibv_context *context, const void *in,
+                                              size_t inlen, void *out, size_t outlen);
+typedef int (*mlx5dv_devx_query_eqn_func_t)(struct ibv_context *context, uint32_t cpus,
+                                            uint32_t *eqn);
+typedef struct mlx5dv_devx_umem *(*mlx5dv_devx_umem_reg_func_t)(struct ibv_context *context,
+                                                                void *addr, size_t size,
+                                                                uint32_t access);
+typedef struct mlx5dv_devx_umem *(*mlx5dv_devx_umem_reg_ex_func_t)(
+    struct ibv_context *context, struct mlx5dv_devx_umem_in *umem_in);
+typedef int (*mlx5dv_devx_umem_dereg_func_t)(struct mlx5dv_devx_umem *umem);
+typedef struct mlx5dv_devx_uar *(*mlx5dv_devx_alloc_uar_func_t)(struct ibv_context *context,
+                                                                uint32_t uar_type);
+typedef void (*mlx5dv_devx_free_uar_func_t)(struct mlx5dv_devx_uar *uar);
+typedef int (*mlx5dv_query_device_func_t)(struct ibv_context *context,
+                                          struct mlx5dv_context *attrs_out);
+
+/* Function pointers */
+static mlx5dv_init_obj_func_t mlx5dv_init_obj_func = NULL;
+static mlx5dv_devx_obj_create_func_t mlx5dv_devx_obj_create_func = NULL;
+static mlx5dv_devx_obj_destroy_func_t mlx5dv_devx_obj_destroy_func = NULL;
+static mlx5dv_devx_obj_query_func_t mlx5dv_devx_obj_query_func = NULL;
+static mlx5dv_devx_obj_modify_func_t mlx5dv_devx_obj_modify_func = NULL;
+static mlx5dv_devx_general_cmd_func_t mlx5dv_devx_general_cmd_func = NULL;
+static mlx5dv_devx_query_eqn_func_t mlx5dv_devx_query_eqn_func = NULL;
+static mlx5dv_devx_umem_reg_func_t mlx5dv_devx_umem_reg_func = NULL;
+static mlx5dv_devx_umem_reg_ex_func_t mlx5dv_devx_umem_reg_ex_func = NULL;
+static mlx5dv_devx_umem_dereg_func_t mlx5dv_devx_umem_dereg_func = NULL;
+static mlx5dv_devx_alloc_uar_func_t mlx5dv_devx_alloc_uar_func = NULL;
+static mlx5dv_devx_free_uar_func_t mlx5dv_devx_free_uar_func = NULL;
+static mlx5dv_query_device_func_t mlx5dv_query_device_func = NULL;
+
+/* *********** dlopen Initialization *********** */
+
+static void doca_verbs_wrapper_init_once(int *ret) {
+    mlx5dv_handle = dlopen("libmlx5.so", RTLD_LAZY);
+    if (!mlx5dv_handle) {
+        *ret = -1; /* Failed to load library */
+        return;
+    }
+
+    /* Load function pointers */
+    mlx5dv_init_obj_func = (mlx5dv_init_obj_func_t)dlsym(mlx5dv_handle, "mlx5dv_init_obj");
+    mlx5dv_devx_obj_create_func =
+        (mlx5dv_devx_obj_create_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_create");
+    mlx5dv_devx_obj_destroy_func =
+        (mlx5dv_devx_obj_destroy_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_destroy");
+    mlx5dv_devx_obj_query_func =
+        (mlx5dv_devx_obj_query_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_query");
+    mlx5dv_devx_obj_modify_func =
+        (mlx5dv_devx_obj_modify_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_obj_modify");
+    mlx5dv_devx_general_cmd_func =
+        (mlx5dv_devx_general_cmd_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_general_cmd");
+    mlx5dv_devx_query_eqn_func =
+        (mlx5dv_devx_query_eqn_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_query_eqn");
+    mlx5dv_devx_umem_reg_func =
+        (mlx5dv_devx_umem_reg_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_reg");
+    mlx5dv_devx_umem_reg_ex_func =
+        (mlx5dv_devx_umem_reg_ex_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_reg_ex");
+    mlx5dv_devx_umem_dereg_func =
+        (mlx5dv_devx_umem_dereg_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_umem_dereg");
+    mlx5dv_devx_alloc_uar_func =
+        (mlx5dv_devx_alloc_uar_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_alloc_uar");
+    mlx5dv_devx_free_uar_func =
+        (mlx5dv_devx_free_uar_func_t)dlsym(mlx5dv_handle, "mlx5dv_devx_free_uar");
+    mlx5dv_query_device_func =
+        (mlx5dv_query_device_func_t)dlsym(mlx5dv_handle, "mlx5dv_query_device");
+
+    /* Check if all functions were loaded successfully */
+    if (!mlx5dv_init_obj_func || !mlx5dv_devx_obj_create_func || !mlx5dv_devx_obj_destroy_func ||
+        !mlx5dv_devx_obj_query_func || !mlx5dv_devx_obj_modify_func ||
+        !mlx5dv_devx_general_cmd_func || !mlx5dv_devx_query_eqn_func ||
+        !mlx5dv_devx_umem_reg_func || !mlx5dv_devx_umem_reg_ex_func ||
+        !mlx5dv_devx_umem_dereg_func || !mlx5dv_devx_alloc_uar_func || !mlx5dv_devx_free_uar_func ||
+        !mlx5dv_query_device_func) {
+        dlclose(mlx5dv_handle);
+        mlx5dv_handle = NULL;
+        *ret = -1; /* Failed to load some functions */
+        return;
+    }
+
+    *ret = 0;
+}
+
+static int doca_verbs_wrapper_init_dlopen(void) {
+    int ret = 0;
+    static std::once_flag once;
+    std::call_once(once, doca_verbs_wrapper_init_once, &ret);
+    return ret;
+}
+
+/* *********** Wrapper Implementation with dlopen *********** */
+
+doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                enum mlx5dv_obj_type obj_type) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_init_obj_func(obj, obj_type);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(struct ibv_context *context, const void *in,
+                                                       size_t inlen, void *out, size_t outlen,
+                                                       struct mlx5dv_devx_obj **obj_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_obj *obj = mlx5dv_devx_obj_create_func(context, in, inlen, out, outlen);
+    if (obj) {
+        *obj_out = obj;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_destroy_func(obj);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in,
+                                                      size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_query_func(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in,
+                                                       size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_obj_modify_func(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in,
+                                                        size_t inlen, void *out, size_t outlen) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_general_cmd_func(context, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t cpus,
+                                                      uint32_t *eqn) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_query_eqn_func(context, cpus, eqn);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr,
+                                                     size_t size, uint32_t access,
+                                                     struct mlx5dv_devx_umem **umem_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_func(context, addr, size, access);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(struct ibv_context *context,
+                                                        struct mlx5dv_devx_umem_in *umem_in,
+                                                        struct mlx5dv_devx_umem **umem_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_ex_func(context, umem_in);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *umem) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_devx_umem_dereg_func(umem);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(struct ibv_context *context,
+                                                      uint32_t uar_type,
+                                                      struct mlx5dv_devx_uar **uar_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar_func(context, uar_type);
+    if (uar) {
+        *uar_out = uar;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    mlx5dv_devx_free_uar_func(uar);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_wrapper_mlx5dv_query_device(struct ibv_context *context,
+                                                    struct mlx5dv_context *attrs_out) {
+    if (doca_verbs_wrapper_init_dlopen() != 0) {
+        return DOCA_ERROR_NOT_FOUND;
+    }
+    int ret = mlx5dv_query_device_func(context, attrs_out);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h
new file mode 100644
index 00000000000..2707a0efd3a
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_mlx5dv_wrapper.h
@@ -0,0 +1,431 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_mlx5dv_wrapper.h
+ * @brief Wrapper for mlx5dv API calls and structs
+ *
+ * This wrapper provides an abstraction layer over mlx5dv APIs.
+ * It can be enabled by defining DOCA_VERBS_USE_MLX5DV_WRAPPER.
+ *
+ * When DOCA_VERBS_USE_MLX5DV_WRAPPER is defined:
+ * - All mlx5dv API calls are wrapped using dlopen
+ * - All mlx5dv structs are wrapped
+ * - The wrapper provides a clean abstraction layer with dynamic loading
+ *
+ * When DOCA_VERBS_USE_MLX5DV_WRAPPER is not defined:
+ * - Direct mlx5dv APIs are used
+ * - No overhead is introduced
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_MLX5DV_WRAPPER_H
+#define DOCA_VERBS_MLX5DV_WRAPPER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "host/doca_error.h"
+
+#ifdef DOCA_VERBS_USE_MLX5DV_WRAPPER
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <sys/types.h>
+#include <endian.h>
+
+#include "doca_verbs_ibv_wrapper.h"
+
+#define ETHERNET_LL_SIZE 6
+
+enum mlx5_ib_uapi_uar_alloc_type {
+    MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF = 0x0,
+    MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC = 0x1,
+};
+
+#define MLX5DV_UAR_ALLOC_TYPE_BF MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF
+#define MLX5DV_UAR_ALLOC_TYPE_NC MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC
+
+enum mlx5dv_devx_umem_in_mask {
+    MLX5DV_UMEM_MASK_DMABUF = 1 << 0,
+};
+
+struct mlx5dv_devx_umem_in {
+    void *addr;
+    size_t size;
+    uint32_t access;
+    uint64_t pgsz_bitmap;
+    uint64_t comp_mask;
+    int dmabuf_fd;
+};
+
+enum mlx5dv_obj_type {
+    MLX5DV_OBJ_QP = 1 << 0,
+    MLX5DV_OBJ_CQ = 1 << 1,
+    MLX5DV_OBJ_SRQ = 1 << 2,
+    MLX5DV_OBJ_RWQ = 1 << 3,
+    MLX5DV_OBJ_DM = 1 << 4,
+    MLX5DV_OBJ_AH = 1 << 5,
+    MLX5DV_OBJ_PD = 1 << 6,
+    MLX5DV_OBJ_DEVX = 1 << 7,
+};
+
+struct mlx5dv_devx_umem {
+    uint32_t umem_id;
+};
+
+struct mlx5dv_devx_obj {
+    /* Opaque structure - implementation details hidden */
+    void *obj;
+};
+
+struct doca_gpunetio_ib_mlx5_wqe_av {
+    union {
+        struct {
+            __be32 qkey;
+            __be32 reserved;
+        } qkey;
+        __be64 dc_key;
+    } key;
+    __be32 dqp_dct;
+    uint8_t stat_rate_sl;
+    uint8_t fl_mlid;
+    __be16 rlid;
+    uint8_t reserved0[4];
+    uint8_t rmac[ETHERNET_LL_SIZE];
+    uint8_t tclass;
+    uint8_t hop_limit;
+    __be32 grh_gid_fl;
+    uint8_t rgid[16];
+};
+
+struct mlx5dv_ah {
+    struct doca_gpunetio_ib_mlx5_wqe_av *av;
+    uint64_t comp_mask;
+};
+
+struct mlx5dv_pd {
+    uint32_t pdn;
+    uint64_t comp_mask;
+};
+
+struct mlx5dv_obj {
+    struct {
+        struct ibv_qp *in;
+        struct mlx5dv_qp *out;
+    } qp;
+    struct {
+        struct ibv_cq *in;
+        struct mlx5dv_cq *out;
+    } cq;
+    struct {
+        struct ibv_srq *in;
+        struct mlx5dv_srq *out;
+    } srq;
+    struct {
+        struct ibv_wq *in;
+        struct mlx5dv_rwq *out;
+    } rwq;
+    struct {
+        struct ibv_dm *in;
+        struct mlx5dv_dm *out;
+    } dm;
+    struct {
+        struct ibv_ah *in;
+        struct mlx5dv_ah *out;
+    } ah;
+    struct {
+        struct ibv_pd *in;
+        struct mlx5dv_pd *out;
+    } pd;
+    struct {
+        struct mlx5dv_devx_obj *in;
+        struct mlx5dv_devx *out;
+    } devx;
+};
+
+struct mlx5dv_devx_uar {
+    void *reg_addr;
+    void *base_addr;
+    uint32_t page_id;
+    off_t mmap_off;
+    uint64_t comp_mask;
+};
+
+#define __devx_nullp(typ) ((struct mlx5_ifc_##typ##_bits *)NULL)
+#define __devx_st_sz_bits(typ) sizeof(struct mlx5_ifc_##typ##_bits)
+#define __devx_bit_sz(typ, fld) sizeof(__devx_nullp(typ)->fld)
+#define __devx_bit_off(typ, fld) offsetof(struct mlx5_ifc_##typ##_bits, fld)
+#define __devx_dw_off(bit_off) ((bit_off) / 32)
+#define __devx_64_off(bit_off) ((bit_off) / 64)
+#define __devx_dw_bit_off(bit_sz, bit_off) (32 - (bit_sz) - ((bit_off) & 0x1f))
+#define __devx_mask(bit_sz) ((uint32_t)((1ull << (bit_sz)) - 1))
+#define __devx_dw_mask(bit_sz, bit_off) (__devx_mask(bit_sz) << __devx_dw_bit_off(bit_sz, bit_off))
+
+#define DEVX_FLD_SZ_BYTES(typ, fld) (__devx_bit_sz(typ, fld) / 8)
+#define DEVX_ST_SZ_BYTES(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 8)
+#define DEVX_ST_SZ_DW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 32)
+#define DEVX_ST_SZ_QW(typ) (sizeof(struct mlx5_ifc_##typ##_bits) / 64)
+#define DEVX_UN_SZ_BYTES(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 8)
+#define DEVX_UN_SZ_DW(typ) (sizeof(union mlx5_ifc_##typ##_bits) / 32)
+#define DEVX_BYTE_OFF(typ, fld) (__devx_bit_off(typ, fld) / 8)
+#define DEVX_ADDR_OF(typ, p, fld) ((unsigned char *)(p) + DEVX_BYTE_OFF(typ, fld))
+
+static inline void _devx_set(void *p, uint32_t value, size_t bit_off, size_t bit_sz) {
+    __be32 *fld = (__be32 *)(p) + __devx_dw_off(bit_off);
+    uint32_t dw_mask = __devx_dw_mask(bit_sz, bit_off);
+    uint32_t mask = __devx_mask(bit_sz);
+
+    *fld = htobe32((be32toh(*fld) & (~dw_mask)) |
+                   ((value & mask) << __devx_dw_bit_off(bit_sz, bit_off)));
+}
+
+#define DEVX_SET(typ, p, fld, v) _devx_set(p, v, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld))
+
+static inline uint32_t _devx_get(const void *p, size_t bit_off, size_t bit_sz) {
+    return ((be32toh(*((const __be32 *)(p) + __devx_dw_off(bit_off))) >>
+             __devx_dw_bit_off(bit_sz, bit_off)) &
+            __devx_mask(bit_sz));
+}
+
+#define DEVX_GET(typ, p, fld) _devx_get(p, __devx_bit_off(typ, fld), __devx_bit_sz(typ, fld))
+
+static inline void _devx_set64(void *p, uint64_t v, size_t bit_off) {
+    *((__be64 *)(p) + __devx_64_off(bit_off)) = htobe64(v);
+}
+
+#define DEVX_SET64(typ, p, fld, v) _devx_set64(p, v, __devx_bit_off(typ, fld))
+
+static inline uint64_t _devx_get64(const void *p, size_t bit_off) {
+    return be64toh(*((const __be64 *)(p) + __devx_64_off(bit_off)));
+}
+
+#define DEVX_GET64(typ, p, fld) _devx_get64(p, __devx_bit_off(typ, fld))
+
+struct mlx5dv_context;
+struct mlx5dv_port;
+
+/* *********** mlx5dv API Wrappers *********** */
+
+/**
+ * @brief Wrapper for mlx5dv_init_obj
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                enum mlx5dv_obj_type obj_type);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_create
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(struct ibv_context *context, const void *in,
+                                                       size_t inlen, void *out, size_t outlen,
+                                                       struct mlx5dv_devx_obj **obj_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_destroy
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_query
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in,
+                                                      size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_obj_modify
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in,
+                                                       size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_general_cmd
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in,
+                                                        size_t inlen, void *out, size_t outlen);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_query_eqn
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t cpus,
+                                                      uint32_t *eqn);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_reg
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr,
+                                                     size_t size, uint32_t access,
+                                                     struct mlx5dv_devx_umem **umem_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_reg_ex
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(struct ibv_context *context,
+                                                        struct mlx5dv_devx_umem_in *umem_in,
+                                                        struct mlx5dv_devx_umem **umem_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_umem_dereg
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *umem);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_alloc_uar
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(struct ibv_context *context,
+                                                      uint32_t uar_type,
+                                                      struct mlx5dv_devx_uar **uar_out);
+
+/**
+ * @brief Wrapper for mlx5dv_devx_free_uar
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar);
+
+/**
+ * @brief Wrapper for mlx5dv_query_device
+ */
+doca_error_t doca_verbs_wrapper_mlx5dv_query_device(struct ibv_context *context,
+                                                    struct mlx5dv_context *attrs_out);
+
+#else /* !DOCA_VERBS_USE_MLX5DV_WRAPPER */
+
+#include <infiniband/mlx5dv.h>
+
+/* *********** Direct API Implementation (inline) *********** */
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_init_obj(struct mlx5dv_obj *obj,
+                                                              enum mlx5dv_obj_type obj_type) {
+    int ret = mlx5dv_init_obj(obj, obj_type);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_create(
+    struct ibv_context *context, const void *in, size_t inlen, void *out, size_t outlen,
+    struct mlx5dv_devx_obj **obj_out) {
+    struct mlx5dv_devx_obj *obj = mlx5dv_devx_obj_create(context, in, inlen, out, outlen);
+    if (obj) {
+        *obj_out = obj;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj) {
+    int ret = mlx5dv_devx_obj_destroy(obj);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj,
+                                                                    const void *in, size_t inlen,
+                                                                    void *out, size_t outlen) {
+    int ret = mlx5dv_devx_obj_query(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj,
+                                                                     const void *in, size_t inlen,
+                                                                     void *out, size_t outlen) {
+    int ret = mlx5dv_devx_obj_modify(obj, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_general_cmd(struct ibv_context *context,
+                                                                      const void *in, size_t inlen,
+                                                                      void *out, size_t outlen) {
+    int ret = mlx5dv_devx_general_cmd(context, in, inlen, out, outlen);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_query_eqn(struct ibv_context *context,
+                                                                    uint32_t cpus, uint32_t *eqn) {
+    int ret = mlx5dv_devx_query_eqn(context, cpus, eqn);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg(
+    struct ibv_context *context, void *addr, size_t size, uint32_t access,
+    struct mlx5dv_devx_umem **umem_out) {
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg(context, addr, size, access);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(
+    struct ibv_context *context, struct mlx5dv_devx_umem_in *umem_in,
+    struct mlx5dv_devx_umem **umem_out) {
+    struct mlx5dv_devx_umem *umem = mlx5dv_devx_umem_reg_ex(context, umem_in);
+    if (umem) {
+        *umem_out = umem;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_umem_dereg(
+    struct mlx5dv_devx_umem *umem) {
+    int ret = mlx5dv_devx_umem_dereg(umem);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+    struct ibv_context *context, uint32_t uar_type, struct mlx5dv_devx_uar **uar_out) {
+    struct mlx5dv_devx_uar *uar = mlx5dv_devx_alloc_uar(context, uar_type);
+    if (uar) {
+        *uar_out = uar;
+        return DOCA_SUCCESS;
+    }
+    return DOCA_ERROR_DRIVER;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *uar) {
+    mlx5dv_devx_free_uar(uar);
+    return DOCA_SUCCESS;
+}
+
+static inline doca_error_t doca_verbs_wrapper_mlx5dv_query_device(
+    struct ibv_context *context, struct mlx5dv_context *attrs_out) {
+    int ret = mlx5dv_query_device(context, attrs_out);
+    return (ret == 0) ? DOCA_SUCCESS : DOCA_ERROR_DRIVER;
+}
+
+#endif /* !DOCA_VERBS_USE_MLX5DV_WRAPPER */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DOCA_VERBS_MLX5DV_WRAPPER_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h
new file mode 100644
index 00000000000..2512b74bdf6
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_net_wrapper.h
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * @file doca_verbs_net_wrapper.h
+ * @brief Main wrapper header for IB Verbs and mlx5dv API calls and structs
+ *
+ * This header includes the separate IB Verbs and mlx5dv wrappers.
+ * It provides backward compatibility with the original unified wrapper.
+ *
+ * For IB Verbs wrapper, define DOCA_VERBS_USE_IBV_WRAPPER
+ * For mlx5dv wrapper, define DOCA_VERBS_USE_MLX5DV_WRAPPER
+ * For backward compatibility, define DOCA_VERBS_USE_WRAPPER (enables both)
+ *
+ * @{
+ */
+#ifndef DOCA_VERBS_NET_WRAPPER_H
+#define DOCA_VERBS_NET_WRAPPER_H
+
+#ifdef DOCA_VERBS_USE_NET_WRAPPER
+#ifndef DOCA_VERBS_USE_IBV_WRAPPER
+#define DOCA_VERBS_USE_IBV_WRAPPER
+#endif
+#ifndef DOCA_VERBS_USE_MLX5DV_WRAPPER
+#define DOCA_VERBS_USE_MLX5DV_WRAPPER
+#endif
+#endif
+
+/* Include the separate wrappers */
+#include "doca_verbs_ibv_wrapper.h"
+#include "doca_verbs_mlx5dv_wrapper.h"
+
+#endif /* DOCA_VERBS_NET_WRAPPER_H */
+
+/** @} */
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp
new file mode 100644
index 00000000000..5970d75e3f5
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.cpp
@@ -0,0 +1,2743 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_srq.hpp"
+#include "doca_verbs_cq.hpp"
+#include "doca_verbs_qp.hpp"
+#include "doca_verbs_net_wrapper.h"
+#include "common/doca_gpunetio_verbs_def.h"
+
+#define USER_INDEX_MSB_8BITS_MASK 0xFF000000
+#define DOCA_VERBS_LOG_OCTOWORD_SIZE 4
+#define DOCA_VERBS_OCTOWORD_SIZE (1U << DOCA_VERBS_LOG_OCTOWORD_SIZE)
+#define DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_data_seg)
+#define DOCA_VERBS_LOG_WQEBB_SIZE 6
+#define DOCA_VERBS_WQEBB_SIZE (1U << DOCA_VERBS_LOG_WQEBB_SIZE)
+#define MAX(a, b) std::max(a, b)
+#define QP_ATTR(_mask) (DOCA_VERBS_QP_ATTR_##_mask)
+#define PRIV_DOCA_MAC_BYTE_LENGTH 6
+#define PRIV_DOCA_VERBS_PORT_NUM 1
+#define PRIV_DOCA_GID_BYTE_LENGTH 16
+
+enum {
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_RRE = (1 << 1),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_RWE = (1 << 3),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_PKEY_INDEX = (1 << 4),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_MIN_RNR_NAK = (1 << 6),
+    PRIV_DOCA_MLX5_QP_OPT_PARAM_PORT_NUM = (1 << 16),
+    PRIV_DOCA_MLX5_QP_OPT_DSCP = (1 << 17),
+    PRIV_DOCA_MLX5_QP_OPT_SGID_INDEX = (1 << 23),
+};
+
+enum doca_verbs_qp_state_mod {
+    DOCA_VERBS_QP_RST2INIT,
+    DOCA_VERBS_QP_INIT2INIT,
+    DOCA_VERBS_QP_INIT2RTR,
+    DOCA_VERBS_QP_RTR2RTS,
+    DOCA_VERBS_QP_RTS2RTS,
+};
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+constexpr uint32_t sc_verbs_qp_doorbell_size = 64;
+constexpr uint8_t sc_verbs_qp_log_rq_stride_shift = 4;
+constexpr uint32_t sc_verbs_mac_addr_len = 6;
+constexpr uint32_t sc_verbs_mac_addr_2msbytes_len = 2;
+constexpr uint32_t sc_verbs_log_msg_max = 30;
+
+using create_qp_in = uint32_t[MLX5_ST_SZ_DW(create_qp_in)];
+using create_qp_out = uint32_t[MLX5_ST_SZ_DW(create_qp_out)];
+
+using rst2init_qp_in = uint32_t[MLX5_ST_SZ_DW(rst2init_qp_in)];
+using rst2init_qp_out = uint32_t[MLX5_ST_SZ_DW(rst2init_qp_out)];
+
+using init2init_qp_in = uint32_t[MLX5_ST_SZ_DW(init2init_qp_in)];
+using init2init_qp_out = uint32_t[MLX5_ST_SZ_DW(init2init_qp_out)];
+
+using init2rtr_qp_in = uint32_t[MLX5_ST_SZ_DW(init2rtr_qp_in)];
+using init2rtr_qp_out = uint32_t[MLX5_ST_SZ_DW(init2rtr_qp_out)];
+
+using rtr2rts_qp_in = uint32_t[MLX5_ST_SZ_DW(rtr2rts_qp_in)];
+using rtr2rts_qp_out = uint32_t[MLX5_ST_SZ_DW(rtr2rts_qp_out)];
+
+using rts2rts_qp_in = uint32_t[MLX5_ST_SZ_DW(rts2rts_qp_in)];
+using rts2rts_qp_out = uint32_t[MLX5_ST_SZ_DW(rts2rts_qp_out)];
+
+using qp_2err_in = uint32_t[MLX5_ST_SZ_DW(qp_2err_in)];
+using qp_2err_out = uint32_t[MLX5_ST_SZ_DW(qp_2err_out)];
+
+using qp_2rst_in = uint32_t[MLX5_ST_SZ_DW(qp_2rst_in)];
+using qp_2rst_out = uint32_t[MLX5_ST_SZ_DW(qp_2rst_out)];
+
+using query_qp_in = uint32_t[MLX5_ST_SZ_DW(query_qp_in)];
+using query_qp_out = uint32_t[MLX5_ST_SZ_DW(query_qp_out)];
+
+int rst2init_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(PKEY_INDEX) | QP_ATTR(PORT_NUM) | QP_ATTR(ALLOW_REMOTE_WRITE) |
+        QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int init2rtr_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(RQ_PSN) | QP_ATTR(DEST_QP_NUM) | QP_ATTR(PATH_MTU) | QP_ATTR(AH_ATTR) |
+        QP_ATTR(MIN_RNR_TIMER),
+};
+
+int rtr2rts_requested_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(SQ_PSN) | QP_ATTR(ACK_TIMEOUT) | QP_ATTR(RETRY_CNT) | QP_ATTR(RNR_RETRY),
+};
+
+int init2init_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(PKEY_INDEX) | QP_ATTR(PORT_NUM) |
+        QP_ATTR(ALLOW_REMOTE_WRITE) | QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int init2rtr_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(PKEY_INDEX) |
+        QP_ATTR(ALLOW_REMOTE_WRITE) | QP_ATTR(ALLOW_REMOTE_READ),
+};
+
+int rtr2rts_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(MIN_RNR_TIMER) |
+        QP_ATTR(ALLOW_REMOTE_WRITE),
+};
+
+int rts2rts_optional_attr[DOCA_VERBS_QP_TYPE_RC + 1] = {
+    /* [DOCA_VERBS_QP_TYPE_RC] */
+    QP_ATTR(CURRENT_STATE) | QP_ATTR(NEXT_STATE) | QP_ATTR(ALLOW_REMOTE_WRITE) |
+        QP_ATTR(ALLOW_REMOTE_READ) | QP_ATTR(MIN_RNR_TIMER) | QP_ATTR(AH_ATTR),
+};
+
+const char *qp_attr_to_string(int attr) {
+    switch (attr) {
+        case DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE:
+            return "ALLOW_REMOTE_WRITE";
+        case DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ:
+            return "ALLOW_REMOTE_READ";
+        case DOCA_VERBS_QP_ATTR_PKEY_INDEX:
+            return "PKEY_INDEX";
+        case DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER:
+            return "MIN_RNR_TIMER";
+        case DOCA_VERBS_QP_ATTR_PORT_NUM:
+            return "PORT_NUM";
+        case DOCA_VERBS_QP_ATTR_NEXT_STATE:
+            return "NEXT_STATE";
+        case DOCA_VERBS_QP_ATTR_CURRENT_STATE:
+            return "CURRENT_STATE";
+        case DOCA_VERBS_QP_ATTR_PATH_MTU:
+            return "PATH_MTU";
+        case DOCA_VERBS_QP_ATTR_RQ_PSN:
+            return "RQ_PSN";
+        case DOCA_VERBS_QP_ATTR_SQ_PSN:
+            return "SQ_PSN";
+        case DOCA_VERBS_QP_ATTR_DEST_QP_NUM:
+            return "DEST_QP_NUM";
+        case DOCA_VERBS_QP_ATTR_ACK_TIMEOUT:
+            return "ACK_TIMEOUT";
+        case DOCA_VERBS_QP_ATTR_RETRY_CNT:
+            return "RETRY_CNT";
+        case DOCA_VERBS_QP_ATTR_RNR_RETRY:
+            return "RNR_RETRY";
+        case DOCA_VERBS_QP_ATTR_AH_ATTR:
+            return "AH_ATTR";
+        default:
+            break;
+    }
+
+    return "UNKNOWN";
+}
+
+void print_if_missing_attr(int required_attr_mask, int attr_mask, int attr_to_check) {
+    if ((required_attr_mask & attr_to_check) != 0 && (attr_mask & attr_to_check) == 0)
+        DOCA_LOG(LOG_ERR, "%s is required but diabled in attr_mask (%d)",
+                 qp_attr_to_string(attr_to_check), attr_mask);
+}
+
+void print_missing_attrs(int required_attr_mask, int attr_mask) {
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PKEY_INDEX);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PORT_NUM);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_NEXT_STATE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_CURRENT_STATE);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_PATH_MTU);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RQ_PSN);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_SQ_PSN);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_DEST_QP_NUM);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_ACK_TIMEOUT);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RETRY_CNT);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_RNR_RETRY);
+    print_if_missing_attr(required_attr_mask, attr_mask, DOCA_VERBS_QP_ATTR_AH_ATTR);
+}
+
+bool is_X2rst_attrs_valid(int attr_mask) {
+    int valid_attr = (DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE);
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_X2err_attrs_valid(int attr_mask) {
+    int valid_attr = (DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE);
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rst2init_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = rst2init_requested_attr[qp_type];
+    int valid_attr =
+        required_attr | DOCA_VERBS_QP_ATTR_CURRENT_STATE | DOCA_VERBS_QP_ATTR_NEXT_STATE;
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_init2init_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int valid_attr = init2init_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_init2rtr_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = init2rtr_requested_attr[qp_type];
+    int valid_attr = required_attr | init2rtr_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rtr2rts_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int required_attr = rtr2rts_requested_attr[qp_type];
+    int valid_attr = required_attr | rtr2rts_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    if ((required_attr & attr_mask) != required_attr) {
+        print_missing_attrs(required_attr, attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+bool is_rts2rts_attrs_valid(int attr_mask, uint32_t qp_type) {
+    int valid_attr = rts2rts_optional_attr[qp_type];
+
+    if (attr_mask & ~(valid_attr)) {
+        DOCA_LOG(LOG_ERR, "attr_mask contains invalid bit attr_masks (attr_mask=%d)", attr_mask);
+        return false;
+    }
+
+    return true;
+}
+
+void convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(
+    int attr_mask, int &mlx5_opt_mask, doca_verbs_qp_state_mod state_mod) {
+    mlx5_opt_mask = 0;
+
+    static const int valid_opt_mask[] = {
+        // RST2INIT
+        0,
+        // INIT2INIT
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_PKEY_INDEX | DOCA_VERBS_QP_ATTR_PORT_NUM,
+        // INIT2RTR
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_PKEY_INDEX,
+        // RTR2RTS
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER,
+        // RTS2RTS
+        DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ |
+            DOCA_VERBS_QP_ATTR_AH_ATTR | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER,
+    };
+
+    attr_mask &= valid_opt_mask[state_mod];
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_PKEY_INDEX)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_PKEY_INDEX;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_MIN_RNR_NAK;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_PORT_NUM)
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_PORT_NUM;
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR)
+        mlx5_opt_mask |= (PRIV_DOCA_MLX5_QP_OPT_SGID_INDEX | PRIV_DOCA_MLX5_QP_OPT_DSCP);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE) {
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_RWE;
+    }
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) {
+        mlx5_opt_mask |= PRIV_DOCA_MLX5_QP_OPT_PARAM_RRE;
+    }
+}
+
+doca_error_t query_roce_version(struct ibv_context *ctx, uint8_t sgid_index,
+                                uint8_t &roce_version) noexcept {
+    uint32_t in[MLX5_ST_SZ_DW(query_roce_address_in)] = {0};
+    constexpr auto out_size =
+        MLX5_ST_SZ_DW(query_roce_address_out) + MLX5_ST_SZ_DW(roce_addr_layout);
+    uint32_t out[out_size] = {0};
+
+    DEVX_SET(query_roce_address_in, &in, opcode, MLX5_CMD_OP_QUERY_ROCE_ADDRESS);
+    DEVX_SET(query_roce_address_in, &in, roce_address_index, sgid_index);
+
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query roce version");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    roce_version = DEVX_GET(query_roce_address_out, out, roce_address[0].roce_version);
+
+    DOCA_LOG(LOG_INFO, "roce_version = %d", roce_version);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_doca_mtu_size_to_prm_mtu_size(doca_verbs_mtu_size mtu_size,
+                                                   uint32_t &prm_mtu_size) noexcept {
+    switch (mtu_size) {
+        case DOCA_VERBS_MTU_SIZE_256_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_256_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_512_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_512_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_1K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_1K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_2K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_2K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_4K_BYTES:
+            prm_mtu_size = MLX5_QPC_MTU_4K_BYTES;
+            break;
+        case DOCA_VERBS_MTU_SIZE_RAW_ETHERNET:
+            prm_mtu_size = MLX5_QPC_MTU_RAW_ETHERNET_QP;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid DOCA mtu size=%d", mtu_size);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_prm_mtu_size_to_doca_verbs_mtu_size(uint32_t prm_mtu_size,
+                                                         doca_verbs_mtu_size &mtu_size) noexcept {
+    switch (prm_mtu_size) {
+        case MLX5_QPC_MTU_256_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_256_BYTES;
+            break;
+        case MLX5_QPC_MTU_512_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_512_BYTES;
+            break;
+        case MLX5_QPC_MTU_1K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_1K_BYTES;
+            break;
+        case MLX5_QPC_MTU_2K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_2K_BYTES;
+            break;
+        case MLX5_QPC_MTU_4K_BYTES:
+            mtu_size = DOCA_VERBS_MTU_SIZE_4K_BYTES;
+            break;
+        case MLX5_QPC_MTU_RAW_ETHERNET_QP:
+            mtu_size = DOCA_VERBS_MTU_SIZE_RAW_ETHERNET;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid prm mtu size=%d", mtu_size);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+int random_in_range(int min, int max) { return min + rand() % (max - min + 1); }
+
+doca_error_t resolve_remote_mac(ibv_pd *pd_handle, uint8_t local_port_num, uint32_t local_gid_index,
+                                uint8_t remote_gid[PRIV_DOCA_GID_BYTE_LENGTH], uint8_t hop_limit,
+                                uint8_t is_global,
+                                uint8_t mac[PRIV_DOCA_MAC_BYTE_LENGTH]) noexcept {
+    struct ibv_ah_attr attr = {};
+
+    attr.port_num = local_port_num;
+    attr.grh.sgid_index = local_gid_index;
+    memcpy(attr.grh.dgid.raw, remote_gid, PRIV_DOCA_GID_BYTE_LENGTH);
+    attr.grh.hop_limit = hop_limit;
+    attr.is_global = is_global;
+
+    struct ibv_ah *ah;
+    auto ah_ret = doca_verbs_wrapper_ibv_create_ah(pd_handle, &attr, &ah);
+    if (ah_ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create ibv_ah. ret=%d", ah_ret);
+        return ah_ret;
+    }
+
+    struct mlx5dv_obj dv_obj {};
+    struct mlx5dv_ah dv_ah {};
+
+    dv_obj.ah.in = ah;
+    dv_obj.ah.out = &dv_ah;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_AH);
+    if (ret != DOCA_SUCCESS) {
+        auto destroy_ret = doca_verbs_wrapper_ibv_destroy_ah(ah);
+        if (destroy_ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ret);
+        }
+        DOCA_LOG(LOG_ERR, "Failed to initialize mlx5dv_ah from ibv_ah. ret=%d", ret);
+        return DOCA_ERROR_DRIVER;
+    }
+
+    // Check needed for coverity
+    if (dv_ah.av == nullptr) {
+        auto destroy_ret = doca_verbs_wrapper_ibv_destroy_ah(ah);
+        if (destroy_ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ret);
+        }
+        DOCA_LOG(LOG_ERR, "Failed to initialize mlx5dv_ah from ibv_ah mlx5dv_ah::av is NULL");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    memcpy(mac, dv_ah.av->rmac, PRIV_DOCA_MAC_BYTE_LENGTH);
+
+    auto destroy_ah_status = doca_verbs_wrapper_ibv_destroy_ah(ah);
+    if (destroy_ah_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy ibv_ah. ret=%d", destroy_ah_status);
+        return destroy_ah_status;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t convert_prm_qp_state_to_doca_verbs_qp_state(uint32_t qp_state,
+                                                         doca_verbs_qp_state &state) {
+    switch (qp_state) {
+        case MLX5_QPC_STATE_RST:
+            state = DOCA_VERBS_QP_STATE_RST;
+            break;
+        case MLX5_QPC_STATE_INIT:
+            state = DOCA_VERBS_QP_STATE_INIT;
+            break;
+        case MLX5_QPC_STATE_RTR:
+            state = DOCA_VERBS_QP_STATE_RTR;
+            break;
+        case MLX5_QPC_STATE_RTS:
+            state = DOCA_VERBS_QP_STATE_RTS;
+            break;
+        case MLX5_QPC_STATE_ERR:
+            state = DOCA_VERBS_QP_STATE_ERR;
+            break;
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid prm qp state=%d", qp_state);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs Member Functions
+ *********************************************************************************************************************/
+
+bool doca_verbs_qp::is_qp_attr_state_valid(enum doca_verbs_qp_state state) noexcept {
+    switch (state) {
+        case DOCA_VERBS_QP_STATE_RST:
+        case DOCA_VERBS_QP_STATE_INIT:
+        case DOCA_VERBS_QP_STATE_RTR:
+        case DOCA_VERBS_QP_STATE_RTS:
+        case DOCA_VERBS_QP_STATE_ERR:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "state is invalid (value is %u)", state);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_path_mtu_valid(enum doca_verbs_mtu_size path_mtu) noexcept {
+    switch (path_mtu) {
+        case DOCA_VERBS_MTU_SIZE_256_BYTES:
+        case DOCA_VERBS_MTU_SIZE_512_BYTES:
+        case DOCA_VERBS_MTU_SIZE_1K_BYTES:
+        case DOCA_VERBS_MTU_SIZE_2K_BYTES:
+        case DOCA_VERBS_MTU_SIZE_4K_BYTES:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "path_mtu is invalid (value is %u)", path_mtu);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+// No value of PSN causes a value (we print a warning and mask it in case of overflow)
+uint32_t doca_verbs_qp::is_qp_attr_queue_psn_valid(uint32_t psn) noexcept {
+    if (psn & ~0xffffff) {
+        DOCA_LOG(LOG_ERR, "PSN value overflow (max is %x). Masking to 24 bits", 0xffffff);
+        psn &= 0xffffff;
+    }
+
+    return psn;
+}
+
+bool doca_verbs_qp::is_qp_attr_ah_add_type_valid(enum doca_verbs_addr_type addr_type) noexcept {
+    switch (addr_type) {
+        case DOCA_VERBS_ADDR_TYPE_IPv4:
+        case DOCA_VERBS_ADDR_TYPE_IPv6:
+        case DOCA_VERBS_ADDR_TYPE_IB_GRH:
+        case DOCA_VERBS_ADDR_TYPE_IB_NO_GRH:
+            return true;
+        default:
+            DOCA_LOG(LOG_ERR, "addr_type is invalid (value is %u)", addr_type);
+            return false;
+    }
+
+    // Shouldn't reach this
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_ah_sgid_index_valid(uint8_t sgid_index) noexcept {
+    if (sgid_index >= m_verbs_device_attr->m_gid_table_size) {
+        DOCA_LOG(LOG_ERR, "sgid_index should be less than %u (value is %u)",
+                 m_verbs_device_attr->m_gid_table_size - 1, sgid_index);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_pkey_index_valid(uint16_t pkey_index) noexcept {
+    if (pkey_index > m_verbs_device_attr->m_max_pkeys) {
+        DOCA_LOG(LOG_ERR, "pkey_index should be less than %u (value is %u)",
+                 m_verbs_device_attr->m_max_pkeys, pkey_index);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_port_num_valid(uint16_t port_num) noexcept {
+    if (port_num > m_verbs_device_attr->m_phys_port_cnt || port_num < 1) {
+        DOCA_LOG(LOG_ERR, "port_num should be from %u to %u (value is %u)", 1,
+                 m_verbs_device_attr->m_phys_port_cnt, port_num);
+        return false;
+    }
+
+    return true;
+}
+
+bool doca_verbs_qp::is_qp_attr_valid(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_CURRENT_STATE) &&
+        !is_qp_attr_state_valid(verbs_qp_attr->current_state))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_NEXT_STATE) &&
+        !is_qp_attr_state_valid(verbs_qp_attr->next_state))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PATH_MTU) &&
+        !is_qp_attr_path_mtu_valid(verbs_qp_attr->path_mtu))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_RQ_PSN))
+        verbs_qp_attr->rq_psn = is_qp_attr_queue_psn_valid(verbs_qp_attr->rq_psn);
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_SQ_PSN))
+        verbs_qp_attr->sq_psn = is_qp_attr_queue_psn_valid(verbs_qp_attr->sq_psn);
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) &&
+        !is_qp_attr_ah_add_type_valid(verbs_qp_attr->ah_attr->addr_type))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) &&
+        !is_qp_attr_ah_sgid_index_valid(verbs_qp_attr->ah_attr->sgid_index))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PKEY_INDEX) &&
+        !is_qp_attr_pkey_index_valid(verbs_qp_attr->pkey_index))
+        return false;
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_PORT_NUM) &&
+        !is_qp_attr_port_num_valid(verbs_qp_attr->port_num))
+        return false;
+
+    return true;
+}
+
+doca_verbs_qp_state doca_verbs_qp::get_current_state() const noexcept { return m_current_state; }
+
+doca_error_t doca_verbs_qp::create_qp_obj(
+    uint32_t uar_id, uint32_t log_rq_size, uint32_t log_sq_size_wqebb, uint32_t log_stride,
+    uint64_t dbr_umem_offset, uint32_t dbr_umem_id, uint32_t wq_umem_id,
+    struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept {
+    create_qp_in create_in{0};
+    create_qp_out create_out{0};
+
+    void *qpc = MLX5_ADDR_OF(create_qp_in, create_in, qpc);
+
+    DEVX_SET(create_qp_in, create_in, opcode, MLX5_CMD_OP_CREATE_QP);
+    DEVX_SET(qpc, qpc, st, MLX5_QPC_ST_RC);
+
+    struct mlx5dv_pd dvpd;
+    struct mlx5dv_obj dv_obj;
+    // Query pdn
+    memset(&dv_obj, 0, sizeof(dv_obj));
+    dv_obj.pd.in = m_pd;
+    dv_obj.pd.out = &dvpd;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_PD);
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Error in mlx5dv PD initialization");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    DEVX_SET(qpc, qpc, pd, dvpd.pdn);
+
+    DEVX_SET(qpc, qpc, user_index, verbs_qp_init_attr.user_index);
+    DEVX_SET(qpc, qpc, uar_page, uar_id);
+
+    if (m_sq_size_wqebb > 0) {
+        if (verbs_qp_init_attr.send_cq == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP. Send CQ is null");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        DEVX_SET(qpc, qpc, cqn_snd, verbs_qp_init_attr.send_cq->get_cqn());
+        DEVX_SET(qpc, qpc, log_sq_size, log_sq_size_wqebb);
+    } else {
+        DEVX_SET(qpc, qpc, no_sq, 1);
+    }
+
+    if ((m_rq_size > 0) || (verbs_qp_init_attr.srq != nullptr)) {
+        if (verbs_qp_init_attr.receive_cq == nullptr) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP. Receive CQ is null");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+
+        DEVX_SET(qpc, qpc, cqn_rcv, verbs_qp_init_attr.receive_cq->get_cqn());
+
+        if (verbs_qp_init_attr.srq != nullptr) {
+            /* Case of SRQ */
+            DEVX_SET(qpc, qpc, srqn_rmpn_xrqn, verbs_qp_init_attr.srq->get_srqn());
+            DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_SRQ_RMP_XRC_SRQ_XRQ);
+            m_srq = verbs_qp_init_attr.srq;
+        } else if (m_rq_size > 0) {
+            /* Case of regular RQ */
+            DEVX_SET(qpc, qpc, log_rq_stride, log_stride);
+            DEVX_SET(qpc, qpc, log_rq_size, log_rq_size);
+            DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_REGULAR);
+        }
+    } else {
+        /* Case of no RQ */
+        DEVX_SET(qpc, qpc, rq_type, MLX5_QPC_RQ_TYPE_ZERO_SIZE_RQ);
+    }
+
+    // DEVX_SET(qpc, qpc, cs_req, 0);            // Disable CS Request
+    // DEVX_SET(qpc, qpc, cs_res, 0);            // Disable CS Response
+
+    DEVX_SET(qpc, qpc, dbr_umem_valid, 1);
+    DEVX_SET(qpc, qpc, dbr_umem_id, dbr_umem_id);
+    DEVX_SET64(qpc, qpc, dbr_addr, dbr_umem_offset);
+    DEVX_SET64(qpc, qpc, cd_master, verbs_qp_init_attr.core_direct_master);
+    DEVX_SET(create_qp_in, create_in, wq_umem_id, wq_umem_id);
+    DEVX_SET(create_qp_in, create_in, wq_umem_valid, 1);
+
+    /* Since wq_umem_valid == 1, FW deduces page size from umem and this field is reserved */
+    DEVX_SET(qpc, qpc, log_page_size, 0);
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ibv_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_qp_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create QP. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_qp_num = DEVX_GET(create_qp_out, create_out, qpn);
+    m_current_state = DOCA_VERBS_QP_STATE_RST;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rst2init(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    rst2init_qp_in in{0};
+    rst2init_qp_out out{0};
+
+    if (!is_rst2init_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rst2init attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(rst2init_qp_in, &in, qpc);
+    DEVX_SET(rst2init_qp_in, &in, opcode, MLX5_CMD_OP_RST2INIT_QP);
+    DEVX_SET(rst2init_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, verbs_qp_attr.port_num);
+    DEVX_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
+    // DEVX_SET(qpc, qpc, counter_set_id, 0x0);  // Not connected to a counter set
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rst2init");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_INIT;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Init state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::init2init(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                      int attr_mask) noexcept {
+    init2init_qp_in in{0};
+    init2init_qp_out out{0};
+
+    if (!is_init2init_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "init2init attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(init2init_qp_in, &in, qpc);
+    DEVX_SET(init2init_qp_in, &in, opcode, MLX5_CMD_OP_INIT2INIT_QP);
+    DEVX_SET(init2init_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, verbs_qp_attr.port_num);
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_INIT2INIT);
+    DEVX_SET(init2init_qp_in, &in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP init2init");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_INIT;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Init state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::init2rtr(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     int attr_mask) noexcept {
+    if (!is_init2rtr_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "init2rtr attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) && !verbs_qp_attr.ah_attr) {
+        DOCA_LOG(LOG_ERR, "AH_ATTR mask is enabled but ah_attr=nullptr");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    init2rtr_qp_in in{0};
+    init2rtr_qp_out out{0};
+
+    void *qpc = MLX5_ADDR_OF(init2rtr_qp_in, in, qpc);
+    DEVX_SET(init2rtr_qp_in, in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
+    DEVX_SET(init2rtr_qp_in, in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, next_rcv_psn, verbs_qp_attr.rq_psn);
+    DEVX_SET(qpc, qpc, remote_qpn, verbs_qp_attr.dest_qp_num);
+    DEVX_SET(qpc, qpc, log_msg_max, sc_verbs_log_msg_max);
+
+    uint32_t prm_mtu{};
+    auto status = convert_doca_mtu_size_to_prm_mtu_size(verbs_qp_attr.path_mtu, prm_mtu);
+    if (status != DOCA_SUCCESS) return status;
+    DEVX_SET(qpc, qpc, mtu, prm_mtu);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IB_GRH)
+        DEVX_SET(qpc, qpc, primary_address_path.tclass, verbs_qp_attr.ah_attr->traffic_class);
+    DEVX_SET(qpc, qpc, primary_address_path.stat_rate, verbs_qp_attr.ah_attr->static_rate);
+
+    if (verbs_qp_attr.ah_attr->addr_type != DOCA_VERBS_ADDR_TYPE_IB_NO_GRH) {
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+               verbs_qp_attr.ah_attr->gid.raw, sizeof(struct doca_verbs_gid));
+        DEVX_SET(qpc, qpc, primary_address_path.hop_limit, verbs_qp_attr.ah_attr->hop_limit);
+        DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, verbs_qp_attr.ah_attr->sgid_index);
+    }
+
+    DEVX_SET(qpc, qpc, primary_address_path.rlid, verbs_qp_attr.ah_attr->dlid);
+    DEVX_SET(qpc, qpc, primary_address_path.sl, verbs_qp_attr.ah_attr->sl);
+
+    if ((verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IPv4) ||
+        (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IPv6)) { /* ROCE */
+        uint8_t dest_mac[PRIV_DOCA_MAC_BYTE_LENGTH];
+        status =
+            resolve_remote_mac(m_pd, PRIV_DOCA_VERBS_PORT_NUM, verbs_qp_attr.ah_attr->sgid_index,
+                               verbs_qp_attr.ah_attr->gid.raw, verbs_qp_attr.ah_attr->hop_limit,
+                               verbs_qp_attr.ah_attr->is_global, dest_mac);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get remote MAC");
+            return status;
+        }
+
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), dest_mac,
+               sc_verbs_mac_addr_2msbytes_len);
+        memcpy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_31_0),
+               dest_mac + sc_verbs_mac_addr_2msbytes_len,
+               sc_verbs_mac_addr_len - sc_verbs_mac_addr_2msbytes_len);
+    }
+
+    if (verbs_qp_attr.ah_attr->addr_type == DOCA_VERBS_ADDR_TYPE_IB_GRH) {
+        DEVX_SET(qpc, qpc, primary_address_path.grh, 1);
+    }
+
+    if (m_verbs_device_attr->m_port_type == MLX5_CAP_PORT_TYPE_ETH) {
+        uint8_t roce_version{};
+        status = query_roce_version(m_ibv_ctx, verbs_qp_attr.ah_attr->sgid_index, roce_version);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to query roce version");
+            return status;
+        }
+
+        if (roce_version >= MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0) {
+            // generate a random udp_sport
+            srand(time(NULL));
+            uint16_t udp_sport = (uint16_t)random_in_range(m_verbs_device_attr->m_min_udp_sport,
+                                                           m_verbs_device_attr->m_max_udp_sport);
+            DOCA_LOG(LOG_INFO, "Generated udp_sport = %d", udp_sport);
+
+            DEVX_SET(qpc, qpc, primary_address_path.udp_sport, udp_sport);
+            DEVX_SET(qpc, qpc, primary_address_path.dscp,
+                     verbs_qp_attr.ah_attr->traffic_class >> 2);
+        }
+    }
+
+    DEVX_SET(qpc, qpc, primary_address_path.pkey_index, verbs_qp_attr.pkey_index);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_INIT2RTR);
+    DEVX_SET(init2rtr_qp_in, in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP init2rtr, syndrome=0x%x",
+                 DEVX_GET(nop_out, out, syndrome));
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTR;
+    m_addr_type = verbs_qp_attr.ah_attr->addr_type;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to RTR state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rtr2rts(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                    int attr_mask) noexcept {
+    rtr2rts_qp_in in{0};
+    rtr2rts_qp_out out{0};
+
+    if (!is_rtr2rts_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rtr2rts attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    void *qpc = MLX5_ADDR_OF(rtr2rts_qp_in, &in, qpc);
+    DEVX_SET(rtr2rts_qp_in, &in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
+    DEVX_SET(rtr2rts_qp_in, &in, qpn, m_qp_num);
+    DEVX_SET(qpc, qpc, next_send_psn, verbs_qp_attr.sq_psn);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_ACK_TIMEOUT)
+        DEVX_SET(qpc, qpc, primary_address_path.ack_timeout, verbs_qp_attr.ack_timeout);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_RETRY_CNT)
+        DEVX_SET(qpc, qpc, retry_count, verbs_qp_attr.retry_cnt);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_RNR_RETRY)
+        DEVX_SET(qpc, qpc, rnr_retry, verbs_qp_attr.rnr_retry);
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    DEVX_SET(qpc, qpc, log_ack_req_freq, 0x0);  // 8
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_RTR2RTS);
+
+    DEVX_SET(rtr2rts_qp_in, &in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rtr2rts");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTS;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to RTS state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::rts2rts(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                    int attr_mask) noexcept {
+    if (!is_rts2rts_attrs_valid(attr_mask, m_qp_type)) {
+        DOCA_LOG(LOG_ERR, "rts2rts attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) && !verbs_qp_attr.ah_attr) {
+        DOCA_LOG(LOG_ERR, "AH_ATTR mask is enabled but ah_attr=nullptr");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    rts2rts_qp_in in{0};
+    rts2rts_qp_out out{0};
+
+    void *qpc = MLX5_ADDR_OF(rts2rts_qp_in, in, qpc);
+    DEVX_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
+    DEVX_SET(rts2rts_qp_in, in, qpn, m_qp_num);
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER)
+        DEVX_SET(qpc, qpc, min_rnr_nak, verbs_qp_attr.min_rnr_timer);
+    if (verbs_qp_attr.allow_remote_write == 1) {
+        DEVX_SET(qpc, qpc, rwe, 1);
+    }
+
+    if ((attr_mask & DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ) &&
+        verbs_qp_attr.allow_remote_read == 1) {
+        DEVX_SET(qpc, qpc, rre, 1);
+    }
+    if (verbs_qp_attr.allow_remote_atomic > DOCA_VERBS_QP_ATOMIC_MODE_NONE) {
+        DEVX_SET(qpc, qpc, rae, 1);
+        DEVX_SET(qpc, qpc, atomic_mode, verbs_qp_attr.allow_remote_atomic);
+    }
+
+    if (attr_mask & DOCA_VERBS_QP_ATTR_AH_ATTR) {
+        DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, verbs_qp_attr.ah_attr->sgid_index);
+
+        if (m_verbs_device_attr->m_is_rts2rts_qp_dscp_supported &&
+            m_verbs_device_attr->m_port_type == MLX5_CAP_PORT_TYPE_ETH) {
+            uint8_t roce_version{};
+            auto status =
+                query_roce_version(m_ibv_ctx, verbs_qp_attr.ah_attr->sgid_index, roce_version);
+            if (status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to query roce version");
+                return status;
+            }
+
+            if (roce_version >= MLX5_ROCE_ADDR_LAYOUT_ROCE_VERSION_VERSION_2_0)
+                DEVX_SET(qpc, qpc, primary_address_path.dscp,
+                         verbs_qp_attr.ah_attr->traffic_class >> 2);
+        }
+    }
+
+    int mlx5_opt_param_mask{0};
+    convert_doca_verbs_qp_attr_mask_to_legal_mlx5_qp_opt_param_mask(attr_mask, mlx5_opt_param_mask,
+                                                                    DOCA_VERBS_QP_RTS2RTS);
+
+    DEVX_SET(rts2rts_qp_in, in, opt_param_mask, mlx5_opt_param_mask);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP rts2rts");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RTS;
+
+    DOCA_LOG(LOG_INFO, "IB Verbs QP %p: has been successfully moved to RTS state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::qp2err(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                   int attr_mask) noexcept {
+    qp_2err_in in{0};
+    qp_2err_out out{0};
+
+    if (!is_X2err_attrs_valid(attr_mask)) {
+        DOCA_LOG(LOG_ERR, "X2err attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    DEVX_SET(qp_2err_in, in, opcode, MLX5_CMD_OP_QP_2ERR);
+    DEVX_SET(qp_2err_in, in, qpn, m_qp_num);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP 2err");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_ERR;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Error state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::qp2rst(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                   int attr_mask) noexcept {
+    qp_2rst_in in{0};
+    qp_2rst_out out{0};
+
+    if (!is_X2rst_attrs_valid(attr_mask)) {
+        DOCA_LOG(LOG_ERR, "X2rst attrs are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    DEVX_SET(qp_2rst_in, in, opcode, MLX5_CMD_OP_QP_2RST);
+    DEVX_SET(qp_2rst_in, in, qpn, m_qp_num);
+
+    auto ret =
+        doca_verbs_wrapper_mlx5dv_devx_obj_modify(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to modify QP 2rst");
+        return ret;
+    }
+
+    m_current_state = DOCA_VERBS_QP_STATE_RST;
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully moved to Reset state", this);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp::query_qp(struct doca_verbs_qp_attr &verbs_qp_attr,
+                                     struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept {
+    query_qp_in in{0};
+    query_qp_out out{0};
+
+    DEVX_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
+    DEVX_SET(query_qp_in, in, qpn, m_qp_num);
+
+    auto ret = doca_verbs_wrapper_mlx5dv_devx_obj_query(m_qp_obj, in, sizeof(in), out, sizeof(out));
+    if (ret != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query QP");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    /* Set verbs_qp_attr with the QP information */
+    const void *qpc = MLX5_ADDR_OF(query_qp_out, out, qpc);
+    auto prm_qp_state = DEVX_GET(qpc, qpc, state);
+
+    auto status =
+        convert_prm_qp_state_to_doca_verbs_qp_state(prm_qp_state, verbs_qp_attr.current_state);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get state, invalid qp state");
+        return DOCA_ERROR_UNEXPECTED;
+    }
+
+    verbs_qp_attr.next_state = verbs_qp_attr.current_state;
+
+    auto prm_mtu_size = DEVX_GET(qpc, qpc, mtu);
+    status = convert_prm_mtu_size_to_doca_verbs_mtu_size(prm_mtu_size, verbs_qp_attr.path_mtu);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to get state, invalid MTU size");
+        return DOCA_ERROR_UNEXPECTED;
+    }
+
+    verbs_qp_attr.rq_psn = DEVX_GET(qpc, qpc, next_rcv_psn);
+    verbs_qp_attr.sq_psn = DEVX_GET(qpc, qpc, next_send_psn);
+    verbs_qp_attr.dest_qp_num = DEVX_GET(qpc, qpc, remote_qpn);
+    verbs_qp_attr.pkey_index = DEVX_GET(qpc, qpc, primary_address_path.pkey_index);
+    verbs_qp_attr.port_num = DEVX_GET(qpc, qpc, primary_address_path.vhca_port_num);
+    verbs_qp_attr.ack_timeout = DEVX_GET(qpc, qpc, primary_address_path.ack_timeout);
+    verbs_qp_attr.retry_cnt = DEVX_GET(qpc, qpc, retry_count);
+    verbs_qp_attr.rnr_retry = DEVX_GET(qpc, qpc, rnr_retry);
+    verbs_qp_attr.min_rnr_timer = DEVX_GET(qpc, qpc, min_rnr_nak);
+    verbs_qp_attr.allow_remote_write = DEVX_GET(qpc, qpc, rwe);
+    verbs_qp_attr.allow_remote_read = DEVX_GET(qpc, qpc, rre);
+    // verbs_qp_attr.allow_remote_atomic = DEVX_GET(qpc, qpc, rae);
+
+    if (verbs_qp_attr.ah_attr != nullptr) {
+        verbs_qp_attr.ah_attr->addr_type = m_addr_type;
+        verbs_qp_attr.ah_attr->dlid = DEVX_GET(qpc, qpc, primary_address_path.rlid);
+        verbs_qp_attr.ah_attr->sl = DEVX_GET(qpc, qpc, primary_address_path.sl);
+        verbs_qp_attr.ah_attr->sgid_index = DEVX_GET(qpc, qpc, primary_address_path.src_addr_index);
+        verbs_qp_attr.ah_attr->static_rate = DEVX_GET(qpc, qpc, primary_address_path.stat_rate);
+        verbs_qp_attr.ah_attr->hop_limit = DEVX_GET(qpc, qpc, primary_address_path.hop_limit);
+        verbs_qp_attr.ah_attr->traffic_class = DEVX_GET(qpc, qpc, primary_address_path.tclass);
+
+        memcpy(verbs_qp_attr.ah_attr->gid.raw,
+               MLX5_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip),
+               sizeof(struct doca_verbs_gid));
+    }
+
+    /* Set verbs_qp_init_attr with the QP information */
+    verbs_qp_init_attr.send_cq = m_init_attr.send_cq;
+    verbs_qp_init_attr.receive_cq = m_init_attr.receive_cq;
+    verbs_qp_init_attr.sq_sig_all = m_init_attr.sq_sig_all;
+    verbs_qp_init_attr.qp_context = m_init_attr.qp_context;
+    verbs_qp_init_attr.pd = m_pd;
+    verbs_qp_init_attr.sq_wr = m_sq_size_wr;
+    verbs_qp_init_attr.rq_wr = m_rq_size;
+    verbs_qp_init_attr.receive_max_sges = m_rcv_max_sges;
+    verbs_qp_init_attr.user_index = DEVX_GET(qpc, qpc, user_index);
+    verbs_qp_init_attr.qp_type = m_qp_type;
+    verbs_qp_init_attr.send_max_sges = m_send_max_sges;
+    verbs_qp_init_attr.max_inline_data = m_init_attr.max_inline_data;
+    verbs_qp_init_attr.external_umem = m_init_attr.external_umem;
+    verbs_qp_init_attr.external_umem_offset = m_init_attr.external_umem_offset;
+    verbs_qp_init_attr.external_uar = m_init_attr.external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_qp::create(struct ibv_context *ibv_ctx) {
+    auto status{DOCA_SUCCESS};
+    m_ibv_ctx = ibv_ctx;
+    m_pd = m_init_attr.pd;
+
+    if ((m_init_attr.external_umem != nullptr && m_init_attr.external_umem_dbr == nullptr) ||
+        (m_init_attr.external_umem == nullptr && m_init_attr.external_umem_dbr != nullptr)) {
+        DOCA_LOG(LOG_ERR, "Both UMEM should be either external or internal");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    /* Query device attr */
+    status = doca_verbs_query_device(ibv_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (m_init_attr.qp_type != DOCA_VERBS_QP_TYPE_RC) {
+        DOCA_LOG(LOG_ERR, "QP type is not valid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    uint32_t log_rq_size{0};
+    uint32_t log_stride{0};
+    uint32_t log_sq_size_wqebb{0};
+
+    /* Calculate Work Queue sizes */
+    if (m_init_attr.rq_wr > 0 && m_init_attr.srq == nullptr) {
+        if (m_init_attr.rq_wr > m_verbs_device_attr->m_max_qp_wr) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: rq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        if (m_init_attr.receive_max_sges == 0) {
+            DOCA_LOG(
+                LOG_ERR,
+                "Failed to create IB Verbs QP: rq_wr is greater than 0 but receive_max_sges is 0");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_rcv_max_sges = doca_internal_utils_next_power_of_two(m_init_attr.receive_max_sges);
+        /* Calculate receive_wqe size */
+        m_rcv_wqe_size = m_rcv_max_sges * DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+        if (m_rcv_wqe_size > m_verbs_device_attr->m_max_rq_desc_size) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: rcv_max_sges is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_log_rcv_wqe_size = static_cast<uint8_t>(doca_internal_utils_log2(m_rcv_wqe_size));
+        log_stride = m_log_rcv_wqe_size - sc_verbs_qp_log_rq_stride_shift;
+
+        /* Calculate RQ size in bytes */
+        auto rq_size_bytes = static_cast<uint32_t>(
+            doca_internal_utils_next_power_of_two(m_init_attr.rq_wr * m_rcv_wqe_size));
+        /* Minimum size of RQ is 64 bytes */
+        rq_size_bytes = MAX(rq_size_bytes, DOCA_VERBS_WQEBB_SIZE);
+        /* Calculate RQ size in receive_wqe units */
+        m_rq_size = rq_size_bytes / m_rcv_wqe_size;
+        log_rq_size = doca_internal_utils_log2(m_rq_size);
+    }
+
+    if (m_init_attr.sq_wr > 0) {
+        // This check is done in rdma-core
+        if (m_init_attr.sq_wr > (0x7fffffff / m_verbs_device_attr->m_max_sq_desc_size)) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: sq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        if (m_init_attr.send_max_sges == 0) {
+            DOCA_LOG(
+                LOG_ERR,
+                "Failed to create IB Verbs QP: sq_wr is greater than 0 but send_max_sges is 0");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        m_send_max_sges = m_init_attr.send_max_sges;
+        /* Calculate Send WQE size, which is the size of one control segment, size of one rdma
+         * segment and a single data segment multiplied by the maximum number of send SGEs */
+        uint32_t send_wqe_size =
+            sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+            sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+            (m_init_attr.send_max_sges * sizeof(struct doca_gpunetio_ib_mlx5_wqe_data_seg));
+        if (send_wqe_size > m_verbs_device_attr->m_max_sq_desc_size) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: send_max_sges is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+
+        uint32_t send_wqe_inline_size{};
+        if (m_init_attr.max_inline_data > 0) {
+            /* Calculate inline data segment size, which is composed of:
+             * - size of mlx5_wqe_inl_data_seg (4 Bytes for byte_count and is_inline
+             * attributes)
+             * - max_inline_data size */
+            uint32_t inline_data_seg_size =
+                sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg) + m_init_attr.max_inline_data;
+            /* Align the size to OCTOWORD_SIZE (16 bytes) */
+            inline_data_seg_size =
+                doca_internal_utils_align_up_uint64(inline_data_seg_size, DOCA_VERBS_OCTOWORD_SIZE);
+            /* Calculate Send WQE with inline data size, which is the size of one control
+             * segment, size of one rdma segment and the total inline data segment size */
+            send_wqe_inline_size = sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+                                   inline_data_seg_size;
+            if (send_wqe_inline_size > m_verbs_device_attr->m_max_sq_desc_size) {
+                DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: max_inline_data is too big");
+                throw DOCA_ERROR_INVALID_VALUE;
+            }
+        }
+
+        /* Set m_send_wqe_size to the maximum value between the sizes of send_wqe_size and
+         * send_wqe_inline_size
+         */
+        m_send_wqe_size = MAX(send_wqe_size, send_wqe_inline_size);
+
+        /* Align size of send_wqe_size to WQEBB size */
+        m_send_wqe_size =
+            doca_internal_utils_align_up_uint32(m_send_wqe_size, DOCA_VERBS_WQEBB_SIZE);
+        /* Calculate sq_size in bytes */
+        auto sq_size_bytes = static_cast<uint32_t>(
+            doca_internal_utils_next_power_of_two(m_send_wqe_size * m_init_attr.sq_wr));
+        /* Calculate sq_size in wqebb units */
+        m_sq_size_wqebb = sq_size_bytes / DOCA_VERBS_WQEBB_SIZE;
+        if (m_sq_size_wqebb > m_verbs_device_attr->m_max_send_wqebb) {
+            DOCA_LOG(LOG_ERR, "Failed to create IB Verbs QP: sq_wr is too big");
+            throw DOCA_ERROR_INVALID_VALUE;
+        }
+        log_sq_size_wqebb = doca_internal_utils_log2(m_sq_size_wqebb);
+        /* Calculate sq_size in Work Request units */
+        m_sq_size_wr = sq_size_bytes / m_send_wqe_size;
+
+        /* Due to alignments we may have more space for inline data */
+        if (m_init_attr.max_inline_data > 0) {
+            m_init_attr.max_inline_data =
+                m_send_wqe_size - (sizeof(struct doca_gpunetio_ib_mlx5_wqe_ctrl_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_raddr_seg) +
+                                   sizeof(struct doca_gpunetio_ib_mlx5_wqe_inl_data_seg));
+            m_max_inline_data_length = m_init_attr.max_inline_data;
+        }
+    }
+
+    uint32_t uar_id{};
+    if (m_init_attr.external_uar == nullptr) {
+        /* Case of internal UAR */
+        auto uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+            m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_BF, &m_uar_obj);
+        if (uar_status != DOCA_SUCCESS) {
+            uar_status = doca_verbs_wrapper_mlx5dv_devx_alloc_uar(
+                m_ibv_ctx, MLX5DV_UAR_ALLOC_TYPE_NC, &m_uar_obj);
+            if (uar_status != DOCA_SUCCESS) {
+                DOCA_LOG(LOG_ERR, "Failed to create UAR");
+                throw DOCA_ERROR_DRIVER;
+            }
+        }
+
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(m_uar_obj->reg_addr);
+        uar_id = m_uar_obj->page_id;
+    } else {
+        /* Case of external UAR */
+        status = doca_verbs_uar_id_get(m_init_attr.external_uar, &uar_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR ID");
+            throw status;
+        }
+
+        void *reg_addr{};
+        status = doca_verbs_uar_reg_addr_get(m_init_attr.external_uar, &reg_addr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external UAR reg_addr");
+            throw status;
+        }
+        m_uar_db_reg = reinterpret_cast<uint64_t *>(reg_addr);
+    }
+
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+    uint32_t wq_umem_id{0};
+
+    if (m_init_attr.external_umem == nullptr) {
+        auto db_umem_offset =
+            (m_rq_size * m_rcv_wqe_size) + (m_sq_size_wqebb * DOCA_VERBS_WQEBB_SIZE);
+        /* Align the Work Queue size to cacheline size for better performance */
+        db_umem_offset =
+            doca_internal_utils_align_up_uint32(db_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+        /* Case of internal umem */
+        auto total_umem_size = doca_internal_utils_align_up_uint32(
+            db_umem_offset + sc_verbs_qp_doorbell_size, DOCA_VERBS_PAGE_SIZE);
+
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+
+        memset(m_umem_buf, 0, total_umem_size);
+
+        m_wq_buf = m_umem_buf;
+        m_rq_buf = m_wq_buf;
+        m_sq_buf = m_wq_buf + ((uintptr_t)m_rq_size << m_log_rcv_wqe_size);
+
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ibv_ctx, m_wq_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create QP UMEM");
+            throw DOCA_ERROR_DRIVER;
+        }
+
+        wq_umem_id = m_umem_obj->umem_id;
+        dbr_umem_offset = db_umem_offset;
+        dbr_umem_id = wq_umem_id;
+
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_wq_buf + db_umem_offset);
+    } else {
+        uint8_t *tmp_db_buffer;
+
+        /* Case of external umem for wq and dbr */
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_wq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        m_wq_buf += m_init_attr.external_umem_offset;
+        m_rq_buf = m_wq_buf;
+        m_sq_buf = m_wq_buf + ((uintptr_t)m_rq_size << m_log_rcv_wqe_size);
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem, &wq_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        /* Case of external umem */
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem_dbr,
+                                             reinterpret_cast<void **>(&tmp_db_buffer));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem_dbr, &dbr_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem id");
+            throw status;
+        }
+
+        dbr_umem_offset = m_init_attr.external_umem_dbr_offset;
+        m_db_buffer = reinterpret_cast<uint32_t *>(tmp_db_buffer + dbr_umem_offset);
+    }
+
+    /* Create QP object */
+    status = create_qp_obj(uar_id, log_rq_size, log_sq_size_wqebb, log_stride, dbr_umem_offset,
+                           dbr_umem_id, wq_umem_id, m_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create QP object");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    DOCA_LOG(LOG_INFO, "DOCA IB Verbs QP %p: has been successfully created", this);
+}
+
+doca_error_t doca_verbs_qp::destroy() noexcept {
+    doca_error_t ret = DOCA_SUCCESS;
+
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_qp_obj) {
+        ret = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_qp_obj);
+        if (ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy QP object");
+            return DOCA_ERROR_DRIVER;
+        }
+        m_qp_obj = nullptr;
+    }
+
+    if (m_uar_obj) {
+        doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        m_uar_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        ret = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (ret != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return DOCA_ERROR_DRIVER;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_verbs_qp::doca_verbs_qp(struct ibv_context *ibv_ctx,
+                             struct doca_verbs_qp_init_attr &verbs_qp_init_attr)
+    : m_ibv_ctx(ibv_ctx), m_init_attr(verbs_qp_init_attr) {
+    try {
+        create(ibv_ctx);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create QP");
+        throw;
+    }
+}
+
+doca_verbs_qp::~doca_verbs_qp() { static_cast<void>(destroy()); }
+
+uint32_t doca_verbs_qp::get_qpn() const noexcept { return m_qp_num; }
+
+void *doca_verbs_qp::get_dbr_addr() const noexcept { return (void *)m_db_buffer; }
+
+void *doca_verbs_qp::get_uar_addr() const noexcept { return (void *)m_uar_db_reg; }
+
+enum doca_verbs_uar_allocation_type doca_verbs_qp::get_uar_mtype() const noexcept {
+    return m_init_attr.external_uar->get_uar_mtype();
+}
+
+void *doca_verbs_qp::get_sq_buf() const noexcept { return m_sq_buf; }
+
+void *doca_verbs_qp::get_rq_buf() const noexcept { return (void *)m_wq_buf; }
+
+uint32_t doca_verbs_qp::get_sq_size_wqebb() const noexcept { return m_sq_size_wqebb; }
+
+uint32_t doca_verbs_qp::get_rq_size() const noexcept { return m_rq_size; }
+
+uint32_t doca_verbs_qp::get_rcv_wqe_size() const noexcept { return m_rcv_wqe_size; }
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_qp_init_attr_create(struct doca_verbs_qp_init_attr **verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_init_attr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_qp_init_attr =
+        (struct doca_verbs_qp_init_attr *)calloc(1, sizeof(struct doca_verbs_qp_init_attr));
+    if (*verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_init_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_destroy(struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy qp_init_attr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_qp_init_attr);
+    verbs_qp_init_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_pd(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                            struct ibv_pd *pd) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter pd is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->pd = pd;
+
+    return DOCA_SUCCESS;
+}
+
+struct ibv_pd *doca_verbs_qp_init_attr_get_pd(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pd: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->pd;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_send_cq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 struct doca_verbs_cq *send_cq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (send_cq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_cq: parameter send_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->send_cq = send_cq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_send_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get send_cq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->send_cq;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_receive_cq(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_cq *receive_cq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (receive_cq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter receive_cq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->receive_cq = receive_cq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_cq *doca_verbs_qp_init_attr_get_receive_cq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_cq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->receive_cq;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_sq_sig_all(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, int sq_sig_all) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_sig_all: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->sq_sig_all = sq_sig_all;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_init_attr_get_sq_sig_all(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_sig_all: parameter verbs_qp_init_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_init_attr->sq_sig_all;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_sq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t sq_wr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_wr: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->sq_wr = sq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_sq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_wr: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->sq_wr;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_rq_wr(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                               uint32_t rq_wr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_cq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->rq_wr = rq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_rq_wr(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rq_wr: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->rq_wr;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_send_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t send_max_sges) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set send_max_sges: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->send_max_sges = send_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_send_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get send_max_sges: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->send_max_sges;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_receive_max_sges(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t receive_max_sges) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->receive_max_sges = receive_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_receive_max_sges(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_max_sges: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->receive_max_sges;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_max_inline_data(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t max_inline_data) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set max_inline_data: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->max_inline_data = max_inline_data;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_max_inline_data(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get max_inline_data: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->max_inline_data;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_user_index(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint32_t user_index) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set user_index: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if ((user_index & USER_INDEX_MSB_8BITS_MASK) != 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set user_index: input parameter user_index=%u exceeds 24 bits",
+                 user_index);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->user_index = user_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_user_index(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get user_index: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->user_index;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_qp_type(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                                 uint32_t qp_type) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_type: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->qp_type = qp_type;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_init_attr_get_qp_type(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get qp_type: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->qp_type;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_umem = external_umem;
+    verbs_qp_init_attr->external_umem_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_dbr_umem(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_umem_dbr = external_umem;
+    verbs_qp_init_attr->external_umem_dbr_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_get_external_umem(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem_offset == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem_offset is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_umem = verbs_qp_init_attr->external_umem;
+    *external_umem_offset = verbs_qp_init_attr->external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_external_uar(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, struct doca_verbs_uar *external_uar) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->external_uar = external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_get_external_uar(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+    struct doca_verbs_uar **external_uar) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_uar: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_uar == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_uar: parameter external_uar is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_uar = verbs_qp_init_attr->external_uar;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_qp_context(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, void *qp_context) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_context: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (qp_context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set qp_context: parameter qp_context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->qp_context = qp_context;
+
+    return DOCA_SUCCESS;
+}
+
+void *doca_verbs_qp_init_attr_get_qp_context(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get qp_context: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->qp_context;
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_core_direct_master(
+    struct doca_verbs_qp_init_attr *verbs_qp_init_attr, uint8_t core_direct_master) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set core_direct_master: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (core_direct_master != 0x0 && core_direct_master != 0x1) {
+        DOCA_LOG(LOG_ERR, "Failed to set core_direct_master: invalid input value %d",
+                 core_direct_master);
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->core_direct_master = core_direct_master;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_qp_init_attr_get_core_direct_master(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get core_direct_master: parameter verbs_qp_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_init_attr->core_direct_master;
+}
+
+doca_error_t doca_verbs_qp_attr_create(struct doca_verbs_qp_attr **verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_qp_attr = (struct doca_verbs_qp_attr *)calloc(1, sizeof(struct doca_verbs_qp_attr));
+    if (*verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create qp_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_attr_destroy(struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy qp_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_qp_attr);
+    verbs_qp_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_attr_set_next_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               enum doca_verbs_qp_state next_state) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set next_state: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->next_state = next_state;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_next_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get next_state: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_qp_state>(0);
+    }
+
+    return verbs_qp_attr->next_state;
+}
+
+doca_error_t doca_verbs_qp_attr_set_current_state(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  enum doca_verbs_qp_state current_state) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set current_state: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->current_state = current_state;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_state doca_verbs_qp_attr_get_current_state(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get current_state: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_qp_state>(0);
+    }
+
+    return verbs_qp_attr->current_state;
+}
+
+doca_error_t doca_verbs_qp_attr_set_path_mtu(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             enum doca_verbs_mtu_size path_mtu) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set path_mtu: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->path_mtu = path_mtu;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_mtu_size doca_verbs_qp_attr_get_path_mtu(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get path_mtu: parameter verbs_qp_attr is NULL");
+        return static_cast<enum doca_verbs_mtu_size>(0);
+    }
+
+    return verbs_qp_attr->path_mtu;
+}
+
+doca_error_t doca_verbs_qp_attr_set_rq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t rq_psn) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set rq_psn: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->rq_psn = rq_psn;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_rq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rq_psn: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->rq_psn;
+}
+
+doca_error_t doca_verbs_qp_attr_set_sq_psn(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                           uint32_t sq_psn) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sq_psn: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->sq_psn = sq_psn;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_sq_psn(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sq_psn: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->sq_psn;
+}
+
+doca_error_t doca_verbs_qp_attr_set_dest_qp_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint32_t dest_qp_num) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set dest_qp_num: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->dest_qp_num = dest_qp_num;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_qp_attr_get_dest_qp_num(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get dest_qp_num: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->dest_qp_num;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_write(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                       int allow_remote_write) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_write: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_write = allow_remote_write;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_attr_get_allow_remote_write(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_write: parameter verbs_qp_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_attr->allow_remote_write;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_read(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                      int allow_remote_read) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_read: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_read = allow_remote_read;
+
+    return DOCA_SUCCESS;
+}
+
+int doca_verbs_qp_attr_get_allow_remote_read(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_read: parameter verbs_qp_attr is NULL");
+        return -1;
+    }
+
+    return verbs_qp_attr->allow_remote_read;
+}
+
+doca_error_t doca_verbs_qp_attr_set_allow_remote_atomic(
+    struct doca_verbs_qp_attr *verbs_qp_attr, enum doca_verbs_qp_atomic_type atomic_type) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set allow_remote_atomic: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->allow_remote_atomic = atomic_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_qp_atomic_type doca_verbs_qp_attr_get_allow_remote_atomic(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get allow_remote_atomic: parameter verbs_qp_attr is NULL");
+        return DOCA_VERBS_QP_ATOMIC_MODE_NONE;
+    }
+
+    return verbs_qp_attr->allow_remote_atomic;
+}
+
+doca_error_t doca_verbs_qp_attr_set_ah_attr(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                            doca_verbs_ah_attr *ah_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ah_attr: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (ah_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ah_attr: parameter ah_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->ah_attr = ah_attr;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_ah_attr *doca_verbs_qp_attr_get_ah_attr(
+    const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ah_attr: parameter verbs_qp_attr is NULL");
+        return nullptr;
+    }
+    if (verbs_qp_attr->ah_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ah_attr: ah_attr object was not set previously");
+        return nullptr;
+    }
+
+    return verbs_qp_attr->ah_attr;
+}
+
+doca_error_t doca_verbs_qp_attr_set_pkey_index(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                               uint16_t pkey_index) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pkey_index: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->pkey_index = pkey_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_pkey_index(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pkey_index: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->pkey_index;
+}
+
+doca_error_t doca_verbs_qp_attr_set_port_num(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                             uint16_t port_num) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set port_num: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->port_num = port_num;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_port_num(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get port_num: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->port_num;
+}
+
+doca_error_t doca_verbs_qp_attr_set_ack_timeout(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                uint16_t ack_timeout) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set ack_timeout: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->ack_timeout = ack_timeout;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_ack_timeout(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get ack_timeout: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->ack_timeout;
+}
+
+doca_error_t doca_verbs_qp_attr_set_retry_cnt(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t retry_cnt) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set retry_cnt: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->retry_cnt = retry_cnt;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_retry_cnt(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get retry_cnt: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->retry_cnt;
+}
+
+doca_error_t doca_verbs_qp_attr_set_rnr_retry(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                              uint16_t rnr_retry) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set rnr_retry: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->rnr_retry = rnr_retry;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_rnr_retry(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get rnr_retry: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->rnr_retry;
+}
+
+doca_error_t doca_verbs_qp_attr_set_min_rnr_timer(struct doca_verbs_qp_attr *verbs_qp_attr,
+                                                  uint16_t min_rnr_timer) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set min_rnr_timer: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_attr->min_rnr_timer = min_rnr_timer;
+
+    return DOCA_SUCCESS;
+}
+
+uint16_t doca_verbs_qp_attr_get_min_rnr_timer(const struct doca_verbs_qp_attr *verbs_qp_attr) {
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get min_rnr_timer: parameter verbs_qp_attr is NULL");
+        return 0;
+    }
+
+    return verbs_qp_attr->min_rnr_timer;
+}
+
+doca_error_t doca_verbs_ah_attr_create(struct ibv_context *context,
+                                       struct doca_verbs_ah_attr **verbs_ah) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_ah = (struct doca_verbs_ah_attr *)calloc(1, sizeof(struct doca_verbs_ah_attr));
+    if (*verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_ah: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    (*verbs_ah)->is_global = 1;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_ah_attr_destroy(struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_ah: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_ah);
+    verbs_ah = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_ah_attr_set_gid(struct doca_verbs_ah_attr *verbs_ah,
+                                        struct doca_verbs_gid gid) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set gid: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->gid = gid;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_gid doca_verbs_ah_get_gid(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get gid: parameter verbs_ah is NULL");
+        struct doca_verbs_gid zero_gid {};
+        memset(&zero_gid, 0, sizeof(zero_gid));
+        return zero_gid;
+    }
+
+    return verbs_ah->gid;
+}
+
+doca_error_t doca_verbs_ah_attr_set_addr_type(struct doca_verbs_ah_attr *verbs_ah,
+                                              enum doca_verbs_addr_type addr_type) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set addr_type: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->addr_type = addr_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_addr_type doca_verbs_ah_get_addr_type(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get addr_type: parameter verbs_ah is NULL");
+        return static_cast<enum doca_verbs_addr_type>(0);
+    }
+
+    return verbs_ah->addr_type;
+}
+
+doca_error_t doca_verbs_ah_attr_set_dlid(struct doca_verbs_ah_attr *verbs_ah, uint32_t dlid) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set dlid: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->dlid = dlid;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_ah_get_dlid(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get dlid: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->dlid;
+}
+
+doca_error_t doca_verbs_ah_attr_set_sl(struct doca_verbs_ah_attr *verbs_ah, uint8_t sl) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sl: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->sl = sl;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_sl(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sl: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->sl;
+}
+
+doca_error_t doca_verbs_ah_attr_set_sgid_index(struct doca_verbs_ah_attr *verbs_ah,
+                                               uint8_t sgid_index) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set sgid_index: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->sgid_index = sgid_index;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_sgid_index(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get sgid_index: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->sgid_index;
+}
+
+doca_error_t doca_verbs_ah_attr_set_static_rate(struct doca_verbs_ah_attr *verbs_ah,
+                                                uint8_t static_rate) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set static_rate: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->static_rate = static_rate;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_static_rate(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get static_rate: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->static_rate;
+}
+
+doca_error_t doca_verbs_ah_attr_set_hop_limit(struct doca_verbs_ah_attr *verbs_ah,
+                                              uint8_t hop_limit) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set hop_limit: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->hop_limit = hop_limit;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_hop_limit(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get hop_limit: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->hop_limit;
+}
+
+doca_error_t doca_verbs_ah_attr_set_traffic_class(struct doca_verbs_ah_attr *verbs_ah,
+                                                  uint8_t traffic_class) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set traffic_class: parameter verbs_ah is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_ah->traffic_class = traffic_class;
+
+    return DOCA_SUCCESS;
+}
+
+uint8_t doca_verbs_ah_get_traffic_class(const struct doca_verbs_ah_attr *verbs_ah) {
+    if (verbs_ah == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get traffic_class: parameter verbs_ah is NULL");
+        return 0;
+    }
+
+    return verbs_ah->traffic_class;
+}
+
+doca_error_t doca_verbs_qp_create(struct ibv_context *context,
+                                  struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                  struct doca_verbs_qp **verbs_qp) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_qp = new doca_verbs_qp(context, *verbs_qp_init_attr);
+        DOCA_LOG(LOG_INFO, "IB Verbs Context %p: verbs_qp=%p was created", context, *verbs_qp);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_destroy(struct doca_verbs_qp *verbs_qp) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_qp->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy verbs_qp.");
+        return status;
+    }
+
+    delete (verbs_qp);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_qp_modify(struct doca_verbs_qp *verbs_qp,
+                                  struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (!verbs_qp->is_qp_attr_valid(verbs_qp_attr, attr_mask)) {
+        DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: some QP attributes values are invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    doca_verbs_qp_state current_state;
+    doca_verbs_qp_state next_state;
+    if (!(attr_mask & DOCA_VERBS_QP_ATTR_CURRENT_STATE))
+        current_state = verbs_qp->get_current_state();
+    else
+        current_state = verbs_qp_attr->current_state;
+    if (!(attr_mask & DOCA_VERBS_QP_ATTR_NEXT_STATE))
+        next_state = current_state;
+    else
+        next_state = verbs_qp_attr->next_state;
+
+    switch (next_state) {
+        case DOCA_VERBS_QP_STATE_RST:
+            return verbs_qp->qp2rst(*verbs_qp_attr, attr_mask);
+        case DOCA_VERBS_QP_STATE_INIT:
+            if (current_state == DOCA_VERBS_QP_STATE_RST)
+                return verbs_qp->rst2init(*verbs_qp_attr, attr_mask);
+            else if (current_state == DOCA_VERBS_QP_STATE_INIT)
+                return verbs_qp->init2init(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_RTR:
+            if (current_state == DOCA_VERBS_QP_STATE_INIT)
+                return verbs_qp->init2rtr(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_RTS:
+            if (current_state == DOCA_VERBS_QP_STATE_RTR)
+                return verbs_qp->rtr2rts(*verbs_qp_attr, attr_mask);
+            else if (current_state == DOCA_VERBS_QP_STATE_RTS)
+                return verbs_qp->rts2rts(*verbs_qp_attr, attr_mask);
+            else
+                goto invalid_input;
+        case DOCA_VERBS_QP_STATE_ERR:
+            return verbs_qp->qp2err(*verbs_qp_attr, attr_mask);
+        default:
+            DOCA_LOG(LOG_ERR, "Failed to modify verbs_qp: invalid next_state");
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+invalid_input:
+    DOCA_LOG(LOG_ERR,
+             "Failed to modify verbs_qp: invalid combination of current_state and next_state");
+    return DOCA_ERROR_INVALID_VALUE;
+}
+
+doca_error_t doca_verbs_qp_query(struct doca_verbs_qp *verbs_qp,
+                                 struct doca_verbs_qp_attr *verbs_qp_attr,
+                                 struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to query verbs_qp: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return verbs_qp->query_qp(*verbs_qp_attr, *verbs_qp_init_attr);
+}
+
+uint32_t doca_verbs_qp_get_qpn(const struct doca_verbs_qp *verbs_qp) { return verbs_qp->get_qpn(); }
+
+void *doca_verbs_qp_get_dbr_addr(const struct doca_verbs_qp *verbs_qp) {
+    return verbs_qp->get_dbr_addr();
+}
+
+void *doca_verbs_qp_get_uar_addr(const struct doca_verbs_qp *verbs_qp) {
+    return verbs_qp->get_uar_addr();
+}
+
+void doca_verbs_qp_get_wq(const struct doca_verbs_qp *verbs_qp, void **sq_buf,
+                          uint32_t *sq_num_entries, void **rq_buf, uint32_t *rq_num_entries,
+                          uint32_t *rwqe_size_bytes) {
+    *sq_buf = verbs_qp->get_sq_buf();
+    *rq_buf = verbs_qp->get_rq_buf();
+    *sq_num_entries = verbs_qp->get_sq_size_wqebb();
+    *rq_num_entries = verbs_qp->get_rq_size();
+    *rwqe_size_bytes = verbs_qp->get_rcv_wqe_size();
+}
+
+doca_error_t doca_verbs_qp_init_attr_set_srq(struct doca_verbs_qp_init_attr *verbs_qp_init_attr,
+                                             struct doca_verbs_srq *srq) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq: parameter verbs_qp_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq: parameter srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_qp_init_attr->srq = srq;
+
+    return DOCA_SUCCESS;
+}
+
+struct doca_verbs_srq *doca_verbs_qp_init_attr_get_srq(
+    const struct doca_verbs_qp_init_attr *verbs_qp_init_attr) {
+    if (verbs_qp_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq: parameter verbs_qp_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_qp_init_attr->srq;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp
new file mode 100644
index 00000000000..a4ae748ca1b
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_qp.hpp
@@ -0,0 +1,211 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+#include "doca_verbs_uar.hpp"
+
+struct doca_verbs_ah_attr {
+    struct doca_verbs_gid gid {};
+    enum doca_verbs_addr_type addr_type { DOCA_VERBS_ADDR_TYPE_IPv4 };
+    uint32_t dlid{};
+    uint8_t sl{};
+    uint8_t sgid_index{};
+    uint8_t static_rate{};
+    uint8_t hop_limit{};
+    uint8_t traffic_class{};
+    uint8_t is_global{};
+};
+
+struct doca_verbs_qp_init_attr {
+    struct ibv_pd *pd{};
+    struct doca_verbs_cq *send_cq{};
+    struct doca_verbs_cq *receive_cq{};
+    struct doca_verbs_srq *srq{};
+    struct doca_verbs_umem *external_umem{};
+    struct doca_verbs_umem *external_umem_dbr{};
+    struct doca_verbs_uar *external_uar{};
+    uint64_t external_umem_offset{};
+    uint64_t external_umem_dbr_offset{};
+    int sq_sig_all{};
+    uint32_t sq_wr{};
+    uint32_t rq_wr{};
+    uint32_t send_max_sges{};
+    uint32_t receive_max_sges{};
+    uint32_t max_inline_data{};
+    uint32_t user_index{};
+    uint32_t qp_type{};
+    void *qp_context{};
+    uint32_t send_cqn{};
+    uint32_t receive_cqn{};
+    uint8_t core_direct_master{};
+};
+
+struct doca_verbs_qp_attr {
+    enum doca_verbs_qp_state next_state { DOCA_VERBS_QP_STATE_RST };
+    enum doca_verbs_qp_state current_state { DOCA_VERBS_QP_STATE_RST };
+    enum doca_verbs_mtu_size path_mtu { DOCA_VERBS_MTU_SIZE_256_BYTES };
+    uint32_t rq_psn{};
+    uint32_t sq_psn{};
+    uint32_t dest_qp_num{};
+    int allow_remote_write{};
+    int allow_remote_read{};
+    enum doca_verbs_qp_atomic_type allow_remote_atomic {};
+    doca_verbs_ah_attr *ah_attr{};
+    uint16_t pkey_index{};
+    uint16_t port_num{};
+    uint8_t ack_timeout{};
+    uint8_t retry_cnt{};
+    uint8_t rnr_retry{};
+    uint8_t min_rnr_timer{};
+    uint8_t core_direct_master{};
+};
+
+/**
+ *  @brief This struct implements the doca_verbs_qp
+ */
+struct doca_verbs_qp {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * The ibv context
+     * @param [in] verbs_qp_init_attr
+     * The DOCA IB Verbs QP attributes
+     *
+     */
+    doca_verbs_qp(struct ibv_context *ibv_ctx, struct doca_verbs_qp_init_attr &verbs_qp_init_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_qp();
+
+    void create(struct ibv_context *ibv_ctx);
+
+    doca_error_t destroy() noexcept;
+
+    uint32_t get_qpn() const noexcept;
+
+    void *get_sq_buf() const noexcept;
+
+    void *get_rq_buf() const noexcept;
+
+    uint32_t get_sq_size_wqebb() const noexcept;
+
+    uint32_t get_rq_size() const noexcept;
+
+    uint32_t get_rcv_wqe_size() const noexcept;
+
+    void *get_dbr_addr() const noexcept;
+
+    void *get_uar_addr() const noexcept;
+
+    enum doca_verbs_uar_allocation_type get_uar_mtype() const noexcept;
+
+    doca_error_t create_qp_obj(uint32_t uar_id, uint32_t log_rq_size, uint32_t log_sq_size,
+                               uint32_t log_stride, uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+                               uint32_t wq_umem_id,
+                               struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept;
+
+    doca_error_t rst2init(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t init2init(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t init2rtr(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t rtr2rts(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t rts2rts(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t qp2err(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t qp2rst(struct doca_verbs_qp_attr &verbs_qp_attr, int param_mask) noexcept;
+
+    doca_error_t query_qp(struct doca_verbs_qp_attr &verbs_qp_attr,
+                          struct doca_verbs_qp_init_attr &verbs_qp_init_attr) noexcept;
+
+    bool is_qp_attr_state_valid(enum doca_verbs_qp_state state) noexcept;
+
+    bool is_qp_attr_path_mtu_valid(enum doca_verbs_mtu_size path_mtu) noexcept;
+
+    uint32_t is_qp_attr_queue_psn_valid(uint32_t psn) noexcept;
+
+    bool is_qp_attr_ah_add_type_valid(enum doca_verbs_addr_type addr_type) noexcept;
+
+    bool is_qp_attr_ah_sgid_index_valid(uint8_t sgid_index) noexcept;
+
+    bool is_qp_attr_pkey_index_valid(uint16_t pkey_index) noexcept;
+
+    bool is_qp_attr_port_num_valid(uint16_t port_num) noexcept;
+
+    bool is_qp_attr_valid(struct doca_verbs_qp_attr *verbs_qp_attr, int attr_mask) noexcept;
+
+    doca_verbs_qp_state get_current_state() const noexcept;
+
+   private:
+    struct mlx5dv_devx_obj *m_qp_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_wq_buf{};
+    struct ibv_context *m_ibv_ctx{};
+    struct ibv_pd *m_pd{};
+    uint32_t m_qp_type{DOCA_VERBS_QP_TYPE_RC};
+    doca_verbs_addr_type m_addr_type{DOCA_VERBS_ADDR_TYPE_IPv4};
+    doca_verbs_qp_state m_current_state{DOCA_VERBS_QP_STATE_RST};
+    uint32_t m_rcv_wqe_size{};
+    uint32_t m_send_wqe_size{};
+    doca_verbs_qp_init_attr m_init_attr{};
+    struct doca_verbs_device_attr *m_verbs_device_attr{};
+    uint32_t m_rcv_max_sges{};
+    uint8_t m_log_rcv_wqe_size{};
+    uint32_t m_rq_size{};
+    uint32_t m_send_max_sges{};
+    uint32_t m_sq_size_wqebb{};
+    uint32_t m_sq_size_wr{};
+    uint32_t m_max_inline_data_length{};
+    uint64_t *m_uar_db_reg{};
+    uint8_t *m_sq_buf{};
+    uint8_t *m_rq_buf{};
+    uint32_t m_qp_num{};
+    uint32_t *m_db_buffer{};
+    struct doca_verbs_srq *m_srq{};
+
+    doca_verbs_qp(doca_verbs_qp const &) = delete;
+    doca_verbs_qp &operator=(doca_verbs_qp const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp
new file mode 100644
index 00000000000..408aabe9f2b
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.cpp
@@ -0,0 +1,580 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <malloc.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+#include <string.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_srq.hpp"
+#include "doca_verbs_net_wrapper.h"
+
+#define DOCA_VERBS_SRQ_DB_SIZE 64
+#define DOCA_VERBS_LOG_WQEBB_SIZE 6
+#define DOCA_VERBS_MIN_SRQ_SIZE 32
+#define DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_data_seg)
+#define DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES sizeof(struct doca_internal_mlx5_wqe_mprq_next_seg)
+#define MAX(a, b) std::max(a, b)
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+using create_rmp_in = uint32_t[MLX5_ST_SZ_DW(create_rmp_in)];
+using create_rmp_out = uint32_t[MLX5_ST_SZ_DW(create_rmp_out)];
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_srq Member Functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_srq::create_srq_obj(
+    uint32_t log_srq_size, uint32_t log_stride, uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+    uint64_t wq_umem_offset, uint32_t wq_umem_id,
+    struct doca_verbs_srq_init_attr &verbs_srq_init_attr) noexcept {
+    create_rmp_in create_in{0};
+    create_rmp_out create_out{0};
+
+    DEVX_SET(create_rmp_in, create_in, opcode, MLX5_CMD_OP_CREATE_RMP);
+
+    void *rmp_context = MLX5_ADDR_OF(create_rmp_in, create_in, ctx);
+    DEVX_SET(rmpc, rmp_context, state, MLX5_SQC_STATE_RDY);
+
+    void *wq_context = MLX5_ADDR_OF(rmpc, rmp_context, wq);
+
+    struct mlx5dv_pd dvpd;
+    struct mlx5dv_obj dv_obj;
+    // Query pdn
+    memset(&dv_obj, 0, sizeof(dv_obj));
+    dv_obj.pd.in = m_pd;
+    dv_obj.pd.out = &dvpd;
+
+    auto ret = doca_verbs_wrapper_mlx5dv_init_obj(&dv_obj, MLX5DV_OBJ_PD);
+    if (ret) {
+        DOCA_LOG(LOG_ERR, "Error in mlx5dv PD initialization");
+        return DOCA_ERROR_DRIVER;
+    }
+
+    DEVX_SET(wq, wq_context, pd, dvpd.pdn);
+
+    if (verbs_srq_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        DEVX_SET(wq, wq_context, wq_type, 0);
+        DEVX_SET(rmpc, rmp_context, basic_cyclic_rcv_wqe, 0);
+        m_srq_type = DOCA_VERBS_SRQ_TYPE_LINKED_LIST;
+    } else {
+        DEVX_SET(wq, wq_context, wq_type, 1);
+        DEVX_SET(rmpc, rmp_context, basic_cyclic_rcv_wqe, 1);
+        m_srq_type = DOCA_VERBS_SRQ_TYPE_CONTIGUOUS;
+    }
+
+    DEVX_SET(wq, wq_context, log_wq_sz, log_srq_size);
+    DEVX_SET(wq, wq_context, log_wq_stride, log_stride);
+    DEVX_SET(wq, wq_context, end_padding_mode, 1);
+
+    DEVX_SET(wq, wq_context, wq_umem_id, wq_umem_id);
+    DEVX_SET64(wq, wq_context, wq_umem_offset, wq_umem_offset);
+    DEVX_SET(wq, wq_context, wq_umem_valid, 1);
+
+    DEVX_SET(wq, wq_context, dbr_umem_id, dbr_umem_id);
+    DEVX_SET64(wq, wq_context, dbr_addr, dbr_umem_offset);
+    DEVX_SET(wq, wq_context, dbr_umem_valid, 1);
+
+    /* Create DevX object */
+    auto status = doca_verbs_wrapper_mlx5dv_devx_obj_create(
+        m_ctx, create_in, sizeof(create_in), create_out, sizeof(create_out), &m_srq_obj);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ. DevX error, syndrome=0x%x",
+                 DEVX_GET(nop_out, create_out, syndrome));
+        return status;
+    }
+
+    m_srq_num = DEVX_GET(create_rmp_out, create_out, rmpn);
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq::destroy() noexcept {
+    if (m_verbs_device_attr) {
+        auto status = doca_verbs_device_attr_free(m_verbs_device_attr);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free device attr");
+            return DOCA_ERROR_INVALID_VALUE;
+        }
+        m_verbs_device_attr = nullptr;
+    }
+
+    if (m_srq_obj) {
+        auto destroy_status = doca_verbs_wrapper_mlx5dv_devx_obj_destroy(m_srq_obj);
+        if (destroy_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy SRQ object");
+            return destroy_status;
+        }
+        m_srq_obj = nullptr;
+    }
+
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    if (m_umem_buf) {
+        free(m_umem_buf);
+        m_umem_buf = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_srq::create(struct ibv_context *ctx) {
+    auto status{DOCA_SUCCESS};
+    m_pd = m_init_attr.pd;
+    m_ctx = ctx;
+
+    /* Query device attr */
+    status = doca_verbs_query_device(m_ctx, &m_verbs_device_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to query device attr");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if ((m_init_attr.srq_type != DOCA_VERBS_SRQ_TYPE_LINKED_LIST) &&
+        (m_init_attr.srq_type != DOCA_VERBS_SRQ_TYPE_CONTIGUOUS)) {
+        DOCA_LOG(LOG_ERR, "SRQ type is invalid");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+    if (m_init_attr.pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: pd is NUL");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    m_pd = m_init_attr.pd;
+
+    if (m_init_attr.srq_wr == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: srq_wr is 0");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+    if (m_init_attr.srq_wr > m_verbs_device_attr->m_max_srq_wr) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create DOCA IB Verbs SRQ: The requested srq_wr is larger than the "
+                 "maximum supported srq_wr by the device");
+        throw DOCA_ERROR_NOT_SUPPORTED;
+    }
+    if (m_init_attr.receive_max_sges == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create DOCA IB Verbs SRQ: receive_max_sges is 0");
+        throw DOCA_ERROR_INVALID_VALUE;
+    }
+
+    if (m_init_attr.receive_max_sges > m_verbs_device_attr->m_max_srq_sge) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create DOCA IB Verbs SRQ: The requested sge size is larger than the "
+                 "maximum supported sge size by the device");
+        throw DOCA_ERROR_NOT_SUPPORTED;
+    }
+
+    m_rcv_max_sges = m_init_attr.receive_max_sges;
+
+    /* Calculate receive_wqe size */
+    m_rcv_wqe_size = m_rcv_max_sges * DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    if (m_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        m_rcv_wqe_size += DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES;
+        /* For LL SRQ: Minimum receive WQE size for SRQ is 32 bytes */
+        m_rcv_wqe_size = MAX(m_rcv_wqe_size, static_cast<uint32_t>(DOCA_VERBS_MIN_SRQ_SIZE));
+    }
+
+    m_rcv_wqe_size = doca_internal_utils_next_power_of_two(m_rcv_wqe_size);
+
+    /* Calculate the actual max_sges size according to the actual wqe size */
+    if (m_init_attr.srq_type == DOCA_VERBS_SRQ_TYPE_LINKED_LIST) {
+        m_rcv_max_sges = (m_rcv_wqe_size - DOCA_VERBS_CONTROL_SEG_SIZE_IN_BYTES) /
+                         DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    } else {  // m_init_attr.srq_type = DOCA_VERBS_SRQ_TYPE_CONTIGUOUS
+        m_rcv_max_sges = m_rcv_wqe_size / DOCA_VERBS_DATA_SEG_SIZE_IN_BYTES;
+    }
+
+    m_log_rcv_wqe_size = static_cast<uint8_t>(doca_internal_utils_log2(m_rcv_wqe_size));
+
+    /* Calculate SRQ size in bytes */
+    auto srq_size_bytes = static_cast<uint32_t>(
+        doca_internal_utils_next_power_of_two(m_init_attr.srq_wr * m_rcv_wqe_size));
+
+    /* Calculate SRQ size in receive_wqe units */
+    m_srq_size = srq_size_bytes / m_rcv_wqe_size;
+    auto log_srq_size = doca_internal_utils_log2(m_srq_size);
+
+    uint32_t dbr_umem_id{0};
+    uint64_t dbr_umem_offset{0};
+    uint32_t wq_umem_id{0};
+    uint64_t wq_umem_offset{0};
+
+    /* Calculate DBR offset */
+    auto db_umem_offset = m_srq_size * m_rcv_wqe_size;
+
+    /* Align the Work Queue size to cacheline size for better performance */
+    db_umem_offset = doca_internal_utils_align_up_uint32(db_umem_offset, DOCA_VERBS_CACHELINE_SIZE);
+
+    if (m_init_attr.external_umem == nullptr) {
+        /* Case of internal umem */
+
+        auto total_umem_size = doca_internal_utils_align_up_uint32(
+            db_umem_offset + DOCA_VERBS_SRQ_DB_SIZE, DOCA_VERBS_PAGE_SIZE);
+        m_umem_buf = (uint8_t *)memalign(DOCA_VERBS_PAGE_SIZE, total_umem_size);
+        memset(m_umem_buf, 0, total_umem_size);
+
+        m_srq_buf = m_umem_buf;
+
+        /* Create UMEM object */
+        auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg(m_ctx, m_srq_buf,
+                                                                   total_umem_size, 0, &m_umem_obj);
+        if (umem_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to create SRQ UMEM");
+            throw umem_status;
+        }
+
+        dbr_umem_id = m_umem_obj->umem_id;
+        dbr_umem_offset = db_umem_offset;
+
+        wq_umem_id = m_umem_obj->umem_id;
+        wq_umem_offset = 0;
+    } else {
+        status = doca_verbs_umem_get_address(m_init_attr.external_umem,
+                                             reinterpret_cast<void **>(&m_srq_buf));
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem address");
+            throw status;
+        }
+
+        m_srq_buf += m_init_attr.external_umem_offset;
+
+        status = doca_verbs_umem_get_id(m_init_attr.external_umem, &wq_umem_id);
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get umem id");
+            throw status;
+        }
+
+        wq_umem_offset = m_init_attr.external_umem_offset;
+
+        m_db_buffer = reinterpret_cast<uint32_t *>(m_srq_buf + db_umem_offset);
+        m_db_buffer = reinterpret_cast<uint32_t *>((reinterpret_cast<uint8_t *>(m_db_buffer)));
+
+        if (status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to get external umem base offset");
+            throw status;
+        }
+
+        dbr_umem_offset = m_init_attr.external_umem_offset + db_umem_offset;
+        dbr_umem_id = wq_umem_id;
+    }
+
+    m_db_buffer = reinterpret_cast<uint32_t *>(m_srq_buf + db_umem_offset);
+
+    /* Create SRQ object */
+    status = create_srq_obj(log_srq_size, m_log_rcv_wqe_size, dbr_umem_offset, dbr_umem_id,
+                            wq_umem_offset, wq_umem_id, m_init_attr);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ object");
+        throw status;
+    }
+}
+
+doca_verbs_srq::doca_verbs_srq(struct ibv_context *ctx,
+                               struct doca_verbs_srq_init_attr &verbs_srq_init_attr)
+    : m_ctx(ctx), m_init_attr(verbs_srq_init_attr) {
+    try {
+        create(ctx);
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create SRQ");
+        throw;
+    }
+}
+
+doca_verbs_srq::~doca_verbs_srq() { static_cast<void>(destroy()); }
+
+void *doca_verbs_srq::get_srq_buf() const noexcept { return (void *)m_srq_buf; }
+
+uint32_t doca_verbs_srq::get_srq_size() const noexcept { return m_srq_size; }
+
+uint32_t doca_verbs_srq::get_rcv_wqe_size() const noexcept { return m_rcv_wqe_size; }
+
+void *doca_verbs_srq::get_dbr_addr() const noexcept { return (void *)m_db_buffer; }
+
+uint32_t doca_verbs_srq::get_srqn() const noexcept { return m_srq_num; }
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_srq_init_attr_create(
+    struct doca_verbs_srq_init_attr **verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create srq_init_attr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *verbs_srq_init_attr =
+        (struct doca_verbs_srq_init_attr *)calloc(1, sizeof(struct doca_verbs_srq_init_attr));
+    if (*verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create srq_init_attr: failed to allocate memory");
+        return DOCA_ERROR_NO_MEMORY;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_destroy(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy srq_init_attr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    free(verbs_srq_init_attr);
+    verbs_srq_init_attr = nullptr;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_create(struct ibv_context *context,
+                                   struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                   struct doca_verbs_srq **verbs_srq) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter context is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (verbs_srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create verbs_srq: parameter verbs_srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *verbs_srq = new doca_verbs_srq(context, *verbs_srq_init_attr);
+        DOCA_LOG(LOG_INFO, "doca_verbs_srq=%p was created", verbs_srq);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_destroy(struct doca_verbs_srq *verbs_srq) {
+    if (verbs_srq == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_srq: parameter verbs_srq is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = verbs_srq->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy verbs_srq");
+        return status;
+    }
+
+    delete (verbs_srq);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_srq_wr(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t srq_wr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_wr: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq_wr == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_wr: parameter srq_wr is 0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->srq_wr = srq_wr;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_srq_init_attr_get_srq_wr(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq_wr: parameter verbs_srq_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_srq_init_attr->srq_wr;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_receive_max_sges(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, uint32_t receive_max_sges) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (receive_max_sges == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to set receive_max_sges: parameter receive_max_sges is 0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->receive_max_sges = receive_max_sges;
+
+    return DOCA_SUCCESS;
+}
+
+uint32_t doca_verbs_srq_init_attr_get_receive_max_sges(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get receive_max_sges: parameter verbs_srq_init_attr is NULL");
+        return 0;
+    }
+
+    return verbs_srq_init_attr->receive_max_sges;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_type(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                               enum doca_verbs_srq_type srq_type) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_type: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (srq_type != DOCA_VERBS_SRQ_TYPE_LINKED_LIST && srq_type != DOCA_VERBS_SRQ_TYPE_CONTIGUOUS) {
+        DOCA_LOG(LOG_ERR, "Failed to set srq_type: parameter srq_type is invalid");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->srq_type = srq_type;
+
+    return DOCA_SUCCESS;
+}
+
+enum doca_verbs_srq_type doca_verbs_srq_init_attr_get_type(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get srq_type: parameter verbs_srq_init_attr is NULL");
+        return DOCA_VERBS_SRQ_TYPE_LINKED_LIST;
+    }
+
+    return verbs_srq_init_attr->srq_type;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_pd(struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+                                             struct ibv_pd *pd) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (pd == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set pd: parameter pd is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->pd = pd;
+
+    return DOCA_SUCCESS;
+}
+
+struct ibv_pd *doca_verbs_srq_init_attr_get_pd(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get pd: parameter verbs_srq_init_attr is NULL");
+        return nullptr;
+    }
+
+    return verbs_srq_init_attr->pd;
+}
+
+doca_error_t doca_verbs_srq_init_attr_set_external_umem(
+    struct doca_verbs_srq_init_attr *verbs_srq_init_attr, struct doca_verbs_umem *external_umem,
+    uint64_t external_umem_offset) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to set external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    verbs_srq_init_attr->external_umem = external_umem;
+    verbs_srq_init_attr->external_umem_offset = external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_srq_init_attr_get_external_umem(
+    const struct doca_verbs_srq_init_attr *verbs_srq_init_attr,
+    struct doca_verbs_umem **external_umem, uint64_t *external_umem_offset) {
+    if (verbs_srq_init_attr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter verbs_srq_init_attr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (external_umem_offset == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get external_umem: parameter external_umem_offset is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *external_umem = verbs_srq_init_attr->external_umem;
+    *external_umem_offset = verbs_srq_init_attr->external_umem_offset;
+
+    return DOCA_SUCCESS;
+}
+
+void doca_verbs_srq_get_wq(const struct doca_verbs_srq *verbs_srq, void **srq_buf,
+                           uint32_t *srq_num_entries, uint32_t *rwqe_size_bytes) {
+    *srq_buf = verbs_srq->get_srq_buf();
+    *srq_num_entries = verbs_srq->get_srq_size();
+    *rwqe_size_bytes = verbs_srq->get_rcv_wqe_size();
+}
+
+void *doca_verbs_srq_get_dbr_addr(const struct doca_verbs_srq *verbs_srq) {
+    return verbs_srq->get_dbr_addr();
+}
+
+uint32_t doca_verbs_srq_get_srqn(const struct doca_verbs_srq *verbs_srq) {
+    return verbs_srq->get_srqn();
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp
new file mode 100644
index 00000000000..2e897b8f469
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_srq.hpp
@@ -0,0 +1,109 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+#include "doca_verbs_uar.hpp"
+
+struct doca_verbs_srq_init_attr {
+    struct ibv_pd *pd{};
+    enum doca_verbs_srq_type srq_type { DOCA_VERBS_SRQ_TYPE_LINKED_LIST };
+    uint32_t srq_wr{};
+    uint32_t receive_max_sges{};
+    struct doca_verbs_umem *external_umem{};
+    uint64_t external_umem_offset{};
+};
+
+/**
+ *  @brief This struct implements the doca verbs srq
+ */
+struct doca_verbs_srq {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] verbs_ctx
+     * The DOCA IB Verbs Context
+     * @param [in] verbs_srq_init_attr
+     * The DOCA IB Verbs SRQ attributes
+     *
+     */
+    doca_verbs_srq(struct ibv_context *ctx, struct doca_verbs_srq_init_attr &verbs_srq_init_attr);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_srq();
+
+    doca_error_t create_srq_obj(uint32_t log_srq_size, uint32_t log_stride,
+                                uint64_t dbr_umem_offset, uint32_t dbr_umem_id,
+                                uint64_t wq_umem_offset, uint32_t wq_umem_id,
+                                struct doca_verbs_srq_init_attr &verbs_srq_init_attr) noexcept;
+
+    void create(struct ibv_context *ctx);
+
+    doca_error_t destroy() noexcept;
+
+    void *get_srq_buf() const noexcept;
+
+    uint32_t get_srq_size() const noexcept;
+
+    uint32_t get_rcv_wqe_size() const noexcept;
+
+    void *get_dbr_addr() const noexcept;
+
+    uint32_t get_srqn() const noexcept;
+
+   private:
+    struct mlx5dv_devx_obj *m_srq_obj{};
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    uint8_t *m_umem_buf{};
+    uint8_t *m_srq_buf{};
+    struct ibv_context *m_ctx{};
+    struct ibv_pd *m_pd{};
+    uint32_t m_srq_num{};
+    uint32_t *m_db_buffer{};
+    uint32_t m_rcv_wqe_size{};
+    uint32_t m_rcv_max_sges{};
+    uint32_t m_srq_size{};
+    uint8_t m_log_rcv_wqe_size{};
+    enum doca_verbs_srq_type m_srq_type { DOCA_VERBS_SRQ_TYPE_LINKED_LIST };
+    doca_verbs_srq_init_attr m_init_attr{};
+    doca_verbs_device_attr *m_verbs_device_attr{};
+
+    doca_verbs_srq(doca_verbs_srq const &) = delete;
+    doca_verbs_srq &operator=(doca_verbs_srq const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp
new file mode 100644
index 00000000000..c0986735302
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.cpp
@@ -0,0 +1,197 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_gpunetio_config.h"
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_uar.hpp"
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {
+
+doca_error_t convert_doca_verbs_uar_type_to_mlx5_uar_type(doca_verbs_uar_allocation_type uar_type,
+                                                          uint32_t &mlx5_uar_type) noexcept {
+    switch (uar_type) {
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME:
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_BF;
+            break;
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE:
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_NC;
+            break;
+        case DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED:
+#if DOCA_GPUNETIO_HAVE_DEDICATED_NC_UAR == 1
+            mlx5_uar_type = MLX5DV_UAR_ALLOC_TYPE_NC_DEDICATED;
+            break;
+#else
+            DOCA_LOG(LOG_ERR, "DOCA_VERBS_UAR_ALLOCATION_TYPE_NONCACHE_DEDICATED is not supported");
+            return DOCA_ERROR_NOT_SUPPORTED;
+#endif
+        default:
+            DOCA_LOG(LOG_ERR, "Can't convert invalid UAR type=%d", mlx5_uar_type);
+            return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_uar Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_uar::doca_verbs_uar(struct ibv_context *context,
+                               enum doca_verbs_uar_allocation_type allocation_type)
+    : m_ibv_ctx(context), m_allocation_type(allocation_type) {
+    try {
+        create();
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create UAR");
+        throw;
+    }
+}
+
+doca_verbs_uar::~doca_verbs_uar() { static_cast<void>(destroy()); }
+
+void doca_verbs_uar::create() {
+    uint32_t mlx5_uar_type{};
+    auto status = convert_doca_verbs_uar_type_to_mlx5_uar_type(m_allocation_type, mlx5_uar_type);
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to convert UAR");
+        throw DOCA_ERROR_DRIVER;
+    }
+
+    auto uar_status =
+        doca_verbs_wrapper_mlx5dv_devx_alloc_uar(m_ibv_ctx, mlx5_uar_type, &m_uar_obj);
+    if (uar_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to alloc UAR");
+        throw uar_status;
+    }
+
+    m_uar_id = m_uar_obj->page_id;
+    m_reg_addr = m_uar_obj->reg_addr;
+}
+
+doca_error_t doca_verbs_uar::destroy() noexcept {
+    if (m_uar_obj) {
+        auto free_uar_status = doca_verbs_wrapper_mlx5dv_devx_free_uar(m_uar_obj);
+        if (free_uar_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to free UAR");
+            return free_uar_status;
+        }
+        m_uar_obj = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_uar_create(struct ibv_context *context,
+                                   enum doca_verbs_uar_allocation_type allocation_type,
+                                   struct doca_verbs_uar **uar_obj) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create uar: parameter context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create uar: parameter uar_obj=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *uar_obj = new doca_verbs_uar(context, allocation_type);
+        DOCA_LOG(LOG_INFO, "doca_verbs_uar=%p was created", *uar_obj);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_uar_destroy(struct doca_verbs_uar *uar_obj) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy uar: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = uar_obj->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR, "Failed to destroy uar.");
+        return status;
+    }
+
+    delete (uar_obj);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_uar_id_get(const struct doca_verbs_uar *uar_obj, uint32_t *uar_id) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar id: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (uar_id == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar id: parameter uar_id is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *uar_id = uar_obj->get_uar_id();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_uar_reg_addr_get(const struct doca_verbs_uar *uar_obj, void **reg_addr) {
+    if (uar_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar reg_addr: parameter uar_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (reg_addr == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to get uar reg_addr: parameter reg_addr is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *reg_addr = uar_obj->get_reg_addr();
+
+    return DOCA_SUCCESS;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp
new file mode 100644
index 00000000000..5aa168b9941
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_uar.hpp
@@ -0,0 +1,109 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+/**
+ *  @brief This struct implements the doca verbs uar
+ */
+struct doca_verbs_uar {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] context
+     * ibv_context
+     * @param [in] allocation_type
+     * The uar allocation type.
+     */
+    doca_verbs_uar(struct ibv_context *context,
+                   enum doca_verbs_uar_allocation_type allocation_type);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_uar();
+
+    /**
+     * @brief destroy the uar
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the uar.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the uar
+     *
+     */
+    void create();
+
+    /**
+     * @brief Get uar ID
+     *
+     * @return uar ID
+     */
+    uint32_t get_uar_id() const noexcept { return m_uar_id; }
+
+    /**
+     * @brief Get UAR reg address
+     *
+     * @return UAR reg address
+     */
+    void *get_reg_addr() const noexcept { return m_reg_addr; }
+
+    /**
+     * @brief Get UAR memory allocation type
+     *
+     * @return UAR memory allocation type
+     */
+    enum doca_verbs_uar_allocation_type get_uar_mtype() const noexcept { return m_allocation_type; }
+
+   private:
+    struct mlx5dv_devx_uar *m_uar_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    enum doca_verbs_uar_allocation_type m_allocation_type {
+        DOCA_VERBS_UAR_ALLOCATION_TYPE_BLUEFLAME
+    };
+    uint32_t m_uar_id{};
+    void *m_reg_addr{};
+
+    doca_verbs_uar(doca_verbs_uar const &) = delete;
+    doca_verbs_uar &operator=(doca_verbs_uar const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp
new file mode 100644
index 00000000000..a8a444db29e
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.cpp
@@ -0,0 +1,212 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <mutex>
+#include <time.h>
+
+#include "host/mlx5_prm.h"
+#include "host/mlx5_ifc.h"
+
+#include "doca_gpunetio_config.h"
+#include "doca_verbs_net_wrapper.h"
+#include "doca_internal.hpp"
+#include "doca_verbs_device_attr.hpp"
+#include "doca_verbs_umem.hpp"
+
+/*********************************************************************************************************************
+ * Helper functions
+ *********************************************************************************************************************/
+
+namespace {} /* namespace */
+
+/**********************************************************************************************************************
+ * doca_verbs_umem Member Functions
+ *********************************************************************************************************************/
+
+doca_verbs_umem::doca_verbs_umem(struct ibv_context *ibv_ctx, void *address, size_t size,
+                                 uint32_t access_flags, int dmabuf_fd, size_t dmabuf_offset)
+    : m_ibv_ctx(ibv_ctx),
+      m_address(address),
+      m_size(size),
+      m_access_flags(access_flags),
+      m_dmabuf_fd(dmabuf_fd),
+      m_dmabuf_offset(dmabuf_offset) {
+    try {
+        create();
+    } catch (...) {
+        (void)destroy();
+        DOCA_LOG(LOG_ERR, "Failed to create UMEM");
+        throw;
+    }
+}
+
+doca_verbs_umem::~doca_verbs_umem() { static_cast<void>(destroy()); }
+
+void doca_verbs_umem::create() {
+    struct mlx5dv_devx_umem_in umem_in {};
+
+    umem_in.addr = m_address;
+    umem_in.size = m_size;
+    umem_in.access = m_access_flags;
+    umem_in.pgsz_bitmap = sysconf(_SC_PAGESIZE);
+    umem_in.comp_mask = 0;
+
+#if DOCA_GPUNETIO_HAVE_MLX5DV_UMEM_DMABUF == 1
+    /* check if dmabuf file descriptor was set to determine mask */
+    if (m_dmabuf_fd != (int)DOCA_VERBS_DMABUF_INVALID_FD) {
+        umem_in.comp_mask = MLX5DV_UMEM_MASK_DMABUF;
+        umem_in.dmabuf_fd = m_dmabuf_fd;
+        /* umem_in.addr is interpreted as the starting offset of the dmabuf */
+        umem_in.addr = reinterpret_cast<void *>(m_dmabuf_offset);
+    }
+#endif
+
+    auto umem_status = doca_verbs_wrapper_mlx5dv_devx_umem_reg_ex(m_ibv_ctx, &umem_in, &m_umem_obj);
+    if (umem_status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_ERR,
+                 "Failed to create UMEM, m_address %p m_size %zd m_access_flags %x m_dmabuf_fd %d "
+                 "m_dmabuf_offset %zd err %d",
+                 m_address, m_size, m_access_flags, m_dmabuf_fd, m_dmabuf_offset, errno);
+        throw umem_status;
+    }
+
+    m_umem_id = m_umem_obj->umem_id;
+}
+
+doca_error_t doca_verbs_umem::destroy() noexcept {
+    if (m_umem_obj) {
+        auto dereg_status = doca_verbs_wrapper_mlx5dv_devx_umem_dereg(m_umem_obj);
+        if (dereg_status != DOCA_SUCCESS) {
+            DOCA_LOG(LOG_ERR, "Failed to destroy UMEM object");
+            return dereg_status;
+        }
+        m_umem_obj = nullptr;
+    }
+
+    return DOCA_SUCCESS;
+}
+
+/**********************************************************************************************************************
+ * Public API functions
+ *********************************************************************************************************************/
+
+doca_error_t doca_verbs_umem_create(struct ibv_context *context, void *address, size_t size,
+                                    uint32_t access_flags, int dmabuf_id, size_t dmabuf_offset,
+                                    struct doca_verbs_umem **umem_obj) {
+    if (context == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter context=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (address == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter address=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (size == 0) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter size=0");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_ERR, "Failed to create umem: parameter umem_obj=NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    try {
+        *umem_obj =
+            new doca_verbs_umem(context, address, size, access_flags, dmabuf_id, dmabuf_offset);
+        DOCA_LOG(LOG_INFO, "doca_verbs_umem=%p was created", *umem_obj);
+        return DOCA_SUCCESS;
+    } catch (doca_error_t err) {
+        return err;
+    }
+}
+
+doca_error_t doca_verbs_umem_destroy(struct doca_verbs_umem *umem_obj) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy umem: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    auto status = umem_obj->destroy();
+    if (status != DOCA_SUCCESS) {
+        DOCA_LOG(LOG_INFO, "Failed to destroy umem.");
+        return status;
+    }
+
+    delete (umem_obj);
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_id(const struct doca_verbs_umem *umem_obj, uint32_t *umem_id) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem id: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_id == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem id: parameter umem_id is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_id = umem_obj->get_umem_id();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_size(const struct doca_verbs_umem *umem_obj, size_t *umem_size) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem size: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_size == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem size: parameter umem_size is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_size = umem_obj->get_umem_size();
+
+    return DOCA_SUCCESS;
+}
+
+doca_error_t doca_verbs_umem_get_address(const struct doca_verbs_umem *umem_obj,
+                                         void **umem_address) {
+    if (umem_obj == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem address: parameter umem_obj is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+    if (umem_address == nullptr) {
+        DOCA_LOG(LOG_INFO, "Failed to get umem address: parameter umem_address is NULL");
+        return DOCA_ERROR_INVALID_VALUE;
+    }
+
+    *umem_address = umem_obj->get_umem_address();
+
+    return DOCA_SUCCESS;
+}
diff --git a/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp
new file mode 100644
index 00000000000..8f4e09cb1cd
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/doca-gpunetio/src/doca_verbs_umem.hpp
@@ -0,0 +1,118 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "host/doca_verbs.h"
+
+/**
+ *  @brief This struct implements the doca verbs umem
+ */
+struct doca_verbs_umem {
+   public:
+    /**
+     * @brief constructor
+     *
+     * @param [in] ibv_ctx
+     * ibv_context
+     * @param [in] address
+     * The umem address.
+     * @param [in] size
+     * The umem size.
+     * @param [in] access_flags
+     * The umem access flags.
+     * @param [in] dmabuf_fd
+     * The umem dmabuf file descriptor id.
+     * @param [in] dmabuf_offset
+     * The umem dmabuf offset.
+     */
+    doca_verbs_umem(struct ibv_context *ibv_ctx, void *address, size_t size, uint32_t access_flags,
+                    int dmabuf_fd, size_t dmabuf_offset);
+
+    /**
+     * @brief destructor
+     */
+    ~doca_verbs_umem();
+
+    /**
+     * @brief destroy the umem
+     *
+     * @return
+     * DOCA_SUCCESS on successful destroy.
+     * DOCA_ERROR_DRIVER on failure to destroy the umem.
+     *
+     */
+    doca_error_t destroy() noexcept;
+
+    /**
+     * @brief create the umem
+     *
+     */
+    void create();
+
+    /**
+     * @brief Get umem ID
+     *
+     * @return umem ID
+     */
+    uint32_t get_umem_id() const noexcept { return m_umem_id; }
+
+    /**
+     * @brief Get umem size
+     *
+     * @return umem size
+     */
+    size_t get_umem_size() const noexcept { return m_size; }
+
+    /**
+     * @brief Get umem address
+     *
+     * @return umem address
+     */
+    void *get_umem_address() const noexcept { return m_address; }
+
+   private:
+    struct mlx5dv_devx_umem *m_umem_obj{};
+    struct ibv_context *m_ibv_ctx{};
+    void *m_address{};
+    size_t m_size{};
+    uint32_t m_access_flags{};
+    uint32_t m_umem_id{};
+    int m_dmabuf_fd;
+    size_t m_dmabuf_offset;
+
+    doca_verbs_umem(doca_verbs_umem const &) = delete;
+    doca_verbs_umem &operator=(doca_verbs_umem const &) = delete;
+};
diff --git a/projects/rccl/src/transport/gdaki/gin_host_gdaki.cc b/projects/rccl/src/transport/gdaki/gin_host_gdaki.cc
new file mode 100644
index 00000000000..a7dd677966f
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/gin_host_gdaki.cc
@@ -0,0 +1,1065 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <alloc.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <mutex>
+
+#include "ibvwrap.h"
+#include "mlx5/mlx5dvwrap.h"
+#include "gin/gin_host.h"
+#include "gin_host_gdaki.h"
+#include "plugin/nccl_net.h"
+#include "param.h"
+
+#include "doca_gpunetio_host.h"
+#include "nccl_device/gin/gdaki/gin_gdaki_device_host_common.h"
+#include "../net_ib_gin.h"
+
+#define DOCACHECK(call)                                       \
+  do {                                                        \
+    doca_error_t RES = call;                                  \
+    if (RES != DOCA_SUCCESS) {                           \
+      /* Print the back trace*/                               \
+      INFO(NCCL_NET, "%s:%d -> %d", __FILE__, __LINE__, RES); \
+      return ncclSystemError;                                 \
+    }                                                         \
+  } while (0)
+
+#define DOCACHECKGOTO(call, DOCA_RES, NCCL_RES, label)             \
+  do {                                                             \
+    DOCA_RES = call;                                               \
+    if (DOCA_RES != DOCA_SUCCESS) {                           \
+      /* Print the back trace*/                                    \
+      INFO(NCCL_NET, "%s:%d -> %d", __FILE__, __LINE__, DOCA_RES); \
+      NCCL_RES = ncclSystemError;                                  \
+      goto label;                                                  \
+    }                                                              \
+  } while (0)
+
+#define VERBS_TEST_DBR_SIZE (8)
+#define MAX_PCI_ADDRESS_LEN 32U
+
+NCCL_PARAM(GinGdakiNicHandler, "GIN_GDAKI_NIC_HANDLER", 0);
+NCCL_PARAM(GinGdakiQpDepth, "GIN_GDAKI_QP_DEPTH", 128);
+NCCL_PARAM(GinErrorQuerySec, "GIN_ERROR_QUERY_SEC", 10);
+extern int64_t ncclParamIbTimeout();
+extern int64_t ncclParamIbRetryCnt();
+extern int64_t ncclParamIbPkey();
+extern int64_t ncclParamIbSl();
+extern int64_t ncclParamIbTc();
+extern int64_t ncclParamIbPciRelaxedOrdering();
+extern int64_t ncclParamIbDataDirect();
+extern int64_t ncclParamDmaBufEnable();
+
+static const int NCCL_IB_SL_DEFAULT = 0;
+static const int NCCL_IB_TC_DEFAULT = 0;
+
+static inline bool gdakiRelaxedOrderingEnabled() {
+  static bool hasCheckedRelaxedOrdering = false;
+  static bool relaxedOrderingEnabled = false;
+
+  static std::mutex lockMutex;
+  std::lock_guard<std::mutex> lock(lockMutex);
+
+  if (!hasCheckedRelaxedOrdering) {
+    int roMode = ncclParamIbPciRelaxedOrdering();
+    ncclResult_t r = ncclInternalError;
+    if (roMode == 1 || roMode == 2) {
+      // Query IBVERBS_1.8 API - needed for IBV_ACCESS_RELAXED_ORDERING support
+      r = wrap_ibv_reg_mr_iova2(NULL, NULL, NULL, 0, 0, 0);
+    }
+
+    relaxedOrderingEnabled = (r != ncclInternalError);
+    hasCheckedRelaxedOrdering = true;
+  }
+  return relaxedOrderingEnabled;
+}
+
+static ncclResult_t gdakiRegMrDmaBuf(struct ibv_mr **mr, struct ibv_pd *pd, void *addr,
+                                     size_t length, int access) {
+  int status = 0;
+  int dmabuf_fd = -1;
+
+  if (ncclParamDmaBufEnable() == 0) return ncclInvalidUsage;
+
+#if CUDA_VERSION >= 11070
+  static size_t host_page_size = sysconf(_SC_PAGESIZE);
+  size_t aligned_size = length;
+  ALIGN_SIZE(aligned_size, host_page_size);
+
+#if CUDA_VERSION >= 12080
+  if (ncclParamIbDataDirect()) {
+    status = pfn_cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)addr, aligned_size,
+                                               CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+                                               CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE);
+    if (status) {
+      INFO(NCCL_NET,
+           "Failed to get DMA-BUF handle for address range with type PCIE, error=%d. Trying a "
+           "different method.",
+           status);
+      goto try_legacy;
+    }
+    status = wrap_mlx5dv_reg_dmabuf_mr(mr, pd, 0, aligned_size, 0, dmabuf_fd, access,
+                                       MLX5DV_REG_DMABUF_ACCESS_DATA_DIRECT);
+    if (status) {
+      INFO(NCCL_NET,
+           "Failed to register memory with DMA-BUF and data direct, error=%d. Trying a different "
+           "method.",
+           status);
+      close(dmabuf_fd);
+      dmabuf_fd = -1;
+    } else
+      goto out;
+  }
+try_legacy:
+
+#endif
+
+  CUCHECK(cuMemGetHandleForAddressRange((void *)&dmabuf_fd, (CUdeviceptr)addr, aligned_size,
+                                        CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0));
+  status = wrap_ibv_reg_dmabuf_mr(mr, pd, 0, aligned_size, 0, dmabuf_fd, access);
+  if (status)
+    INFO(NCCL_NET, "Failed to register memory with DMA-BUF, error=%d. Trying a different method.",
+         status);
+#else
+  status = ncclInvalidUsage;
+#endif
+
+#if CUDA_VERSION >= 12080
+out:
+#endif
+  if (dmabuf_fd >= 0) {
+    close(dmabuf_fd);
+  }
+  return (ncclResult_t)status;
+}
+
+static ncclResult_t gdakiRegMr(struct ibv_mr **mr, struct ibv_pd *pd, void *addr, size_t length,
+                               int access, bool force_strict_ordering = false) {
+  int status = 0;
+
+  if (!force_strict_ordering && gdakiRelaxedOrderingEnabled())
+    access |= IBV_ACCESS_RELAXED_ORDERING;
+
+  NOWARN(status = gdakiRegMrDmaBuf(mr, pd, addr, length, access), NCCL_NET);
+  if (status == ncclSuccess) return ncclSuccess;
+
+  NCCLCHECK(wrap_ibv_reg_mr_iova2(mr, pd, addr, length, 0, access));
+  return ncclSuccess;
+}
+
+template <typename T>
+class GdakiHostGPUMemHandle {
+ private:
+  CUmemGenericAllocationHandle cumemhandle;
+  unsigned int num_elements;
+
+ public:
+  T *host_buf;
+  T *gpu_buf;
+
+  ncclResult_t allocate(unsigned int num_elements) {
+    this->host_buf = (T *)calloc(num_elements, sizeof(T));
+    EQCHECK(this->host_buf, nullptr);
+
+    NCCLCHECK(ncclCuMemAlloc((void **)&this->gpu_buf, &this->cumemhandle, CU_MEM_HANDLE_TYPE_NONE,
+                             num_elements * sizeof(T)));
+
+    this->num_elements = num_elements;
+
+    return ncclSuccess;
+  }
+
+  void deallocate() {
+    if (this->host_buf != nullptr) {
+      free(this->host_buf);
+    }
+    if (this->gpu_buf != nullptr) {
+      ncclCuMemFree(this->gpu_buf);
+    }
+  }
+
+  ncclResult_t copy_h_to_d() {
+    NCCLCHECK(ncclCudaMemcpy<T>(this->gpu_buf, this->host_buf, this->num_elements));
+    return ncclSuccess;
+  }
+
+  ncclResult_t copy_d_to_h() {
+    NCCLCHECK(ncclCudaMemcpy<T>(this->host_buf, this->gpu_buf, this->num_elements));
+    return ncclSuccess;
+  }
+
+  GdakiHostGPUMemHandle() : cumemhandle(0), num_elements(0), host_buf(nullptr), gpu_buf(nullptr){};
+  GdakiHostGPUMemHandle(unsigned int num_elements) {
+    ncclResult_t status = this->allocate(num_elements);
+    if (status != ncclSuccess) {
+      throw status;
+    }
+  };
+
+  ~GdakiHostGPUMemHandle() { this->deallocate(); }
+};
+
+template <typename T>
+class GdakiGlobalGPUBufferTable {
+ private:
+  CUmemGenericAllocationHandle cumemhandle;
+  unsigned int num_elements;
+  unsigned int next_unused_idx;
+  unsigned int num_ranks;
+  GdakiHostGPUMemHandle<__be32> rkeys_hd_mhandle;
+
+ public:
+  T *gpu_ptr;
+  struct ibv_mr *mr;
+
+  ncclResult_t allocate(unsigned int num_elements, unsigned int num_ranks) {
+    NCCLCHECK(ncclCuMemAlloc((void **)&this->gpu_ptr, &this->cumemhandle, CU_MEM_HANDLE_TYPE_NONE,
+                             num_elements * sizeof(T)));
+    CUDACHECK(cudaMemset(this->gpu_ptr, 0, num_elements * sizeof(T)));
+    NCCLCHECK(this->rkeys_hd_mhandle.allocate(num_ranks));
+
+    this->num_elements = num_elements;
+    this->num_ranks = num_ranks;
+    this->next_unused_idx = 0;
+
+    return ncclSuccess;
+  }
+
+  void deallocate() {
+    if (this->gpu_ptr != nullptr) {
+      ncclCuMemFree(this->gpu_ptr);
+    }
+  }
+
+  ncclResult_t register_mr(struct ibv_pd *ib_pd, bool force_strict_ordering = false) {
+    NCCLCHECK(gdakiRegMr(&this->mr, ib_pd, this->gpu_ptr, this->num_elements * sizeof(T),
+                         IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                           IBV_ACCESS_REMOTE_ATOMIC,
+                         force_strict_ordering));
+    return ncclSuccess;
+  }
+
+  void deregister_mr() {
+    if (this->mr != nullptr) {
+      wrap_ibv_dereg_mr(this->mr);
+      this->mr = nullptr;
+    }
+  }
+
+  ncclResult_t exchange_info(struct ncclGinIbCollComm *cComm) {
+    __be32 rkey = htobe32(this->mr->rkey);
+    NCCLCHECK(cComm->allGather(cComm, &rkey, this->rkeys_hd_mhandle.host_buf, sizeof(__be32)));
+    NCCLCHECK(this->rkeys_hd_mhandle.copy_h_to_d());
+    return ncclSuccess;
+  }
+
+  ncclResult_t allocate_elements(unsigned int num_elements, unsigned int *out_start_idx) {
+    if (this->next_unused_idx + num_elements > this->num_elements) {
+      WARN("Not enough space to get elements");
+      return ncclInvalidUsage;
+    }
+
+    *out_start_idx = this->next_unused_idx;
+    this->next_unused_idx += num_elements;
+
+    return ncclSuccess;
+  }
+
+  void free_elements(unsigned int start_idx, unsigned int num_elements) {
+    // No op for now as we don't allow reusing elements.
+  }
+
+  uint32_t *get_rkeys_d() { return this->rkeys_hd_mhandle.gpu_buf; }
+
+  GdakiGlobalGPUBufferTable()
+    : gpu_ptr(nullptr), mr(nullptr), cumemhandle(nullptr), num_elements(0), next_unused_idx(0){};
+  GdakiGlobalGPUBufferTable(unsigned int num_elements, unsigned int num_ranks) {
+    this->allocate(num_elements, num_ranks);
+  };
+  ~GdakiGlobalGPUBufferTable() { this->deallocate(); }
+};
+
+struct gdaki_mem_handle {
+  int type;
+  struct ibv_mr *mr;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle> *gdaki_mhandle_hd_mhandle;
+  GdakiHostGPUMemHandle<uint32_t> *rkeys_hd_mhandle;
+};
+
+struct gdaki_exch_info {
+  int lid;
+  int qpn;
+  union ibv_gid gid;
+  struct doca_verbs_gid vgid;
+  int gid_index;
+};
+
+struct gdaki_context {
+  int cuda_id;
+  struct doca_gpu *gdev;
+  struct ibv_device *ib_dev;
+  struct ibv_context *ib_ctx;    /* DOCA Verbs Context */
+  struct ibv_pd *ib_pd;          /* local protection domain */
+  struct doca_verbs_ah_attr *ah; /* DOCA Verbs address handle */
+  struct doca_verbs_gid gid;
+
+  union ibv_gid rgid;
+  struct ibv_port_attr port_attr;
+  uint8_t port_num;
+  int gid_index;
+
+  uint32_t qp_rq_size;
+  uint32_t qp_sq_size;
+  struct doca_gpu_verbs_qp_group_hl **gqp_groups;
+  struct doca_gpu_verbs_qp_hl **gqps;
+  struct doca_gpu_verbs_qp_hl **companion_gqps;
+
+  GdakiGlobalGPUBufferTable<uint64_t> *counters_table;
+  GdakiGlobalGPUBufferTable<uint64_t> *signals_table;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext> *gin_gdaki_gpu_ctx_hd_mhandle;
+  struct {
+    void *addr;
+    struct ibv_mr *mr;
+    CUmemGenericAllocationHandle mhandle;
+  } sink_buffer;
+  struct timespec last_error_query_time;
+
+  struct ncclGinIbCollComm *collComm;
+  ncclNetDeviceHandle_v11_t *devHandle;
+};
+
+template <typename T>
+static inline T gdaki_round_up(T x, T y) {
+  return ((x + y - 1) / y) * y;
+}
+
+static ncclResult_t gdakiFindDevice(char *ibDevName, struct ibv_device **outIbDev) {
+  ncclResult_t status = ncclSuccess;
+  int numOfDevice;
+  struct ibv_device **devList = nullptr;
+  struct ibv_device *ibDev = nullptr;
+
+  assert(ibDevName != nullptr);
+
+  NCCLCHECK(wrap_ibv_get_device_list(&devList, &numOfDevice));
+
+  if (numOfDevice <= 0) {
+    WARN("No network devices that support GDAKI found");
+    status = ncclSystemError;
+    goto fail;
+  }
+
+  for (int i = 0; i < numOfDevice; ++i) {
+    struct ibv_device *ibDev_ = devList[i];
+    if (!strcmp(wrap_ibv_get_device_name(ibDev_), ibDevName)) {
+      ibDev = ibDev_;
+      break;
+    }
+  }
+  if (!ibDev) {
+    WARN("IB device %s not found", ibDevName);
+    status = ncclInvalidArgument;
+    goto fail;
+  }
+
+  *outIbDev = ibDev;
+
+exit:
+  return status;
+fail:
+  NCCLCHECK(wrap_ibv_free_device_list(devList));
+  goto exit;
+}
+
+static void gdakiFillExchInfo(struct gdaki_exch_info *exch_info, struct gdaki_context *gdaki_ctx,
+                              struct doca_gpu_verbs_qp_hl *gqp) {
+  exch_info->lid = gdaki_ctx->port_attr.lid;
+  exch_info->qpn = doca_verbs_qp_get_qpn(gqp->qp);
+  memcpy(exch_info->gid.raw, gdaki_ctx->rgid.raw, sizeof(union ibv_gid));
+  memcpy(exch_info->vgid.raw, gdaki_ctx->rgid.raw, sizeof(union ibv_gid));
+  exch_info->gid_index = gdaki_ctx->gid_index;
+}
+
+static ncclResult_t gdakiCreateVerbsAh(struct gdaki_context *ctx, int ib_sl, int ib_tc,
+                                       int ib_gid_index) {
+  ncclResult_t status = ncclSuccess;
+  doca_error_t docaStatus = DOCA_SUCCESS;
+
+  DOCACHECK(doca_verbs_ah_attr_create(ctx->ib_ctx, &ctx->ah));
+  DOCACHECK(doca_verbs_ah_attr_set_sl(ctx->ah, ib_sl));
+  DOCACHECK(doca_verbs_ah_attr_set_traffic_class(ctx->ah, ib_tc));
+
+  if (ctx->port_attr.link_layer == 1) {
+    DOCACHECKGOTO(doca_verbs_ah_attr_set_addr_type(ctx->ah, DOCA_VERBS_ADDR_TYPE_IB_NO_GRH),
+                  docaStatus, status, destroy_verbs_ah);
+  } else {
+    DOCACHECKGOTO(doca_verbs_ah_attr_set_addr_type(ctx->ah, DOCA_VERBS_ADDR_TYPE_IPv4),
+                  docaStatus, status, destroy_verbs_ah);
+  }
+
+  // set_port_num?
+  DOCACHECKGOTO(doca_verbs_ah_attr_set_sgid_index(ctx->ah, ib_gid_index), docaStatus, status,
+                destroy_verbs_ah);
+  DOCACHECKGOTO(doca_verbs_ah_attr_set_hop_limit(ctx->ah, 255), docaStatus, status, destroy_verbs_ah);
+
+  return ncclSuccess;
+
+destroy_verbs_ah:
+  DOCACHECK(doca_verbs_ah_attr_destroy(ctx->ah));
+  return status;
+}
+
+static ncclResult_t gdakiConnectQp(struct gdaki_context *ctx, struct doca_gpu_verbs_qp_hl *gqp,
+                                   struct gdaki_exch_info *exch_info) {
+  ncclResult_t status = ncclSuccess;
+  doca_error_t docaStatus = DOCA_SUCCESS;
+  struct doca_verbs_qp_attr *verbs_qp_attr = nullptr;
+
+  DOCACHECK(doca_verbs_ah_attr_set_gid(ctx->ah, exch_info->vgid));
+  DOCACHECK(doca_verbs_ah_attr_set_dlid(ctx->ah, exch_info->lid));
+  DOCACHECK(doca_verbs_qp_attr_create(&verbs_qp_attr));
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_path_mtu(verbs_qp_attr, DOCA_VERBS_MTU_SIZE_4K_BYTES),
+    docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_rq_psn(verbs_qp_attr, 0), docaStatus, status,
+                destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_sq_psn(verbs_qp_attr, 0), docaStatus, status,
+                destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_port_num(verbs_qp_attr, ctx->port_num), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_ack_timeout(verbs_qp_attr, ncclParamIbTimeout()), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_retry_cnt(verbs_qp_attr, ncclParamIbRetryCnt()), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_rnr_retry(verbs_qp_attr, 7), docaStatus, status,
+                destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_min_rnr_timer(verbs_qp_attr, 12), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_INIT),
+    docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_write(verbs_qp_attr, 1), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_read(verbs_qp_attr, 1), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_allow_remote_atomic(
+                  verbs_qp_attr, DOCA_VERBS_QP_ATOMIC_MODE_IB_SPEC),
+                docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_ah_attr(verbs_qp_attr, ctx->ah), docaStatus,
+                status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_dest_qp_num(verbs_qp_attr, exch_info->qpn),
+                docaStatus, status, destroy_verbs_qp_attr);
+  DOCACHECKGOTO(doca_verbs_qp_attr_set_pkey_index(verbs_qp_attr, ncclParamIbPkey()), docaStatus,
+                status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_WRITE |
+                    DOCA_VERBS_QP_ATTR_ALLOW_REMOTE_READ | DOCA_VERBS_QP_ATTR_PKEY_INDEX |
+                    DOCA_VERBS_QP_ATTR_PORT_NUM),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_RTR),
+    docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_RQ_PSN |
+                    DOCA_VERBS_QP_ATTR_DEST_QP_NUM | DOCA_VERBS_QP_ATTR_PATH_MTU |
+                    DOCA_VERBS_QP_ATTR_AH_ATTR | DOCA_VERBS_QP_ATTR_MIN_RNR_TIMER),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(
+    doca_verbs_qp_attr_set_next_state(verbs_qp_attr, DOCA_VERBS_QP_STATE_RTS),
+    docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECKGOTO(doca_verbs_qp_modify(
+                  gqp->qp, verbs_qp_attr,
+                  DOCA_VERBS_QP_ATTR_NEXT_STATE | DOCA_VERBS_QP_ATTR_SQ_PSN |
+                    DOCA_VERBS_QP_ATTR_ACK_TIMEOUT | DOCA_VERBS_QP_ATTR_RETRY_CNT |
+                    DOCA_VERBS_QP_ATTR_RNR_RETRY),
+                docaStatus, status, destroy_verbs_qp_attr);
+
+  DOCACHECK(doca_verbs_qp_attr_destroy(verbs_qp_attr));
+
+  return ncclSuccess;
+
+destroy_verbs_qp_attr:
+  DOCACHECK(doca_verbs_qp_attr_destroy(verbs_qp_attr));
+  return status;
+}
+
+ncclResult_t ncclGinGdakiCreateContext(void *collComm, int nSignals, int nCounters,
+                                       void **outGinCtx, ncclNetDeviceHandle_v11_t **outDevHandle) {
+  int status = ncclSuccess;
+  doca_error_t docaStatus;
+
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+
+  char pciBusId[MAX_PCI_ADDRESS_LEN];
+
+  const int rank = cComm->rank;
+  const int nranks = cComm->nranks;
+  const int ncontexts = 1;
+  const int nqps_per_rank = ncontexts;
+  const int nqps_for_comm = nqps_per_rank * nranks;  // Number of QPs for communication
+  const int ncompanion_qps = nqps_for_comm * 2;      // Number of companion QPs for communication
+                                                     // Double because we connect to self.
+  const int nqps =
+    nqps_per_rank * (nranks + 1);  // +1 for the local rank.
+                                   // The last group is the responder of the local rank.
+
+  // TODO: Take these config parameters from the environment variables or users.
+  const int num_counters = nCounters;
+  const int num_signals = nSignals;
+  ncclNetProperties_t props;
+  ncclNetDeviceHandle_v11_t *devHandle = nullptr;
+  struct gdaki_context *gdaki_ctx = nullptr;
+  struct gdaki_exch_info *local_exch_info = nullptr;
+  struct gdaki_exch_info *remote_exch_info = nullptr;
+
+  struct doca_gpu_verbs_qp_init_attr_hl qp_init_attr;
+
+  uint64_t *sink_buffer = nullptr;
+  struct ibv_mr *sink_buffer_mr = nullptr;
+  CUmemGenericAllocationHandle sink_buffer_mhandle;
+
+  bool need_cpu_proxy = false;
+
+  GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext> *gin_gdaki_gpu_ctx_hd_mhandle =
+    new GdakiHostGPUMemHandle<struct ncclGinGdakiGPUContext>(ncontexts);
+
+  GdakiGlobalGPUBufferTable<uint64_t> *counters_table =
+    new GdakiGlobalGPUBufferTable<uint64_t>(num_counters, nranks);
+  GdakiGlobalGPUBufferTable<uint64_t> *signals_table =
+    new GdakiGlobalGPUBufferTable<uint64_t>(num_signals, nranks);
+
+  const int ib_sl = (ncclParamIbSl() != -1) ? ncclParamIbSl() : NCCL_IB_SL_DEFAULT;
+  const int ib_tc = (ncclParamIbTc() != -1) ? ncclParamIbTc() : NCCL_IB_TC_DEFAULT;
+  int ib_gid_index = 0;
+
+  NCCLCHECK(cComm->getProperties(cComm->dev, &props));
+
+  const size_t host_page_size = sysconf(_SC_PAGESIZE);
+  gdaki_ctx = (struct gdaki_context *)calloc(1, sizeof(*gdaki_ctx));
+  EQCHECKGOTO(gdaki_ctx, nullptr, status, out);
+
+  devHandle = (ncclNetDeviceHandle_v11_t *)calloc(1, sizeof(*devHandle));
+  EQCHECKGOTO(devHandle, nullptr, status, out);
+
+  gdaki_ctx->gqp_groups = (struct doca_gpu_verbs_qp_group_hl **)calloc(
+    nqps_for_comm, sizeof(*gdaki_ctx->gqp_groups));
+  EQCHECKGOTO(gdaki_ctx->gqp_groups, nullptr, status, out);
+
+  // Main QP
+  gdaki_ctx->gqps = (struct doca_gpu_verbs_qp_hl **)calloc(nqps, sizeof(*gdaki_ctx->gqps));
+  EQCHECKGOTO(gdaki_ctx->gqps, nullptr, status, out);
+
+  // Companion QP
+  gdaki_ctx->companion_gqps =
+    (struct doca_gpu_verbs_qp_hl **)calloc(ncompanion_qps, sizeof(*gdaki_ctx->companion_gqps));
+  EQCHECKGOTO(gdaki_ctx->companion_gqps, nullptr, status, out);
+
+  local_exch_info = (struct gdaki_exch_info *)calloc(nranks, sizeof(*local_exch_info));
+  EQCHECKGOTO(local_exch_info, nullptr, status, out);
+
+  remote_exch_info = (struct gdaki_exch_info *)calloc(nranks, sizeof(*remote_exch_info));
+  EQCHECKGOTO(remote_exch_info, nullptr, status, out);
+
+  CUDACHECK(cudaGetDevice(&gdaki_ctx->cuda_id));
+  CUDACHECK(cudaDeviceGetPCIBusId(pciBusId, MAX_PCI_ADDRESS_LEN, gdaki_ctx->cuda_id));
+
+  DOCACHECKGOTO(doca_gpu_create(pciBusId, &gdaki_ctx->gdev), docaStatus, status, out);
+
+  // Find the IB/RoCE device by name
+  NCCLCHECKGOTO(gdakiFindDevice(props.name, &gdaki_ctx->ib_dev), status, out);
+
+  // Open the IB context
+  NCCLCHECKGOTO(wrap_ibv_open_device(&gdaki_ctx->ib_ctx, gdaki_ctx->ib_dev), status, out);
+
+  // Allocate the protection domain
+  NCCLCHECKGOTO(wrap_ibv_alloc_pd(&gdaki_ctx->ib_pd, gdaki_ctx->ib_ctx), status, out);
+
+  // Exchange counters and signals with peers
+  NCCLCHECKGOTO(counters_table->register_mr(gdaki_ctx->ib_pd, true), status, out);
+  NCCLCHECKGOTO(signals_table->register_mr(gdaki_ctx->ib_pd, true), status, out);
+
+  NCCLCHECKGOTO(counters_table->exchange_info(cComm), status, out);
+  NCCLCHECKGOTO(signals_table->exchange_info(cComm), status, out);
+
+  gdaki_ctx->port_num = props.port;
+  NCCLCHECKGOTO(wrap_ibv_query_port(gdaki_ctx->ib_ctx, gdaki_ctx->port_num, &gdaki_ctx->port_attr),
+                status, out);
+
+  // Get the GID index
+  NCCLCHECKGOTO(cComm->getGidIndex(gdaki_ctx->ib_ctx, gdaki_ctx->port_num, &gdaki_ctx->port_attr, &ib_gid_index), status, out);
+  gdaki_ctx->gid_index = ib_gid_index;
+
+  NCCLCHECKGOTO(wrap_ibv_query_gid(gdaki_ctx->ib_ctx, 1, ib_gid_index, &gdaki_ctx->rgid), status,
+                out);
+
+  NCCLCHECKGOTO(gdakiCreateVerbsAh(gdaki_ctx, ib_sl, ib_tc, ib_gid_index), status, out);
+
+  gdaki_ctx->qp_rq_size = 0;
+  gdaki_ctx->qp_sq_size = ncclParamGinGdakiQpDepth();
+
+  memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+  qp_init_attr.gpu_dev = gdaki_ctx->gdev;
+  qp_init_attr.ibpd = gdaki_ctx->ib_pd;
+  qp_init_attr.sq_nwqe = gdaki_ctx->qp_sq_size;
+  qp_init_attr.nic_handler =
+    (enum doca_gpu_dev_verbs_nic_handler)ncclParamGinGdakiNicHandler();
+  qp_init_attr.mreg_type = DOCA_GPUNETIO_VERBS_MEM_REG_TYPE_DEFAULT;
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    DOCACHECKGOTO(
+      doca_gpu_verbs_create_qp_group_hl(&qp_init_attr, &gdaki_ctx->gqp_groups[qp_idx]),
+      docaStatus, status, out);
+
+    gdaki_ctx->gqps[qp_idx] = &gdaki_ctx->gqp_groups[qp_idx]->qp_main;
+    gdaki_ctx->companion_gqps[qp_idx] = &gdaki_ctx->gqp_groups[qp_idx]->qp_companion;
+
+    INFO(NCCL_NET, "[%d] Created a QP group: qp_idx=%d, main_qpn=%#x, companion_qpn=%#x", rank,
+         qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp),
+         doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp));
+  }
+
+  for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+    DOCACHECKGOTO(doca_gpu_verbs_create_qp_hl(&qp_init_attr, &gdaki_ctx->gqps[qp_idx]),
+                  docaStatus, status, out);
+    INFO(NCCL_NET, "[%d] Created a self-loop peer QP: qp_idx=%d, qpn=%#x", rank, qp_idx,
+         doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp));
+  }
+
+  for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+    DOCACHECKGOTO(
+      doca_gpu_verbs_create_qp_hl(&qp_init_attr, &gdaki_ctx->companion_gqps[qp_idx]),
+      docaStatus, status, out);
+    INFO(NCCL_NET, "[%d] Created a self-loop peer companion QP: qp_idx=%d, qpn=%#x", rank, qp_idx,
+         doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp));
+  }
+
+  for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+    // Prepare information for exchange with peers
+    for (int rank_idx = 0; rank_idx < nranks; rank_idx++) {
+      int qp_idx = rank_idx + ctx_idx * nranks;
+      gdakiFillExchInfo(&local_exch_info[rank_idx], gdaki_ctx, gdaki_ctx->gqps[qp_idx]);
+    }
+
+    // Exchange information with peers
+    NCCLCHECKGOTO(
+      cComm->allToAll(cComm, local_exch_info, remote_exch_info, sizeof(struct gdaki_exch_info)),
+      status, out);
+
+    for (int rank_idx = 0; rank_idx < nranks; rank_idx++) {
+      int qp_idx = rank_idx + ctx_idx * nranks;
+      if (rank_idx == rank)
+        gdakiFillExchInfo(&remote_exch_info[rank_idx], gdaki_ctx,
+                          gdaki_ctx->gqps[nqps_for_comm + ctx_idx]);
+
+      NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->gqps[qp_idx], &remote_exch_info[rank_idx]),
+                    status, out);
+
+      INFO(NCCL_NET,
+           "[%d] Connected main QP: qp_idx=%d, main_qpn=%#x, remote_rank=%d, remote_qpn=%#x", rank,
+           qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[qp_idx]->qp), rank_idx,
+           remote_exch_info[rank_idx].qpn);
+    }
+  }
+
+  for (int qp_idx = 0; qp_idx < nqps_per_rank; qp_idx++) {
+    int peer_qp_idx = nqps_for_comm + qp_idx;
+    struct gdaki_exch_info exch_info;
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->gqps[qp_idx * nqps_per_rank + rank]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->gqps[peer_qp_idx], &exch_info), status, out);
+    INFO(NCCL_NET, "[%d] Connected self-loop peer QP: qp_idx=%d, qpn=%#x, main_qpn=%#x", rank,
+         peer_qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->gqps[peer_qp_idx]->qp), exch_info.qpn);
+  }
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    int peer_qp_idx = nqps_for_comm + qp_idx;
+    struct gdaki_exch_info exch_info;
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->companion_gqps[peer_qp_idx]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->companion_gqps[qp_idx], &exch_info), status,
+                  out);
+    INFO(NCCL_NET,
+         "[%d] Connected companion QP: qp_idx=%d, companion_qpn=%#x, peer_companion_qpn=%#x", rank,
+         qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[qp_idx]->qp), exch_info.qpn);
+
+    gdakiFillExchInfo(&exch_info, gdaki_ctx, gdaki_ctx->companion_gqps[qp_idx]);
+    NCCLCHECKGOTO(gdakiConnectQp(gdaki_ctx, gdaki_ctx->companion_gqps[peer_qp_idx], &exch_info),
+                  status, out);
+    INFO(NCCL_NET,
+         "[%d] Connected self-loop peer companion QP: qp_idx=%d, peer_companion_qpn=%#x, "
+         "companion_qpn=%#x",
+         rank, peer_qp_idx, doca_verbs_qp_get_qpn(gdaki_ctx->companion_gqps[peer_qp_idx]->qp),
+         exch_info.qpn);
+  }
+
+  NCCLCHECKGOTO(ncclCuMemAlloc((void **)&sink_buffer, &sink_buffer_mhandle, CU_MEM_HANDLE_TYPE_NONE,
+                               sizeof(uint64_t)),
+                status, out);
+
+  NCCLCHECKGOTO(gdakiRegMr(&sink_buffer_mr, gdaki_ctx->ib_pd, sink_buffer, sizeof(uint64_t),
+                           IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+                             IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC),
+                status, out);
+
+  for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+    struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+      &gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+
+    struct doca_gpu_dev_verbs_qp *tmp_qp;
+    struct doca_gpu_dev_verbs_qp *tmp_qp_companion;
+
+    tmp_qp = (struct doca_gpu_dev_verbs_qp *)calloc(nranks,
+                                                         sizeof(struct doca_gpu_dev_verbs_qp));
+    tmp_qp_companion = (struct doca_gpu_dev_verbs_qp *)calloc(
+      nranks, sizeof(struct doca_gpu_dev_verbs_qp));
+    for (int qp_idx = 0; qp_idx < nranks; qp_idx++) {
+      struct doca_gpu_dev_verbs_qp *qp_cpu =
+        gdaki_ctx->gqps[(ctx_idx * nranks) + qp_idx]->qp_gverbs->qp_cpu;
+      memcpy(&tmp_qp[qp_idx], qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp));
+      need_cpu_proxy |= (qp_cpu->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY);
+
+      qp_cpu = gdaki_ctx->companion_gqps[(ctx_idx * nranks) + qp_idx]->qp_gverbs->qp_cpu;
+      memcpy(&tmp_qp_companion[qp_idx], qp_cpu, sizeof(struct doca_gpu_dev_verbs_qp));
+      need_cpu_proxy |= (qp_cpu->nic_handler == DOCA_GPUNETIO_VERBS_NIC_HANDLER_CPU_PROXY);
+    }
+
+    DOCACHECKGOTO(
+      doca_gpu_mem_alloc(gdaki_ctx->gdev, sizeof(struct doca_gpu_dev_verbs_qp) * nranks,
+                              host_page_size, DOCA_GPU_MEM_TYPE_GPU,
+                              (void **)&gin_gdaki_gpu_ctx->gdqp, nullptr);
+      , docaStatus, status, out);
+
+    NCCLCHECKGOTO(
+      ncclCudaMemcpy<struct doca_gpu_dev_verbs_qp>(gin_gdaki_gpu_ctx->gdqp, tmp_qp, nranks),
+      status, out);
+
+    DOCACHECKGOTO(
+      doca_gpu_mem_alloc(gdaki_ctx->gdev, sizeof(struct doca_gpu_dev_verbs_qp) * nranks,
+                              host_page_size, DOCA_GPU_MEM_TYPE_GPU,
+                              (void **)&gin_gdaki_gpu_ctx->companion_gdqp, nullptr);
+      , docaStatus, status, out);
+
+    NCCLCHECKGOTO(ncclCudaMemcpy<struct doca_gpu_dev_verbs_qp>(
+                    gin_gdaki_gpu_ctx->companion_gdqp, tmp_qp_companion, nranks),
+                  status, out);
+
+    gin_gdaki_gpu_ctx->counters_table.buffer = counters_table->gpu_ptr;
+    gin_gdaki_gpu_ctx->counters_table.rkeys = counters_table->get_rkeys_d();
+    gin_gdaki_gpu_ctx->counters_table.lkey = htobe32(counters_table->mr->lkey);
+    gin_gdaki_gpu_ctx->signals_table.buffer = signals_table->gpu_ptr;
+    gin_gdaki_gpu_ctx->signals_table.rkeys = signals_table->get_rkeys_d();
+    gin_gdaki_gpu_ctx->signals_table.lkey = htobe32(signals_table->mr->lkey);
+    gin_gdaki_gpu_ctx->sink_buffer_lkey = htobe32(sink_buffer_mr->lkey);
+
+    free(tmp_qp);
+    free(tmp_qp_companion);
+  }
+
+  NCCLCHECKGOTO(gin_gdaki_gpu_ctx_hd_mhandle->copy_h_to_d(), status, out);
+
+  devHandle->netDeviceType = NCCL_NET_DEVICE_GIN_GDAKI;
+  devHandle->netDeviceVersion = NCCL_GIN_GDAKI_VERSION;
+  devHandle->handle = (void *)gin_gdaki_gpu_ctx_hd_mhandle->gpu_buf;
+  devHandle->size = 0;
+  devHandle->needsProxyProgress = need_cpu_proxy;
+
+  gdaki_ctx->ib_pd = gdaki_ctx->ib_pd;
+  gdaki_ctx->counters_table = counters_table;
+  gdaki_ctx->signals_table = signals_table;
+  gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle = gin_gdaki_gpu_ctx_hd_mhandle;
+  gdaki_ctx->sink_buffer.addr = sink_buffer;
+  gdaki_ctx->sink_buffer.mr = sink_buffer_mr;
+  gdaki_ctx->sink_buffer.mhandle = sink_buffer_mhandle;
+  gdaki_ctx->collComm = cComm;
+  gdaki_ctx->devHandle = devHandle;
+
+  cComm->ginCtx = gdaki_ctx;
+
+  *outDevHandle = devHandle;
+  *outGinCtx = gdaki_ctx;
+
+out:
+  if (status != ncclSuccess) {
+    if (gdaki_ctx) {
+      // Clean up any allocated GPU memory
+      if (gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle) {
+        for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+          struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+            &gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+          if (gin_gdaki_gpu_ctx->gdqp) {
+            doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->gdqp);
+            gin_gdaki_gpu_ctx->gdqp = nullptr;
+          }
+          if (gin_gdaki_gpu_ctx->companion_gdqp) {
+            doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->companion_gdqp);
+            gin_gdaki_gpu_ctx->companion_gdqp = nullptr;
+          }
+        }
+      }
+
+      for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_group_hl(gdaki_ctx->gqp_groups[qp_idx]);
+        gdaki_ctx->gqp_groups[qp_idx] = nullptr;
+      }
+      for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->gqps[qp_idx]);
+        gdaki_ctx->gqps[qp_idx] = nullptr;
+      }
+      for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+        doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->companion_gqps[qp_idx]);
+        gdaki_ctx->companion_gqps[qp_idx] = nullptr;
+      }
+
+      if (gdaki_ctx->gqp_groups) free(gdaki_ctx->gqp_groups);
+      if (gdaki_ctx->gqps) free(gdaki_ctx->gqps);
+      if (gdaki_ctx->companion_gqps) free(gdaki_ctx->companion_gqps);
+
+      if (gdaki_ctx->gdev) doca_gpu_destroy(gdaki_ctx->gdev);
+    }
+
+    if (devHandle) free(devHandle);
+
+    if (sink_buffer_mr) NCCLCHECK(wrap_ibv_dereg_mr(sink_buffer_mr));
+    if (sink_buffer) ncclCuMemFree(sink_buffer);
+
+    delete gin_gdaki_gpu_ctx_hd_mhandle;
+
+    if (counters_table) {
+      counters_table->deregister_mr();
+      delete counters_table;
+    }
+
+    if (signals_table) {
+      signals_table->deregister_mr();
+      delete signals_table;
+    }
+
+    if (gdaki_ctx) {
+      if (gdaki_ctx->ib_pd) NCCLCHECK(wrap_ibv_dealloc_pd(gdaki_ctx->ib_pd));
+      if (gdaki_ctx->ib_ctx) NCCLCHECK(wrap_ibv_close_device(gdaki_ctx->ib_ctx));
+
+      memset(gdaki_ctx, 0, sizeof(*gdaki_ctx));
+      free(gdaki_ctx);
+    }
+  }
+
+  if (local_exch_info) free(local_exch_info);
+
+  if (remote_exch_info) free(remote_exch_info);
+
+  return (ncclResult_t)status;
+}
+
+ncclResult_t ncclGinGdakiDestroyContext(void *ginCtx) {
+  if (!ginCtx) return ncclInvalidArgument;
+
+  struct gdaki_context *gdaki_ctx = (struct gdaki_context *)ginCtx;
+  struct ncclGinIbCollComm *cComm = gdaki_ctx->collComm;
+  const int nranks = cComm->nranks;
+  const int ncontexts = 1;
+  const int nqps_per_rank = ncontexts;
+  const int nqps_for_comm = nqps_per_rank * nranks;  // Number of QPs for communication
+  const int ncompanion_qps = nqps_for_comm * 2;      // Number of companion QPs for communication
+                                                     // Double because we connect to self.
+  const int nqps =
+    nqps_per_rank * (nranks + 1);  // +1 for the local rank.
+                                   // The last group is the responder of the local rank.
+
+  for (int qp_idx = 0; qp_idx < nqps_for_comm; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_group_hl(gdaki_ctx->gqp_groups[qp_idx]);
+    gdaki_ctx->gqp_groups[qp_idx] = nullptr;
+  }
+  for (int qp_idx = nqps_for_comm; qp_idx < nqps; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->gqps[qp_idx]);
+    gdaki_ctx->gqps[qp_idx] = nullptr;
+  }
+  for (int qp_idx = nqps_for_comm; qp_idx < ncompanion_qps; qp_idx++) {
+    doca_gpu_verbs_destroy_qp_hl(gdaki_ctx->companion_gqps[qp_idx]);
+    gdaki_ctx->companion_gqps[qp_idx] = nullptr;
+  }
+
+  if (gdaki_ctx->gqp_groups) free(gdaki_ctx->gqp_groups);
+  if (gdaki_ctx->gqps) free(gdaki_ctx->gqps);
+  if (gdaki_ctx->companion_gqps) free(gdaki_ctx->companion_gqps);
+
+  if (gdaki_ctx->counters_table) {
+    gdaki_ctx->counters_table->deregister_mr();
+    delete gdaki_ctx->counters_table;
+  }
+  if (gdaki_ctx->signals_table) {
+    gdaki_ctx->signals_table->deregister_mr();
+    delete gdaki_ctx->signals_table;
+  }
+
+  if (gdaki_ctx->sink_buffer.mr) NCCLCHECK(wrap_ibv_dereg_mr(gdaki_ctx->sink_buffer.mr));
+  if (gdaki_ctx->sink_buffer.addr) NCCLCHECK(ncclCuMemFree(gdaki_ctx->sink_buffer.addr));
+
+  if (gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle) {
+    for (int ctx_idx = 0; ctx_idx < ncontexts; ctx_idx++) {
+      struct ncclGinGdakiGPUContext *gin_gdaki_gpu_ctx =
+        &gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle->host_buf[ctx_idx];
+      if (gin_gdaki_gpu_ctx->gdqp) {
+        DOCACHECK(doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->gdqp));
+      }
+      if (gin_gdaki_gpu_ctx->companion_gdqp) {
+        DOCACHECK(doca_gpu_mem_free(gdaki_ctx->gdev, gin_gdaki_gpu_ctx->companion_gdqp));
+      }
+    }
+    delete gdaki_ctx->gin_gdaki_gpu_ctx_hd_mhandle;
+  }
+
+  if (gdaki_ctx->ah) {
+    DOCACHECK(doca_verbs_ah_attr_destroy(gdaki_ctx->ah));
+  }
+
+  if (gdaki_ctx->gdev) {
+    DOCACHECK(doca_gpu_destroy(gdaki_ctx->gdev));
+  }
+  if (gdaki_ctx->ib_pd) NCCLCHECK(wrap_ibv_dealloc_pd(gdaki_ctx->ib_pd));
+  if (gdaki_ctx->ib_ctx) NCCLCHECK(wrap_ibv_close_device(gdaki_ctx->ib_ctx));
+
+  if (gdaki_ctx->devHandle) free(gdaki_ctx->devHandle);
+
+  memset(gdaki_ctx, 0, sizeof(*gdaki_ctx));
+  free(gdaki_ctx);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiRegMrSym(void *collComm, void *data, size_t size, int type, void **mhandle,
+                                  void **ginHandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+
+  struct gdaki_context *gdaki_ctx = (struct gdaki_context *)cComm->ginCtx;
+  struct ibv_mr *mr = nullptr;
+  GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle> *gdaki_mhandle_hd_mhandle =
+    new GdakiHostGPUMemHandle<struct ncclGinGdakiMemHandle>(1);
+  GdakiHostGPUMemHandle<__be32> *rkeys_hd_mhandle =
+    new GdakiHostGPUMemHandle<__be32>(cComm->nranks);
+  __be32 rkey;
+
+  struct gdaki_mem_handle *gdaki_mhandle = nullptr;
+  gdaki_mhandle = (struct gdaki_mem_handle *)calloc(1, sizeof(*gdaki_mhandle));
+  EQCHECK(gdaki_mhandle, nullptr);
+
+  NCCLCHECK(gdakiRegMr(&mr, gdaki_ctx->ib_pd, data, size,
+                       IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ |
+                         IBV_ACCESS_REMOTE_ATOMIC));
+
+  rkey = htobe32(mr->rkey);
+  NCCLCHECK(cComm->allGather(cComm, &rkey, rkeys_hd_mhandle->host_buf, sizeof(__be32)));
+  NCCLCHECK(rkeys_hd_mhandle->copy_h_to_d());
+
+  gdaki_mhandle_hd_mhandle->host_buf->rkeys = rkeys_hd_mhandle->gpu_buf;
+  gdaki_mhandle_hd_mhandle->host_buf->lkey = htobe32(mr->lkey);
+  NCCLCHECK(gdaki_mhandle_hd_mhandle->copy_h_to_d());
+
+  gdaki_mhandle->type = type;
+  gdaki_mhandle->mr = mr;
+  gdaki_mhandle->gdaki_mhandle_hd_mhandle = gdaki_mhandle_hd_mhandle;
+  gdaki_mhandle->rkeys_hd_mhandle = rkeys_hd_mhandle;
+
+  INFO(NCCL_NET, "[%d] Registered MR: data=%p, size=%zu, lkey(be32)=%#x, rkey(be32)=%#x",
+       cComm->rank, data, size, htobe32(mr->lkey), htobe32(mr->rkey));
+
+  *mhandle = (void *)gdaki_mhandle;
+  *ginHandle = (void *)gdaki_mhandle_hd_mhandle->gpu_buf;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiDeregMrSym(void *collComm, void *mhandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct gdaki_mem_handle *gdaki_mhandle = (struct gdaki_mem_handle *)mhandle;
+  struct ibv_mr *mr = gdaki_mhandle->mr;
+
+  INFO(NCCL_NET, "[%d] Unregistering MR: lkey(be32)=%#x, rkey(be32)=%#x", cComm->rank,
+       htobe32(mr->lkey), htobe32(mr->rkey));
+
+  NCCLCHECK(wrap_ibv_dereg_mr(mr));
+
+  delete gdaki_mhandle->gdaki_mhandle_hd_mhandle;
+  delete gdaki_mhandle->rkeys_hd_mhandle;
+
+  memset(gdaki_mhandle, 0, sizeof(*gdaki_mhandle));
+
+  free(gdaki_mhandle);
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiProgress(void *collComm) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct gdaki_context *gdakiCtx = (struct gdaki_context *)cComm->ginCtx;
+  const int ncontexts = 1;
+  const int nranks = gdakiCtx->collComm->nranks;
+  const int nqpsPerRank = ncontexts;
+  const int nqpsForComm = nqpsPerRank * nranks;  // Number of QPs for communication
+
+  for (int qpIdx = 0; qpIdx < nqpsForComm; qpIdx++) {
+    struct doca_gpu_verbs_qp *qp = gdakiCtx->gqps[qpIdx]->qp_gverbs;
+    if (qp->cpu_proxy) {
+      DOCACHECK(doca_gpu_verbs_cpu_proxy_progress(qp));
+    }
+
+    qp = gdakiCtx->companion_gqps[qpIdx]->qp_gverbs;
+    if (qp->cpu_proxy) {
+      DOCACHECK(doca_gpu_verbs_cpu_proxy_progress(qp));
+    }
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinGdakiQueryLastError(void *ginCtx, bool *hasError) {
+  struct gdaki_context *gdakiCtx = (struct gdaki_context *)ginCtx;
+  bool hasError_ = false;
+  const int ncontexts = 1;
+  const int nranks = gdakiCtx->collComm->nranks;
+  const int nqpsPerRank = ncontexts;
+  const int nqpsForComm = nqpsPerRank * nranks;  // Number of QPs for communication
+
+  // We throttle the frequency of these queries since they can easily take 250us.
+  struct timespec ts;
+  if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+    if (ts.tv_sec - gdakiCtx->last_error_query_time.tv_sec +
+          (ts.tv_nsec - gdakiCtx->last_error_query_time.tv_nsec) / 1e9 <
+        ncclParamGinErrorQuerySec()) {
+      goto exit;
+    }
+    gdakiCtx->last_error_query_time = ts;
+  }
+
+  for (int qpIdx = 0; qpIdx < nqpsForComm; qpIdx++) {
+    struct doca_gpu_verbs_qp *qp = gdakiCtx->gqps[qpIdx]->qp_gverbs;
+    struct doca_gpu_verbs_qp_error_info errorInfo;
+    DOCACHECK(doca_gpu_verbs_query_last_error(qp, &errorInfo));
+    hasError_ |= errorInfo.has_error;
+    if (hasError_) break;
+
+    qp = gdakiCtx->companion_gqps[qpIdx]->qp_gverbs;
+    DOCACHECK(doca_gpu_verbs_query_last_error(qp, &errorInfo));
+    hasError_ |= errorInfo.has_error;
+    if (hasError_) break;
+  }
+exit:
+  *hasError = hasError_;
+  return ncclSuccess;
+}
diff --git a/projects/rccl/src/transport/gdaki/gin_host_gdaki.h b/projects/rccl/src/transport/gdaki/gin_host_gdaki.h
new file mode 100644
index 00000000000..fcd45489152
--- /dev/null
+++ b/projects/rccl/src/transport/gdaki/gin_host_gdaki.h
@@ -0,0 +1,36 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _GIN_HOST_GDAKI_H_
+#define _GIN_HOST_GDAKI_H_
+
+#ifndef DOCA_VERBS_USE_CUDA_WRAPPER
+#define DOCA_VERBS_USE_CUDA_WRAPPER
+#endif
+
+#ifndef DOCA_VERBS_USE_NET_WRAPPER
+#define DOCA_VERBS_USE_NET_WRAPPER
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include "nccl.h"
+#include "gin/gin_host.h"
+
+ncclResult_t ncclGinGdakiCreateContext(void *collComm, int nSignals, int nCounters,
+                                       void **outGinCtx, ncclNetDeviceHandle_v11_t **outDevHandle);
+ncclResult_t ncclGinGdakiDestroyContext(void *ginCtx);
+ncclResult_t ncclGinGdakiRegMrSym(void *collComm, void *data, size_t size, int type, void **mhandle,
+                                  void **ginHandle);
+ncclResult_t ncclGinGdakiDeregMrSym(void *collComm, void *mhandle);
+ncclResult_t ncclGinGdakiProgress(void *ginCtx);
+ncclResult_t ncclGinGdakiQueryLastError(void *ginCtx, bool *hasError);
+
+#endif
diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc
index f8ef119324e..f35a54485d8 100644
--- a/projects/rccl/src/transport/net_ib.cc
+++ b/projects/rccl/src/transport/net_ib.cc
@@ -179,6 +179,49 @@ static void ncclIbDevFatalError(struct ncclIbDev* dev) {
   ncclIbStatsFatalError(&dev->stats);
 }
 
+// Helper function to convert IB work completion status to string
+static const char* ibvWcStatusStr(enum ibv_wc_status status) {
+  switch (status) {
+    case IBV_WC_SUCCESS:            return "IBV_WC_SUCCESS";
+    case IBV_WC_LOC_LEN_ERR:        return "IBV_WC_LOC_LEN_ERR";
+    case IBV_WC_LOC_QP_OP_ERR:      return "IBV_WC_LOC_QP_OP_ERR";
+    case IBV_WC_LOC_EEC_OP_ERR:     return "IBV_WC_LOC_EEC_OP_ERR";
+    case IBV_WC_LOC_PROT_ERR:       return "IBV_WC_LOC_PROT_ERR";
+    case IBV_WC_WR_FLUSH_ERR:       return "IBV_WC_WR_FLUSH_ERR";
+    case IBV_WC_MW_BIND_ERR:        return "IBV_WC_MW_BIND_ERR";
+    case IBV_WC_BAD_RESP_ERR:       return "IBV_WC_BAD_RESP_ERR";
+    case IBV_WC_LOC_ACCESS_ERR:     return "IBV_WC_LOC_ACCESS_ERR";
+    case IBV_WC_REM_INV_REQ_ERR:    return "IBV_WC_REM_INV_REQ_ERR";
+    case IBV_WC_REM_ACCESS_ERR:     return "IBV_WC_REM_ACCESS_ERR";
+    case IBV_WC_REM_OP_ERR:         return "IBV_WC_REM_OP_ERR";
+    case IBV_WC_RETRY_EXC_ERR:      return "IBV_WC_RETRY_EXC_ERR";
+    case IBV_WC_RNR_RETRY_EXC_ERR:  return "IBV_WC_RNR_RETRY_EXC_ERR";
+    case IBV_WC_LOC_RDD_VIOL_ERR:   return "IBV_WC_LOC_RDD_VIOL_ERR";
+    case IBV_WC_REM_INV_RD_REQ_ERR: return "IBV_WC_REM_INV_RD_REQ_ERR";
+    case IBV_WC_REM_ABORT_ERR:      return "IBV_WC_REM_ABORT_ERR";
+    case IBV_WC_INV_EECN_ERR:       return "IBV_WC_INV_EECN_ERR";
+    case IBV_WC_INV_EEC_STATE_ERR:  return "IBV_WC_INV_EEC_STATE_ERR";
+    case IBV_WC_FATAL_ERR:          return "IBV_WC_FATAL_ERR";
+    case IBV_WC_RESP_TIMEOUT_ERR:   return "IBV_WC_RESP_TIMEOUT_ERR";
+    case IBV_WC_GENERAL_ERR:        return "IBV_WC_GENERAL_ERR";
+    default:                        return "UNKNOWN_STATUS";
+  }
+}
+
+// Helper function to convert IB work completion opcode to string
+static const char* ibvWcOpcodeStr(enum ibv_wc_opcode opcode) {
+  switch (opcode) {
+    case IBV_WC_SEND:               return "IBV_WC_SEND";
+    case IBV_WC_RDMA_WRITE:         return "IBV_WC_RDMA_WRITE";
+    case IBV_WC_RDMA_READ:          return "IBV_WC_RDMA_READ";
+    case IBV_WC_COMP_SWAP:          return "IBV_WC_COMP_SWAP";
+    case IBV_WC_FETCH_ADD:          return "IBV_WC_FETCH_ADD";
+    case IBV_WC_BIND_MW:            return "IBV_WC_BIND_MW";
+    case IBV_WC_RECV:               return "IBV_WC_RECV";
+    case IBV_WC_RECV_RDMA_WITH_IMM: return "IBV_WC_RECV_RDMA_WITH_IMM";
+    default:                        return "UNKNOWN_OPCODE";
+  }
+}
 pthread_t ncclIbAsyncThread;
 static void* ncclIbAsyncThreadMain(void* args) {
   struct ncclIbDev* dev = (struct ncclIbDev*)args;
@@ -227,7 +270,7 @@ static void* ncclIbAsyncThreadMain(void* args) {
     case IBV_EVENT_CLIENT_REREGISTER:
     case IBV_EVENT_SRQ_LIMIT_REACHED:
       // the above are non-fatal
-      WARN("NET/IB : %s:%d Got async error event: %s", dev->devName, dev->portNum, str);
+      WARN("NET/IB : %s:%d Got async event: %s", dev->devName, dev->portNum, str);
       break;
     case IBV_EVENT_COMM_EST:
       break;
@@ -498,7 +541,7 @@ static int ncclIbMatchVfPath(char* path1, char* path2) {
 /**
  * Assumes PCIe path ends with xxxx:xx:xx.x 
  */
-static void ncclIbNormalizePciPath(const char* in, char* out, size_t out_size) {
+ static void ncclIbNormalizePciPath(const char* in, char* out, size_t out_size) {
   if (!in || !out || out_size == 0) return;
   // Safe copy with truncation
   size_t len = strnlen(in, out_size - 1);
@@ -718,6 +761,7 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
   ncclIbMergedDev tmp;
   memset(&tmp,0,sizeof(tmp));
   bool used[MAX_IB_DEVS] = {0};
+
   for (int i = 0; i < props->ndevs; i++) {
     if( props->devs[i]  < 0 || props->devs[i] >= ncclNIbDevs ) {
       WARN("NET/IB : Cannot use physical device %d, max %d", props->devs[i], ncclNIbDevs);
@@ -764,6 +808,7 @@ ncclResult_t ncclIbMakeVDeviceInternal(int* d, ncclNetVDeviceProps_t* props) {
            dev0->devName, numa0, dev->devName, numa_i);
       break;
     }
+
     char root_i[8];
     ncclIbGetPciRootFromPath(dev->pciPath, root_i, sizeof(root_i));
     if (strcmp(root_i, root0) != 0) {
@@ -795,7 +840,12 @@ ncclResult_t ncclIbSetNetAttr(void *ctx, ncclNetAttr_t *netAttr) {
 
 static ncclProfilerCallback_t ncclProfilerFunction;
 
-ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+static ncclResult_t ncclIbFinalizeDevices(void) {
+  netRefCount--;
+  return ncclSuccess;
+}
+
+static ncclResult_t ncclIbInitDevices(ncclDebugLogger_t /*logFunction*/, ncclProfilerCallback_t profFunction) {
   if (netRefCount++) return ncclSuccess;
   ncclResult_t ret = ncclSuccess;
   ncclProfilerFunction = profFunction;
@@ -934,6 +984,7 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
               PTHREADCHECKGOTO(pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, ncclIbDevs + ncclNIbDevs), "pthread_create", ret, fail);
               ncclSetThreadName(ncclIbAsyncThread, "NCCL IbAsync %2d", ncclNIbDevs);
               PTHREADCHECKGOTO(pthread_detach(ncclIbAsyncThread), "pthread_detach", ret, fail); // will not be pthread_join()'d
+
               ncclNIbDevs++;
               nPorts++;
             }
@@ -968,14 +1019,22 @@ ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config
 
   }
 exit:
-  ibContext.trafficClass = config->trafficClass;
-  *ctx = &ibContext;
   return ret;
 fail:
   if(ncclSuccess != wrap_ibv_free_device_list(devices)){WARN("NET/IB : Unable to free device list");}
   goto exit;
 }
 
+ncclResult_t ncclIbInit(void** ctx, uint64_t commId, ncclNetCommConfig_t* config, ncclDebugLogger_t logFunction, ncclProfilerCallback_t profFunction) {
+  ncclResult_t ret = ncclSuccess;
+  ncclNetCommConfig_t* netCommConfig = nullptr;
+  NCCLCHECK(ncclIbInitDevices(logFunction, profFunction));
+  NCCLCHECK(ncclCalloc(&netCommConfig, 1));
+  netCommConfig->trafficClass = config->trafficClass;
+  *ctx = (void *)netCommConfig;
+  return ret;
+}
+
 ncclResult_t ncclIbDevices(int* ndev) {
   *ndev = ncclNMergedIbDevs;
   return ncclSuccess;
@@ -1245,8 +1304,9 @@ struct ncclIbGidInfo {
 #define NCCL_NET_IB_REQ_SEND 1
 #define NCCL_NET_IB_REQ_RECV 2
 #define NCCL_NET_IB_REQ_FLUSH 3
-#define NCCL_NET_IB_REQ_FAILED 4
-const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush", "Failed" };
+#define NCCL_NET_IB_REQ_GIN_IPUT 4
+#define NCCL_NET_IB_REQ_FAILED 5
+const char* reqTypeStr[] = { "Unused", "Send", "Recv", "Flush", "IPut", "Failed" };
 
 #define MAX_QPS_PER_REQ 8
 struct ncclProfilerInfo {
@@ -1277,6 +1337,9 @@ struct ncclIbRequest {
     struct {
       int* sizes;
     } recv;
+    struct {
+      int rank;
+    } iput;
   };
 };
 
@@ -1332,6 +1395,7 @@ struct ncclIbRemSizesFifo {
 struct alignas(8) ncclIbSendCommDev {
   struct ncclIbNetCommDevBase base;
   struct ibv_mr* fifoMr;
+  struct ibv_mr* putSignalScratchpadMr;
 };
 
 
@@ -1370,6 +1434,7 @@ struct ncclIbSendComm {
   struct ncclIbRemSizesFifo remSizesFifo;
   uint64_t fifoHead;
   int ar; // Use adaptive routing when all merged devices have it enabled
+  uint64_t putSignalScratchpad;
 };
 // The SendFifo needs to be 32-byte aligned and each element needs
 // to be a 32-byte multiple, so that an entry does not get split and
@@ -1667,6 +1732,9 @@ ncclResult_t ncclIbConnect(void* ctx, int dev, void* opaqueHandle, void** sendCo
     devInfo->mtu           = ibDev->portAttr.active_mtu;
     devInfo->lid           = ibDev->portAttr.lid;
     devInfo->ibv_dev_index = commDev->base.ibDevN;
+    // Prepare GIN Put Signal scratchpad (for RDMA Atomic result)
+    NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->putSignalScratchpadMr, commDev->base.pd, &comm->putSignalScratchpad, sizeof(comm->putSignalScratchpad), IBV_ACCESS_LOCAL_WRITE), ret, fail);
+
     // Prepare my fifo
     NCCLCHECKGOTO(wrap_ibv_reg_mr(&commDev->fifoMr, commDev->base.pd, comm->fifo, sizeof(struct ncclIbSendFifo)*MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ), ret, fail);
     devInfo->fifoRkey = commDev->fifoMr->rkey;
@@ -2003,7 +2071,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
     // Local ibDevN
     ibDevN = rComm->devs[devIndex].base.ibDevN;
     ibDev = ncclIbDevs + ibDevN;
-    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE, &rComm->base.stats, qp), ret, fail);
+    NCCLCHECKGOTO(ncclIbCreateQp(ibDev->portNum, &rCommDev->base, IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC, &rComm->base.stats, qp), ret, fail);
     qp->devIndex = devIndex;
     devIndex = (devIndex + 1) % rComm->base.vProps.ndevs;
 
@@ -2076,7 +2144,7 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
       rCommDev->gpuFlush.dmabuf_fd = -1;
 
       if (rcclParamIbGdrFlushGpuMemNoRelaxedOrdering()) {
-#if CUDA_VERSION >= 11070 || HIP_VERSION >= 71260540
+        #if CUDA_VERSION >= 11070 || HIP_VERSION >= 71260540
         if (ncclCuMemEnable()) {
           NCCLCHECKGOTO(ncclMemAlloc((void**)&rCommDev->gpuFlush.gpuFlushGpuMem, sizeof(int)), ret, fail);
           CUCHECKGOTO(cuMemGetHandleForAddressRange((void*)&rCommDev->gpuFlush.dmabuf_fd,
@@ -2118,15 +2186,14 @@ ncclResult_t ncclIbAccept(void* listenComm, void** recvComm, ncclNetDeviceHandle
             rCommDev->gpuFlush.dmabuf_fd = -1;
           }
         }
-#endif
-flush_reg_done:
-        if (!gpuFlushRegistered) {
-          if (rCommDev->gpuFlush.gpuFlushGpuMem) {
-            ncclCudaFree(rCommDev->gpuFlush.gpuFlushGpuMem);
-            rCommDev->gpuFlush.gpuFlushGpuMem = nullptr;
-          }
-          rCommDev->gpuFlush.gpuMr = nullptr;
-          rCommDev->gpuFlush.dmabuf_fd = -1;
+        #endif
+        flush_reg_done:
+                if (!gpuFlushRegistered) {
+                  if (rCommDev->gpuFlush.gpuFlushGpuMem) {
+                    ncclCudaFree(rCommDev->gpuFlush.gpuFlushGpuMem);
+                    rCommDev->gpuFlush.gpuFlushGpuMem = nullptr;
+                  }
+                  rCommDev->gpuFlush.gpuMr = nullptr;
         }
       }
       NCCLCHECKGOTO(wrap_ibv_reg_mr(&rCommDev->gpuFlush.hostMr, rCommDev->base.pd, &rComm->gpuFlushHostMem, sizeof(int), IBV_ACCESS_LOCAL_WRITE), ret, fail);
@@ -2244,7 +2311,9 @@ ncclResult_t ncclIbRegMrDmaBufInternal(ncclIbNetCommDevBase* base, void* data, s
       }
       // Deregister / register
       struct ibv_mr* mr;
-      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ;
+      // REMOTE_ATOMIC required for GIN proxy atomic fetch-add on signal MR;
+      // without it mlx5 returns WC_REM_ACCESS_ERR (vendor_err 0x88).
+      unsigned int flags = IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ|IBV_ACCESS_REMOTE_ATOMIC;
       if (ncclIbRelaxedOrderingEnabled) flags |= IBV_ACCESS_RELAXED_ORDERING;
       if (fd != -1) {
         /* DMA-BUF support */
@@ -2722,7 +2791,6 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   struct ibv_recv_wr wr;
   struct ibv_recv_wr* bad_wr = NULL;
   int nqps = 0;
-
   if (comm->base.ready == 0) {
     WARN("NET/IB: ncclIbIrecv() called when comm->base.ready == 0");
     *request = NULL;
@@ -2787,26 +2855,26 @@ ncclResult_t ncclIbIrecv(void* recvComm, int n, void** data, size_t* sizes, int*
   return ncclSuccess;
 
 fail:
-  ncclIbStatsFatalError(&comm->base.stats);
-  if (req) {
-    // If events were added (IBV ops posted), we can't free immediately -
-    // completions may still arrive. Mark as failed and return it so
-    // caller can call Test to drain completions.
-    if (ncclIbRequestHasEvents(req)) {
-      req->type = NCCL_NET_IB_REQ_FAILED;
-      *request = req;
-      // Return success so caller proceeds to call Test() which will drain
-      // completions. The fatal error is recorded in stats and will be
-      // caught on the next operation.
-      return ncclSuccess;
-    } else {
-      ncclIbFreeRequest(req);
-      *request = NULL;
-    }
+ncclIbStatsFatalError(&comm->base.stats);
+if (req) {
+  // If events were added (IBV ops posted), we can't free immediately -
+  // completions may still arrive. Mark as failed and return it so
+  // caller can call Test to drain completions.
+  if (ncclIbRequestHasEvents(req)) {
+    req->type = NCCL_NET_IB_REQ_FAILED;
+    *request = req;
+    // Return success so caller proceeds to call Test() which will drain
+    // completions. The fatal error is recorded in stats and will be
+    // caught on the next operation.
+    return ncclSuccess;
   } else {
+    ncclIbFreeRequest(req);
     *request = NULL;
   }
-  return ret;
+} else {
+  *request = NULL;
+}
+return ret;
 }
 
 ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request) {
@@ -2815,7 +2883,6 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   struct ncclIbRequest* req = NULL;
   struct ncclIbMrHandle* mhandle = NULL;
   int last = -1;
-
   for (int i=0; i<n; i++) if (sizes[i]) last = i;
   if (comm->flushEnabled == 0 || last == -1) return ncclSuccess;
 
@@ -2871,26 +2938,26 @@ ncclResult_t ncclIbIflush(void* recvComm, int n, void** data, int* sizes, void**
   return ncclSuccess;
 
 fail:
-  ncclIbStatsFatalError(&comm->base.stats);
-  if (req) {
-    // If events were added (IBV ops posted), we can't free immediately -
-    // completions may still arrive. Mark as failed and return it so
-    // caller can call Test to drain completions.
-    if (ncclIbRequestHasEvents(req)) {
-      req->type = NCCL_NET_IB_REQ_FAILED;
-      *request = req;
-      // Return success so caller proceeds to call Test() which will drain
-      // completions. The fatal error is recorded in stats and will be
-      // caught on the next operation.
-      return ncclSuccess;
-    } else {
-      ncclIbFreeRequest(req);
-      *request = NULL;
-    }
+ncclIbStatsFatalError(&comm->base.stats);
+if (req) {
+  // If events were added (IBV ops posted), we can't free immediately -
+  // completions may still arrive. Mark as failed and return it so
+  // caller can call Test to drain completions.
+  if (ncclIbRequestHasEvents(req)) {
+    req->type = NCCL_NET_IB_REQ_FAILED;
+    *request = req;
+    // Return success so caller proceeds to call Test() which will drain
+    // completions. The fatal error is recorded in stats and will be
+    // caught on the next operation.
+    return ncclSuccess;
   } else {
+    ncclIbFreeRequest(req);
     *request = NULL;
   }
-  return ret;
+} else {
+  *request = NULL;
+}
+return ret;
 }
 
 #define HCA_NAME(req, index) ((req)->devBases[(index)]->pd->context->device->name)
@@ -2907,15 +2974,15 @@ static int getReqQpIndex(struct ncclIbRequest* req, int request, int qpNumber) {
 
 ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
   struct ncclIbRequest *r = (struct ncclIbRequest*)request;
+  *done = 0;
   ncclResult_t ret = ncclSuccess;
   int failDevIdx = -1;
-  *done = 0;
   while (1) {
     NCCLCHECKGOTO(ncclIbStatsCheckFatalCount(&r->base->stats,__func__), ret, fail);
     if (r->events[0] == 0 && r->events[1] == 0 && r->events[2] == 0 && r->events[3] == 0) {
       TRACE(NCCL_NET, "r=%p done", r);
       *done = 1;
-      // If this was a failed request, we were just draining completions.
+            // If this was a failed request, we were just draining completions.
       // Now that all events are done, free and return success.
       if (r->type == NCCL_NET_IB_REQ_FAILED) {
         NCCLCHECK(ncclIbFreeRequest(r));
@@ -2977,12 +3044,17 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
 
             char line[SOCKET_NAME_MAXLEN+1];
             char *hcaName = r->devBases[i]->pd->context->device->name;
-            WARN("NET/IB: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
-                ncclSocketToString(&addr, line), wc->status, wc->opcode, wc->byte_len, wc->vendor_err, reqTypeStr[r->type],
+            int reqSize = wc->byte_len;
+            struct ncclIbRequest* req = r->base->reqs+(wc->wr_id & 0xff);
+            if (req && req->type == NCCL_NET_IB_REQ_SEND) {
+              // For Send use the request size as WC byte_len is not reliable
+              reqSize = req->send.size;
+            }
+            WARN("NET/IB: Got completion from peer %s with status=%s(%d) opcode=%s(%d) reqSize=%d vendor_err=%u req_type=%s%s%s%s%s hca %s",
+                ncclSocketToString(&addr, line), ibvWcStatusStr(wc->status), wc->status,
+                ibvWcOpcodeStr(wc->opcode), wc->opcode, reqSize, wc->vendor_err, reqTypeStr[r->type],
                 localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
-            ret = ncclRemoteError;
-            failDevIdx = i;
-            goto fail;
+            return ncclRemoteError;
           }
 
           union ncclSocketAddress addr;
@@ -2999,9 +3071,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
               struct ncclIbRequest* sendReq = r->base->reqs+((wc->wr_id >> (j*8)) & 0xff);
               if ((sendReq->events[i] <= 0)) {
                 WARN("NET/IB: sendReq(%p)->events={%d,%d,%d,%d}, i=%d, j=%d <= 0", sendReq, sendReq->events[0], sendReq->events[1], sendReq->events[2], sendReq->events[3], i, j);
-                ret = ncclInternalError;
-                failDevIdx = i;
-                goto fail;
+                return ncclInternalError;
               }
               sendReq->events[i]--;
 #ifdef NCCL_ENABLE_NET_PROFILING
@@ -3014,9 +3084,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
             if (req && wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) {
               if (req->type != NCCL_NET_IB_REQ_RECV) {
                 WARN("NET/IB: wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM and req->type=%d", req->type);
-                ret = ncclInternalError;
-                failDevIdx = i;
-                goto fail;
+                return ncclInternalError;
               }
               if (req->nreqs == 1) {
                 req->recv.sizes[0] = wc->imm_data;
@@ -3045,8 +3113,7 @@ ncclResult_t ncclIbTest(void* request, int* done, int* sizes) {
     // If no CQEs found on any device, return and come back later
     if (totalWrDone == 0) return ncclSuccess;
   }
-
-fail:
+  fail:
   // Mark connection and device as fatal
   ncclIbStatsFatalError(&r->base->stats);
   if (failDevIdx >= 0 && r->devBases[failDevIdx] != NULL) {
@@ -3070,8 +3137,11 @@ ncclResult_t ncclIbCloseSend(void* sendComm) {
       struct ncclIbSendCommDev* commDev = comm->devs + i;
       if (commDev->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(commDev->fifoMr));
       if (comm->remSizesFifo.mrs[i] != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remSizesFifo.mrs[i]));
+      if (commDev->putSignalScratchpadMr != NULL)
+        NCCLCHECK(wrap_ibv_dereg_mr(commDev->putSignalScratchpadMr));
       NCCLCHECK(ncclIbDestroyBase(&commDev->base));
     }
+
     free(comm);
   }
   TIME_PRINT("IB");
@@ -3118,8 +3188,8 @@ ncclResult_t ncclIbCloseListen(void* listenComm) {
 }
 
 ncclResult_t ncclIbFinalize(void* ctx) {
-  netRefCount--;
-  return ncclSuccess;
+  free(ctx);
+  return ncclIbFinalizeDevices();
 }
 
 ncclResult_t rcclNetP2pPolicy(void* handle, int isP2p) {
@@ -3155,7 +3225,542 @@ ncclNet_t ncclNetIb = {
   ncclIbSetNetAttr,
 };
 
-/*
-  ncclIbSetProperties,
-  ncclIbRefreshDevices
-*/
+
+
+/// GIN IB Plugin
+
+#include "gin/gin_host.h"
+#include "net_ib_gin.h"
+
+const int NCCL_GIN_IB_ALLGATHER_TAG = 0xa0;
+const int NCCL_GIN_IB_ALLTOALL_TAG = 0xa1;
+
+ncclResult_t ncclGinIbInit(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction) {
+  ncclNetCommConfig_t* netCommConfig = nullptr;
+  NCCLCHECK(ncclIbInitDevices(logFunction, nullptr));
+  NCCLCHECK(ncclCalloc(&netCommConfig, 1));
+  *ctx = netCommConfig;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbFinalize(void *ctx) {
+  if (ctx) free(ctx);
+  return ncclIbFinalizeDevices();
+}
+
+static ncclResult_t ncclGinIbAllGather(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len) {
+  ncclResult_t status = ncclSuccess;
+  void *rMhandle = NULL, *sMhandle = NULL;
+  void *srequest = NULL, *rrequest = NULL;
+  int speer;
+  int rpeer;
+  void *rbuf;
+  int tag;
+  int done;
+
+  NCCLCHECKGOTO(ncclNetIb.regMr(cComm->recvComm, recvBuf,
+                                cComm->nranks * len, NCCL_PTR_HOST,
+                                &rMhandle),
+                status, out);
+  NCCLCHECKGOTO(ncclNetIb.regMr(cComm->sendComm, recvBuf,
+                                cComm->nranks * len, NCCL_PTR_HOST,
+                                &sMhandle),
+                status, out);
+
+  speer = cComm->rank;
+  memcpy((void *)((uintptr_t)recvBuf + speer * len), srcBuf, len);
+  for (int i = 0; i < cComm->nranks - 1; i++) {
+    rpeer = (speer - 1 + cComm->nranks) % cComm->nranks;
+    while (srequest == NULL || rrequest == NULL) {
+      rbuf = (void *)((uintptr_t)recvBuf + rpeer * len);
+      tag = NCCL_GIN_IB_ALLGATHER_TAG;
+      if (srequest == NULL)
+        NCCLCHECKGOTO(ncclNetIb.isend(cComm->sendComm,
+                                      (void *)((uintptr_t)recvBuf + speer * len),
+                                      len, tag, sMhandle, NULL, &srequest),
+                      status, out);
+      if (rrequest == NULL)
+        NCCLCHECKGOTO(ncclNetIb.irecv(cComm->recvComm, 1, &rbuf, &len,
+                                      &tag, &rMhandle, NULL, &rrequest),
+                      status, out);
+    }
+    while (srequest || rrequest) {
+      if (rrequest)
+        NCCLCHECKGOTO(ncclNetIb.test(rrequest, &done, NULL),
+                      status, out);
+      if (done)
+        rrequest = NULL;
+      if (srequest)
+        NCCLCHECKGOTO(ncclNetIb.test(srequest, &done, NULL),
+                      status, out);
+      if (done)
+        srequest = NULL;
+    }
+    speer = rpeer;
+  }
+
+out:
+  if (rMhandle)
+    ncclNetIb.deregMr(cComm->recvComm, rMhandle);
+
+  if (sMhandle)
+    ncclNetIb.deregMr(cComm->sendComm, sMhandle);
+
+  return status;
+}
+
+static ncclResult_t ncclGinIbAllToAll(struct ncclGinIbCollComm *cComm, void *src_buf, void *recv_buf, size_t len) {
+  ncclResult_t status = ncclSuccess;
+
+  void *tmp_buf = nullptr;
+  NCCLCHECK(ncclIbMalloc((void **)&tmp_buf, cComm->nranks * cComm->nranks * len));
+  NCCLCHECKGOTO(cComm->allGather(cComm, src_buf, tmp_buf, cComm->nranks * len), status, out);
+
+  for (int i = 0; i < cComm->nranks; i++) {
+    memcpy((void *)((uintptr_t)recv_buf + i * len), (void *)((uintptr_t)tmp_buf + i * cComm->nranks * len + cComm->rank * len), len);
+  }
+
+out:
+  if (tmp_buf)
+    free(tmp_buf);
+
+  return status;
+}
+
+ncclResult_t ncclGinIbP2PBarrier(struct ncclGinIbCollComm *cComm) {
+  // TODO: move allocation to init or use zero-byte allgather
+  int *dummy;
+  NCCLCHECK(ncclIbMalloc((void **)&dummy, cComm->nranks * sizeof(int)));
+  NCCLCHECK(ncclGinIbAllGather(cComm, dummy + cComm->rank * sizeof(int),
+                               dummy, sizeof(int)));
+  free(dummy);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbConnect(void* ctx, void* handles[], int nranks, int rank, void* listenComm, void** collComm) {
+  struct ncclIbListenComm *lComm = (struct ncclIbListenComm *)listenComm;
+  struct ncclGinIbCollComm *cComm = nullptr;
+  int next;
+
+  NCCLCHECK(ncclIbMalloc((void **)&cComm, sizeof(*cComm)));
+  NCCLCHECK(ncclIbMalloc((void**)&cComm->fullSendComm, sizeof(void *) * nranks));
+  NCCLCHECK(ncclIbMalloc((void**)&cComm->fullRecvComm, sizeof(void *) * nranks));
+
+  cComm->nranks = nranks;
+  cComm->rank = rank;
+
+  next = (cComm->rank + 1) % nranks;
+  do
+  {
+    if (cComm->sendComm == NULL) {
+      NCCLCHECK(ncclNetIb.connect(ctx, lComm->dev, handles[next], &cComm->sendComm, NULL));
+    }
+    if (cComm->recvComm == NULL)
+      NCCLCHECK(ncclNetIb.accept(lComm, &cComm->recvComm, NULL));
+  } while (cComm->sendComm == NULL || cComm->recvComm == NULL);
+
+  cComm->getProperties = (ncclResult_t(*)(int dev, void *props))ncclIbGetProperties;
+  cComm->allGather = ncclGinIbAllGather;
+  cComm->allToAll = ncclGinIbAllToAll;
+  cComm->getGidIndex = ncclIbGetGidIndex;
+  cComm->dev = lComm->dev;
+
+  for (int i = 0; i < nranks; i++)
+  {
+    int connectPeer = (cComm->rank + i) % nranks;
+    int acceptPeer = (cComm->rank - i + nranks) % nranks;
+    do
+    {
+      if (cComm->fullSendComm[connectPeer] == NULL)
+        NCCLCHECK(ncclNetIb.connect(ctx, lComm->dev, handles[connectPeer], &cComm->fullSendComm[connectPeer], NULL));
+      if (cComm->fullRecvComm[acceptPeer] == NULL)
+        NCCLCHECK(ncclNetIb.accept(lComm, &cComm->fullRecvComm[acceptPeer], NULL));
+    } while ((cComm->fullSendComm[connectPeer] == NULL) || (cComm->fullRecvComm[acceptPeer] == NULL));
+    NCCLCHECK(ncclGinIbP2PBarrier(cComm));
+  }
+
+  *collComm = cComm;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbCloseColl(void* collComm) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+  if (!cComm) return ncclSuccess;
+
+  if (cComm->fullRecvComm) {
+    for (int i=0; i<cComm->nranks; i++) {
+      NCCLCHECK(ncclNetIb.closeRecv(cComm->fullRecvComm[i]));
+    }
+    free(cComm->fullRecvComm);
+    cComm->fullRecvComm = NULL;
+  }
+
+  if (cComm->fullSendComm) {
+    for (int i=0; i<cComm->nranks; i++) {
+      NCCLCHECK(ncclNetIb.closeSend(cComm->fullSendComm[i]));
+    }
+    free(cComm->fullSendComm);
+    cComm->fullSendComm = NULL;
+  }
+
+  if (cComm->recvComm) {
+    NCCLCHECK(ncclNetIb.closeRecv(cComm->recvComm));
+    cComm->recvComm = NULL;
+  }
+
+  if (cComm->sendComm) {
+    NCCLCHECK(ncclNetIb.closeSend(cComm->sendComm));
+    cComm->sendComm = NULL;
+  }
+
+  memset(cComm, 0, sizeof(*cComm));
+
+  free(cComm);
+  return ncclSuccess;
+}
+
+#include "gdaki/gin_host_gdaki.h"
+
+static std::mutex ncclGinIbGdakiLockMutex;
+static int ncclGinIbGdakiNDevs = -1;
+int ncclGinIbGdakiDevIndexes[MAX_IB_DEVS];
+
+ncclResult_t ncclGinIbGdakiInit(void** ctx, uint64_t commId, ncclDebugLogger_t logFunction) {
+  NCCLCHECK(ncclGinIbInit(ctx, commId, logFunction));
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  if (ncclGinIbGdakiNDevs == -1) {
+    int ndevs = 0;
+    for (int i = 0; i < ncclNIbDevs; i++) {
+      if (ncclIbDevs[i].ibProvider == IB_PROVIDER_MLX5) {
+        ncclGinIbGdakiDevIndexes[ndevs] = i;
+        ++ndevs;
+      }
+    }
+    ncclGinIbGdakiNDevs = ndevs;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiDevices(int* ndev) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  *ndev = ncclGinIbGdakiNDevs;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiGetProperties(int dev, ncclNetProperties_t* props) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  NCCLCHECK(ncclNetIb.getProperties(ncclGinIbGdakiDevIndexes[dev], props));
+  props->netDeviceType = NCCL_NET_DEVICE_GIN_GDAKI;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiListen(void* ctx, int dev, void* opaqueHandle, void** listenComm) {
+  std::lock_guard<std::mutex> lock(ncclGinIbGdakiLockMutex);
+  return ncclNetIb.listen(ctx, ncclGinIbGdakiDevIndexes[dev], opaqueHandle, listenComm);
+}
+
+ncclResult_t ncclGinIbGdakiCreateContext(void* collComm, int nSignals, int nCounters, void **ginCtx, ncclNetDeviceHandle_v11_t** devHandle) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  NCCLCHECK(ncclGinGdakiCreateContext(cComm, nSignals, nCounters, ginCtx, devHandle));
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbGdakiRegMrSym(void* collComm, void* data, size_t size, int type, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  return ncclGinGdakiRegMrSym((struct ncclGinIbCollComm *)collComm, data, size, type, mhandle, ginHandle);
+}
+
+ncclResult_t ncclGinIbGdakiDeregMrSym(void* collComm, void* mhandle) {
+  return ncclGinGdakiDeregMrSym((struct ncclGinIbCollComm *)collComm, mhandle);
+}
+
+ncclResult_t ncclGinIbGdakiDestroyContext(void* ginCtx) {
+  return ncclGinGdakiDestroyContext(ginCtx);
+}
+
+ncclResult_t ncclGinIbGdakiProgress(void *collComm)
+{
+  return ncclGinGdakiProgress(collComm);
+}
+
+ncclResult_t ncclGinIbGdakiQueryLastError(void *ginCtx, bool *hasError) {
+  return ncclGinGdakiQueryLastError(ginCtx, hasError);
+}
+
+ncclGin_t ncclGinIbGdaki = {
+  "GIN_IB_GDAKI",
+  ncclGinIbGdakiInit,
+  ncclGinIbGdakiDevices,
+  ncclGinIbGdakiGetProperties,
+  ncclGinIbGdakiListen,
+  ncclGinIbConnect,
+  ncclGinIbGdakiCreateContext,
+  ncclGinIbGdakiRegMrSym,
+  NULL, // regMrSymDmaBuf
+  ncclGinIbGdakiDeregMrSym,
+  ncclGinIbGdakiDestroyContext,
+  ncclGinIbCloseColl,
+  ncclIbCloseListen,
+  NULL,
+  NULL,
+  NULL,
+  ncclGinIbGdakiProgress,
+  ncclGinIbGdakiQueryLastError,
+  ncclGinIbFinalize
+};
+
+
+struct ncclIbGinProxyMrHandle {
+  struct ncclIbMrHandle *mrHandle;
+  uintptr_t *base_vas;
+  uint32_t *rkeys;
+};
+
+ncclResult_t ncclGinIbProxyGetProperties(int dev, ncclNetProperties_t* props) {
+  NCCLCHECK(ncclNetIb.getProperties(dev, props));
+  props->netDeviceType = NCCL_NET_DEVICE_GIN_PROXY;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyRegMrSymDmaBuf(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct ncclIbGinProxyMrHandle *ginMrHandle;
+  NCCLCHECK(ncclCalloc(&ginMrHandle, 1));
+
+  NCCLCHECKNOWARN(ncclIbRegMrDmaBuf(cComm->recvComm, data, size, type, offset, fd, (void**)&ginMrHandle->mrHandle), NCCL_NET);
+
+  NCCLCHECK(ncclCalloc(&ginMrHandle->base_vas, cComm->nranks));
+  NCCLCHECK(ncclCalloc(&ginMrHandle->rkeys, cComm->nranks));
+
+  NCCLCHECK(cComm->allGather(cComm, &data, ginMrHandle->base_vas, sizeof(uintptr_t)));
+  NCCLCHECK(cComm->allGather(cComm, &ginMrHandle->mrHandle->mrs[0]->rkey, ginMrHandle->rkeys, sizeof(uint32_t)));
+
+  *mhandle = ginMrHandle;
+  *ginHandle = ginMrHandle;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyRegMrSym(void* collComm, void* data, size_t size, int type, uint64_t mr_flags, void** mhandle, void **ginHandle) {
+  return ncclGinIbProxyRegMrSymDmaBuf(collComm, data, size, type, 0, -1, mr_flags, mhandle, ginHandle);
+}
+
+ncclResult_t ncclGinIbProxyDeregMrSym(void* collComm, void* mhandle) {
+  struct ncclGinIbCollComm *cComm = (struct ncclGinIbCollComm *)collComm;
+  struct ncclIbGinProxyMrHandle *ginMrHandle = (struct ncclIbGinProxyMrHandle *)mhandle;
+
+  NCCLCHECK(ncclNetIb.deregMr(cComm->recvComm, ginMrHandle->mrHandle));
+  free(ginMrHandle->base_vas);
+  free(ginMrHandle->rkeys);
+  free(ginMrHandle);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyCloseColl(void* collComm) {
+  free(collComm);
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyIPut(void *collComm, uint64_t srcOff, void *srcMhandle, size_t size,
+                                uint64_t dstOff, void *dstMhandle, uint32_t rank, void **request)
+{
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  struct ncclIbGinProxyMrHandle *srcMrHandle = (struct ncclIbGinProxyMrHandle *)srcMhandle;
+  struct ncclIbGinProxyMrHandle *dstMrHandle = (struct ncclIbGinProxyMrHandle *)dstMhandle;
+
+  void *srcPtr = (void *)(srcMrHandle->base_vas[cComm->rank] + srcOff);
+  void *dstPtr = (void *)(dstMrHandle->base_vas[rank] + dstOff);
+  uint32_t lkey = srcMrHandle->mrHandle->mrs[0]->lkey;
+  uint32_t rkey = dstMrHandle->rkeys[rank];
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  struct ncclIbQp *qp = &comm->base.qps[0];
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
+  req->type = NCCL_NET_IB_REQ_GIN_IPUT;
+  req->sock = &comm->base.sock;
+  req->iput.rank = rank;
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    req->devBases[i] = &comm->devs[i].base;
+  }
+
+  struct ibv_send_wr wr;
+  memset(&wr, 0, sizeof(wr));
+  struct ibv_sge sge;
+  memset(&sge, 0, sizeof(sge));
+
+  wr.opcode                  = IBV_WR_RDMA_WRITE;
+  wr.send_flags              = IBV_SEND_SIGNALED;
+  wr.wr_id                   = req - comm->base.reqs;
+  wr.next                    = NULL;
+  wr.wr.rdma.remote_addr     = (uint64_t)dstPtr;
+  wr.wr.rdma.rkey            = rkey;
+  wr.sg_list = &sge;
+  wr.num_sge = 1;
+
+  sge.addr = (uintptr_t)srcPtr;  // Local buffer address
+  sge.length = size;  // Size of the transfer
+  sge.lkey = lkey;  // Local key
+
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(qp->qp, &wr, &bad_wr));
+  ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyIPutSignal(void *collComm, uint64_t srcOff, void *srcMhandle,
+                                      size_t size, uint64_t dstOff, void *dstMhandle,
+                                      uint32_t rank, uint64_t signalOff, void *signalMhandle,
+                                      uint64_t signalValue, uint32_t signalOp, void **request)
+{
+  if (signalOp != NCCL_NET_SIGNAL_OP_INC && signalOp != NCCL_NET_SIGNAL_OP_ADD) {
+    WARN("ncclGinIbProxyIPutSignal: Unsupported signalOp %u", signalOp);
+    return ncclInvalidArgument;
+  }
+
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+
+  struct ncclIbGinProxyMrHandle *srcMrHandle = (struct ncclIbGinProxyMrHandle *)srcMhandle;
+  struct ncclIbGinProxyMrHandle *dstMrHandle = (struct ncclIbGinProxyMrHandle *)dstMhandle;
+  struct ncclIbGinProxyMrHandle *signalMrHandle = (struct ncclIbGinProxyMrHandle *)signalMhandle;
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  struct ncclIbQp *qp = &comm->base.qps[0];
+  int devIndex = qp->devIndex;
+
+  struct ncclIbRequest* req;
+  NCCLCHECK(ncclIbGetRequest(&comm->base, &req));
+  req->type = NCCL_NET_IB_REQ_GIN_IPUT;
+  req->sock = &comm->base.sock;
+  req->iput.rank = rank;
+  for (int i = 0; i < comm->base.vProps.ndevs; i++) {
+    req->devBases[i] = &comm->devs[i].base;
+  }
+
+  struct ibv_send_wr wr[2];
+  memset(&wr, 0, sizeof(wr));
+  struct ibv_sge sge[2];
+  memset(&sge, 0, sizeof(sge));
+
+  // If size is 0, we only need to send the signal. srcMrHandle must be non-NULL
+  if (size > 0 && dstMrHandle) {
+    void *srcPtr = (void *)(srcMrHandle->base_vas[cComm->rank] + srcOff);
+    void *dstPtr = (void *)(dstMrHandle->base_vas[rank] + dstOff);
+    uint32_t lkey = srcMrHandle->mrHandle->mrs[0]->lkey;
+    uint32_t rkey = dstMrHandle->rkeys[rank];
+
+    // PUT
+    wr[0].opcode                  = IBV_WR_RDMA_WRITE;
+    wr[0].send_flags              = 0; // We only need the CQE from the signal
+    wr[0].wr_id                   = req - comm->base.reqs;
+    wr[0].next                    = &wr[1];
+    wr[0].wr.rdma.remote_addr     = (uint64_t)dstPtr;
+    wr[0].wr.rdma.rkey            = rkey;
+    wr[0].sg_list = &sge[0];
+    wr[0].num_sge = 1;
+
+    sge[0].addr = (uintptr_t)srcPtr;  // Local buffer address
+    sge[0].length = size;  // Size of the transfer
+    sge[0].lkey = lkey;  // Local key
+  }
+
+  void *signalPtr = (void *)(signalMrHandle->base_vas[rank] + signalOff);
+  uint32_t signalRkey = signalMrHandle->rkeys[rank];
+
+  // SIGNAL
+  wr[1].opcode                  = IBV_WR_ATOMIC_FETCH_AND_ADD;
+  wr[1].send_flags              = IBV_SEND_SIGNALED;
+  wr[1].wr_id                   = req - comm->base.reqs;  // used for matching completions with request
+  wr[1].next                    = NULL;
+  wr[1].wr.atomic.remote_addr   = (uint64_t)signalPtr;
+  wr[1].wr.atomic.compare_add   = signalOp == NCCL_NET_SIGNAL_OP_INC ? 1 : signalValue;
+  wr[1].wr.atomic.rkey          = signalRkey;
+  wr[1].sg_list = &sge[1];
+  wr[1].num_sge = 1;
+
+  sge[1].addr = (uintptr_t)&comm->putSignalScratchpad;
+  sge[1].length = sizeof(comm->putSignalScratchpad);
+  sge[1].lkey = comm->devs[devIndex].putSignalScratchpadMr->lkey;
+
+  // Send the put and the signal in one go
+  struct ibv_send_wr* bad_wr;
+  NCCLCHECK(wrap_ibv_post_send(qp->qp, size > 0 ? &wr[0] : &wr[1], &bad_wr));
+  ncclIbAddEvent(req, qp->devIndex, &comm->devs[qp->devIndex].base);
+  *request = req;
+  return ncclSuccess;
+}
+
+ncclResult_t ncclGinIbProxyTest(void *collComm, void *request, int *done) {
+  struct ncclGinIbCollComm* cComm = (struct ncclGinIbCollComm*)collComm;
+  struct ncclIbRequest* req = (struct ncclIbRequest*)request;
+  int rank = req->iput.rank;
+  *done = 0;
+
+  if (req->events[0] == 0) {
+    *done = 1;
+    NCCLCHECK(ncclIbFreeRequest(req));
+    return ncclSuccess;
+  }
+  int wrDone = 0;
+  struct ibv_wc wc[4];
+
+  struct ncclIbSendComm* comm = (struct ncclIbSendComm*)cComm->fullSendComm[rank];
+  NCCLCHECK(wrap_ibv_poll_cq(comm->devs[0].base.cq, 4, wc, &wrDone));
+  for (int i = 0; i < wrDone; i++) {
+    if (wc[i].status != IBV_WC_SUCCESS) {
+      union ncclSocketAddress addr;
+      ncclSocketGetAddr(req->sock, &addr);
+      char localGidString[INET6_ADDRSTRLEN] = "";
+      char remoteGidString[INET6_ADDRSTRLEN] = "";
+      const char* localGidStr = NULL, *remoteGidStr = NULL;
+      if (req->devBases[i]->gidInfo.link_layer == IBV_LINK_LAYER_ETHERNET) {
+        localGidStr = ibvGetGidStr(&req->devBases[i]->gidInfo.localGid, localGidString, sizeof(localGidString));
+        remoteGidStr = ibvGetGidStr(&req->base->remDevs[i].remoteGid, remoteGidString, sizeof(remoteGidString));
+      }
+
+      char line[SOCKET_NAME_MAXLEN+1];
+      char *hcaName = req->devBases[i]->pd->context->device->name;
+      WARN("NET/IB/GIN: Got completion from peer %s with status=%d opcode=%d len=%u vendor err %u (%s)%s%s%s%s hca %s",
+          ncclSocketToString(&addr, line), wc[i].status, wc[i].opcode, wc[i].byte_len, wc[i].vendor_err, reqTypeStr[req->type],
+          localGidStr ?  " localGid ":"", localGidString, remoteGidStr ? " remoteGids":"", remoteGidString, hcaName);
+      return ncclRemoteError;
+    }
+
+    struct ncclIbRequest* wcReq = comm->base.reqs + wc[i].wr_id;
+
+    wcReq->events[0]--;
+    if (wcReq == req && wcReq->events[0] == 0) {
+      *done = 1;
+      NCCLCHECK(ncclIbFreeRequest(wcReq));
+    }
+  }
+  return ncclSuccess;
+}
+
+// No support for NCCL_IB_SPLIT_DATA_ON_QPS or NCCL_IB_MERGE_NICS
+ncclGin_t ncclGinIbProxy = {
+  "GIN_IB_PROXY",
+  ncclGinIbInit,
+  ncclIbDevices,
+  ncclGinIbProxyGetProperties,
+  ncclIbListen,
+  ncclGinIbConnect,
+  NULL,
+  ncclGinIbProxyRegMrSym,
+  ncclGinIbProxyRegMrSymDmaBuf,
+  ncclGinIbProxyDeregMrSym,
+  NULL,
+  ncclGinIbCloseColl,
+  ncclIbCloseListen,
+  ncclGinIbProxyIPut,
+  ncclGinIbProxyIPutSignal,
+  ncclGinIbProxyTest,
+  NULL,
+  NULL,
+  ncclGinIbFinalize
+};
diff --git a/projects/rccl/src/transport/net_ib_gin.h b/projects/rccl/src/transport/net_ib_gin.h
new file mode 100644
index 00000000000..0bc75c5992d
--- /dev/null
+++ b/projects/rccl/src/transport/net_ib_gin.h
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_NET_IB_GIN_H_
+#define _NCCL_NET_IB_GIN_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include "nccl.h"
+
+struct ncclGinIbCollComm {
+  int           rank;
+  int           nranks;
+  void*         recvComm;
+  void*         sendComm;
+  void**        fullRecvComm;
+  void**        fullSendComm;
+  int           dev;
+  void*         ginCtx;
+  ncclResult_t (*getProperties)(int dev, void *props);
+  ncclResult_t (*allGather)(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len);
+  ncclResult_t (*allToAll)(struct ncclGinIbCollComm *cComm, void *srcBuf, void *recvBuf, size_t len);
+  ncclResult_t (*getGidIndex)(struct ibv_context *context, uint8_t portNum, struct ibv_port_attr* portAttr, int *gidIndex);
+};
+
+#endif
diff --git a/projects/rccl/src/transport/nvls.cc b/projects/rccl/src/transport/nvls.cc
index fcd7b14d0ec..0be7063b261 100644
--- a/projects/rccl/src/transport/nvls.cc
+++ b/projects/rccl/src/transport/nvls.cc
@@ -25,6 +25,7 @@ struct graphRegData {
 struct localRegData {
   struct ncclReg reg;
   intptr_t offset;
+  int handleTypes;
 };
 
 ncclResult_t nvlsCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
@@ -157,7 +158,7 @@ ncclResult_t ncclNvlsInit(struct ncclComm* comm) {
 
   int gpuCount;
   NCCLCHECK(ncclTopoGetGpuCount(comm->topo, &gpuCount));
-  if (!ncclParamNvlsEnable() || gpuCount <= 2) return ncclSuccess;
+  if (!ncclParamNvlsEnable() || gpuCount < 2) return ncclSuccess;
 
   CUdevice dev;
   int driverVersion;
@@ -542,11 +543,12 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   CUmulticastObjectProp mcprop;
   CUmemAllocationProp ucprop;
   char shareableHandle[NVLS_HANDLE_SIZE];
-  CUmemGenericAllocationHandle mcHandle;
+  CUmemGenericAllocationHandle mcHandle = 0;
   size_t minSize = SIZE_MAX;
   struct localRegData* regData = NULL;
   cudaPointerAttributes attr;
-  size_t ucgran, mcgran, ucsize, mcsize;
+  size_t ucgran, mcgran, ucsize = 0, mcsize = 0;
+  bool bindComplete = false, mapComplete = false;
 
   NCCLCHECKGOTO(ncclCalloc(&regData, comm->localRanks), ret, fail);
 
@@ -594,6 +596,10 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
     if ((regData[i].reg.state & NVLS_REG_POSSIBLE) == 0) {
       goto fail;
     }
+    // We need to check whether the offsets are the same among ranks.
+    if (i > 0 && regData[i].offset != regData[i - 1].offset) {
+      goto fail;
+    }
     /* get minimal reg size of nvls buffers */
     if (minSize > regData[i].reg.regUCSize)
       minSize = regData[i].reg.regUCSize;
@@ -615,33 +621,42 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   }
 
   CUCHECKGOTO(cuMulticastAddDevice(mcHandle, comm->nvlsResources->dev), ret, fail);
+  // intra-node barrier to mitigate the possible hang in cuMulticastBindAddr during abort
+  // It also ensures that if cuMulticastBindAddr fails, the cleanup code won't race with the UDS proxy
+  NCCLCHECKGOTO(bootstrapIntraNodeBarrier(comm->bootstrap, comm->localRankToRank, comm->localRank, comm->localRanks, comm->localRankToRank[0]), ret, fail);
   // Coverity complains that regRecord could be NULL.  That won't in practice be the case because we've already checked
   // (regData[i].reg.state & NVLS_REG_POSSIBLE) of all local ranks, which would catch it and bail out.
   // coverity[var_deref_op]
-  CUCHECKGOTO(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0), ret, fail);
+  CUresult err;
+  err = CUPFN(cuMulticastBindAddr(mcHandle, 0, (CUdeviceptr)regRecord->begAddr, ucsize, 0));
+  if (err != CUDA_SUCCESS) {
+    // Don't print an error in case of buffers that are incompatible with MC.
+    if (err != CUDA_ERROR_INVALID_VALUE) {
+      const char *errStr;
+      CUCALL(cuGetErrorString(err, &errStr));
+      INFO(NCCL_REG, "Failed to multicast-bind user buffer: CUDA error %d '%s'", err, errStr);
+    }
+    goto fail;
+  }
+  bindComplete = true;
 
   // Create a VA for the NVLS
   CUCHECKGOTO(cuMemAddressReserve(&regPtr, mcsize, mcgran, 0U, 0), ret, fail);
   // Map the VA locally
   CUCHECKGOTO(cuMemMap(regPtr, mcsize, 0, mcHandle, 0), ret, fail);
+  mapComplete = true;
   CUCHECKGOTO(cuMemSetAccess(regPtr, mcsize, &comm->nvlsResources->accessDesc, 1), ret, fail);
 
+  /* get all buffer addresses */
+  regRecord->caddrs[comm->localRank] = regRecord->begAddr;
+  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
+
   regRecord->regAddr = regPtr;
   regRecord->regUCSize = ucsize;
   regRecord->regMCSize = mcsize;
   regRecord->dev = comm->nvlsResources->dev;
   regRecord->mcHandle = mcHandle;
   regRecord->state |= NVLS_REG_COMPLETE;
-  /* get all buffer addresses */
-  regRecord->caddrs[comm->localRank] = regRecord->begAddr;
-  NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regRecord->caddrs + comm->localRank, regRecord->caddrs, sizeof(uintptr_t)), ret, fail);
-
-  /* Although registration is done, we still need to check whether the offsets are same among ranks. */
-  for (int i = 0; i < comm->localRanks - 1; ++i) {
-    if (regData[i].offset != regData[i + 1].offset) {
-      goto fail;
-    }
-  }
 
   *regAddr = (uintptr_t)regPtr + regData[comm->localRank].offset;
   *regUsed = 1;
@@ -649,6 +664,14 @@ ncclResult_t tryRegisterBuffer(struct ncclComm *comm, uintptr_t userBuff, size_t
   free(regData);
   return ret;
 fail:
+  if (regPtr) {
+    if (mapComplete) CUCALL(cuMemUnmap(regPtr, mcsize));
+    CUCALL(cuMemAddressFree(regPtr, mcsize));
+  }
+  if (mcHandle) {
+    if (bindComplete) CUCALL(cuMulticastUnbind(mcHandle, comm->nvlsResources->dev, 0/*mcOffset*/, ucsize));
+    CUCALL(cuMemRelease(mcHandle));
+  }
   *regUsed = 0;
   goto exit;
 }
@@ -667,11 +690,19 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
     memcpy(&regData[comm->localRank * 2].reg, sendRegRecord, sizeof(struct ncclReg));
     regData[comm->localRank * 2].offset = (uintptr_t)sendbuff - sendRegRecord->begAddr;
   }
+  if (sendbuff) {
+    CUCHECKGOTO(cuPointerGetAttribute((void*)&regData[comm->localRank * 2].handleTypes,
+                                      CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, (CUdeviceptr)sendbuff), ret, fail);
+  }
 
   if (recvRegRecord) {
     memcpy(&regData[comm->localRank * 2 + 1].reg, recvRegRecord, sizeof(struct ncclReg));
     regData[comm->localRank * 2 + 1].offset = (uintptr_t)recvbuff - recvRegRecord->begAddr;
   }
+  if (recvbuff) {
+    CUCHECKGOTO(cuPointerGetAttribute((void*)&regData[comm->localRank * 2 + 1].handleTypes,
+                                      CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES, (CUdeviceptr)recvbuff), ret, fail);
+  }
 
   NCCLCHECKGOTO(ncclShmemAllgather(comm, &comm->nvlsResources->nvlsShmem, regData + comm->localRank * 2, regData, sizeof(struct localRegData) * 2), ret, fail);
 
@@ -688,6 +719,11 @@ static ncclResult_t nvlsRegisterBuffer(struct ncclComm *comm, const void *sendbu
     if ((regData[i * 2].reg.state & NVLS_REG_NO_SUPPORT) || (regData[i * 2 + 1].reg.state & NVLS_REG_NO_SUPPORT)) {
       goto fail;
     }
+
+    if ((sendbuff && (regData[i * 2].handleTypes & ncclCuMemHandleType) == 0) ||
+        (recvbuff && (regData[i * 2 + 1].handleTypes & ncclCuMemHandleType) == 0)) {
+      goto fail;
+    }
   }
 
   if (sendNeedReg == false) {
diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc
index 119e5f4977a..36544c95289 100644
--- a/projects/rccl/src/transport/p2p.cc
+++ b/projects/rccl/src/transport/p2p.cc
@@ -138,7 +138,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph
 
   // Check topology / p2p level.
   int intermediateRank;
-  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank, NULL));
   if (*ret == 0) return ncclSuccess;
   if (intermediateRank != -1) {
     if (useMemcpy) *ret = 0;
@@ -339,7 +339,7 @@ static ncclResult_t p2pGetInfo(struct ncclComm* comm, struct ncclPeerInfo* info1
   int p2p;
   // Queries the topology to see if the GPUs are Ampere and
   // connected via NVLink, if so we enable P2P Read by default
-  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank));
+  NCCLCHECK(ncclTopoCheckP2p(comm, comm->topo, info1->rank, info2->rank, &p2p, read, intermediateRank, NULL));
 
   int readEnable = ncclParamP2pReadEnable();
   if (readEnable != -2) *read = readEnable;
@@ -1200,3 +1200,8 @@ static void initCeOperation() {
     init = 1;
   }
 }
+
+bool ncclP2pUsesMemcpy() {
+  initCeOperation();
+  return useMemcpy != 0;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000000..b48ed18803c
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,206 @@
+# Source files
+set(LIBSRCFILES
+    bootstrap.cc
+    channel.cc
+    ce_coll.cc
+    collectives.cc
+    debug.cc
+    enqueue.cc
+    group.cc
+    init.cc
+    init_nvtx.cc
+    proxy.cc
+    transport.cc
+    mnnvl.cc
+    allocator.cc
+    sym_kernels.cc
+    dev_runtime.cc
+)
+
+# Add compatibility shim if using static cudart
+if(CUDARTLIB STREQUAL "cudart_static")
+    list(APPEND LIBSRCFILES enhcompat.cc)
+endif()
+
+# Configure pkg-config file
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/nccl.pc.in
+    ${CMAKE_BINARY_DIR}/lib/pkgconfig/nccl.pc
+    @ONLY
+)
+
+# Add files from subdirectories
+add_subdirectory(transport)
+add_subdirectory(misc)
+add_subdirectory(register)
+add_subdirectory(graph)
+add_subdirectory(plugin)
+add_subdirectory(device)
+add_subdirectory(nccl_device)
+add_subdirectory(ras)
+add_subdirectory(scheduler)
+add_subdirectory(gin)
+
+add_compile_options(-fmacro-prefix-map=${CMAKE_CURRENT_SOURCE_DIR}/=)
+
+# Add all source files
+list(APPEND LIBSRCFILES
+    ${TRANSPORT_SOURCES}
+    ${MISC_SOURCES}
+    ${REGISTER_SOURCES}
+    ${GRAPH_SOURCES}
+    ${PLUGIN_SOURCES}
+    ${RAS_SOURCES}
+    ${SYM_SOURCES}
+    ${SCHEDULER_SOURCES}
+    ${GIN_SOURCES}
+    ${DOCA_SOURCES}
+)
+
+###################### Create a shared NCCL library ############################
+add_library(nccl SHARED)
+
+target_sources(nccl PRIVATE ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    ${DOCA_HOME}/include
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+add_custom_command(
+    OUTPUT ${CMAKE_BINARY_DIR}/include/nccl.h
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_BINARY_DIR}/include
+    COMMAND sed -e "s/\\\$$\\{nccl:Major\\}/${NCCL_MAJOR}/g"
+                -e "s/\\\$$\\{nccl:Minor\\}/${NCCL_MINOR}/g"
+                -e "s/\\\$$\\{nccl:Patch\\}/${NCCL_PATCH}/g"
+                -e "s/\\\$$\\{nccl:Suffix\\}/${NCCL_SUFFIX}/g"
+                -e "s/\\\$$\\{nccl:Version\\}/${NCCL_VERSION_CODE}/g"
+                ${CMAKE_CURRENT_SOURCE_DIR}/nccl.h.in > ${CMAKE_BINARY_DIR}/include/nccl.h
+    BYPRODUCTS ${CMAKE_BINARY_DIR}/include/nccl.h
+)
+
+file(GLOB_RECURSE SRC_DEVICE_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device/*.h)
+
+# Copy all device header files to the destination
+foreach(HEADER_FILE ${SRC_DEVICE_HEADERS})
+    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/${HEADER_FILE} ${CMAKE_BINARY_DIR}/${HEADER_FILE} COPYONLY)
+    list(APPEND DEVICE_HEADERS ${CMAKE_BINARY_DIR}/${HEADER_FILE})
+endforeach()
+
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/include/nccl_device.h ${CMAKE_BINARY_DIR}/include/nccl_device.h COPYONLY)
+
+add_custom_target(nccl_header DEPENDS
+    ${CMAKE_BINARY_DIR}/include/nccl.h
+    ${CMAKE_BINARY_DIR}/include/nccl_device.h
+    ${DEVICE_HEADERS}
+    ${DEVICE_DOCA_HEADERS}
+)
+
+add_dependencies(nccl nccl_header)
+add_dependencies(nccl_device nccl_header)
+
+# Set version and output name
+set_target_properties(nccl PROPERTIES
+    VERSION ${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}
+    SOVERSION ${NCCL_MAJOR}
+    OUTPUT_NAME "nccl"
+    PREFIX "lib"
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Link libraries
+target_link_libraries(nccl
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Add version script for symbol visibility control
+target_link_options(nccl PRIVATE
+    "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/libnccl.map"
+)
+
+# Set output directories for nccl shared library
+set_target_properties(nccl PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
+
+###################### Create a ras binary executable ############################
+set(RAS_BINSRCFILES ras/client.cc)
+
+add_executable(ncclras ${RAS_BINSRCFILES})
+
+target_include_directories(ncclras PUBLIC
+    ${CMAKE_BINARY_DIR}/include
+    ${CUDAToolkit_INCLUDE_DIRS}
+)
+
+add_dependencies(ncclras nccl_header)
+
+target_link_libraries(ncclras
+    PRIVATE
+    pthread
+    rt
+    dl
+)
+
+# Set output directory for ncclras executable
+set_target_properties(ncclras PROPERTIES
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+)
+
+###################### Create a static NCCL library ############################
+add_library(nccl_static STATIC ${LIBSRCFILES})
+
+# Include directories
+target_include_directories(nccl_static PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/device
+    ${CMAKE_CURRENT_SOURCE_DIR}/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/plugin
+    ${CUDAToolkit_INCLUDE_DIRS}
+    transport/gdaki/doca-gpunetio/include
+    ${CUDAToolkit_INCLUDE_DIRS}/cccl
+)
+
+# Add dependency on nccl_header
+add_dependencies(nccl_static nccl_header)
+
+# Link libraries
+target_link_libraries(nccl_static
+    PRIVATE
+    nccl_device
+    pthread
+    rt
+    dl
+    ${CUDAToolkit_LIBRARIES}
+    ${EXTRA_LIBS}
+)
+
+# Set CUDA specific flags
+set_target_properties(nccl_static PROPERTIES
+    CUDA_SEPARABLE_COMPILATION ON
+    CUDA_RESOLVE_DEVICE_SYMBOLS ON
+    CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# Set output directory for nccl_static library
+set_target_properties(nccl_static PROPERTIES
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+)
diff --git a/src/device/generate.py b/src/device/generate.py
new file mode 100755
index 00000000000..4b081924e7a
--- /dev/null
+++ b/src/device/generate.py
@@ -0,0 +1,445 @@
+#!/usr/bin/env python3
+import os
+import sys
+import shutil
+
+# Order of redops, tys, protos, algos must match src/include/device.h
+all_colls =  ["Broadcast","Reduce","AllGather","ReduceScatter","AllReduce","SendRecv"]
+all_redops = ["Sum","Prod","MinMax","PreMulSum","SumPostDiv"]
+all_tys =    ["i8","u8","i32","u32","i64","u64","f16","f32","f64","bf16","f8e4m3","f8e5m2"]
+all_protos = ["LL","LL128","SIMPLE"]
+all_algos =  ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE","PAT"]
+
+################################################################################
+# The first command line argument is the path to the directory to generate and
+# populate.
+
+gensrc = sys.argv[1]
+
+if os.path.exists(gensrc):
+  for name in os.listdir(gensrc):
+    path = os.path.join(gensrc, name)
+    if os.path.isfile(path):
+      os.remove(path)
+    elif os.path.isdir(path):
+      shutil.rmtree(path)
+else:
+  os.mkdir(gensrc)
+
+################################################################################
+# The second  command line argument is used as a regex to filter the functions
+# which make it into libnccl. This is helpful for reducing the binary when
+# developing device code. The regex supports non-space containing globs '*',
+# parentheses '(x)', and union 'a|b'. The string representing the function has
+# one of the forms:
+#
+# SendRecv
+# (AllGather|Broadcast) <algo> <proto>
+# (AlLReduce|Reduce|ReduceScatter) <redop> <type> <algo> <proto>
+#
+# The possible values for redop, type, algo, proto can be found in the all_<foo>
+# lists at the top of this file.
+#
+# Since the Makefile forwards this from the ONLY_FUNCS variable, useful command
+# line examples are given:
+"""
+# Only send/recv:
+make ONLY_FUNCS="SendRecv"
+
+# Only non-reductions:
+make ONLY_FUNCS="AllGather * *|Broadcast * *|SendRecv"
+
+# Only AllReduce sum f32 (but all algos, protos)
+make ONLY_FUNCS="AllReduce Sum f32 * *"
+
+# Only AllReduce minmax i32 NVLS (but all protos)
+make ONLY_FUNCS="AllReduce MinMax i32 NVLS *"
+
+# AllReduce sum <all floats> RING LL128
+make ONLY_FUNCS="AllReduce Sum f32 RING LL128"
+"""
+
+# Paste all non-None arguments together with `sep`.
+def paste(sep, *args):
+  return sep.join(x for x in args if x is not None)
+
+func_pattern = sys.argv[2:3]
+if func_pattern and func_pattern[0]:
+  import re
+  func_pattern = func_pattern[0]
+  func_pattern = func_pattern.replace("*", "[^ ]*")
+  func_pattern += "$"
+  def func_filter(*fn):
+    return None is not re.match(func_pattern, paste(" ", *fn), flags=re.IGNORECASE)
+else:
+  def func_filter(coll, redop, ty, algo, proto):
+    return True
+
+################################################################################
+
+algos_of_coll = {
+  "AllGather":     ["RING","COLLNET_DIRECT","NVLS","PAT"],
+  "AllReduce":     ["TREE","RING","COLLNET_DIRECT","COLLNET_CHAIN","NVLS","NVLS_TREE"],
+  "Broadcast":     ["RING"],
+  "Reduce":        ["RING"],
+  "ReduceScatter": ["RING","COLLNET_DIRECT","NVLS","PAT"],
+  "SendRecv":      [None]
+}
+
+coll_camel_to_lower = {
+  "AllGather":     "all_gather",
+  "AllReduce":     "all_reduce",
+  "Broadcast":     "broadcast",
+  "Reduce":        "reduce",
+  "ReduceScatter": "reduce_scatter",
+  "SendRecv":      "sendrecv"
+}
+coll_lower_to_camel = {coll_camel_to_lower[x]: x for x in coll_camel_to_lower}
+
+################################################################################
+
+# Returns pair of minimum required values for (CUDART_VERSION, __CUDA_ARCH__)
+# or None if function is never supported. Note that (0, 0) encodes universal
+# support.
+def required_cuda(coll, redop, ty, algo, proto):
+  cudart, arch = 0, 0
+  # kernels mapped to by coll="Nop" functions have coll="Generic"
+  if coll in ("SendRecv", "Generic", "Nop"): return (cudart, arch)
+
+  if proto!="SIMPLE" and algo not in ("RING","TREE"): return None
+
+  if coll in ("AllReduce","Reduce","ReduceScatter"):
+    if redop=="SumPostDiv" and ty[0] not in ("i","u"): return None
+    if ty=="bf16": cudart = max(cudart, 11000)
+    if ty.startswith("f8"):
+      cudart = max(cudart, 11080)
+      arch = max(arch, 900)
+
+  if "NVLS" in algo:
+    if coll in ("AllReduce","Reduce","ReduceScatter"):
+      # Must match ncclNvlsSupported() in src/include/device.h
+      nvls_ok = ((ty in ("i32","u32","i64","u64") and redop in ("Sum","MinMax")) or
+                 (ty in ("f32","f64") and redop=="Sum") or
+                 (ty in ("f16","bf16") and redop in ("Sum","MinMax")))
+      if not nvls_ok: return None
+    cudart = max(cudart, 12010)
+    arch = max(arch, 900)
+
+  return (cudart, arch)
+
+# Maps functions to the chosen representative for the equivalence class it
+# belongs to. For instance (sum, signed int) maps to (sum, unsigned int).
+def equivalent_primary(coll, redop, ty, algo, proto):
+  if coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    # map signed integer sum/prod to unsigned
+    if redop in ("Sum","Prod","PreMulSum","SumPostDiv") and ty[0]=="i":
+      return (coll, redop, "u"+ty[1:], algo, proto)
+    # map signed integer min/max to unsigned for non-NVLS
+    if redop=="MinMax" and ty[0]=="i" and ("NVLS" not in algo):
+      return (coll, redop, "u"+ty[1:], algo, proto)
+  return (coll, redop, ty, algo, proto)
+
+# Map to another func representing the best kernel to use. Every distinct value
+# returned will instantiate a ncclDevKernel specialized to run this func
+# without function call overhead.
+def best_kernel(coll, redop, ty, algo, proto):
+  def best(coll, redop, ty, algo, proto):
+    # Modify this logic to control how many kernels are specialized.
+    if coll=="Nop": return ("Generic", None, None, None, None)
+    if coll=="SendRecv": return ("SendRecv", None, None, None, None)
+    if coll in ("AllGather","Broadcast"): return (coll, None, None, "RING", "LL")
+    return (coll, "Sum", ty, ("TREE" if algo=="TREE" else "RING"), "LL")
+  # Need to ensure kernel is specialize for a primary function
+  kfn = equivalent_primary(*best(coll, redop, ty, algo, proto))
+  # And isn't filtered out.
+  if not func_filter(*kfn): return ("Generic", None, None, None, None)
+  return kfn
+
+# Order rows are enumerated must match formula of `ncclDevFuncId()`:
+def enumerate_func_rows():
+  yield ("SendRecv", None, None, None, None)
+  for coll in ("AllGather", "Broadcast"):
+    algos = algos_of_coll[coll]
+    for algo in algos:
+      for proto in all_protos:
+        yield (coll, None, None, algo, proto)
+  for coll in ("AllReduce", "Reduce", "ReduceScatter"):
+    algos = algos_of_coll[coll]
+    for redop in all_redops:
+      for ty in all_tys:
+        for algo in algos:
+          for proto in all_protos:
+            yield (coll, redop, ty, algo, proto)
+
+################################################################################
+
+def is_built(coll, redop, ty, algo, proto):
+  built = required_cuda(coll, redop, ty, algo, proto)
+  built = built and func_filter(coll, redop, ty, algo, proto)
+  return built
+
+# Returns None if required_cuda(...) is None.
+# Returns the coll="Nop" function if developer has filtered it out.
+# Otherwise just returns func it was given.
+def validate(coll, redop, ty, algo, proto):
+  valid = required_cuda(coll, redop, ty, algo, proto)
+  built = valid and func_filter(coll, redop, ty, algo, proto)
+  if built: return (coll, redop, ty, algo, proto)
+  if valid: return ("Nop", None, None, None, None)
+  return None
+
+# Corresponds to ncclDevFuncRowToId[]
+func_rows = [validate(*fn) for fn in enumerate_func_rows()]
+
+# Corresponds to ncclDevFuncTable[]
+primary_funcs = sorted(set(equivalent_primary(*fn) for fn in func_rows if fn is not None))
+
+# primary_to_index[primary_funcs[i]] == i
+primary_to_index = {fn: i for (i,fn) in zip(range(len(primary_funcs)), primary_funcs)}
+
+kernel_funcs = sorted(set(best_kernel(*fn) for fn in primary_funcs))
+
+################################################################################
+
+# Generate <gensrc>/device_table.cu
+with open(os.path.join(gensrc, "device_table.cu"), "w") as f:
+  out = f.write
+  out('#include "common.h"\n')
+  out("\n")
+
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+    out("__device__ void %s();\n" % sym)
+    if (cudart, arch) != (0, 0):
+      out("#endif\n")
+  out("\n")
+
+  out("__device__ ncclDevFuncPtr_t const ncclDevFuncTable[] = {\n");
+  index = 0
+  for fn in primary_funcs:
+    sym = paste("_", "ncclDevFunc", *fn)
+    cudart, arch = required_cuda(*fn)
+    if (cudart, arch) != (0, 0):
+      out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart ,arch))
+    out("/*%4d*/ %s,\n" % (index, sym))
+    if (cudart, arch) != (0, 0):
+      out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  out("// Workaround for https://reviews.llvm.org/D55580\n"
+      "__device__ void ncclWorkaroundClangD55580() {}\n")
+
+# Generate <gensrc>/host_table.cc
+with open(os.path.join(gensrc, "host_table.cc"), "w") as f:
+  out = f.write
+  out('#include "device.h"\n')
+  out("\n")
+
+  out("extern int const ncclDevFuncIdCount = %d;\n" % len(primary_funcs))
+
+  # The mapping from function rows to valid primary function ids.
+  out("extern int const ncclDevFuncRowToId[] = {\n")
+  index = 0
+  for fn in func_rows:
+    fn_id, comment = -1, ""
+    if fn is not None:
+      fn_id = primary_to_index[equivalent_primary(*fn)]
+      comment = " // " + paste(" ", *fn)
+    out("/*%4d*/ %d,%s\n" % (index, fn_id, comment))
+    index += 1
+  out("-1};\n")
+  out("\n")
+
+  # Forward declarations of kernels.
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    # __global__ below gets removed by the host compiler, which results in
+    # Coverity diagnosing a specifiers inconsistency.
+    out("// coverity[declaration]\n")
+    out("__global__ void %s(ncclDevKernelArgs4K const);\n" % sym)
+    if cudart != 0: out("#endif\n")
+  out("\n")
+
+  # List of all kernel function pointers.
+  out("extern int const ncclDevKernelCount = %d;\n" % len(kernel_funcs))
+  out("void* ncclDevKernelList[] = {\n")
+  index = 0
+  for kfn in kernel_funcs:
+    cudart, _ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym));
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  out("int ncclDevKernelRequirements[] = {\n")
+  for index,kfn in enumerate(kernel_funcs):
+    cudart,_ = required_cuda(*kfn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    out("  %7d, /*%4d %s*/\n" % (cudart or 0, index, sym));
+  out("};\n")
+  out("\n")
+
+  # Maps primary id to kernel function pointer.
+  out("extern void* const ncclDevKernelForFunc[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    sym = paste("_", "ncclDevKernel", *kfn)
+    cudart, _ = required_cuda(*kfn)
+    if cudart != 0: out("#if CUDART_VERSION >= %d\n" % cudart)
+    out("/*%4d*/ (void*)%s,\n" % (index, sym))
+    if cudart != 0: out("#else\n" "/*%4d*/ nullptr,\n" "#endif\n" % index)
+    index += 1
+  out("nullptr};\n")
+  out("\n")
+
+  # Does the prior map use an explicitly specialized kernel.
+  out("extern bool const ncclDevKernelForFuncIsSpecialized[] = {\n")
+  index = 0
+  for fn in primary_funcs:
+    kfn = best_kernel(*fn)
+    specialized = "1" if fn == kfn else "0"
+    out("/*%4d*/ %s,\n" % (index, specialized))
+    index += 1
+  out("0};\n")
+
+# Maps to .cu filename which implements this func. The only constraint is that
+# "coll" is reflected in the name: formally that no two funcs having different
+# coll's map to the same filename.
+def impl_filename(coll, redop, ty, algo, proto):
+  return "%s.cu" % paste("_", coll_camel_to_lower[coll], redop and redop.lower(), ty)
+
+# Partition the functions and kernels to the .cu filenames. The partition is
+# a dictionary mapping filename to (coll, func-tuple list)
+def partition_by_name(fns):
+  ans = {}
+  for fn in fns:
+    name = impl_filename(*fn)
+    coll = fn[0]
+    if name not in ans:
+      ans[name] = (coll, [])
+    ans[name][1].append(fn)
+  return ans
+
+name_to_funcs = partition_by_name(fn for fn in primary_funcs if fn[0]!="Nop")
+name_to_kernels = partition_by_name(kfn for kfn in kernel_funcs if kfn[0]!="Generic")
+
+files = ""
+for name in sorted(name_to_funcs.keys()):
+    files += name + ";"
+files += "device_table.cu;"
+files += "host_table.cc"
+
+# Do not print files when running make
+if os.environ.get("NCCL_USE_CMAKE", "0") == "1":
+    print(files)
+
+# Generate <gensrc>/rules.mk
+with open(os.path.join(gensrc, "rules.mk"), "w") as f:
+  out = f.write
+  impl_names = sorted(name_to_funcs.keys())
+  names = impl_names + ["host_table.cc", "device_table.cu"]
+  out("LIB_OBJS_GEN = $(patsubst %,$(OBJDIR)/genobj/%.o,{names})\n"
+      .format(names=" ".join(names)))
+  out("\n")
+
+  # For each <coll>_<op>_<ty>.cu compile to a .cu.o file. Notice the dependencies
+  # come from the suffix-erased file (e.g. 'gensrc/all_reduce.cu')
+  for name in impl_names:
+    coll = name_to_funcs[name][0]
+    out(
+      "$(OBJDIR)/genobj/{name}.o: $(OBJDIR)/gensrc $(OBJDIR)/genobj/{lower_coll}.cu.d\n"
+      "\t" "$(call COMPILE,$@,$(OBJDIR)/gensrc/{name})\n"
+      "\n"
+      .format(name=name, lower_coll=coll_camel_to_lower[coll])
+    )
+
+# Add the suffix-erased .cu's which are used only for dependency scraping.
+for coll in set(coll for (coll,_,_,_,_) in primary_funcs if coll!="Nop"):
+  name = impl_filename(coll, None, None, None, None)
+  if name not in name_to_funcs:
+    name_to_funcs[name] = (coll, [])
+
+redop_to_cxx = {
+  None: "FuncCopy",
+  "Sum": "FuncSum",
+  "Prod": "FuncProd",
+  "MinMax": "FuncMinMax",
+  "PreMulSum": "FuncPreMulSum",
+  "SumPostDiv": "FuncSumPostDiv"
+}
+
+ty_to_cxx = {
+  None: "int8_t",
+  "i8": "int8_t",
+  "u8": "uint8_t",
+  "i32": "int32_t",
+  "u32": "uint32_t",
+  "i64": "int64_t",
+  "u64": "uint64_t",
+  "f16": "half",
+  "f32": "float",
+  "f64": "double",
+  "bf16": "__nv_bfloat16",
+  "f8e4m3": "__nv_fp8_e4m3",
+  "f8e5m2": "__nv_fp8_e5m2"
+}
+
+# Generate each <gensrc>/<impl>.cu:
+for name in name_to_funcs.keys():
+  (coll, fns) = name_to_funcs[name]
+  with open(os.path.join(gensrc, name), "w") as f:
+    out = f.write
+    out(
+      '#include "common.h"\n'
+      '#include "{lower_coll}.h"\n'
+      .format(lower_coll=coll_camel_to_lower[coll])
+    )
+
+    (_, kfns) = name_to_kernels.get(name) or (None, [])
+    for kfn in kfns:
+      (coll, redop, ty, algo, proto) = kfn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      fn_id = primary_to_index[kfn]
+      cudart, arch = required_cuda(*kfn)
+      s = "DEFINE_ncclDevKernel({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n"
+      if (cudart, arch) != (0, 0):
+        # Add conditional compilation logic around s. If CUDART_VERSION is satisfactory
+        # we must compile a kernel regardless of __CUDA_ARCH__ since the host code has
+        # to link against some stub.
+        s = "#if CUDART_VERSION >= {cudart}\n" \
+            "  #if __CUDA_ARCH__ < {arch}\n" \
+            "    DEFINE_ncclDevKernel_nop({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto}, {fn_id})\n" \
+            "  #else\n" \
+            "    " + s + \
+            "  #endif\n" \
+            "#endif\n"
+      out(s.format(
+        cudart=cudart, arch=arch, sym=sym, coll=coll,
+        redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+        algo=(algo or "RING"), proto=(proto or "SIMPLE"), fn_id=fn_id
+      ))
+
+    for fn in fns:
+      (coll, redop, ty, algo, proto) = fn
+      sym = paste("_", coll, redop, ty, algo, proto)
+      cudart, arch = required_cuda(*fn)
+      if (cudart, arch) != (0, 0):
+        out("#if CUDART_VERSION >= %d && __CUDA_ARCH__ >= %d\n" % (cudart, arch))
+      out(
+        "DEFINE_ncclDevFunc({sym}, ncclFunc{coll}, {redop_cxx}, {ty_cxx}, NCCL_ALGO_{algo}, NCCL_PROTO_{proto})\n"
+        .format(sym=sym, coll=coll, redop_cxx=redop_to_cxx[redop], ty_cxx=ty_to_cxx[ty],
+                algo=(algo or "RING"), proto=(proto or "SIMPLE"))
+      )
+      if (cudart, arch) != (0, 0):
+        out("#endif\n")
diff --git a/src/graph/rings.cc b/src/graph/rings.cc
new file mode 100644
index 00000000000..70fac75b196
--- /dev/null
+++ b/src/graph/rings.cc
@@ -0,0 +1,70 @@
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+
+void dumpLine(int* values, int nranks, const char* prefix) {
+  constexpr int line_length = 128;
+  char line[line_length];
+  int num_width = snprintf(nullptr, 0, "%d", nranks-1);  // safe as per "man snprintf"
+  int n = snprintf(line, line_length, "%s", prefix);
+  for (int i = 0; i < nranks && n < line_length-1; i++) {
+    n += snprintf(line + n, line_length - n, " %*d", num_width, values[i]);
+    // At this point n may be more than line_length-1, so don't use it
+    // for indexing into "line".
+  }
+  if (n >= line_length) {
+    // Sprintf wanted to write more than would fit in the buffer. Assume
+    // line_length is at least 4 and replace the end with "..." to
+    // indicate that it was truncated.
+    snprintf(line+line_length-4, 4, "...");
+  }
+  INFO(NCCL_INIT, "%s", line);
+}
+
+ncclResult_t ncclBuildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) {
+  ncclResult_t ret = ncclSuccess;
+  uint64_t* rankFound;
+  int rankFoundSize = DIVUP(nranks, 64);
+  NCCLCHECK(ncclCalloc(&rankFound, rankFoundSize));
+
+  for (int r=0; r<nrings; r++) {
+    char prefix[40];
+    /*sprintf(prefix, "[%d] Channel %d Prev : ", rank, r);
+    dumpLine(prev+r*nranks, nranks, prefix);
+    sprintf(prefix, "[%d] Channel %d Next : ", rank, r);
+    dumpLine(next+r*nranks, nranks, prefix);*/
+
+    int current = rank;
+    for (int i=0; i<nranks; i++) {
+      rankFound[current/64] |= (1<<(current%64));
+      rings[r*nranks+i] = current;
+      current = next[r*nranks+current];
+    }
+    snprintf(prefix, sizeof(prefix), "Channel %02d/%02d :", r, nrings);
+    if (rank == 0) dumpLine(rings+r*nranks, nranks, prefix);
+    if (current != rank) {
+      WARN("Error : ring %d does not loop back to start (%d != %d)", r, current, rank);
+      ret = ncclInternalError;
+      goto end;
+    }
+    // Check that all ranks are there
+    for (int i=0; i<nranks; i++) {
+      uint64_t bits = rankFound[i/64], mask = 1<<(i%64);
+      // Fast check 64 ranks at a time
+      if (mask == 1 && bits == 0xffffffffffffffff) { i += 63; continue; }
+      if ((bits & mask) == 0) {
+        WARN("Error : ring %d does not contain rank %d", r, i);
+        ret = ncclInternalError;
+        goto end;
+      }
+    }
+    memset(rankFound, 0, rankFoundSize*sizeof(uint64_t));
+  }
+end:
+  free(rankFound);
+  return ret;
+}
diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc
new file mode 100644
index 00000000000..9e4b9c0ade2
--- /dev/null
+++ b/src/graph/tuning.cc
@@ -0,0 +1,601 @@
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "device.h"
+#include "comm.h"
+#include "topo.h"
+#include "nccl_tuner.h"
+
+NCCL_PARAM(Nthreads, "NTHREADS", -2);
+NCCL_PARAM(Ll128Nthreads, "LL128_NTHREADS", -2);
+
+static int getNthreads(const char* name, int env, int min, int max, int def) {
+  int nt = env;
+  if (nt > 0) {
+    if (nt % WARP_SIZE != 0) {
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (must be a multiple of %d)", name, nt, WARP_SIZE);
+      nt = max;
+    } else if (nt > max) {
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (maximum %d).", name, nt, max);
+      nt = max;
+    } else if (nt < min) {
+      INFO(NCCL_GRAPH|NCCL_ENV, "Invalid %s %d (minimum %d).", name, nt, min);
+      nt = min;
+     }
+  } else {
+    nt = def;
+  }
+  return nt;
+}
+
+// Parse a map of prefixes to a list of elements. The first prefix is
+// optional and, if not present, the list of elements will be applied
+// to all prefixes. Only the first list of elements can lack a
+// prefix. Prefixes (if present) are followed by a colon. Lists of
+// elements are comma delimited. Mappings of prefix to the lists of
+// elements are semi-colon delimited.
+//
+// For example:
+//
+//     NCCL_ALGO="ring,collnetdirect;allreduce:tree,collnetdirect;broadcast:ring"
+// Enable ring and collnetdirect for all functions, then select tree
+// and collnetdirect for allreduce and ring for broadcast.
+//
+//     NCCL_PROTO="LL,Simple;allreduce:^LL"
+// Enable LL and Simple for all functions, but everything except LL
+// for allreduce.
+//
+//     NCCL_PROTO="^LL128;allreduce:LL128"
+// Enable everything but LL128, but only LL128 for allreduce.
+ncclResult_t parseList(const char* str, const char* prefixElems[], int nprefixes, const char* elems[], int nelems, int* list) {
+  ncclResult_t ret = ncclSuccess;
+  char* fullStr = strdup(str);
+  char* tmpFullStr;
+  char* fullToken = strtok_r(fullStr, ";", &tmpFullStr);
+  char* subToken = nullptr;
+  char* tokStr = nullptr;
+  while (fullToken) {
+    subToken = strdup(fullToken);
+    char* tmpSubStr;
+    char* prefix = strtok_r(subToken, ":", &tmpSubStr);
+    char* elemList = strtok_r(NULL, ":", &tmpSubStr);
+    if (elemList == NULL) {
+      if (fullToken != fullStr) {
+        // It makes no sense for any entry other than the first to not have a prefix,
+        // because then all the prefixes before the prefix-less entry would be
+        // overwritten.
+        WARN("All entries except the first must have a prefix: \"%s\"", str);
+        ret = ncclInvalidUsage;
+        goto fail;
+      }
+      elemList = prefix;
+      prefix = NULL;
+    }
+
+    int unset, set;
+    if (elemList[0] == '^') {
+      unset = 1; set = 0; elemList++;
+    } else {
+      unset = 0; set = 1;
+    }
+
+    bool foundPrefix = false;
+    for (int p=0; p<nprefixes; p++) {
+      if (prefix && strcasecmp(prefix, prefixElems[p]) != 0) continue;
+      foundPrefix = true;
+      for (int e=0; e<nelems; e++) list[p*nelems+e] = unset;
+
+      tokStr = strdup(elemList);
+      char* tmpStr;
+      char* elem = strtok_r(tokStr, ",", &tmpStr);
+      while (elem) {
+        int e;
+        for (e=0; e<nelems; e++) {
+          if (strcasecmp(elem, elems[e]) == 0) {
+            list[p*nelems+e] = set;
+            break;
+          }
+        }
+        if (e==nelems) {
+          WARN("Unrecognized element token \"%s\" when parsing \"%s\"", elem, str);
+          ret = ncclInvalidUsage;
+          goto fail;
+        }
+        elem = strtok_r(NULL, ",", &tmpStr);
+      }
+      free(tokStr);
+      tokStr = nullptr;
+    }
+    if (!foundPrefix) {
+      WARN("Unrecognized prefix token \"%s\" when parsing \"%s\"", prefix, str);
+      ret = ncclInvalidUsage;
+      goto fail;
+    }
+    free(subToken);
+    subToken = nullptr;
+
+    fullToken = strtok_r(NULL, ";", &tmpFullStr);
+  }
+
+exit:
+  free(tokStr);
+  free(subToken);
+  free(fullStr);
+  return ret;
+fail:
+  goto exit;
+}
+
+// NVLS efficiency factor.
+static const float nvlsEfficiency[NCCL_NUM_COMPCAPS] = {
+  0.0f, // Volta
+  0.0f, // Ampere
+  0.85f, // Hopper
+  0.74f, // Blackwell
+};
+
+// Default tuner constants
+static const ncclTunerConstants_t ncclTunerConstantsDefaults = {
+  .baseLatencies = {
+    {  6.8, 14.0,  8.4 }, {  6.6, 14.0,  8.4 },  // Tree, Ring
+    {    0,    0,    0 }, {    0,    0,    0 },  // Collnet Direct, Chain
+    {    0,    0,    0 }, {    0,    0,    0 },  // NVLS, NVLS Tree
+    {  8.0,  8.0,  8.0 }                         // PAT
+    },
+  .hwLatencies = {
+  /* NVLINK */
+  { { .6, 1.25, 4.0 }, { .6, 1.9, 3.4 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,  25 }, {  0,   0,  25 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* PCI */
+  { { 1.0, 1.9, 4.0 }, { 1.0, 2.5, 5.7 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {  0,    0, 3.7 }, {  0,   0,  2.8 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {  0,    0,   0 }, {  0,   0,    0 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {  0,    0, 4.0 } /* PAT (LL/LL128/Simple)*/
+    },
+  /* NET */
+  { { 5.0, 8.5, 14 }, { 2.7, 4.0, 14.0 }, /* Tree (LL/LL128/Simple), Ring (LL/LL128/Simple)*/
+    {   0,   0, 31 }, {   0,   0,   30 }, /* CollNetDirect (LL/LL128/Simple), CollNetChain (LL/LL128/Simple)*/
+    {   0,   0, 18 }, {   0,   0,   20.9 }, /* NVLS (LL/LL128/Simple), NVLSTree (LL/LL128/Simple)*/
+    {   0,   0, 14 } /* PAT (LL/LL128/Simple)*/
+    },
+  },
+  .llMaxBws = {
+     {39.0, 39.0, 20.4}, /* Volta-N1/Intel-N2/Intel-N4) */
+     {87.7, 22.5 /*avg of ring & tree*/, 19.0}, /* Ampere-N1/AMD-N2/AMD-N4) */
+     {141.0, 45.0 /*avg of ring & tree*/, 35.0}, /* Hopper-N1/AMD-N2/AMD-N4) */
+     {2*141.0, 2*45.0 /*avg of ring & tree*/, 2*35.0}, /* Blackwell-N1/AMD-N2/AMD-N4) */
+  },
+  .perChMaxRingLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 36.7}, /* Hopper (N1/N2/N4) */
+    {2*36.7, 34.6, 2*36.7}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeLL128Bws = {
+    {20.0, 20.0, 20.0}, /* Volta (N1/N2/N4) */
+    {20.0, 20.0, 20.0}, /* Ampere (N1/N2/N4) */
+    {36.7, 36.7, 29.0}, /* Hopper (N1/N2/N4) */
+    {55.6, 31.67, 20.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {38.7, 41.4, 36.0}, /* Hopper (N1/N2/N4) */
+    {70.0, 42.8, 24.0}, /* Blackwell (N1/N2/N4) */
+  },
+  .perChMaxNVLSTreeBws = {
+    {26.5, 18.5, 10.0}, /* Volta (N1/N2/N4) */
+    {24.0, 23.6, 17.8}, /* Ampere (N1/N2/N4) */
+    {0.0, 57.7, 45.5}, /* Hopper (N1/N2/N4) */
+    {0.0, 96.0, 43.8} /* Blackwell (N1/N2/N4) */
+  }
+};
+
+NCCL_PARAM(PatEnable, "PAT_ENABLE", 2);
+static int ncclPatEnable(struct ncclComm* comm) {
+  int patEnable = ncclParamPatEnable();
+  if (comm->minCompCap < 60) return 0; // Need SM60 or higher for CUDA atomics
+  if (patEnable != 2) return patEnable;
+  if (comm->nNodes != comm->nRanks) return 0; // PAT only supports 1 GPU per node
+  if (comm->netDeviceType != NCCL_NET_DEVICE_HOST) return 0;   // PAT doesn't support net device offload
+  return 1;
+}
+
+// Network post overhead in ns (1000 = 1 us)
+NCCL_PARAM(NetOverhead, "NET_OVERHEAD", -2);
+
+static float getNetOverhead(struct ncclComm* comm) {
+  if (ncclParamNetOverhead() != -2) return ncclParamNetOverhead() * .001;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_INTEL) return 1.0;
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_X86 && comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD) return 2.0;
+  return 1.0;
+}
+
+NCCL_PARAM(Ll128C2c, "LL128_C2C", 1);
+
+ncclResult_t ncclTopoInitTunerConstants(struct ncclComm* comm) {
+
+  comm->tunerConstants = ncclTunerConstantsDefaults;
+
+  return ncclSuccess;
+}
+
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
+  int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
+  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
+    comm->maxThreads[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] =
+    comm->maxThreads[NCCL_ALGO_NVLS][NCCL_PROTO_SIMPLE] =
+    comm->maxThreads[NCCL_ALGO_NVLS_TREE][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] =
+    getNthreads("NCCL_LL128_NTHREADS", ncclParamLl128Nthreads(), NCCL_LL128_MAX_NTHREADS/4, NCCL_LL128_MAX_NTHREADS, NCCL_LL128_MAX_NTHREADS);
+
+  int nNodes = comm->nNodes;
+  int nRanks = comm->nRanks;
+  if (nRanks <= 1) return ncclSuccess;
+
+  int compCapIndex = minCompCap >= 100 ? NCCL_BLACKWELL_COMPCAP_IDX : (minCompCap >= 90 ? NCCL_HOPPER_COMPCAP_IDX : minCompCap >= 80 ? NCCL_AMPERE_COMPCAP_IDX : NCCL_VOLTA_COMPCAP_IDX);
+  int index2 = nNodes <= 2 ? nNodes-1 : 2;
+  // LL: for single node, we look at GPU type; for multi-node, we look at CPU type
+  int index1 = nNodes == 1 ? compCapIndex :
+               (comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_AMD || comm->cpuVendor == NCCL_TOPO_CPU_VENDOR_MIXED) ? 1 : 0;
+  double llMaxBw = comm->tunerConstants.llMaxBws[index1][index2];
+  double perChMaxTreeBw = comm->tunerConstants.perChMaxTreeBws[compCapIndex][index2];
+  double perChMaxRingLL128Bw = comm->tunerConstants.perChMaxRingLL128Bws[compCapIndex][index2];
+  double perChMaxTreeLL128Bw = comm->tunerConstants.perChMaxTreeLL128Bws[compCapIndex][index2];
+  double perChMaxNVLSTreeBw = comm->tunerConstants.perChMaxNVLSTreeBws[compCapIndex][index2];
+  // De-penalize Tree/Simple latency on Power systems to favor Tree than Ring
+  if (comm->cpuArch == NCCL_TOPO_CPU_ARCH_POWER) comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->tunerConstants.hwLatencies[NCCL_HW_PCI][NCCL_ALGO_RING][NCCL_PROTO_SIMPLE];
+  float ppn = (float)nRanks / nNodes;
+
+  int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS];
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) intraHw[a] = graphs[a]->typeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI;
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) hw[a] = nNodes == 1 ? intraHw[a] : NCCL_HW_NET;
+
+  for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
+    int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
+      coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
+      nRanks;
+
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
+      if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
+          && a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
+          && a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
+      if (coll == ncclFuncAllReduce && a == NCCL_ALGO_PAT) continue;
+
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
+        if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
+            && a == NCCL_ALGO_PAT && (p != NCCL_PROTO_SIMPLE || ncclPatEnable(comm) == 0)) continue;
+        int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0;
+        float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
+        if (a == NCCL_ALGO_NVLS_TREE || a == NCCL_ALGO_NVLS)
+        {
+          // NVLS/NVLStree needs at least 2 channels
+          if (graphs[a]->nChannels < 2 ) continue;
+          // Convert to NVLS busBW/channel
+          float intraBw = graphs[a]->bwIntra * nvlsEfficiency[compCapIndex] * (graphs[a]->nChannels - 1) / graphs[a]->nChannels;
+	  // AllReduce pipelines two operations.
+          if (coll == ncclFuncAllReduce) {
+            intraBw *= 2.0f;
+          } else {
+            intraBw *= (ppn - 1) / ppn;
+          }
+          // Handle 2 node case of NVLSTree
+          float interBw = graphs[a]->bwInter * ((nNodes <= 2 && a == NCCL_ALGO_NVLS_TREE) ? 2 : 1);
+          bw = std::min( {intraBw, interBw, a == NCCL_ALGO_NVLS_TREE ? (float)perChMaxNVLSTreeBw : std::numeric_limits<float>::max()} );
+        };
+        float busBw = graphs[a]->nChannels * bw;
+
+        // Various model refinements
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * .5); }
+        if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (0.92 /*120.0/128.0*/), graphs[a]->nChannels*perChMaxRingLL128Bw);
+        if (a == NCCL_ALGO_TREE && coll == ncclFuncAllReduce) busBw = std::min(busBw*.92, graphs[a]->nChannels*perChMaxTreeBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
+        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 120.0/128.0), graphs[a]->nChannels*perChMaxTreeLL128Bw);
+        if (a == NCCL_ALGO_TREE && comm->maxTreePattern == NCCL_TOPO_PATTERN_TREE) busBw *= .85;
+        if (a == NCCL_ALGO_PAT) busBw *= .75;
+        if (a == NCCL_ALGO_COLLNET_DIRECT && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
+        if (a == NCCL_ALGO_COLLNET_CHAIN && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Not used
+        if (a == NCCL_ALGO_COLLNET_DIRECT && p == NCCL_PROTO_SIMPLE) {
+          if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
+            busBw = ppn * std::min(graphs[a]->bwIntra, graphs[a]->bwInter * 0.9f);
+          } else {
+            // Collnet+Direct requires all GPUs to have a local NIC to work at full speed
+            float factor = ppn / (1.0*graphs[a]->nChannels); // GPU/NIC ratio
+            factor -= (factor-1)/2;
+            busBw /= factor;
+            if (minCompCap >= 90) busBw *= .85;
+          }
+        }
+        // disable collnet for allgather/reducescatter if #localranks > #heads
+        // AllGather/ReduceScatter requires 1:1 GPU:NIC
+        if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_COLLNET_DIRECT) && p == NCCL_PROTO_SIMPLE && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) && comm->nNodes > 1) {
+          int nHeads = 0;
+          if (coll == ncclFuncAllGather && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->iallgather)) busBw = 0.0f;
+          if (coll == ncclFuncReduceScatter && comm->nNodes > 1 && (!comm->ncclCollNet || !comm->ncclCollNet->ireducescatter)) busBw = 0.0f;
+          if (comm->config.collnetEnable)
+            nHeads = comm->collNetHeadsNum;
+          else
+            busBw = 0.0f;
+          if (busBw > 0.0f) {
+            for (int r = 0; r < comm->nRanks; r++) {
+              int node = comm->rankToNode[r];
+              if (comm->nodeRanks[node].localRanks > nHeads) {
+                busBw = 0.0f;
+                break;
+              }
+            }
+          }
+        }
+
+        // Convert bus BW to algorithm BW
+        if (!(a != NCCL_ALGO_RING && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) {
+          float ratio = 1.0f;
+          if (a == NCCL_ALGO_RING || a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= (1.0 * nRanks) / nsteps;
+          else ratio *= .5;
+          busBw *= ratio;
+        }
+        comm->bandwidths[coll][a][p] = busBw;
+        comm->latencies[coll][a][p] = comm->tunerConstants.baseLatencies[a][p];
+        float intraLat = comm->tunerConstants.hwLatencies[intraHw[a]][a][p];
+        // With ppn=1 latencies are fully exposed, use the Tree network latency
+        float interLat = ppn == 1 ? comm->tunerConstants.hwLatencies[NCCL_HW_NET][NCCL_ALGO_TREE][p] : comm->tunerConstants.hwLatencies[NCCL_HW_NET][a][p];
+        interLat += graphs[a]->latencyInter;
+        // Also add the flush extra latency
+        if (p == NCCL_PROTO_SIMPLE) interLat += graphs[a]->latencyInter;
+
+        if (a == NCCL_ALGO_RING) {
+          float lat = comm->tunerConstants.hwLatencies[hw[a]][a][p];
+          if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) {
+            if (graphs[a]->sameChannels) {
+              comm->latencies[coll][a][p] += lat;
+            } else {
+              if (p == NCCL_PROTO_SIMPLE) lat = comm->tunerConstants.hwLatencies[hw[a]][NCCL_ALGO_TREE][p]; // Add some chunk latency, waiting for proper chunk modeling
+              comm->latencies[coll][a][p] += nsteps*lat;
+            }
+          } else {
+            // Inter-node rings still have to launch nsteps * net overhead.
+            float netOverhead = 0.0;
+            if (nNodes > 1) {
+              netOverhead = getNetOverhead(comm);
+              if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3;
+            }
+            intraLat = std::max(intraLat, netOverhead);
+            int nInterSteps = nNodes == 1 ? 0 : coll == ncclFuncAllReduce ? 2*(nNodes-1) : nNodes-1;
+            comm->latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat;
+          }
+        } else if (a == NCCL_ALGO_TREE) {
+          if (coll == ncclFuncAllReduce) {
+            comm->latencies[coll][a][p] +=
+              2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat);
+          }
+        } else if (a == NCCL_ALGO_COLLNET_DIRECT) {
+          comm->latencies[coll][a][p] +=
+            2 * (std::min(1, (nRanks/nNodes-1)) * intraLat + (nRanks/nNodes-1) * 0.4) + interLat;  // Add 0.4 us arity serialization latency
+        } else if (a == NCCL_ALGO_COLLNET_CHAIN) {
+          comm->latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat;
+        } else if (a == NCCL_ALGO_NVLS) {
+          comm->latencies[coll][a][p] = intraLat;
+          if (nNodes > 1) comm->latencies[coll][a][p] += interLat;
+        } else if (a == NCCL_ALGO_NVLS_TREE) {
+          comm->latencies[coll][a][p] += intraLat + 2 * log2i(nNodes) * interLat;
+        } else if (a == NCCL_ALGO_PAT) {
+          if (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter) {
+            comm->latencies[coll][a][p] += log2i(nNodes) * (interLat/3.5) // Log latency
+              + nRanks * 2.8; // Still a linear part; hopefully we'll manage to remove it at some point.
+          }
+        }
+      }
+    }
+  }
+
+  // Protocols/Algorithms enable/disable, and user overrides.
+  // All are enabled except ll128 which is enabled by default only in certain cases.
+  int protoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_PROTOCOLS];
+  int algoEnable[NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS];
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      protoEnable[f*NCCL_NUM_PROTOCOLS+p] = p == NCCL_PROTO_LL128 ? 2 : 1;
+    }
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 1;
+    }
+  }
+
+  const char *protoStr = ncclGetEnv("NCCL_PROTO");
+  if (protoStr) {
+    INFO(NCCL_ENV, "NCCL_PROTO set by environment to %s", protoStr);
+    NCCLCHECK(parseList(protoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclProtoStr, NCCL_NUM_PROTOCOLS, protoEnable));
+  }
+  const char *algoStr = ncclGetEnv("NCCL_ALGO");
+  if (algoStr) {
+    INFO(NCCL_ENV, "NCCL_ALGO set by environment to %s", algoStr);
+    NCCLCHECK(parseList(algoStr, ncclFuncStr, NCCL_NUM_FUNCTIONS, ncclAlgoStr, NCCL_NUM_ALGORITHMS, algoEnable));
+  }
+
+  if (comm->rank == 0 && (algoStr||protoStr)) {
+    constexpr int strLength = 1024;
+    char funcAlgoProtoTuningStr[strLength];
+    int offset = 0;
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n     Function | ");
+    for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8s  ", ncclProtoStr[p]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s  ", ncclAlgoStr[a]);
+    }
+    offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+
+    for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13s | ", ncclFuncStr[f]);
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%8d  ", protoEnable[f*NCCL_NUM_PROTOCOLS+p]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), " | ");
+      for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+        offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "%13d  ", algoEnable[f*NCCL_NUM_ALGORITHMS+a]);
+      }
+      offset += snprintf(funcAlgoProtoTuningStr+offset, std::max(0, strLength-offset), "\n");
+    }
+
+    INFO(NCCL_ENV, "Enabled NCCL Func/Proto/Algo Matrix:%s", funcAlgoProtoTuningStr);
+  }
+
+  int nvsCount = 0;
+  NCCLCHECK(ncclTopoGetNvsCount(comm->topo, &nvsCount));
+
+  for (int f=0; f<NCCL_NUM_FUNCTIONS; f++) {
+    for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+      int disable = 0;
+      // Disable NVLS Tree on a single node
+      if (comm->nNodes == 1 && a == NCCL_ALGO_NVLS_TREE) disable = 1;
+      // Disable Collnet+Direct, Collnet+Chain or Collnet+NVLS if collnet is not supported.
+      if (comm->config.collnetEnable == 0 &&
+          (a == NCCL_ALGO_COLLNET_DIRECT ||
+           a == NCCL_ALGO_COLLNET_CHAIN ||
+           (a == NCCL_ALGO_NVLS && comm->nNodes > 1))) disable = 1;
+      // Disable CollNet+Direct if not on an NVSwitch system
+      if (nvsCount == 0 && a == NCCL_ALGO_COLLNET_DIRECT) disable = 1;
+      if (disable) algoEnable[f*NCCL_NUM_ALGORITHMS+a] = 0;
+    }
+  }
+
+  for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+    int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p];
+    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
+      pEnable = 1;
+      if (ncclParamLl128C2c() && minCompCap >= 90) {
+        // Enable LL128 by default only on Hopper/Blackwell for all connections up to P2C and PXN.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXN);
+      } else {
+        // Enable LL128 only up to PXB. Don't enable LL128 over PxN because PxN can encapsulate PxB or P2C links.
+        pEnable &= (graphs[a]->typeInter <= PATH_PXB);
+        if (!ncclParamLl128C2c() && minCompCap >= 90)
+          INFO(NCCL_GRAPH, "Disabling LL128 over all PxN connections (PXB and C2C). This ensures that no C2C link will be used by LL128.");
+      }
+      pEnable &= (graphs[a]->typeIntra <= PATH_NVB);
+      pEnable &= (minCompCap == maxCompCap);
+      pEnable &= !(minCompCap < 70 || (minCompCap == 90 && CUDART_VERSION == 11080 && c == ncclFuncAllReduce && a == NCCL_ALGO_RING && comm->nRanks == 2));
+    }
+    if (pEnable == 0) comm->bandwidths[c][a][p] = 0;
+    if (algoEnable[c*NCCL_NUM_ALGORITHMS+a] == 0) comm->bandwidths[c][a][p] = 0;
+  }
+
+  if (comm->rank == 0) {
+    constexpr int lineLen = 1024;
+    char line[lineLen];
+    int offset = 0;
+    for (int block=0; block<DIVUP(NCCL_NUM_ALGORITHMS, 3); block++) {
+      offset = snprintf(line, lineLen, "  Algorithm   |");
+      for (int ba=0; ba<3; ba++) {
+        int a = block*3+ba;
+        if (a >= NCCL_NUM_ALGORITHMS) continue;
+        offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s   %14s   %14s |", "", ncclAlgoStr[a], "");
+      }
+      INFO(NCCL_TUNING, "%s", line);
+      offset = snprintf(line, lineLen, "  Protocol    |");
+      for (int ba=0; ba<3; ba++) {
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14s |", ncclProtoStr[p]);
+        }
+      }
+      INFO(NCCL_TUNING, "%s", line);
+      offset = snprintf(line, lineLen, " Max NThreads |");
+      for (int ba=0; ba<3; ba++) {
+        int a = block*3+ba;
+        if (a >= NCCL_NUM_ALGORITHMS) continue;
+        for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+          offset += snprintf(line+offset, std::max(0, lineLen-offset), " %14d |", comm->maxThreads[a][p]);
+        }
+      }
+      INFO(NCCL_TUNING, "%s", line);
+      for (int c=0; c<NCCL_NUM_FUNCTIONS; c++) {
+        offset = snprintf(line, lineLen, "%13s |", ncclFuncStr[c]);
+        for (int ba=0; ba<3; ba++) {
+          int a = block*3+ba;
+          if (a >= NCCL_NUM_ALGORITHMS) continue;
+          for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+            offset += snprintf(line+offset, std::max(0, lineLen-offset), "%8.1f/%6.1f |", comm->latencies[c][a][p], comm->bandwidths[c][a][p]);
+          }
+        }
+        INFO(NCCL_TUNING, "%s", line);
+      }
+    }
+  }
+
+  // Set per-thread amount of work before we increase nThreads and nChannels
+  for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
+    comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_LL128] = NCCL_LL128_THREAD_THRESHOLD;
+    comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
+  }
+  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
+  comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = 512;
+  comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE] = 512;
+
+  // Override defaults with user env
+  const char* str = ncclGetEnv("NCCL_THREAD_THRESHOLDS");
+  if (str) {
+    INFO(NCCL_ENV, "NCCL_THREAD_THRESHOLDS set by environment to %s", str);
+    ssize_t t[2][NCCL_NUM_PROTOCOLS] = {{ -2, -2, -2 }, { -2, -2, -2 }};
+    sscanf(str, "%ld %ld %ld %ld %ld %ld", t[0], t[0]+1, t[0]+2, t[1], t[1]+1, t[1]+2);
+    for (int a=0; a<2; a++) {
+      for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
+        if (t[a][p] >= 0) comm->threadThresholds[a][p] = t[a][p];
+      }
+    }
+  }
+
+  INFO(NCCL_INIT, "threadThresholds %ld/%ld/%ld | %ld/%ld/%ld | %ld | %ld",
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL128],
+      comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE],
+      comm->threadThresholds[NCCL_ALGO_COLLNET_CHAIN][NCCL_PROTO_SIMPLE]);
+  return ncclSuccess;
+}
+
+// Trees are not perfectly sticking to the model for medium sizes. Applying a static correction
+// factor is not ideal but works quite well. Powers of two, 64 B to 256MB.
+static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][23] = {
+  { 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .7,  .7,  .7,  .7,  .6,  .5,  .4,  .4,  .5,  .6,  .7,  .8,  .9, 1.0, 1.0, 1.0, 1.0 },
+  { 1.0, 1.0, 1.0, 1.0, 1.0,  .9,  .8,  .8,  .8,  .7,  .6,  .6,  .6,  .6,  .6,  .6,  .8,  .9,  .9,  .9,  .9, 1.0, 1.0 },
+  {  .9,  .9,  .9,  .9,  .9,  .9,  .9,  .8,  .7,  .6,  .6,  .5,  .5,  .5,  .5,  .6,  .7,  .8,  .7,  .7,  .8,  .9,  .9 }
+};
+
+ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, int protocol, size_t nBytes, int numPipeOps, float* time) {
+  float bw = comm->bandwidths[coll][algorithm][protocol];
+  float lat = comm->latencies[coll][algorithm][protocol];
+
+  if (bw == 0) {
+    *time = -1.0; return ncclSuccess;
+  }
+  int logSize = log2i(nBytes>>6);
+  if (algorithm == NCCL_ALGO_TREE && coll == ncclFuncAllReduce && logSize >= 0 && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize];
+  if (algorithm == NCCL_ALGO_RING && protocol == NCCL_PROTO_SIMPLE && comm->nNodes > 1
+      && coll == ncclFuncAllReduce && nBytes/(comm->nChannels*comm->nRanks) >= 64) {
+    lat *= comm->minCompCap < 80 ? 1.9 : 1.4; // Plateau effect of ring
+  }
+  // Tree pipelining saves latency in aggregation cases
+  int latCount = algorithm == NCCL_ALGO_RING ? numPipeOps : DIVUP(numPipeOps, NCCL_MAX_DEV_WORK_BATCH_COLLS);
+  *time = lat * latCount + nBytes / (1000 * bw);
+  return ncclSuccess;
+}
diff --git a/src/include/nccl_device/coop.h b/src/include/nccl_device/coop.h
new file mode 100644
index 00000000000..4af229dfbe5
--- /dev/null
+++ b/src/include/nccl_device/coop.h
@@ -0,0 +1,211 @@
+/*************************************************************************
+ * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _NCCL_DEVICE_COOP_H_
+#define _NCCL_DEVICE_COOP_H_
+#include "utility.h"
+
+// ncclCoop[Foo]: NCCL's versions of CUDA's Cooperative Groups. They conform
+// to just this subset of the CUDA API:
+//   int Coop::thread_rank();
+//   int Coop::size();
+//   int Coop::num_threads(); // same as size()
+//   void Coop::sync();
+
+#if __CUDACC__
+template<int nThreadsPow2>
+struct ncclCoopTile { // An aligned pow2 set of threads within the warp.
+  static_assert(nccl::utility::isPow2(nThreadsPow2) && nThreadsPow2 <= 32, "Condition required");
+
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return nccl::utility::lane() % nThreadsPow2;
+  }
+  NCCL_DEVICE_INLINE constexpr int size() const { return nThreadsPow2; }
+  NCCL_DEVICE_INLINE constexpr int num_threads() const { return nThreadsPow2; }
+
+  NCCL_DEVICE_INLINE uint32_t laneMask() const {
+    return (-1u>>(32-nThreadsPow2))<<(nccl::utility::lane() & -nThreadsPow2);
+  }
+  NCCL_DEVICE_INLINE void sync() {
+    if (nThreadsPow2 > 1) __syncwarp(laneMask());
+  }
+};
+#endif
+
+#if __CUDACC__
+typedef ncclCoopTile<1> ncclCoopThread;
+typedef ncclCoopTile<32> ncclCoopWarp;
+#endif
+
+#if __CUDACC__
+struct ncclCoopLanes { // Some lanes of this warp.
+  uint32_t lmask;
+
+  NCCL_DEVICE_INLINE constexpr ncclCoopLanes(uint32_t lmask=-1u): lmask(lmask) {}
+
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return __popc(lmask & nccl::utility::lanemask_lt());
+  }
+  NCCL_DEVICE_INLINE int size() const {
+    return __popc(lmask);
+  }
+  NCCL_DEVICE_INLINE int num_threads() const {
+    return __popc(lmask);
+  }
+  NCCL_DEVICE_INLINE void sync() {
+    __syncwarp(lmask);
+  }
+};
+#endif
+
+#if __CUDACC__
+// A set of consecutive warps that the user has also supplied with a unique
+// id from [0..15]. It is an error for two different warp spans with the same
+// id to be in a collective concurrently.
+struct ncclCoopWarpSpan {
+  uint32_t warp0:8, nWarps:8, id:8;
+
+  NCCL_DEVICE_INLINE constexpr ncclCoopWarpSpan(int warp0, int nWarps, int id):
+    warp0(warp0), nWarps(nWarps), id(id) {
+  }
+
+  NCCL_DEVICE_INLINE int thread_rank() const {
+    return threadIdx.x - 32*warp0;
+  }
+  NCCL_DEVICE_INLINE int size() const {
+    return 32*nWarps;
+  }
+  NCCL_DEVICE_INLINE int num_threads() const {
+    return 32*nWarps;
+  }
+
+  NCCL_DEVICE_INLINE void sync() {
+    //asm volatile("barrier.sync %0, %1;" :: "r"(1+id), "r"(32*nWarps) : "memory");
+    __barrier_sync_count(1+id, 32*nWarps);
+  }
+};
+#endif
+
+#if __CUDACC__
+struct ncclCoopCta {
+  NCCL_DEVICE_INLINE int thread_rank() const { return threadIdx.x; }
+  NCCL_DEVICE_INLINE int size() const { return blockDim.x; }
+  NCCL_DEVICE_INLINE int num_threads() const { return blockDim.x; }
+  NCCL_DEVICE_INLINE void sync() { __syncthreads(); }
+};
+#endif
+
+#if __CUDACC__
+template<int nThreadsPow2>
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopTile<nThreadsPow2> coop) {
+  return coop.laneMask();
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopLanes coop) {
+  return coop.lmask;
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopWarpSpan coop) {
+  return -1u;
+}
+NCCL_DEVICE_INLINE uint32_t ncclCoopGetLaneMask(ncclCoopCta coop) {
+  return -1u;
+}
+#endif
+
+#if __CUDACC__
+// ncclCoopIsThread:
+// At compile time do we know the given coop is a single thread only.
+template<int nThreads>
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopTile<nThreads>) {
+  return nThreads == 1;
+}
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopLanes) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopWarpSpan) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopIsThread(ncclCoopCta) { return false; }
+#endif
+
+#if __CUDACC__
+template<int nThreads>
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopTile<nThreads>) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopLanes) { return true; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopWarpSpan) { return false; }
+NCCL_DEVICE_INLINE constexpr bool ncclCoopWithinWarp(ncclCoopCta) { return false; }
+#endif
+
+#if __CUDACC__
+// Pick threads of our warp that are safe to use collectively.
+NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced() {
+  return ncclCoopLanes{__activemask()};
+}
+#endif
+
+#if __CUDACC__
+// Pick threads of our warp that are safe to use collectively given that this
+// is a collective on the provided cooperative group.
+template<typename Coop>
+NCCL_DEVICE_INLINE ncclCoopTile<32> ncclCoopCoalesced(Coop) {
+  return ncclCoopTile<32>();
+}
+NCCL_DEVICE_INLINE ncclCoopLanes ncclCoopCoalesced(ncclCoopLanes coop) {
+  return coop;
+}
+template<int nThreads>
+NCCL_DEVICE_INLINE ncclCoopTile<nThreads> ncclCoopCoalesced(ncclCoopTile<nThreads> coop) {
+  return coop;
+}
+#endif
+
+#if __CUDACC__
+template<int nThreads, typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopTile<nThreads>, T value, int root, bool entrySync=true) {
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(-1u, u[i], root, nThreads);
+  return v;
+}
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopLanes coop, T value, int root, bool entrySync=true) {
+  uint32_t m = coop.lmask;
+  uint32_t r = root == 0 ? __ffs(m)-1 : __fns(m, 0, 1+root);
+  constexpr int n = (sizeof(T)+4-1)/4;
+  union { uint32_t u[n]; T v; };
+  v = value;
+  #pragma unroll
+  for (int i=0; i < n; i++) u[i] = __shfl_sync(m, u[i], r);
+  return v;
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_WarpSpan_stash() {
+  __shared__ ulong2 stash[15];
+  return stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopWarpSpan coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(ncclCoopBcast_WarpSpan_stash()[0]), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id] = value;
+  coop.sync();
+  return *(T*)&ncclCoopBcast_WarpSpan_stash()[coop.id];
+}
+
+NCCL_DEVICE_INLINE ulong2* ncclCoopBcast_Cta_stash() {
+  __shared__ ulong2 stash;
+  return &stash;
+}
+
+template<typename T>
+NCCL_DEVICE_INLINE T ncclCoopBcast(ncclCoopCta coop, T value, int root, bool entrySync=true) {
+  static_assert(sizeof(T) <= sizeof(*ncclCoopBcast_Cta_stash()), "Required");
+  if (entrySync) coop.sync();
+  if (coop.thread_rank() == root) *(T*)ncclCoopBcast_Cta_stash() = value;
+  coop.sync();
+  return *(T*)ncclCoopBcast_Cta_stash();
+}
+#endif
+
+#endif
diff --git a/src/nccl.h.in b/src/nccl.h.in
new file mode 100644
index 00000000000..61de6b800e1
--- /dev/null
+++ b/src/nccl.h.in
@@ -0,0 +1,583 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+#if __cplusplus && CUDART_VERSION >= 11080
+#include <cuda_fp8.h>
+#endif
+
+#define NCCL_MAJOR ${nccl:Major}
+#define NCCL_MINOR ${nccl:Minor}
+#define NCCL_PATCH ${nccl:Patch}
+#define NCCL_SUFFIX "${nccl:Suffix}"
+
+#define NCCL_VERSION_CODE ${nccl:Version}
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+typedef struct ncclWindow_vidmem* ncclWindow_t;
+#define NCCL_COMM_NULL NULL
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+#define NCCL_UNDEF_FLOAT -1.0f
+
+/* Window Registration flags */
+#define NCCL_WIN_DEFAULT 0x00
+#define NCCL_WIN_COLL_SYMMETRIC 0x01
+
+#define NCCL_WIN_REQUIRED_ALIGNMENT 4096
+
+/* NCCL performance policy */
+#define NCCL_CTA_POLICY_DEFAULT 0x00
+#define NCCL_CTA_POLICY_EFFICIENCY 0x01
+#define NCCL_CTA_POLICY_ZERO 0x02
+
+/* ncclCommShrink flags*/
+#define NCCL_SHRINK_DEFAULT 0x00 /* shrink the parent communicator */
+#define NCCL_SHRINK_ABORT 0x01   /* First, terminate ongoing parent operations, and then shrink the parent communicator */
+
+/* ncclCommRevoke flags */
+#define NCCL_REVOKE_DEFAULT 0x00 /* reserved for future use; must be 0 */
+
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v22800 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
+  int trafficClass;
+  const char *commName;
+  int collnetEnable;
+  int CTAPolicy;
+  int shrinkShare;
+  int nvlsCTAs;
+  int nChannelsPerNetPeer;
+  int nvlinkCentricSched;
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* splitShare */            \
+  NCCL_CONFIG_UNDEF_INT,                    /* trafficClass */          \
+  NCCL_CONFIG_UNDEF_PTR,                    /* commName */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* collnetEnable */         \
+  NCCL_CONFIG_UNDEF_INT,                    /* CTAPolicy */             \
+  NCCL_CONFIG_UNDEF_INT,                    /* shrinkShare */           \
+  NCCL_CONFIG_UNDEF_INT,                    /* nvlsCTAs */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* nChannelsPerNetPeer */   \
+  NCCL_CONFIG_UNDEF_INT,                    /* nvlinkCentricSched */    \
+}
+
+/* This struct will be used by ncclGroupSimulateEnd() API to query information about simulation. */
+typedef struct ncclSimInfo_v22200 {
+    size_t size;
+    unsigned int magic;
+    unsigned int version;
+    float estimatedTime;
+} ncclSimInfo_t;
+
+/* NCCL_SIM_INFO_INITIALIZER must be assigned to initialize simInfo structure when it is created.
+ * Not initialized simInfo will result in NCCL error. */
+#define NCCL_SIM_INFO_INITIALIZER {                                         \
+  sizeof(ncclSimInfo_t),                            /* size */              \
+  0x74685283,                                       /* magic */             \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */           \
+  NCCL_UNDEF_FLOAT                                  /* estimated time */    \
+}
+
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t  ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+
+/* Frees local resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
+/* Revoke a communicator. ncclCommRevoke stops all in-flight operations
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is quiescent; then, management operations (destroy, split,
+ * shrink) can proceed safely. Calling ncclCommFinalize after revoke is invalid.
+ * Additionally, resource sharing via splitShare/shrinkShare is disabled while revoked.
+ * revokeFlags must be NCCL_REVOKE_DEFAULT (0). */
+ncclResult_t  ncclCommRevoke(ncclComm_t comm, int revokeFlags);
+ncclResult_t pncclCommRevoke(ncclComm_t comm, int revokeFlags);
+
+/* Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+
+/* Shrink existing communicator.
+ * Ranks in excludeRanksList will be removed form the existing communicator.
+ * Within the new communicator, ranks will be re-ordered to fill the gap of removed ones.
+ * If config is NULL, the new communicator will inherit the original communicator's configuration
+ * The flag enables NCCL to adapt to various states of the parent communicator, see NCCL_SHRINK flags.*/
+ncclResult_t  ncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
+ncclResult_t pncclCommShrink(ncclComm_t comm, int* excludeRanksList, int excludeRanksCount, ncclComm_t* newcomm, ncclConfig_t* config, int shrinkFlags);
+
+/* Creates a new communicator (multi thread/process version), similar to ncclCommInitRankConfig.
+ * Allows to use more than one ncclUniqueId (up to one per rank), indicated by nId, to accelerate the init operation.
+ * The number of ncclUniqueIds and their order must be the same for every rank.
+ */
+ncclResult_t ncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankScalable(ncclComm_t* newcomm, int nranks, int myrank, int nId, ncclUniqueId* commIds, ncclConfig_t* config);
+
+/* Returns a string for each error code. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Returns a human-readable message of the last error that occurred. */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
+
+/* Reload environment variables that determine logging. */
+__attribute__ ((deprecated("ncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
+void  ncclResetDebugInit();
+__attribute__ ((deprecated("pncclResetDebugInit is not supported as part of the NCCL API and will be removed in the future")))
+void pncclResetDebugInit();
+
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
+/* Register memory window  */
+ncclResult_t  ncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+ncclResult_t pncclCommWindowRegister(ncclComm_t comm, void* buff, size_t size, ncclWindow_t* win, int winFlags);
+
+/* Deregister symmetric memory */
+ncclResult_t  ncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+ncclResult_t pncclCommWindowDeregister(ncclComm_t comm, ncclWindow_t win);
+
+/* Reduction operation selector */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclAvg        = 4,
+               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+                * serves as the least possible value for dynamic ncclRedOp_t's
+                * as constructed by ncclRedOpCreate*** functions. */
+               ncclNumOps     = 5,
+               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+                * It is defined to be the largest signed value (since compilers
+                * are permitted to use signed enums) that won't grow
+                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+                * maintain ABI compatibility. */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+             } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclBfloat16   = 9,
+               ncclFloat8e4m3 = 10,
+               ncclFloat8e5m2 = 11,
+               ncclNumTypes   = 12
+} ncclDataType_t;
+
+/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
+typedef enum {
+  /* ncclScalarDevice: The scalar is in device-visible memory and will be
+   * dereferenced while the collective is running. */
+  ncclScalarDevice = 0,
+
+  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+   * dereferenced before the ncclRedOpCreate***() function returns. */
+  ncclScalarHostImmediate = 1
+} ncclScalarResidence_t;
+
+/*
+ * ncclRedOpCreatePreMulSum
+ *
+ * Creates a new reduction operator which pre-multiplies input values by a given
+ * scalar locally before reducing them with peer values via summation. For use
+ * only with collectives launched against *comm* and *datatype*. The
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
+ * will be dereferenced. Upon return, the newly created operator's handle
+ * is stored in *op*.
+ */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+
+/*
+ * ncclRedOpDestroy
+ *
+ * Destroys the reduction operator *op*. The operator must have been created by
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+ * destroyed as soon as the last NCCL function which is given that operator returns.
+ */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-to-All
+ *
+ * Each device sends count values to all other devices and receives count values
+ * from all other devices. Data to send to destination rank j is taken from
+ * sendbuff+j*count and data received from source rank i is placed at
+ * recvbuff+i*count.
+ */
+ncclResult_t  ncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAlltoAll(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Gather
+ *
+ * Each rank sends count elements from sendbuff to the root rank.
+ * On the root rank, data from rank i is placed at recvbuff + i*count.
+ * On non-root ranks, recvbuff is not used.
+ * root is the rank where data will be gathered.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + root * count.
+ */
+ncclResult_t  ncclGather(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Scatter
+ *
+ * On the root rank, count elements from sendbuff+i*count are sent to rank i.
+ * On non-root ranks, sendbuff is not used.
+ * Each rank receives count elements into recvbuff.
+ * root is the rank that will distribute the data.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + root * count.
+ */
+ncclResult_t  ncclScatter(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Send
+ *
+ * Send data from sendbuff to rank peer.
+ *
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Receive
+ *
+ * Receive data from rank peer into recvbuff.
+ *
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd, but not together.
+ *
+ * Group semantics also allow to fuse multiple operations on the same device
+ * to improve performance (for aggregated collective calls), or to permit
+ * concurrent progress of multiple send/receive operations.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
+ * ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Start a fused NCCL operation consisting of all calls since
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+ * need to be called after ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
+
+/*
+ * Group Simulate End
+ *
+ * Simulate a ncclGroupEnd() call and return NCCL's simulation info in a struct.
+ */
+ncclResult_t  ncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
+ncclResult_t pncclGroupSimulateEnd(ncclSimInfo_t* simInfo);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard