Skip to content

Commit 46cd912

Browse files
i-kosarevclaude
andcommitted
net-ib: refactor fault injection tests
Fix fault injection API issues from review: - ncclIbCastFaultSetQpDelay/SetQpError: use comm->base.nqps for bounds check instead of NCCL_IB_MAX_QPS - ncclIbCastFaultClear: atomically reset fatalErrorCount to 0 - move net_ib_fault_inject.h into transport/net_ib_cast/; drop local #define NCCL_IB_MAX_QPS 128, include net_ib_cast_inspect.h and add static_assert so a size mismatch becomes a compile error - fault hook in IbCastMultiSend: use IbCastStatsFatalError (renamed from ncclIbStatsFatalError in asanniko's split) - FaultInjCastQpErrorClearRecovers: ASSERT_EQ on SetQpError return value; drain recvReq before DeregisterMemory - FaultInjCastSingleQpErrorIsFatal: EXPECT_EQ on ncclIbCastSetTokens and ncclIbCastFaultClear return values Consolidate NCCL_IB_MAX_QPS definition and reorganize headers: - add src/transport/net_ib_limits.h as single source of truth for NCCL_IB_MAX_QPS 128, removing duplicate defines from net_ib.cc, common_cast.h, and net_ib_cast_inspect.h - move net_ib_cast_inspect.h from src/include/ to src/transport/net_ib_cast/ alongside net_ib_fault_inject.h; update CMakeLists include paths - group net_ib_limits.h, net_ib_cast_inspect.h, net_ib_fault_inject.h together next to transport/net_ib.cc in src/CMakeLists.txt Reduce test code duplication: - add helpers to NetIbMPITestBase: DrainRecvRequest, TeardownConnection, ExpectEqualWeightInitTokens, ExpectActiveTokenSumInvariant - use them in FaultInjectTests.cpp and CastTests.cpp to eliminate ~300 lines of repeated teardown and sched-state assertion code Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
1 parent 23a6d58 commit 46cd912

13 files changed

Lines changed: 148 additions & 244 deletions

File tree

projects/rccl/src/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,6 @@ set(SRC_FILES
115115
include/nccl_device.h
116116
include/net_device.h
117117
include/net.h
118-
include/net_ib_cast_inspect.h
119118
include/nvmlwrap.h
120119
include/nvtx.h
121120
include/nvtx_payload_schemas.h
@@ -219,7 +218,6 @@ set(SRC_FILES
219218
include/plugin/net/net_v11.h
220219
include/plugin/profiler/net_ib_v1.h
221220
include/plugin/profiler/net_ib.h
222-
include/net_ib_fault_inject.h
223221
include/plugin/profiler/net_socket_v1.h
224222
include/plugin/profiler/net_socket.h
225223
include/plugin/profiler/profiler_v1.h
@@ -296,6 +294,9 @@ set(SRC_FILES
296294
transport/generic.cc
297295
transport/net.cc
298296
transport/net_ib.cc
297+
transport/net_ib_limits.h
298+
transport/net_ib_cast/net_ib_cast_inspect.h
299+
transport/net_ib_cast/net_ib_fault_inject.h
299300
# net_ib_rocm.cc is generated by rocmIb.cmake directly into the hipify staging area
300301
# so it is not listed here - it's added to HIP_SOURCES separately below
301302
transport/net_socket.cc

projects/rccl/src/transport/net_ib.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1164,7 +1164,7 @@ ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
11641164
#define MAX_REQUESTS (NCCL_NET_MAX_REQUESTS*NCCL_NET_IB_MAX_RECVS)
11651165
static_assert(MAX_REQUESTS <= 256, "request id are encoded in wr_id and we need up to 8 requests ids per completion");
11661166

1167-
#define NCCL_IB_MAX_QPS 128
1167+
#include "net_ib_limits.h"
11681168

11691169
// Per-QP connection metatdata
11701170
struct ncclIbQpInfo {

projects/rccl/src/transport/net_ib_cast/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# Transport sources (paths relative to PROJECT_SOURCE_DIR to match SRC_FILES convention)
22
set(NET_IB_CAST_SOURCES
3+
src/transport/net_ib_limits.h
4+
src/transport/net_ib_cast/net_ib_cast_inspect.h
35
src/transport/net_ib_cast/common_cast.h
46
src/transport/net_ib_cast/connect_cast.h
57
src/transport/net_ib_cast/gin_cast.h
@@ -26,6 +28,7 @@ set(NET_IB_CAST_SOURCES
2628
# is forwarded to target_include_directories(rccl ...).
2729
set(NET_IB_CAST_INCLUDE_DIRS
2830
"${CMAKE_BINARY_DIR}/hipify/src/transport/net_ib_cast"
31+
"${CMAKE_BINARY_DIR}/hipify/src/transport"
2932
PARENT_SCOPE)
3033

3134
# Propagate sources to parent scope so src/CMakeLists.txt can use ${NET_IB_CAST_SOURCES}

projects/rccl/src/transport/net_ib_cast/common_cast.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -748,7 +748,6 @@ ncclResult_t IbCastSetNetAttr(void *ctx, ncclNetAttr_t *netAttr);
748748
#define NCCL_CTS_QP_SLOT_INVALID 0xFF
749749

750750
// IB-CAST specific infrastructure
751-
#define NCCL_IB_MAX_QPS 128
752751

753752
#define NSEC_PER_USEC 1000ULL
754753
#define NSEC_PER_MSEC (NSEC_PER_USEC * 1000)

projects/rccl/src/include/net_ib_cast_inspect.h renamed to projects/rccl/src/transport/net_ib_cast/net_ib_cast_inspect.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#define RCCL_NET_IB_CAST_INSPECT_H_
99

1010
#include <stdint.h>
11+
#include "net_ib_limits.h"
1112

1213
#ifdef __cplusplus
1314
#include "nccl.h" /* ncclResult_t */
@@ -19,14 +20,10 @@ extern "C" {
1920
/*
2021
* Test-only introspection API for the net-ib-cast WRR scheduler.
2122
*
22-
* Implementation lives in src/transport/net_ib_cast.cc and is compiled into
23-
* librccl.so. Both the library and the unit tests include this one header so
24-
* the struct layout and function signatures can never diverge.
25-
*
26-
* NCCL_IB_MAX_QPS is the maximum number of QPs per connection.
27-
* Defined here so both the library and the tests share the same value.
23+
* Implementation lives in src/transport/net_ib_cast/scheduler.cc and is
24+
* compiled into librccl.so. Both the library and the unit tests include this
25+
* one header so the struct layout and function signatures can never diverge.
2826
*/
29-
#define NCCL_IB_MAX_QPS 128
3027

3128
struct ncclIbCastSchedState {
3229
int nqps;

projects/rccl/src/include/net_ib_fault_inject.h renamed to projects/rccl/src/transport/net_ib_cast/net_ib_fault_inject.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,11 @@ extern "C" {
2222
/*
2323
* Test-only per-QP fault injection API for the net-ib CAST transport.
2424
*
25-
* Implemented in src/transport/net_ib_cast.cc (CAST multi-QP path).
26-
*
27-
* NCCL_IB_MAX_QPS is defined in net_ib_cast_inspect.h; guard against
28-
* double-definition when both headers are included together.
25+
* Implemented in src/transport/net_ib_cast/p2p.cc (CAST multi-QP path).
2926
*/
30-
#ifndef NCCL_IB_MAX_QPS
31-
#define NCCL_IB_MAX_QPS 128
27+
#include "net_ib_cast_inspect.h"
28+
#ifdef __cplusplus
29+
static_assert(NCCL_IB_MAX_QPS == 128, "fault injection arrays sized for 128 QPs; update if NCCL_IB_MAX_QPS changes");
3230
#endif
3331

3432
/* ── CAST path (net_ib_cast.cc) ───────────────────────────────────────── */
@@ -39,7 +37,7 @@ extern "C" {
3937
ncclResult_t ncclIbCastFaultSetQpDelay(void* sendComm, int qpIdx, uint32_t delayUs);
4038

4139
/* Arm error injection on a specific QP index.
42-
* When armed, the hook calls ncclIbStatsFatalError and then returns
40+
* When armed, the hook calls IbCastStatsFatalError and then returns
4341
* ncclSystemError instead of calling wrap_ibv_post_send.
4442
* Set inject=false to disarm. */
4543
ncclResult_t ncclIbCastFaultSetQpError(void* sendComm, int qpIdx, bool inject);

projects/rccl/src/transport/net_ib_cast/p2p.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ ncclResult_t IbCastMultiSend(struct ncclIbSendComm* comm, int slot, int nqps, in
298298
const uint32_t faultDelay = comm->base.faultQpDelayUs[qpIndex];
299299
if (faultDelay) usleep(faultDelay);
300300
if (comm->base.faultQpError[qpIndex]) {
301-
ncclIbStatsFatalError(&comm->base.stats);
301+
IbCastStatsFatalError(&comm->base.stats);
302302
return ncclSystemError;
303303
}
304304
}
@@ -1105,15 +1105,15 @@ ncclResult_t IbCastTest(void* request, int* done, int* sizes) {
11051105
ncclResult_t ncclIbCastFaultSetQpDelay(void* sendComm, int qpIdx, uint32_t delayUs) {
11061106
if (!sendComm) return ncclInvalidArgument;
11071107
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
1108-
if (qpIdx < 0 || qpIdx >= NCCL_IB_MAX_QPS) return ncclInvalidArgument;
1108+
if (qpIdx < 0 || qpIdx >= comm->base.nqps) return ncclInvalidArgument;
11091109
comm->base.faultQpDelayUs[qpIdx] = delayUs;
11101110
return ncclSuccess;
11111111
}
11121112

11131113
ncclResult_t ncclIbCastFaultSetQpError(void* sendComm, int qpIdx, bool inject) {
11141114
if (!sendComm) return ncclInvalidArgument;
11151115
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
1116-
if (qpIdx < 0 || qpIdx >= NCCL_IB_MAX_QPS) return ncclInvalidArgument;
1116+
if (qpIdx < 0 || qpIdx >= comm->base.nqps) return ncclInvalidArgument;
11171117
comm->base.faultQpError[qpIdx] = inject;
11181118
return ncclSuccess;
11191119
}
@@ -1123,6 +1123,7 @@ ncclResult_t ncclIbCastFaultClear(void* sendComm) {
11231123
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
11241124
memset(comm->base.faultQpDelayUs, 0, sizeof(comm->base.faultQpDelayUs));
11251125
memset(comm->base.faultQpError, 0, sizeof(comm->base.faultQpError));
1126+
__atomic_store_n(&comm->base.stats.fatalErrorCount, 0, __ATOMIC_RELEASE);
11261127
return ncclSuccess;
11271128
}
11281129

projects/rccl/src/transport/net_ib_cast/scheduler.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,7 @@ ncclResult_t IbCastQpSchedFreeRemap(struct ncclIbRemapWrId* r) {
425425
// Test introspection API — exposes internal WRR scheduler state from a
426426
// sendComm handle. Only intended for unit tests; not part of the public net
427427
// plugin ABI.
428-
// Struct definition and function prototypes live in src/include/net_ib_cast_inspect.h.
428+
// Struct definition and function prototypes live in src/transport/net_ib_cast/net_ib_cast_inspect.h.
429429
// =============================================================================
430430

431431
// ncclIbCastGetSchedState — copy WRR scheduler state out of a sendComm.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
/*************************************************************************
2+
* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef RCCL_NET_IB_LIMITS_H_
8+
#define RCCL_NET_IB_LIMITS_H_
9+
10+
/* Maximum number of QPs per IB connection (CAST and regular paths). */
11+
#define NCCL_IB_MAX_QPS 128
12+
13+
#endif /* RCCL_NET_IB_LIMITS_H_ */

projects/rccl/test/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ if(BUILD_TESTS)
9494
${PROJECT_BINARY_DIR}/hipify/gensrc # for rccl_bfloat16.h
9595
${PROJECT_BINARY_DIR}/hipify/src # for graph/topo.h
9696
${PROJECT_BINARY_DIR}/hipify/src/include/plugin # for recorder tests, nccl_tuner.h
97+
${PROJECT_BINARY_DIR}/hipify/src/transport/net_ib_cast # for net_ib_fault_inject.h, net_ib_cast_inspect.h
98+
${PROJECT_BINARY_DIR}/hipify/src/transport # for net_ib_limits.h
9799
${ROCM_PATH}/include
98100
${ROCM_PATH}
99101
)

0 commit comments

Comments
 (0)