Skip to content

Commit 5f29b91

Browse files
i-kosarevclaude
andcommitted
net-ib: fix fault injection API issues from review
- Validate qpIndex against comm->base.nqps (actual QP count) instead of NCCL_IB_MAX_QPS (128) in FaultSetQpDelay and FaultSetQpError, so arming a non-existent QP returns ncclInvalidArgument immediately. - Reset fatalErrorCount in ncclIbCastFaultClear so the connection is truly recoverable after clearing fault state. - Add phase 1 verification in FaultInjCastQpErrorClearRecovers: rank 1 forwards sendRet + fatalCount to rank 0 via MPI, rank 0 asserts that the fault was actually observed before testing recovery in phase 2. Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
1 parent 7da9b83 commit 5f29b91

2 files changed

Lines changed: 25 additions & 5 deletions

File tree

projects/rccl/src/transport/net_ib_cast.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2942,15 +2942,15 @@ static int IbCastQpSchedGetEffectiveTxNqps(struct ncclIbRequest* req, int *start
29422942
ncclResult_t ncclIbCastFaultSetQpDelay(void* sendComm, int qpIdx, uint32_t delayUs) {
29432943
if (!sendComm) return ncclInvalidArgument;
29442944
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
2945-
if (qpIdx < 0 || qpIdx >= NCCL_IB_MAX_QPS) return ncclInvalidArgument;
2945+
if (qpIdx < 0 || qpIdx >= comm->base.nqps) return ncclInvalidArgument;
29462946
comm->base.faultQpDelayUs[qpIdx] = delayUs;
29472947
return ncclSuccess;
29482948
}
29492949

29502950
ncclResult_t ncclIbCastFaultSetQpError(void* sendComm, int qpIdx, bool inject) {
29512951
if (!sendComm) return ncclInvalidArgument;
29522952
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
2953-
if (qpIdx < 0 || qpIdx >= NCCL_IB_MAX_QPS) return ncclInvalidArgument;
2953+
if (qpIdx < 0 || qpIdx >= comm->base.nqps) return ncclInvalidArgument;
29542954
comm->base.faultQpError[qpIdx] = inject;
29552955
return ncclSuccess;
29562956
}
@@ -2960,6 +2960,7 @@ ncclResult_t ncclIbCastFaultClear(void* sendComm) {
29602960
struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm;
29612961
memset(comm->base.faultQpDelayUs, 0, sizeof(comm->base.faultQpDelayUs));
29622962
memset(comm->base.faultQpError, 0, sizeof(comm->base.faultQpError));
2963+
__atomic_store_n(&comm->base.stats.fatalErrorCount, 0, __ATOMIC_RELEASE);
29632964
return ncclSuccess;
29642965
}
29652966

projects/rccl/test/transport/NetIbMPI/FaultInjectTests.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,10 @@ TEST_F(NetIbMPITest, FaultInjCastQpErrorClearRecovers) {
607607
MPI_Barrier(MPI_COMM_WORLD);
608608

609609
// Trigger the fault (rank 0 posts recv, rank 1 posts send that will fail).
610+
// rank 1 forwards its fault result to rank 0 so we can assert the fault
611+
// was actually observed before testing recovery in phase 2.
612+
static constexpr int kPhase1MpiTag = 9882;
613+
FaultInjectResult p1 = {};
610614
if (rank == 0) {
611615
void* bufs[1] = {buf1};
612616
size_t sizes[1] = {kMsgSize};
@@ -628,17 +632,32 @@ TEST_F(NetIbMPITest, FaultInjCastQpErrorClearRecovers) {
628632
if (sendRet != ncclSuccess || sendReq != nullptr) break;
629633
usleep(kPollIntervalUs);
630634
}
635+
int fatalCount = 0;
631636
if (sendRet == ncclSuccess && sendReq != nullptr) {
632637
for (int poll = 0; poll < 200; poll++) {
633638
int done = 0, sz = 0;
634-
int fc = 0;
635639
TestRequest(sendReq, &done, &sz);
636-
ncclIbCastFaultGetFatalCount(sendComm1, &fc);
637-
if (done || fc > 0) break;
640+
ncclIbCastFaultGetFatalCount(sendComm1, &fatalCount);
641+
if (done || fatalCount > 0) break;
638642
usleep(kPollIntervalUs);
639643
}
644+
} else {
645+
ncclIbCastFaultGetFatalCount(sendComm1, &fatalCount);
640646
}
647+
p1.sendRet = static_cast<int>(sendRet);
648+
p1.fatalCount = fatalCount;
641649
ncclIbCastFaultClear(sendComm1);
650+
MPI_Send(&p1, sizeof(p1), MPI_BYTE, 0, kPhase1MpiTag, MPI_COMM_WORLD);
651+
}
652+
653+
if (rank == 0) {
654+
MPI_Recv(&p1, sizeof(p1), MPI_BYTE, 1, kPhase1MpiTag, MPI_COMM_WORLD,
655+
MPI_STATUS_IGNORE);
656+
bool isendFailed = (p1.sendRet != static_cast<int>(ncclSuccess));
657+
EXPECT_TRUE(isendFailed || p1.fatalCount > 0)
658+
<< "Phase 1: fault injection did not trigger — isend returned "
659+
<< p1.sendRet << ", fatalCount=" << p1.fatalCount
660+
<< "; recovery test is meaningless without a confirmed fault";
642661
}
643662

644663
MPI_Barrier(MPI_COMM_WORLD);

0 commit comments

Comments
 (0)