Skip to content

Commit 5cede29

Browse files
committed
net-ib: add fault injection test configs to runner
1 parent 8cd559d commit 5cede29

2 files changed

Lines changed: 94 additions & 0 deletions

File tree

projects/rccl/tools/scripts/test_runner/configs/mi300x_mellanox_ib.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,47 @@
854854
]
855855
},
856856

857+
"fault_inject_cast": {
858+
"extends": "cast_base",
859+
"timeout": 300,
860+
"env_variables": {
861+
"NCCL_IB_SPLIT_DATA_ON_QPS": "1",
862+
"NCCL_IB_QPS_PER_CONNECTION": "4",
863+
"RCCL_IB_QP_SCHED_WRR_ENABLE": "1",
864+
"RCCL_IB_QP_SCHED_UPDATE_INTERVAL": "100",
865+
"RCCL_IB_QP_SCHED_RESET_INTERVAL": "1000",
866+
"RCCL_IB_QP_SCHED_SPLIT_DATA_MIN": "2000",
867+
"NCCL_IB_RETURN_ASYNC_EVENTS": "1"
868+
},
869+
"tests": [
870+
{
871+
"name": "FaultInj_Cast_QpErrorIsFatal",
872+
"description": "CAST path: error injected on QP 0 increments fatalErrorCount",
873+
"test_filter": "NetIbMPITest.FaultInjCastQpErrorIsFatal"
874+
},
875+
{
876+
"name": "FaultInj_Cast_SlowQpRebalances",
877+
"description": "CAST WRR: 10 ms delay on QP 0 causes RTT timer to reduce QP 0 active tokens below equal share after 500 sends",
878+
"test_filter": "NetIbMPITest.FaultInjCastSlowQpRebalances"
879+
},
880+
{
881+
"name": "FaultInj_Cast_DelayDataIntegrity",
882+
"description": "CAST path: 2 ms per-QP delay preserves full data integrity across 50 sends",
883+
"test_filter": "NetIbMPITest.FaultInjCastDelayDataIntegrity"
884+
},
885+
{
886+
"name": "FaultInj_Cast_SingleQpErrorIsFatal",
887+
"description": "CAST WRR: error on one QP (tokens steered to QP 0 via SetTokens) triggers fatalErrorCount > 0 or isend failure",
888+
"test_filter": "NetIbMPITest.FaultInjCastSingleQpErrorIsFatal"
889+
},
890+
{
891+
"name": "FaultInj_Cast_QpErrorClearRecovers",
892+
"description": "CAST path: after FaultClear on a faulted connection, a fresh connection completes 20 sends with no corruption and fatalCount==0",
893+
"test_filter": "NetIbMPITest.FaultInjCastQpErrorClearRecovers"
894+
}
895+
]
896+
},
897+
857898
"graph_capture_multi_node": {
858899
"extends": "default",
859900
"is_gtest": true,
@@ -1022,6 +1063,12 @@
10221063
"description": "500 sends on 2 concurrent connections: RTT timer fires mid-run, consistency invariants verified",
10231064
"config": "cast_stress",
10241065
"enabled": true
1066+
},
1067+
{
1068+
"name": "NET IB - Fault Injection (CAST path, QPS=4, splitData=1)",
1069+
"description": "Per-QP fault injection on CAST multi-QP path: error propagation, RTT rebalancing, data integrity under delay",
1070+
"config": "fault_inject_cast",
1071+
"enabled": true
10251072
}
10261073
]
10271074
}

projects/rccl/tools/scripts/test_runner/configs/net_ib_transport.json

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,47 @@
498498
"test_filter": "NetIbMPITest.CastStressMultiRoundTwoConns"
499499
}
500500
]
501+
},
502+
503+
"fault_inject_cast": {
504+
"extends": "cast_base",
505+
"timeout": 300,
506+
"env_variables": {
507+
"NCCL_IB_SPLIT_DATA_ON_QPS": "1",
508+
"NCCL_IB_QPS_PER_CONNECTION": "4",
509+
"RCCL_IB_QP_SCHED_WRR_ENABLE": "1",
510+
"RCCL_IB_QP_SCHED_UPDATE_INTERVAL": "100",
511+
"RCCL_IB_QP_SCHED_RESET_INTERVAL": "1000",
512+
"RCCL_IB_QP_SCHED_SPLIT_DATA_MIN": "2000",
513+
"NCCL_IB_RETURN_ASYNC_EVENTS": "1"
514+
},
515+
"tests": [
516+
{
517+
"name": "FaultInj_Cast_QpErrorIsFatal",
518+
"description": "CAST path: error injected on QP 0 increments fatalErrorCount",
519+
"test_filter": "NetIbMPITest.FaultInjCastQpErrorIsFatal"
520+
},
521+
{
522+
"name": "FaultInj_Cast_SlowQpRebalances",
523+
"description": "CAST WRR: 10 ms delay on QP 0 causes RTT timer to reduce QP 0 active tokens below equal share after 500 sends",
524+
"test_filter": "NetIbMPITest.FaultInjCastSlowQpRebalances"
525+
},
526+
{
527+
"name": "FaultInj_Cast_DelayDataIntegrity",
528+
"description": "CAST path: 2 ms per-QP delay preserves full data integrity across 50 sends",
529+
"test_filter": "NetIbMPITest.FaultInjCastDelayDataIntegrity"
530+
},
531+
{
532+
"name": "FaultInj_Cast_SingleQpErrorIsFatal",
533+
"description": "CAST WRR: error on one QP (tokens steered to QP 0 via SetTokens) triggers fatalErrorCount > 0 or isend failure",
534+
"test_filter": "NetIbMPITest.FaultInjCastSingleQpErrorIsFatal"
535+
},
536+
{
537+
"name": "FaultInj_Cast_QpErrorClearRecovers",
538+
"description": "CAST path: after FaultClear on a faulted connection, a fresh connection completes 20 sends with no corruption and fatalCount==0",
539+
"test_filter": "NetIbMPITest.FaultInjCastQpErrorClearRecovers"
540+
}
541+
]
501542
}
502543
},
503544
"test_suites": [
@@ -596,6 +637,12 @@
596637
"description": "500 sends on 2 concurrent connections: RTT timer fires mid-run, consistency invariants verified",
597638
"config": "cast_stress",
598639
"enabled": true
640+
},
641+
{
642+
"name": "NET IB - Fault Injection (CAST path)",
643+
"description": "Per-QP fault injection: error detection, WRR rebalancing under artificial delay, data integrity on CAST multi-QP path",
644+
"config": "fault_inject_cast",
645+
"enabled": true
599646
}
600647
]
601648
}

0 commit comments

Comments
 (0)