|
854 | 854 | ] |
855 | 855 | }, |
856 | 856 |
|
| 857 | + "fault_inject_cast": { |
| 858 | + "extends": "cast_base", |
| 859 | + "timeout": 300, |
| 860 | + "env_variables": { |
| 861 | + "NCCL_IB_SPLIT_DATA_ON_QPS": "1", |
| 862 | + "NCCL_IB_QPS_PER_CONNECTION": "4", |
| 863 | + "RCCL_IB_QP_SCHED_WRR_ENABLE": "1", |
| 864 | + "RCCL_IB_QP_SCHED_UPDATE_INTERVAL": "100", |
| 865 | + "RCCL_IB_QP_SCHED_RESET_INTERVAL": "1000", |
| 866 | + "RCCL_IB_QP_SCHED_SPLIT_DATA_MIN": "2000", |
| 867 | + "NCCL_IB_RETURN_ASYNC_EVENTS": "1" |
| 868 | + }, |
| 869 | + "tests": [ |
| 870 | + { |
| 871 | + "name": "FaultInj_Cast_QpErrorIsFatal", |
| 872 | + "description": "CAST path: error injected on QP 0 increments fatalErrorCount", |
| 873 | + "test_filter": "NetIbMPITest.FaultInjCastQpErrorIsFatal" |
| 874 | + }, |
| 875 | + { |
| 876 | + "name": "FaultInj_Cast_SlowQpRebalances", |
| 877 | + "description": "CAST WRR: 10 ms delay on QP 0 causes RTT timer to reduce QP 0 active tokens below equal share after 500 sends", |
| 878 | + "test_filter": "NetIbMPITest.FaultInjCastSlowQpRebalances" |
| 879 | + }, |
| 880 | + { |
| 881 | + "name": "FaultInj_Cast_DelayDataIntegrity", |
| 882 | + "description": "CAST path: 2 ms per-QP delay preserves full data integrity across 50 sends", |
| 883 | + "test_filter": "NetIbMPITest.FaultInjCastDelayDataIntegrity" |
| 884 | + }, |
| 885 | + { |
| 886 | + "name": "FaultInj_Cast_SingleQpErrorIsFatal", |
| 887 | + "description": "CAST WRR: error on one QP (tokens steered to QP 0 via SetTokens) triggers fatalErrorCount > 0 or isend failure", |
| 888 | + "test_filter": "NetIbMPITest.FaultInjCastSingleQpErrorIsFatal" |
| 889 | + }, |
| 890 | + { |
| 891 | + "name": "FaultInj_Cast_QpErrorClearRecovers", |
| 892 | + "description": "CAST path: after FaultClear on a faulted connection, a fresh connection completes 20 sends with no corruption and fatalCount==0", |
| 893 | + "test_filter": "NetIbMPITest.FaultInjCastQpErrorClearRecovers" |
| 894 | + } |
| 895 | + ] |
| 896 | + }, |
| 897 | + |
857 | 898 | "graph_capture_multi_node": { |
858 | 899 | "extends": "default", |
859 | 900 | "is_gtest": true, |
|
1022 | 1063 | "description": "500 sends on 2 concurrent connections: RTT timer fires mid-run, consistency invariants verified", |
1023 | 1064 | "config": "cast_stress", |
1024 | 1065 | "enabled": true |
| 1066 | + }, |
| 1067 | + { |
| 1068 | + "name": "NET IB - Fault Injection (CAST path, QPS=4, splitData=1)", |
| 1069 | + "description": "Per-QP fault injection on CAST multi-QP path: error propagation, RTT rebalancing, data integrity under delay", |
| 1070 | + "config": "fault_inject_cast", |
| 1071 | + "enabled": true |
1025 | 1072 | } |
1026 | 1073 | ] |
1027 | 1074 | } |
0 commit comments