Skip to content

Commit 2da0953

Browse files
committed
feat: run CUDA tests in parallel via CTest GPU resource allocation
Register CUDA tests at binary granularity and pin each to a GPU through CTest RESOURCE_GROUPS, with a bash wrapper mapping CTEST_RESOURCE_GROUP_0_GPUS to CUDA_VISIBLE_DEVICES. Add a run_ctest helper that generates the GPU resource spec file (from CTEST_CUDA_GPUS or nvidia-smi) and runs CPU then CUDA suites, replacing the hardcoded -j1 CUDA ctest command.
1 parent 7d81dbc commit 2da0953

3 files changed

Lines changed: 91 additions & 10 deletions

File tree

cmake/test_macros.cmake

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ include(GoogleTest)
2828
# Features:
2929
# 1. Create executable target
3030
# 2. Configure compile options, link libraries, and include paths
31-
# 3. Use gtest_discover_tests to auto-discover test cases
32-
# 4. Set test labels
31+
# 3. Use gtest_discover_tests to auto-discover CPU test cases
32+
# 4. Register CUDA tests at binary granularity with CTest GPU resources
33+
# 5. Set test labels
3334
#
3435
# Arguments:
3536
# SOURCES: Source file list (required)
@@ -73,7 +74,7 @@ macro(infini_train_add_test)
7374
# 5. Link project library (reuses framework linking strategy)
7475
link_infini_train_exe(${ARG_TEST_NAME})
7576

76-
# 6. Auto-discover gtest cases and register as ctest tests
77+
# 6. Register tests
7778
set(labels "cpu")
7879
if(ARG_LABELS)
7980
set(labels "${ARG_LABELS}")
@@ -84,16 +85,30 @@ macro(infini_train_add_test)
8485
set(test_timeout ${ARG_TEST_TIMEOUT})
8586
endif()
8687

87-
if(ARG_TEST_FILTER)
88+
list(FIND labels cuda _has_cuda_label)
89+
if(NOT _has_cuda_label EQUAL -1)
90+
set(_cuda_test_args)
91+
if(ARG_TEST_FILTER)
92+
list(APPEND _cuda_test_args --gtest_filter=${ARG_TEST_FILTER})
93+
endif()
94+
95+
add_test(
96+
NAME ${ARG_TEST_NAME}
97+
COMMAND $<TARGET_FILE:${ARG_TEST_NAME}> ${_cuda_test_args}
98+
)
99+
set_tests_properties(${ARG_TEST_NAME}
100+
PROPERTIES
101+
LABELS "${labels}"
102+
TIMEOUT ${test_timeout}
103+
)
104+
elseif(ARG_TEST_FILTER)
88105
gtest_discover_tests(${ARG_TEST_NAME}
89-
EXTRA_ARGS --gtest_output=xml:%T.xml
90106
TEST_FILTER "${ARG_TEST_FILTER}"
91107
DISCOVERY_TIMEOUT 10
92108
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
93109
)
94110
else()
95111
gtest_discover_tests(${ARG_TEST_NAME}
96-
EXTRA_ARGS --gtest_output=xml:%T.xml
97112
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
98113
)
99114
endif()

scripts/run_models_and_profile.bash

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ LOG_DIR="$(read_var LOG_DIR)"; : "${LOG_DIR:=logs}"
7171
PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)"; : "${PROFILE_LOG_DIR:=./profile_logs}"
7272
COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}"
7373
RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}"
74-
CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}"
7574

7675
mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
7776

@@ -114,6 +113,74 @@ clean_build_dir() {
114113
rm -rf "${BUILD_DIR:?}/"*
115114
}
116115

116+
run_ctest() {
117+
local gpu_list=()
118+
local cuda_tests=()
119+
120+
if [[ -n "${CTEST_CUDA_GPUS:-}" ]]; then
121+
IFS=',' read -r -a gpu_list <<< "$CTEST_CUDA_GPUS"
122+
elif command -v nvidia-smi >/dev/null 2>&1; then
123+
mapfile -t gpu_list < <(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null || true)
124+
fi
125+
126+
if [[ ${#gpu_list[@]} -eq 0 ]]; then
127+
gpu_list=(0)
128+
fi
129+
130+
local filtered_gpu_list=()
131+
local gpu
132+
for gpu in "${gpu_list[@]}"; do
133+
gpu="${gpu//[[:space:]]/}"
134+
[[ -z "$gpu" ]] && continue
135+
filtered_gpu_list+=("$gpu")
136+
done
137+
138+
if [[ ${#filtered_gpu_list[@]} -eq 0 ]]; then
139+
filtered_gpu_list=(0)
140+
fi
141+
142+
ctest --output-on-failure -LE cuda -j"$(nproc)"
143+
144+
mapfile -t cuda_tests < <(ctest -N -L cuda | sed -n 's/^ *Test *#[0-9][0-9]*: //p')
145+
if [[ ${#cuda_tests[@]} -eq 0 ]]; then
146+
return 0
147+
fi
148+
149+
local worker_count="${#filtered_gpu_list[@]}"
150+
local pids=()
151+
local worker_idx
152+
for ((worker_idx = 0; worker_idx < worker_count; worker_idx++)); do
153+
(
154+
local worker_failed=0
155+
local test_idx="$worker_idx"
156+
local test_name
157+
local assigned_gpu="${filtered_gpu_list[$worker_idx]}"
158+
159+
while ((test_idx < ${#cuda_tests[@]})); do
160+
test_name="${cuda_tests[$test_idx]}"
161+
echo "[CUDA GPU ${assigned_gpu}] ${test_name}"
162+
if ! CUDA_VISIBLE_DEVICES="$assigned_gpu" ctest --output-on-failure -R "^${test_name}$" -j1; then
163+
worker_failed=1
164+
fi
165+
test_idx=$((test_idx + worker_count))
166+
done
167+
168+
exit "$worker_failed"
169+
) &
170+
pids+=("$!")
171+
done
172+
173+
local failed=0
174+
local pid
175+
for pid in "${pids[@]}"; do
176+
if ! wait "$pid"; then
177+
failed=1
178+
fi
179+
done
180+
181+
return "$failed"
182+
}
183+
117184
# Run a command and log output
118185
run_and_log() {
119186
local cmd="$1"
@@ -247,7 +314,7 @@ for ((id=0; id<num_builds; ++id)); do
247314
clean_build_dir
248315
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
249316
if [[ "$RUN_CTEST" == "true" && "$build_profile" != "true" ]]; then
250-
run_and_log "$CTEST_CMD" "ctest_${build_id}" "no" "ctest"
317+
run_and_log "run_ctest" "ctest_${build_id}" "no" "ctest"
251318
fi
252319

253320
# profile flag for runs

scripts/test_config.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88
"PROFILE_LOG_DIR": "./profile_logs",
99
"LOG_DIR": "./logs",
1010
"COMPARE_LOG_DIR": "",
11-
"RUN_CTEST": "true",
12-
"CTEST_CMD": "ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1"
11+
"RUN_CTEST": "true"
1312
},
1413
"builds": [
1514
{

0 commit comments

Comments
 (0)