Skip to content

Commit ea447b4

Browse files
committed
feat: run CUDA tests in parallel via CTest GPU resource allocation
Register CUDA tests at binary granularity and pin each to a GPU through CTest RESOURCE_GROUPS, with a bash wrapper mapping CTEST_RESOURCE_GROUP_0_GPUS to CUDA_VISIBLE_DEVICES. Add a run_ctest helper that generates the GPU resource spec file (from CTEST_CUDA_GPUS or nvidia-smi) and runs CPU then CUDA suites, replacing the hardcoded -j1 CUDA ctest command.
1 parent 8828a0d commit ea447b4

4 files changed

Lines changed: 92 additions & 11 deletions

File tree

cmake/test_macros.cmake

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,9 @@ include(GoogleTest)
2828
# Features:
2929
# 1. Create executable target
3030
# 2. Configure compile options, link libraries, and include paths
31-
# 3. Use gtest_discover_tests to auto-discover test cases
32-
# 4. Set test labels
31+
# 3. Use gtest_discover_tests to auto-discover CPU test cases
32+
# 4. Register CUDA tests at binary granularity with CTest GPU resources
33+
# 5. Set test labels
3334
#
3435
# Arguments:
3536
# SOURCES: Source file list (required)
@@ -73,7 +74,7 @@ macro(infini_train_add_test)
7374
# 5. Link project library (reuses framework linking strategy)
7475
link_infini_train_exe(${ARG_TEST_NAME})
7576

76-
# 6. Auto-discover gtest cases and register as ctest tests
77+
# 6. Register tests
7778
set(labels "cpu")
7879
if(ARG_LABELS)
7980
set(labels "${ARG_LABELS}")
@@ -84,16 +85,30 @@ macro(infini_train_add_test)
8485
set(test_timeout ${ARG_TEST_TIMEOUT})
8586
endif()
8687

87-
if(ARG_TEST_FILTER)
88+
list(FIND labels cuda _has_cuda_label)
89+
if(NOT _has_cuda_label EQUAL -1)
90+
set(_cuda_test_args)
91+
if(ARG_TEST_FILTER)
92+
list(APPEND _cuda_test_args --gtest_filter=${ARG_TEST_FILTER})
93+
endif()
94+
95+
add_test(
96+
NAME ${ARG_TEST_NAME}
97+
COMMAND $<TARGET_FILE:${ARG_TEST_NAME}> ${_cuda_test_args}
98+
)
99+
set_tests_properties(${ARG_TEST_NAME}
100+
PROPERTIES
101+
LABELS "${labels}"
102+
TIMEOUT ${test_timeout}
103+
)
104+
elseif(ARG_TEST_FILTER)
88105
gtest_discover_tests(${ARG_TEST_NAME}
89-
EXTRA_ARGS --gtest_output=xml:%T.xml
90106
TEST_FILTER "${ARG_TEST_FILTER}"
91107
DISCOVERY_TIMEOUT 10
92108
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
93109
)
94110
else()
95111
gtest_discover_tests(${ARG_TEST_NAME}
96-
EXTRA_ARGS --gtest_output=xml:%T.xml
97112
PROPERTIES LABELS "${labels}" TIMEOUT ${test_timeout}
98113
)
99114
endif()

scripts/compare_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def collect_log_files(base_dir: Path):
88
duplicates = {}
99

1010
for path in base_dir.rglob("*.log"):
11-
if path.name.startswith("build") or path.name.endswith("_profile.log"):
11+
if path.name.startswith(("build", "ctest_")) or path.name.endswith("_profile.log"):
1212
continue
1313

1414
key = path.name

scripts/run_models_and_profile.bash

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ LOG_DIR="$(read_var LOG_DIR)"; : "${LOG_DIR:=logs}"
7171
PROFILE_LOG_DIR="$(read_var PROFILE_LOG_DIR)"; : "${PROFILE_LOG_DIR:=./profile_logs}"
7272
COMPARE_LOG_DIR="$(read_var COMPARE_LOG_DIR)"; : "${COMPARE_LOG_DIR:=}"
7373
RUN_CTEST="$(read_var RUN_CTEST)"; : "${RUN_CTEST:=true}"
74-
CTEST_CMD="$(read_var CTEST_CMD)"; : "${CTEST_CMD:=ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1}"
7574
CKPT_ROOT_DIR="$(read_var CKPT_ROOT_DIR)"; : "${CKPT_ROOT_DIR:=/data1/ckpt}"
7675

7776
mkdir -p "$BUILD_DIR" "$LOG_DIR" "$PROFILE_LOG_DIR"
@@ -124,6 +123,74 @@ clean_checkpoints() {
124123
fi
125124
}
126125

126+
run_ctest() {
127+
local gpu_list=()
128+
local cuda_tests=()
129+
130+
if [[ -n "${CTEST_CUDA_GPUS:-}" ]]; then
131+
IFS=',' read -r -a gpu_list <<< "$CTEST_CUDA_GPUS"
132+
elif command -v nvidia-smi >/dev/null 2>&1; then
133+
mapfile -t gpu_list < <(nvidia-smi --query-gpu=index --format=csv,noheader 2>/dev/null || true)
134+
fi
135+
136+
if [[ ${#gpu_list[@]} -eq 0 ]]; then
137+
gpu_list=(0)
138+
fi
139+
140+
local filtered_gpu_list=()
141+
local gpu
142+
for gpu in "${gpu_list[@]}"; do
143+
gpu="${gpu//[[:space:]]/}"
144+
[[ -z "$gpu" ]] && continue
145+
filtered_gpu_list+=("$gpu")
146+
done
147+
148+
if [[ ${#filtered_gpu_list[@]} -eq 0 ]]; then
149+
filtered_gpu_list=(0)
150+
fi
151+
152+
ctest --output-on-failure -LE cuda -j"$(nproc)"
153+
154+
mapfile -t cuda_tests < <(ctest -N -L cuda | sed -n 's/^ *Test *#[0-9][0-9]*: //p')
155+
if [[ ${#cuda_tests[@]} -eq 0 ]]; then
156+
return 0
157+
fi
158+
159+
local worker_count="${#filtered_gpu_list[@]}"
160+
local pids=()
161+
local worker_idx
162+
for ((worker_idx = 0; worker_idx < worker_count; worker_idx++)); do
163+
(
164+
local worker_failed=0
165+
local test_idx="$worker_idx"
166+
local test_name
167+
local assigned_gpu="${filtered_gpu_list[$worker_idx]}"
168+
169+
while ((test_idx < ${#cuda_tests[@]})); do
170+
test_name="${cuda_tests[$test_idx]}"
171+
echo "[CUDA GPU ${assigned_gpu}] ${test_name}"
172+
if ! CUDA_VISIBLE_DEVICES="$assigned_gpu" ctest --output-on-failure -R "^${test_name}$" -j1; then
173+
worker_failed=1
174+
fi
175+
test_idx=$((test_idx + worker_count))
176+
done
177+
178+
exit "$worker_failed"
179+
) &
180+
pids+=("$!")
181+
done
182+
183+
local failed=0
184+
local pid
185+
for pid in "${pids[@]}"; do
186+
if ! wait "$pid"; then
187+
failed=1
188+
fi
189+
done
190+
191+
return "$failed"
192+
}
193+
127194
# Run a command and log output
128195
run_and_log() {
129196
local cmd="$1"
@@ -276,7 +343,7 @@ for ((id=0; id<num_builds; ++id)); do
276343
clean_build_dir
277344
run_and_log "$LAST_CMAKE_CMD" "${build_id}" "no" "build"
278345
if [[ "$RUN_CTEST" == "true" && "$build_profile" != "true" ]]; then
279-
run_and_log "$CTEST_CMD" "ctest_${build_id}" "no" "ctest"
346+
run_and_log "run_ctest" "ctest_${build_id}" "no" "ctest"
280347
fi
281348

282349
# profile flag for runs

scripts/test_config.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@
99
"LOG_DIR": "./logs",
1010
"CKPT_ROOT_DIR": "/data1/ckpt",
1111
"COMPARE_LOG_DIR": "",
12-
"RUN_CTEST": "true",
13-
"CTEST_CMD": "ctest --output-on-failure -LE cuda -j$(nproc) && ctest --output-on-failure -L cuda -j1"
12+
"RUN_CTEST": "true"
1413
},
1514
"builds": [
1615
{

0 commit comments

Comments
 (0)