Skip to content

Commit a5f49ab

Browse files
authored
Merge pull request #644 from abouteiller/bugfix/no-gpu-found
Consolidated error handling when GPU only tests execute on CPU systems
2 parents 1fdfded + 5ff246a commit a5f49ab

10 files changed

Lines changed: 42 additions & 41 deletions

tests/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ function(parsec_addtest_cmd target)
7070
# restrict memory use for oversubscribed runners
7171
set_tests_properties(${target} PROPERTIES ENVIRONMENT
7272
"PARSEC_MCA_device_cuda_enabled=0;PARSEC_MCA_device_hip_enabled=0;PARSEC_MCA_device_level_zero_enabled=0;PARSEC_MCA_device_cuda_memory_use=10;PARSEC_MCA_device_hip_memory_use=10;PARSEC_MCA_device_level_zero_memory_use=10")
73+
# skip tests that fail because the device is not available */
74+
set_tests_properties(${target} PROPERTIES SKIP_RETURN_CODE 10) # 10 is -PARSEC_ERR_DEVICE, positive 7bit return codes are more portable
7375
endfunction(parsec_addtest_cmd)
7476

7577
check_function_exists(erand48 PARSEC_HAVE_ERAND48)

tests/dsl/dtd/dtd_test_task_insertion.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,12 @@ int main(int argc, char ** argv)
9494
#endif
9595

9696
int m, n;
97-
int no_of_tasks = 500000;
97+
int no_of_tasks = 50000;
9898
int amount_of_work[3] = {100, 1000, 10000};
9999
parsec_taskpool_t *dtd_tp;
100100

101101
parsec = parsec_init( cores, &argc, &argv );
102+
cores = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_CORES);
102103

103104
dtd_tp = parsec_dtd_taskpool_new();
104105

tests/runtime/cuda/nvlink_main.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright (c) 2019-2020 The University of Tennessee and The University
2+
* Copyright (c) 2019-2024 The University of Tennessee and The University
33
* of Tennessee Research Foundation. All rights
44
* reserved.
55
*/
@@ -34,6 +34,14 @@ int main(int argc, char *argv[])
3434

3535
parsec = parsec_init(-1, &argc, &argv);
3636

37+
/* can the test run? */
38+
int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
39+
assert(nb_gpus >= 0);
40+
if(nb_gpus == 0) {
41+
parsec_warning("This test can only run if at least one GPU device is present");
42+
exit(-PARSEC_ERR_DEVICE);
43+
}
44+
3745
tp = testing_nvlink_New(parsec, 10, 512);
3846
if( NULL != tp ) {
3947
parsec_context_add_taskpool(parsec, tp);

tests/runtime/cuda/nvlink_wrapper.c

Lines changed: 6 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
21
/**
3-
* Copyright (c) 2019-2021 The University of Tennessee and The University
2+
* Copyright (c) 2019-2024 The University of Tennessee and The University
43
* of Tennessee Research Foundation. All rights
54
* reserved.
65
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
@@ -103,20 +102,8 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
103102
parsec_matrix_block_cyclic_t *userM;
104103

105104
/** Find all CUDA devices */
106-
nb = 0;
107-
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
108-
parsec_device_module_t *device = parsec_mca_device_get(dev);
109-
if( PARSEC_DEV_CUDA == device->type ) {
110-
nb++;
111-
}
112-
}
113-
if(nb == 0) {
114-
char hostname[256];
115-
gethostname(hostname, 256);
116-
fprintf(stderr, "This test requires at least one CUDA device per node -- no CUDA device found on rank %d on %s\n",
117-
ctx->my_rank, hostname);
118-
return NULL;
119-
}
105+
nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
106+
assert(nb >= 0);
120107
dev_index = (int*)malloc(nb * sizeof(int));
121108
nb = 0;
122109
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
@@ -156,7 +143,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
156143

157144
/* GEMM1 tasks will create one data copy per GPU, and work on those.
158145
* see nvlink.jdf:MAKE_C tasks */
159-
146+
160147
/* userM is a user-managed matrix: the user creates the data copies
161148
* only on the GPU they want the GEMM2 to run. To simplify the code,
162149
* we use parsec_matrix_block_cyclic that requires to also have a CPU data
@@ -208,14 +195,14 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
208195
g++;
209196
}
210197
}
211-
198+
212199
testing_handle = parsec_nvlink_new(dcA, userM, ctx->nb_nodes, CuHI, nb, dev_index);
213200

214201
parsec_add2arena( &testing_handle->arenas_datatypes[PARSEC_nvlink_DEFAULT_ADT_IDX],
215202
parsec_datatype_double_complex_t,
216203
PARSEC_MATRIX_FULL, 1, mb, mb, mb,
217204
PARSEC_ARENA_ALIGNMENT_SSE, -1 );
218-
205+
219206
return &testing_handle->super;
220207
}
221208

tests/runtime/cuda/stage_custom.jdf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
extern "C" %{
22
/*
3-
* Copyright (c) 2019-2023 The University of Tennessee and The University
3+
* Copyright (c) 2019-2024 The University of Tennessee and The University
44
* of Tennessee Research Foundation. All rights
55
* reserved.
66
* Copyright (c) 2024 NVIDIA Corporation. All rights reserved.
@@ -162,7 +162,7 @@ BODY [type=CUDA
162162
lbeta, (double*)A, ldam );
163163
status = cublasGetError();
164164
PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
165-
{return -1;} );
165+
{return PARSEC_HOOK_RETURN_ERROR;} );
166166
}
167167
END
168168

@@ -203,7 +203,7 @@ BODY [type=CUDA
203203
lbeta, (double*)B, ldbm );
204204
status = cublasGetError();
205205
PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
206-
{return -1;} );
206+
{return PARSEC_HOOK_RETURN_ERROR;} );
207207

208208
}
209209
END

tests/runtime/cuda/stage_main.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ int main(int argc, char *argv[])
1414
{
1515
parsec_context_t *parsec = NULL;
1616
parsec_taskpool_t *tp;
17-
int i;
1817
int size = 1;
1918
int rank = 0;
2019
int M;
@@ -41,7 +40,15 @@ int main(int argc, char *argv[])
4140
exit(-1);
4241
}
4342

43+
/* can the test run? */
4444
assert(size == 1);
45+
int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
46+
assert(nb_gpus >= 0);
47+
if(nb_gpus == 0) {
48+
parsec_warning("This test can only run if at least one GPU device is present");
49+
printf("TEST SKIPPED\n");
50+
exit(-PARSEC_ERR_DEVICE);
51+
}
4552

4653
/* Test: comparing results when:
4754
- tile matrix transfered to GPU with default stage_in/stage_out
@@ -87,9 +94,9 @@ int main(int argc, char *argv[])
8794
parsec_taskpool_free(tp);
8895
}
8996

90-
if(ret!= 0){
91-
printf("TEST FAILED\n");
92-
}else{
97+
if( ret != 0) {
98+
printf("TEST FAILED (%d errors)\n", ret);
99+
} else {
93100
printf("TEST PASSED\n");
94101
}
95102

@@ -98,5 +105,5 @@ int main(int argc, char *argv[])
98105
MPI_Finalize();
99106
#endif /* DISTRIBUTED */
100107

101-
return ret;
108+
return (0 == ret)? EXIT_SUCCESS: EXIT_FAILURE;
102109
}

tests/runtime/cuda/stress.jdf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ r = 0 .. NP-1
125125
// Parameters
126126
READ A <- (g == 0) ? A READ_A(m, r) : A GEMM(m, g-1, r)
127127
-> ((g + 1) < NGPUs) ? A GEMM(m, g+1, r)
128-
READ B <- A READ_A( (m+g) % descA->super.mt, r)
128+
READ B <- A READ_A(m, r)
129129
RW C <- (m == 0) ? C MAKE_C(g, r) : C GEMM(m-1, g, r)
130130
-> ((m + 1) < (descA->super.mt)) ? C GEMM(m+1, g, r)
131131
: C DISCARD_C(g, r)

tests/runtime/cuda/stress_main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ int main(int argc, char *argv[])
2828

2929
parsec = parsec_init(-1, &argc, &argv);
3030

31-
tp = testing_stress_New(parsec, 4000, 1024);
31+
tp = testing_stress_New(parsec, 80, 1024);
3232
if( NULL != tp ) {
3333
parsec_context_add_taskpool(parsec, tp);
3434
parsec_context_start(parsec);

tests/runtime/cuda/stress_wrapper.c

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ static void __parsec_stress_destructor( parsec_taskpool_t *tp )
1515
dcA = stress_taskpool->_g_descA;
1616
parsec_tiled_matrix_destroy( (parsec_tiled_matrix_t*)stress_taskpool->_g_descA );
1717
free(dcA);
18+
free(stress_taskpool->_g_cuda_device_index);
1819
}
1920

2021
PARSEC_OBJ_CLASS_INSTANCE(parsec_stress_taskpool_t, parsec_taskpool_t,
@@ -27,19 +28,14 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
2728
parsec_matrix_block_cyclic_t *dcA;
2829

2930
/** Find all CUDA devices */
30-
nb = 0;
31-
for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
32-
parsec_device_module_t *device = parsec_mca_device_get(dev);
33-
if( PARSEC_DEV_CUDA == device->type ) {
34-
nb++;
35-
}
36-
}
31+
nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
32+
assert(nb >= 0);
3733
if(nb == 0) {
3834
/* We just simulate a run on CPUs, with an arbitrary number of pseudo-GPUs */
3935
nb = 8;
4036
dev_index = (int*)malloc(nb * sizeof(int));
4137
memset(dev_index, -1, nb*sizeof(int));
42-
fprintf(stderr, "Simulating %d GPUs for sanity checking in stress test\n", nb);
38+
parsec_warning("Simulating %d GPUs for sanity checking in stress test\n", nb);
4339
} else {
4440
dev_index = (int*)malloc(nb * sizeof(int));
4541
nb = 0;

tests/runtime/cuda/testing_get_best_device.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ int main(int argc, char *argv[])
159159

160160
/* Check result */
161161
if( 0 == rank && info != 0 ) {
162-
fprintf(stderr, "Result is Wrong !!!\n");
162+
fprintf(stderr, "Result is Wrong (info %d) !!!\n", info);
163163
}
164164

165165
parsec_data_free(dcA.mat);
@@ -172,5 +172,5 @@ int main(int argc, char *argv[])
172172
MPI_Finalize();
173173
#endif
174174

175-
return info;
175+
return (0 == info)? EXIT_SUCCESS: EXIT_FAILURE;
176176
}

0 commit comments

Comments
 (0)