tests:cuda: have ctest output SKIPPED when no device is available at

abouteiller · abouteiller · commit 5ff246a7ad88 · 2024-05-22T17:34:03.000-04:00
runtime
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -70,6 +70,8 @@ function(parsec_addtest_cmd target)
   # restrict memory use for oversubscribed runners
   set_tests_properties(${target} PROPERTIES ENVIRONMENT
     "PARSEC_MCA_device_cuda_enabled=0;PARSEC_MCA_device_hip_enabled=0;PARSEC_MCA_device_level_zero_enabled=0;PARSEC_MCA_device_cuda_memory_use=10;PARSEC_MCA_device_hip_memory_use=10;PARSEC_MCA_device_level_zero_memory_use=10")
+  # skip tests that fail because the device is not available */
+  set_tests_properties(${target} PROPERTIES SKIP_RETURN_CODE 10) # 10 is -PARSEC_ERR_DEVICE, positive 7bit return codes are more portable
 endfunction(parsec_addtest_cmd)
 
 check_function_exists(erand48 PARSEC_HAVE_ERAND48)
diff --git a/tests/runtime/cuda/nvlink_main.c b/tests/runtime/cuda/nvlink_main.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2019-2020 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  */
@@ -34,6 +34,14 @@ int main(int argc, char *argv[])
 
     parsec = parsec_init(-1, &argc, &argv);
 
+    /* can the test run? */
+    int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        exit(-PARSEC_ERR_DEVICE);
+    }
+
     tp = testing_nvlink_New(parsec, 10, 512);
     if( NULL != tp ) {
         parsec_context_add_taskpool(parsec, tp);
diff --git a/tests/runtime/cuda/nvlink_wrapper.c b/tests/runtime/cuda/nvlink_wrapper.c
@@ -1,6 +1,5 @@
-
 /**
- * Copyright (c) 2019-2021 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -104,10 +103,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
 
     /** Find all CUDA devices */
     nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
-    if(nb <= 0) {
-        parsec_warning("ABORTED: This test requires at least one CUDA device per node (query returned %d)\n", nb);
-        return NULL;
-    }
+    assert(nb >= 0);
     dev_index = (int*)malloc(nb * sizeof(int));
     nb = 0;
     for(dev = 0; dev < (int)parsec_nb_devices; dev++) {
@@ -147,7 +143,7 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
 
     /* GEMM1 tasks will create one data copy per GPU, and work on those.
      * see nvlink.jdf:MAKE_C tasks */
-    
+
     /* userM is a user-managed matrix: the user creates the data copies
      * only on the GPU they want the GEMM2 to run. To simplify the code,
      * we use parsec_matrix_block_cyclic that requires to also have a CPU data
@@ -199,14 +195,14 @@ parsec_taskpool_t* testing_nvlink_New( parsec_context_t *ctx, int depth, int mb
             g++;
         }
     }
-    
+
     testing_handle = parsec_nvlink_new(dcA, userM, ctx->nb_nodes, CuHI, nb, dev_index);
 
     parsec_add2arena( &testing_handle->arenas_datatypes[PARSEC_nvlink_DEFAULT_ADT_IDX],
                              parsec_datatype_double_complex_t,
                              PARSEC_MATRIX_FULL, 1, mb, mb, mb,
                              PARSEC_ARENA_ALIGNMENT_SSE, -1 );
-    
+
     return &testing_handle->super;
 }
 
diff --git a/tests/runtime/cuda/stage_custom.jdf b/tests/runtime/cuda/stage_custom.jdf
@@ -1,6 +1,6 @@
 extern "C" %{
 /*
- * Copyright (c) 2019-2023 The University of Tennessee and The University
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
@@ -162,7 +162,7 @@ BODY [type=CUDA
                          lbeta,  (double*)A, ldam );
     status = cublasGetError();
     PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 }
 END
 
@@ -203,7 +203,7 @@ BODY [type=CUDA
                          lbeta,  (double*)B, ldbm );
     status = cublasGetError();
     PARSEC_CUDA_CHECK_ERROR( "cublasDgemm", status,
-                            {return -1;} );
+                            {return PARSEC_HOOK_RETURN_ERROR;} );
 
 }
 END
@@ -256,12 +256,6 @@ parsec_taskpool_t* testing_stage_custom_New( parsec_context_t *ctx, int M, int N
     int KP = 1;
     int KQ = 1;
 
-    int nb_gpus = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
-    if(nb_gpus <= 0) {
-        parsec_warning("ABORTED: This test can only run if at least one GPU device is present (query returned %d)\n", nb_gpus);
-        return NULL;
-    }
-
     parsec_matrix_block_cyclic_t *descA;
     descA = (parsec_matrix_block_cyclic_t*)calloc(1, sizeof(parsec_matrix_block_cyclic_t));
     parsec_matrix_block_cyclic_init(descA, PARSEC_MATRIX_DOUBLE, PARSEC_MATRIX_TILE,
diff --git a/tests/runtime/cuda/stage_main.c b/tests/runtime/cuda/stage_main.c
@@ -21,7 +21,7 @@ int main(int argc, char *argv[])
     int MB;
     int NB;
     int P = 1;
-    int ret = EXIT_SUCCESS;
+    int ret = 0;
 
 #if defined(DISTRIBUTED)
     {
@@ -40,7 +40,15 @@ int main(int argc, char *argv[])
         exit(-1);
     }
 
+    /* can the test run? */
     assert(size == 1);
+    int nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        printf("TEST SKIPPED\n");
+        exit(-PARSEC_ERR_DEVICE);
+    }
 
     /* Test: comparing results when:
         - tile matrix transfered to GPU with default stage_in/stage_out
@@ -86,9 +94,9 @@ int main(int argc, char *argv[])
         parsec_taskpool_free(tp);
     }
 
-    if(ret!= 0){
-        printf("TEST FAILED\n");
-    }else{
+    if( ret != 0) {
+        printf("TEST FAILED (%d errors)\n", ret);
+    } else {
         printf("TEST PASSED\n");
     }
 
@@ -97,5 +105,5 @@ int main(int argc, char *argv[])
     MPI_Finalize();
 #endif /* DISTRIBUTED */
 
-    return ret;
+    return (0 == ret)? EXIT_SUCCESS: EXIT_FAILURE;
 }
diff --git a/tests/runtime/cuda/stress_wrapper.c b/tests/runtime/cuda/stress_wrapper.c
@@ -29,10 +29,8 @@ parsec_taskpool_t* testing_stress_New( parsec_context_t *ctx, int depth, int mb
 
     /** Find all CUDA devices */
     nb = parsec_context_query(ctx, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
-    if(nb < 0) {
-        parsec_fatal("parsec_context_query returned %d", nb);
-    }
-    else if(nb == 0) {
+    assert(nb >= 0);
+    if(nb == 0) {
         /* We just simulate a run on CPUs, with an arbitrary number of pseudo-GPUs */
         nb = 8;
         dev_index = (int*)malloc(nb * sizeof(int));
diff --git a/tests/runtime/cuda/testing_get_best_device.c b/tests/runtime/cuda/testing_get_best_device.c
@@ -159,7 +159,7 @@ int main(int argc, char *argv[])
 
     /* Check result */
     if( 0 == rank && info != 0 ) {
-        fprintf(stderr, "Result is Wrong !!!\n");
+        fprintf(stderr, "Result is Wrong (info %d) !!!\n", info);
     }
 
     parsec_data_free(dcA.mat);
@@ -172,5 +172,5 @@ int main(int argc, char *argv[])
     MPI_Finalize();
 #endif
 
-    return info;
+    return (0 == info)? EXIT_SUCCESS: EXIT_FAILURE;
 }

Original file line number	Diff line number	Diff line change
`@@ -159,7 +159,7 @@ int main(int argc, char *argv[])`
`159`	`159`
`160`	`160`	`/* Check result */`
`161`	`161`	`if( 0 == rank && info != 0 ) {`
`162`		`- fprintf(stderr, "Result is Wrong !!!\n");`
	`162`	`+ fprintf(stderr, "Result is Wrong (info %d) !!!\n", info);`
`163`	`163`	`}`
`164`	`164`
`165`	`165`	`parsec_data_free(dcA.mat);`
`@@ -172,5 +172,5 @@ int main(int argc, char *argv[])`
`172`	`172`	`MPI_Finalize();`
`173`	`173`	`#endif`
`174`	`174`
`175`		`- return info;`
	`175`	`+ return (0 == info)? EXIT_SUCCESS: EXIT_FAILURE;`
`176`	`176`	`}`