diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 09b749fd..12582823 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -8,6 +8,10 @@ #include // For aligned_alloc #include "OMPStream.h" +#if defined(PAGEFAULT) +#pragma omp requires unified_shared_memory +#endif + #ifndef ALIGNMENT #define ALIGNMENT (2*1024*1024) // 2MB #endif @@ -24,12 +28,14 @@ OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) #ifdef OMP_TARGET_GPU omp_set_default_device(device); - T *a = this->a; - T *b = this->b; - T *c = this->c; - // Set up data region on device - #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) - {} + #if !defined(PAGEFAULT) + T *a = this->a; + T *b = this->b; + T *c = this->c; + // Set up data region on device + #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size]) + {} + #endif #endif } @@ -37,7 +43,7 @@ OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) template OMPStream::~OMPStream() { -#ifdef OMP_TARGET_GPU +#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT) // End data region on device intptr_t array_size = this->array_size; T *a = this->a; @@ -55,7 +61,7 @@ template void OMPStream::init_arrays(T initA, T initB, T initC) { intptr_t array_size = this->array_size; -#ifdef OMP_TARGET_GPU +#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT) T *a = this->a; T *b = this->b; T *c = this->c; @@ -69,7 +75,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) b[i] = initB; c[i] = initC; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(a[0:0]) @@ -80,7 +86,7 @@ template void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { -#ifdef OMP_TARGET_GPU +#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT) T *a = this->a; T *b = this->b; T *c = this->c; @@ -101,7 +107,7 @@ void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve template void OMPStream::copy() { -#ifdef OMP_TARGET_GPU +#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT) intptr_t array_size = this->array_size; T *a = this->a; T *c = this->c; @@ -113,7 +119,7 @@ void OMPStream::copy() { c[i] = a[i]; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(a[0:0]) @@ -126,9 +132,11 @@ void OMPStream::mul() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - intptr_t array_size = this->array_size; - T *b = this->b; - T *c = this->c; + #if !defined(PAGEFAULT) + intptr_t array_size = this->array_size; + T *b = this->b; + T *c = this->c; + #endif #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for @@ -137,7 +145,7 @@ void OMPStream::mul() { b[i] = scalar * c[i]; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(c[0:0]) @@ -148,10 +156,12 @@ template void OMPStream::add() { #ifdef OMP_TARGET_GPU - intptr_t array_size = this->array_size; - T *a = this->a; - T *b = this->b; - T *c = this->c; + #if !defined(PAGEFAULT) + intptr_t array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #endif #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for @@ -160,7 +170,7 @@ void OMPStream::add() { c[i] = a[i] + b[i]; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(a[0:0]) @@ -173,10 +183,12 @@ void OMPStream::triad() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - intptr_t array_size = this->array_size; - T *a = this->a; - T *b = this->b; - T *c = this->c; + #if !defined(PAGEFAULT) + intptr_t array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #endif #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for @@ -185,7 +197,7 @@ void OMPStream::triad() { a[i] = b[i] + scalar * c[i]; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(a[0:0]) @@ -198,10 +210,12 @@ void OMPStream::nstream() const T scalar = startScalar; #ifdef OMP_TARGET_GPU - intptr_t array_size = this->array_size; - T *a = this->a; - T *b = this->b; - T *c = this->c; + #if !defined(PAGEFAULT) + intptr_t array_size = this->array_size; + T *a = this->a; + T *b = this->b; + T *c = this->c; + #endif #pragma omp target teams distribute parallel for simd #else #pragma omp parallel for @@ -210,7 +224,7 @@ void OMPStream::nstream() { a[i] += b[i] + scalar * c[i]; } - #if defined(OMP_TARGET_GPU) && defined(_CRAYC) + #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT) // If using the Cray compiler, the kernels do not block, so this update forces // a small copy to ensure blocking so that timing is correct #pragma omp target update from(a[0:0]) @@ -223,9 +237,11 @@ T OMPStream::dot() T sum{}; #ifdef OMP_TARGET_GPU - intptr_t array_size = this->array_size; - T *a = this->a; - T *b = this->b; + #if !defined(PAGEFAULT) + intptr_t array_size = this->array_size; + T *a = this->a; + T *b = this->b; + #endif #pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum) #else #pragma omp parallel for reduction(+:sum) diff --git a/src/omp/model.cmake b/src/omp/model.cmake index 56f37cf5..41f2e6cd 100644 --- a/src/omp/model.cmake +++ b/src/omp/model.cmake @@ -111,6 +111,10 @@ register_flag_optional(OFFLOAD_APPEND_LINK_FLAG This is required for most offload implementations so that offload libraries can linked correctly." ON) +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") macro(setup) find_package(OpenMP REQUIRED) @@ -190,5 +194,8 @@ macro(setup) if (OFFLOAD_APPEND_LINK_FLAG) register_append_link_flags(${OMP_FLAGS}) endif () + + register_definitions(${MEM}) + endmacro() diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp index e4c6ec27..b28a0303 100644 --- a/src/sycl2020-usm/SYCLStream2020.cpp +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -9,6 +9,8 @@ #include +#define ALIGNMENT (1024 * 1024 * 2) + // Cache list of devices bool cached = false; std::vector devices; @@ -59,11 +61,17 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) throw std::runtime_error("SYCL errors detected"); } }}); - +#if defined(PAGEFAULT) + a = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T)); + b = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T)); + c = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T)); + sum = (T*)aligned_alloc(ALIGNMENT, ALIGNMENT); +#else a = sycl::malloc_shared(array_size, *queue); b = sycl::malloc_shared(array_size, *queue); c = sycl::malloc_shared(array_size, *queue); sum = sycl::malloc_shared(1, *queue); +#endif // No longer need list of devices devices.clear(); @@ -74,10 +82,17 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) template SYCLStream::~SYCLStream() { +#if defined(PAGEFAULT) + free(a); + free(b); + free(c); + free(sum); +#else sycl::free(a, *queue); sycl::free(b, *queue); sycl::free(c, *queue); sycl::free(sum, *queue); +#endif } template @@ -175,6 +190,14 @@ T SYCLStream::dot() template void SYCLStream::init_arrays(T initA, T initB, T initC) { +#if defined(PAGEFAULT) + for (int i = 0; i < array_size; i++) + { + a[i] = initA; + b[i] = initB; + c[i] = initC; + } +#else queue->submit([&](sycl::handler &cgh) { cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx) @@ -186,6 +209,7 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) }); queue->wait(); +#endif } template diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020-usm/model.cmake index 950daefd..0da5b05a 100644 --- a/src/sycl2020-usm/model.cmake +++ b/src/sycl2020-usm/model.cmake @@ -18,6 +18,10 @@ register_flag_optional(SYCL_COMPILER_DIR HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") +register_flag_optional(MEM "Device memory mode: + DEFAULT - allocate host and device memory pointers. + PAGEFAULT - shared memory, only host pointers allocated." + "DEFAULT") macro(setup) set(CMAKE_CXX_STANDARD 17) @@ -59,6 +63,8 @@ macro(setup) message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") endif () + register_definitions(${MEM}) + endmacro()