Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 50 additions & 34 deletions src/omp/OMPStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
#include <cstdlib> // For aligned_alloc
#include "OMPStream.h"

#if defined(PAGEFAULT)
#pragma omp requires unified_shared_memory
#endif

#ifndef ALIGNMENT
#define ALIGNMENT (2*1024*1024) // 2MB
#endif
Expand All @@ -24,20 +28,22 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)

#ifdef OMP_TARGET_GPU
omp_set_default_device(device);
T *a = this->a;
T *b = this->b;
T *c = this->c;
// Set up data region on device
#pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
{}
#if !defined(PAGEFAULT)
T *a = this->a;
T *b = this->b;
T *c = this->c;
// Set up data region on device
#pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
{}
#endif
#endif

}

template <class T>
OMPStream<T>::~OMPStream()
{
#ifdef OMP_TARGET_GPU
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
// End data region on device
intptr_t array_size = this->array_size;
T *a = this->a;
Expand All @@ -55,7 +61,7 @@ template <class T>
void OMPStream<T>::init_arrays(T initA, T initB, T initC)
{
intptr_t array_size = this->array_size;
#ifdef OMP_TARGET_GPU
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
T *a = this->a;
T *b = this->b;
T *c = this->c;
Expand All @@ -69,7 +75,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
b[i] = initB;
c[i] = initC;
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
Expand All @@ -80,7 +86,7 @@ template <class T>
void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
{

#ifdef OMP_TARGET_GPU
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
T *a = this->a;
T *b = this->b;
T *c = this->c;
Expand All @@ -101,7 +107,7 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
template <class T>
void OMPStream<T>::copy()
{
#ifdef OMP_TARGET_GPU
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *a = this->a;
T *c = this->c;
Expand All @@ -113,7 +119,7 @@ void OMPStream<T>::copy()
{
c[i] = a[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
Expand All @@ -126,9 +132,11 @@ void OMPStream<T>::mul()
const T scalar = startScalar;

#ifdef OMP_TARGET_GPU
intptr_t array_size = this->array_size;
T *b = this->b;
T *c = this->c;
#if !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *b = this->b;
T *c = this->c;
#endif
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
Expand All @@ -137,7 +145,7 @@ void OMPStream<T>::mul()
{
b[i] = scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(c[0:0])
Expand All @@ -148,10 +156,12 @@ template <class T>
void OMPStream<T>::add()
{
#ifdef OMP_TARGET_GPU
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#if !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#endif
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
Expand All @@ -160,7 +170,7 @@ void OMPStream<T>::add()
{
c[i] = a[i] + b[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
Expand All @@ -173,10 +183,12 @@ void OMPStream<T>::triad()
const T scalar = startScalar;

#ifdef OMP_TARGET_GPU
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#if !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#endif
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
Expand All @@ -185,7 +197,7 @@ void OMPStream<T>::triad()
{
a[i] = b[i] + scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
Expand All @@ -198,10 +210,12 @@ void OMPStream<T>::nstream()
const T scalar = startScalar;

#ifdef OMP_TARGET_GPU
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#if !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
T *c = this->c;
#endif
#pragma omp target teams distribute parallel for simd
#else
#pragma omp parallel for
Expand All @@ -210,7 +224,7 @@ void OMPStream<T>::nstream()
{
a[i] += b[i] + scalar * c[i];
}
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
// If using the Cray compiler, the kernels do not block, so this update forces
// a small copy to ensure blocking so that timing is correct
#pragma omp target update from(a[0:0])
Expand All @@ -223,9 +237,11 @@ T OMPStream<T>::dot()
T sum{};

#ifdef OMP_TARGET_GPU
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
#if !defined(PAGEFAULT)
intptr_t array_size = this->array_size;
T *a = this->a;
T *b = this->b;
#endif
#pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum)
#else
#pragma omp parallel for reduction(+:sum)
Expand Down
7 changes: 7 additions & 0 deletions src/omp/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ register_flag_optional(OFFLOAD_APPEND_LINK_FLAG
This is required for most offload implementations so that offload libraries can linked correctly."
ON)

register_flag_optional(MEM "Device memory mode:
DEFAULT - allocate host and device memory pointers.
PAGEFAULT - shared memory, only host pointers allocated."
"DEFAULT")

macro(setup)
find_package(OpenMP REQUIRED)
Expand Down Expand Up @@ -190,5 +194,8 @@ macro(setup)
if (OFFLOAD_APPEND_LINK_FLAG)
register_append_link_flags(${OMP_FLAGS})
endif ()

register_definitions(${MEM})

endmacro()

26 changes: 25 additions & 1 deletion src/sycl2020-usm/SYCLStream2020.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

#include <iostream>

#define ALIGNMENT (1024 * 1024 * 2)

// Cache list of devices
bool cached = false;
std::vector<sycl::device> devices;
Expand Down Expand Up @@ -59,11 +61,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
throw std::runtime_error("SYCL errors detected");
}
}});

#if defined(PAGEFAULT)
a = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
b = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
c = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
sum = (T*)aligned_alloc(ALIGNMENT, ALIGNMENT);
#else
a = sycl::malloc_shared<T>(array_size, *queue);
b = sycl::malloc_shared<T>(array_size, *queue);
c = sycl::malloc_shared<T>(array_size, *queue);
sum = sycl::malloc_shared<T>(1, *queue);
#endif

// No longer need list of devices
devices.clear();
Expand All @@ -74,10 +82,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)

template<class T>
SYCLStream<T>::~SYCLStream() {
#if defined(PAGEFAULT)
free(a);
free(b);
free(c);
free(sum);
#else
sycl::free(a, *queue);
sycl::free(b, *queue);
sycl::free(c, *queue);
sycl::free(sum, *queue);
#endif
}

template <class T>
Expand Down Expand Up @@ -175,6 +190,14 @@ T SYCLStream<T>::dot()
template <class T>
void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
{
#if defined(PAGEFAULT)
for (int i = 0; i < array_size; i++)
{
a[i] = initA;
b[i] = initB;
c[i] = initC;
}
#else
queue->submit([&](sycl::handler &cgh)
{
cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
Expand All @@ -186,6 +209,7 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
});

queue->wait();
#endif
}

template <class T>
Expand Down
6 changes: 6 additions & 0 deletions src/sycl2020-usm/model.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ register_flag_optional(SYCL_COMPILER_DIR
HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
"")

register_flag_optional(MEM "Device memory mode:
DEFAULT - allocate host and device memory pointers.
PAGEFAULT - shared memory, only host pointers allocated."
"DEFAULT")

macro(setup)
set(CMAKE_CXX_STANDARD 17)
Expand Down Expand Up @@ -59,6 +63,8 @@ macro(setup)
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
endif ()

register_definitions(${MEM})

endmacro()


Expand Down
Loading