Skip to content

Commit 1f3e8e1

Browse files
authored
Merge pull request #217 from ifdu/svm
Add optional support for GPU page faults in SYCL2020 and OMP backends
2 parents 8ab5088 + 5ab8dea commit 1f3e8e1

4 files changed

Lines changed: 88 additions & 35 deletions

File tree

src/omp/OMPStream.cpp

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
#include <cstdlib> // For aligned_alloc
99
#include "OMPStream.h"
1010

11+
#if defined(PAGEFAULT)
12+
#pragma omp requires unified_shared_memory
13+
#endif
14+
1115
#ifndef ALIGNMENT
1216
#define ALIGNMENT (2*1024*1024) // 2MB
1317
#endif
@@ -24,20 +28,22 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
2428

2529
#ifdef OMP_TARGET_GPU
2630
omp_set_default_device(device);
27-
T *a = this->a;
28-
T *b = this->b;
29-
T *c = this->c;
30-
// Set up data region on device
31-
#pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
32-
{}
31+
#if !defined(PAGEFAULT)
32+
T *a = this->a;
33+
T *b = this->b;
34+
T *c = this->c;
35+
// Set up data region on device
36+
#pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
37+
{}
38+
#endif
3339
#endif
3440

3541
}
3642

3743
template <class T>
3844
OMPStream<T>::~OMPStream()
3945
{
40-
#ifdef OMP_TARGET_GPU
46+
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
4147
// End data region on device
4248
intptr_t array_size = this->array_size;
4349
T *a = this->a;
@@ -55,7 +61,7 @@ template <class T>
5561
void OMPStream<T>::init_arrays(T initA, T initB, T initC)
5662
{
5763
intptr_t array_size = this->array_size;
58-
#ifdef OMP_TARGET_GPU
64+
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
5965
T *a = this->a;
6066
T *b = this->b;
6167
T *c = this->c;
@@ -69,7 +75,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
6975
b[i] = initB;
7076
c[i] = initC;
7177
}
72-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
78+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
7379
// If using the Cray compiler, the kernels do not block, so this update forces
7480
// a small copy to ensure blocking so that timing is correct
7581
#pragma omp target update from(a[0:0])
@@ -80,7 +86,7 @@ template <class T>
8086
void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
8187
{
8288

83-
#ifdef OMP_TARGET_GPU
89+
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
8490
T *a = this->a;
8591
T *b = this->b;
8692
T *c = this->c;
@@ -101,7 +107,7 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
101107
template <class T>
102108
void OMPStream<T>::copy()
103109
{
104-
#ifdef OMP_TARGET_GPU
110+
#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
105111
intptr_t array_size = this->array_size;
106112
T *a = this->a;
107113
T *c = this->c;
@@ -113,7 +119,7 @@ void OMPStream<T>::copy()
113119
{
114120
c[i] = a[i];
115121
}
116-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
122+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
117123
// If using the Cray compiler, the kernels do not block, so this update forces
118124
// a small copy to ensure blocking so that timing is correct
119125
#pragma omp target update from(a[0:0])
@@ -126,9 +132,11 @@ void OMPStream<T>::mul()
126132
const T scalar = startScalar;
127133

128134
#ifdef OMP_TARGET_GPU
129-
intptr_t array_size = this->array_size;
130-
T *b = this->b;
131-
T *c = this->c;
135+
#if !defined(PAGEFAULT)
136+
intptr_t array_size = this->array_size;
137+
T *b = this->b;
138+
T *c = this->c;
139+
#endif
132140
#pragma omp target teams distribute parallel for simd
133141
#else
134142
#pragma omp parallel for
@@ -137,7 +145,7 @@ void OMPStream<T>::mul()
137145
{
138146
b[i] = scalar * c[i];
139147
}
140-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
148+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
141149
// If using the Cray compiler, the kernels do not block, so this update forces
142150
// a small copy to ensure blocking so that timing is correct
143151
#pragma omp target update from(c[0:0])
@@ -148,10 +156,12 @@ template <class T>
148156
void OMPStream<T>::add()
149157
{
150158
#ifdef OMP_TARGET_GPU
151-
intptr_t array_size = this->array_size;
152-
T *a = this->a;
153-
T *b = this->b;
154-
T *c = this->c;
159+
#if !defined(PAGEFAULT)
160+
intptr_t array_size = this->array_size;
161+
T *a = this->a;
162+
T *b = this->b;
163+
T *c = this->c;
164+
#endif
155165
#pragma omp target teams distribute parallel for simd
156166
#else
157167
#pragma omp parallel for
@@ -160,7 +170,7 @@ void OMPStream<T>::add()
160170
{
161171
c[i] = a[i] + b[i];
162172
}
163-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
173+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
164174
// If using the Cray compiler, the kernels do not block, so this update forces
165175
// a small copy to ensure blocking so that timing is correct
166176
#pragma omp target update from(a[0:0])
@@ -173,10 +183,12 @@ void OMPStream<T>::triad()
173183
const T scalar = startScalar;
174184

175185
#ifdef OMP_TARGET_GPU
176-
intptr_t array_size = this->array_size;
177-
T *a = this->a;
178-
T *b = this->b;
179-
T *c = this->c;
186+
#if !defined(PAGEFAULT)
187+
intptr_t array_size = this->array_size;
188+
T *a = this->a;
189+
T *b = this->b;
190+
T *c = this->c;
191+
#endif
180192
#pragma omp target teams distribute parallel for simd
181193
#else
182194
#pragma omp parallel for
@@ -185,7 +197,7 @@ void OMPStream<T>::triad()
185197
{
186198
a[i] = b[i] + scalar * c[i];
187199
}
188-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
200+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
189201
// If using the Cray compiler, the kernels do not block, so this update forces
190202
// a small copy to ensure blocking so that timing is correct
191203
#pragma omp target update from(a[0:0])
@@ -198,10 +210,12 @@ void OMPStream<T>::nstream()
198210
const T scalar = startScalar;
199211

200212
#ifdef OMP_TARGET_GPU
201-
intptr_t array_size = this->array_size;
202-
T *a = this->a;
203-
T *b = this->b;
204-
T *c = this->c;
213+
#if !defined(PAGEFAULT)
214+
intptr_t array_size = this->array_size;
215+
T *a = this->a;
216+
T *b = this->b;
217+
T *c = this->c;
218+
#endif
205219
#pragma omp target teams distribute parallel for simd
206220
#else
207221
#pragma omp parallel for
@@ -210,7 +224,7 @@ void OMPStream<T>::nstream()
210224
{
211225
a[i] += b[i] + scalar * c[i];
212226
}
213-
#if defined(OMP_TARGET_GPU) && defined(_CRAYC)
227+
#if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
214228
// If using the Cray compiler, the kernels do not block, so this update forces
215229
// a small copy to ensure blocking so that timing is correct
216230
#pragma omp target update from(a[0:0])
@@ -223,9 +237,11 @@ T OMPStream<T>::dot()
223237
T sum{};
224238

225239
#ifdef OMP_TARGET_GPU
226-
intptr_t array_size = this->array_size;
227-
T *a = this->a;
228-
T *b = this->b;
240+
#if !defined(PAGEFAULT)
241+
intptr_t array_size = this->array_size;
242+
T *a = this->a;
243+
T *b = this->b;
244+
#endif
229245
#pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum)
230246
#else
231247
#pragma omp parallel for reduction(+:sum)

src/omp/model.cmake

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@ register_flag_optional(OFFLOAD_APPEND_LINK_FLAG
111111
This is required for most offload implementations so that offload libraries can linked correctly."
112112
ON)
113113

114+
register_flag_optional(MEM "Device memory mode:
115+
DEFAULT - allocate host and device memory pointers.
116+
PAGEFAULT - shared memory, only host pointers allocated."
117+
"DEFAULT")
114118

115119
macro(setup)
116120
find_package(OpenMP REQUIRED)
@@ -190,5 +194,8 @@ macro(setup)
190194
if (OFFLOAD_APPEND_LINK_FLAG)
191195
register_append_link_flags(${OMP_FLAGS})
192196
endif ()
197+
198+
register_definitions(${MEM})
199+
193200
endmacro()
194201

src/sycl2020-usm/SYCLStream2020.cpp

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#include <iostream>
1111

12+
#define ALIGNMENT (1024 * 1024 * 2)
13+
1214
// Cache list of devices
1315
bool cached = false;
1416
std::vector<sycl::device> devices;
@@ -59,11 +61,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
5961
throw std::runtime_error("SYCL errors detected");
6062
}
6163
}});
62-
64+
#if defined(PAGEFAULT)
65+
a = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
66+
b = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
67+
c = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
68+
sum = (T*)aligned_alloc(ALIGNMENT, ALIGNMENT);
69+
#else
6370
a = sycl::malloc_shared<T>(array_size, *queue);
6471
b = sycl::malloc_shared<T>(array_size, *queue);
6572
c = sycl::malloc_shared<T>(array_size, *queue);
6673
sum = sycl::malloc_shared<T>(1, *queue);
74+
#endif
6775

6876
// No longer need list of devices
6977
devices.clear();
@@ -74,10 +82,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
7482

7583
template<class T>
7684
SYCLStream<T>::~SYCLStream() {
85+
#if defined(PAGEFAULT)
86+
free(a);
87+
free(b);
88+
free(c);
89+
free(sum);
90+
#else
7791
sycl::free(a, *queue);
7892
sycl::free(b, *queue);
7993
sycl::free(c, *queue);
8094
sycl::free(sum, *queue);
95+
#endif
8196
}
8297

8398
template <class T>
@@ -175,6 +190,14 @@ T SYCLStream<T>::dot()
175190
template <class T>
176191
void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
177192
{
193+
#if defined(PAGEFAULT)
194+
for (int i = 0; i < array_size; i++)
195+
{
196+
a[i] = initA;
197+
b[i] = initB;
198+
c[i] = initC;
199+
}
200+
#else
178201
queue->submit([&](sycl::handler &cgh)
179202
{
180203
cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
@@ -186,6 +209,7 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
186209
});
187210

188211
queue->wait();
212+
#endif
189213
}
190214

191215
template <class T>

src/sycl2020-usm/model.cmake

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@ register_flag_optional(SYCL_COMPILER_DIR
1818
HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
1919
"")
2020

21+
register_flag_optional(MEM "Device memory mode:
22+
DEFAULT - allocate host and device memory pointers.
23+
PAGEFAULT - shared memory, only host pointers allocated."
24+
"DEFAULT")
2125

2226
macro(setup)
2327
set(CMAKE_CXX_STANDARD 17)
@@ -59,6 +63,8 @@ macro(setup)
5963
message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
6064
endif ()
6165

66+
register_definitions(${MEM})
67+
6268
endmacro()
6369

6470

0 commit comments

Comments
 (0)