Merge pull request #217 from ifdu/svm

tomdeakin · web-flow · commit 1f3e8e1f6757 · 2026-02-20T08:46:49.000Z
Add optional support for GPU page faults in SYCL2020 and OMP backends
diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp
@@ -8,6 +8,10 @@
 #include <cstdlib>  // For aligned_alloc
 #include "OMPStream.h"
 
+#if defined(PAGEFAULT)
+#pragma omp requires unified_shared_memory
+#endif
+
 #ifndef ALIGNMENT
 #define ALIGNMENT (2*1024*1024) // 2MB
 #endif
@@ -24,20 +28,22 @@ OMPStream<T>::OMPStream(const intptr_t ARRAY_SIZE, int device)
 
 #ifdef OMP_TARGET_GPU
   omp_set_default_device(device);
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
-  // Set up data region on device
-  #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
-  {}
+  #if !defined(PAGEFAULT)
+    T *a = this->a;
+    T *b = this->b;
+    T *c = this->c;
+    // Set up data region on device
+    #pragma omp target enter data map(alloc: a[0:array_size], b[0:array_size], c[0:array_size])
+    {}
+  #endif
 #endif
 
 }
 
 template <class T>
 OMPStream<T>::~OMPStream()
 {
-#ifdef OMP_TARGET_GPU
+#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
   // End data region on device
   intptr_t array_size = this->array_size;
   T *a = this->a;
@@ -55,7 +61,7 @@ template <class T>
 void OMPStream<T>::init_arrays(T initA, T initB, T initC)
 {
   intptr_t array_size = this->array_size;
-#ifdef OMP_TARGET_GPU
+#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
@@ -69,7 +75,7 @@ void OMPStream<T>::init_arrays(T initA, T initB, T initC)
     b[i] = initB;
     c[i] = initC;
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(a[0:0])
@@ -80,7 +86,7 @@ template <class T>
 void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
 {
 
-#ifdef OMP_TARGET_GPU
+#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
   T *a = this->a;
   T *b = this->b;
   T *c = this->c;
@@ -101,7 +107,7 @@ void OMPStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::ve
 template <class T>
 void OMPStream<T>::copy()
 {
-#ifdef OMP_TARGET_GPU
+#if defined(OMP_TARGET_GPU) && !defined(PAGEFAULT)
   intptr_t array_size = this->array_size;
   T *a = this->a;
   T *c = this->c;
@@ -113,7 +119,7 @@ void OMPStream<T>::copy()
   {
     c[i] = a[i];
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(a[0:0])
@@ -126,9 +132,11 @@ void OMPStream<T>::mul()
   const T scalar = startScalar;
 
 #ifdef OMP_TARGET_GPU
-  intptr_t array_size = this->array_size;
-  T *b = this->b;
-  T *c = this->c;
+  #if !defined(PAGEFAULT)
+    intptr_t array_size = this->array_size;
+    T *b = this->b;
+    T *c = this->c;
+  #endif
   #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
@@ -137,7 +145,7 @@ void OMPStream<T>::mul()
   {
     b[i] = scalar * c[i];
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(c[0:0])
@@ -148,10 +156,12 @@ template <class T>
 void OMPStream<T>::add()
 {
 #ifdef OMP_TARGET_GPU
-  intptr_t array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
+  #if !defined(PAGEFAULT)
+    intptr_t array_size = this->array_size;
+    T *a = this->a;
+    T *b = this->b;
+    T *c = this->c;
+  #endif
   #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
@@ -160,7 +170,7 @@ void OMPStream<T>::add()
   {
     c[i] = a[i] + b[i];
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(a[0:0])
@@ -173,10 +183,12 @@ void OMPStream<T>::triad()
   const T scalar = startScalar;
 
 #ifdef OMP_TARGET_GPU
-  intptr_t array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
+  #if !defined(PAGEFAULT)
+    intptr_t array_size = this->array_size;
+    T *a = this->a;
+    T *b = this->b;
+    T *c = this->c;
+  #endif
   #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
@@ -185,7 +197,7 @@ void OMPStream<T>::triad()
   {
     a[i] = b[i] + scalar * c[i];
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(a[0:0])
@@ -198,10 +210,12 @@ void OMPStream<T>::nstream()
   const T scalar = startScalar;
 
 #ifdef OMP_TARGET_GPU
-  intptr_t array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
-  T *c = this->c;
+  #if !defined(PAGEFAULT)
+    intptr_t array_size = this->array_size;
+    T *a = this->a;
+    T *b = this->b;
+    T *c = this->c;
+  #endif
   #pragma omp target teams distribute parallel for simd
 #else
   #pragma omp parallel for
@@ -210,7 +224,7 @@ void OMPStream<T>::nstream()
   {
     a[i] += b[i] + scalar * c[i];
   }
-  #if defined(OMP_TARGET_GPU) && defined(_CRAYC)
+  #if defined(OMP_TARGET_GPU) && defined(_CRAYC) && !defined(PAGEFAULT)
   // If using the Cray compiler, the kernels do not block, so this update forces
   // a small copy to ensure blocking so that timing is correct
   #pragma omp target update from(a[0:0])
@@ -223,9 +237,11 @@ T OMPStream<T>::dot()
   T sum{};
 
 #ifdef OMP_TARGET_GPU
-  intptr_t array_size = this->array_size;
-  T *a = this->a;
-  T *b = this->b;
+  #if !defined(PAGEFAULT)
+    intptr_t array_size = this->array_size;
+    T *a = this->a;
+    T *b = this->b;
+  #endif
   #pragma omp target teams distribute parallel for simd map(tofrom: sum) reduction(+:sum)
 #else
   #pragma omp parallel for reduction(+:sum)
diff --git a/src/omp/model.cmake b/src/omp/model.cmake
@@ -111,6 +111,10 @@ register_flag_optional(OFFLOAD_APPEND_LINK_FLAG
         This is required for most offload implementations so that offload libraries can linked correctly."
         ON)
 
+register_flag_optional(MEM "Device memory mode:
+        DEFAULT   - allocate host and device memory pointers.
+        PAGEFAULT - shared memory, only host pointers allocated."
+        "DEFAULT")
 
 macro(setup)
     find_package(OpenMP REQUIRED)
@@ -190,5 +194,8 @@ macro(setup)
     if (OFFLOAD_APPEND_LINK_FLAG)
         register_append_link_flags(${OMP_FLAGS})
     endif ()
+
+    register_definitions(${MEM})
+
 endmacro()
 
diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp
@@ -9,6 +9,8 @@
 
 #include <iostream>
 
+#define ALIGNMENT (1024 * 1024 * 2)
+
 // Cache list of devices
 bool cached = false;
 std::vector<sycl::device> devices;
@@ -59,11 +61,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
       throw std::runtime_error("SYCL errors detected");
     }
   }});
-
+#if defined(PAGEFAULT)
+  a = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
+  b = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
+  c = (T*)aligned_alloc(ALIGNMENT, array_size * sizeof(T));
+  sum = (T*)aligned_alloc(ALIGNMENT, ALIGNMENT);
+#else
   a = sycl::malloc_shared<T>(array_size, *queue);
   b = sycl::malloc_shared<T>(array_size, *queue);
   c = sycl::malloc_shared<T>(array_size, *queue);
   sum = sycl::malloc_shared<T>(1, *queue);
+#endif
 
   // No longer need list of devices
   devices.clear();
@@ -74,10 +82,17 @@ SYCLStream<T>::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index)
 
 template<class T>
 SYCLStream<T>::~SYCLStream() {
+#if defined(PAGEFAULT)
+ free(a);
+ free(b);
+ free(c);
+ free(sum);
+#else
  sycl::free(a, *queue);
  sycl::free(b, *queue);
  sycl::free(c, *queue);
  sycl::free(sum, *queue);
+#endif
 }
 
 template <class T>
@@ -175,6 +190,14 @@ T SYCLStream<T>::dot()
 template <class T>
 void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
 {
+#if defined(PAGEFAULT)
+  for (int i = 0; i < array_size; i++)
+  {
+    a[i] = initA;
+    b[i] = initB;
+    c[i] = initC;
+  }
+#else
   queue->submit([&](sycl::handler &cgh)
   {
     cgh.parallel_for(sycl::range<1>{array_size}, [=, a = this->a, b = this->b, c = this->c](sycl::id<1> idx)
@@ -186,6 +209,7 @@ void SYCLStream<T>::init_arrays(T initA, T initB, T initC)
   });
 
   queue->wait();
+#endif
 }
 
 template <class T>
diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020-usm/model.cmake
@@ -18,6 +18,10 @@ register_flag_optional(SYCL_COMPILER_DIR
            HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
         "")
 
+register_flag_optional(MEM "Device memory mode:
+        DEFAULT   - allocate host and device memory pointers.
+        PAGEFAULT - shared memory, only host pointers allocated."
+        "DEFAULT")
 
 macro(setup)
     set(CMAKE_CXX_STANDARD 17)
@@ -59,6 +63,8 @@ macro(setup)
         message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
     endif ()
 
+    register_definitions(${MEM})
+
 endmacro()