Skip to content

Commit 094a007

Browse files
committed
Merge branch 'gonzalobg-reuse_memory' into develop
2 parents 1f3e8e1 + 36c00b0 commit 094a007

56 files changed

Lines changed: 1420 additions & 1774 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,18 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
4444
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
4545
endif ()
4646

47+
option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)
48+
4749
# setup some defaults flags for everything
4850
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
49-
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
51+
set(DEFAULT_RELEASE_FLAGS -O3)
52+
if (BUILD_NATIVE)
53+
if(CMAKE_SYSTEM_PROCESSOR STREQUAL aarch64)
54+
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -mcpu=native)
55+
else()
56+
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
57+
endif()
58+
endif()
5059

5160
macro(hint_flag FLAG DESCRIPTION)
5261
if (NOT DEFINED ${FLAG})
@@ -149,17 +158,15 @@ endif ()
149158
include(cmake/register_models.cmake)
150159

151160
# register out models <model_name> <preprocessor_def_name> <source files...>
161+
register_model(serial SERIAL SerialStream.cpp)
152162
register_model(omp OMP OMPStream.cpp)
153163
register_model(ocl OCL OCLStream.cpp)
154-
register_model(std-data STD_DATA STDDataStream.cpp)
155-
register_model(std-indices STD_INDICES STDIndicesStream.cpp)
156-
register_model(std-ranges STD_RANGES STDRangesStream.cpp)
164+
register_model(std STD STDStream.cpp)
157165
register_model(hip HIP HIPStream.cpp)
158166
register_model(cuda CUDA CUDAStream.cu)
159167
register_model(kokkos KOKKOS KokkosStream.cpp)
160168
register_model(sycl SYCL SYCLStream.cpp)
161-
register_model(sycl2020-acc SYCL2020 SYCLStream2020.cpp)
162-
register_model(sycl2020-usm SYCL2020 SYCLStream2020.cpp)
169+
register_model(sycl2020 SYCL2020 SYCLStream2020.cpp)
163170
register_model(acc ACC ACCStream.cpp)
164171
# defining RAJA collides with the RAJA namespace so USE_RAJA
165172
register_model(raja USE_RAJA RAJAStream.cpp)

src/Stream.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,19 @@
77

88
#pragma once
99

10+
#include <cstdint>
11+
#include <array>
1012
#include <vector>
1113
#include <string>
14+
#include "benchmark.h"
1215

1316
#ifdef ENABLE_CALIPER
1417
#include <caliper/cali.h>
1518
#include <adiak.h>
1619
#endif
1720

21+
using std::intptr_t;
22+
1823
// Array values
1924
#define startA (0.1)
2025
#define startB (0.2)
@@ -36,9 +41,8 @@ class Stream
3641
virtual void nstream() = 0;
3742
virtual T dot() = 0;
3843

39-
// Copy memory between host and device
40-
virtual void init_arrays(T initA, T initB, T initC) = 0;
41-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
44+
// Set pointers to read from arrays
45+
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
4246
};
4347

4448
// Implementation specific device functions

src/StreamModels.h

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,8 @@
33

44
#if defined(CUDA)
55
#include "CUDAStream.h"
6-
#elif defined(STD_DATA)
7-
#include "STDDataStream.h"
8-
#elif defined(STD_INDICES)
9-
#include "STDIndicesStream.h"
10-
#elif defined(STD_RANGES)
11-
#include "STDRangesStream.hpp"
6+
#elif defined(STD)
7+
#include "STDStream.h"
128
#elif defined(TBB)
139
#include "TBBStream.hpp"
1410
#elif defined(THRUST)
@@ -31,71 +27,69 @@
3127
#include "SYCLStream2020.h"
3228
#elif defined(OMP)
3329
#include "OMPStream.h"
30+
#elif defined(SERIAL)
31+
#include "SerialStream.h"
3432
#elif defined(FUTHARK)
3533
#include "FutharkStream.h"
3634
#endif
3735

38-
template <typename T>
39-
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
36+
template <typename T, typename...Args>
37+
std::unique_ptr<Stream<T>> make_stream(Args... args) {
4038
#if defined(CUDA)
4139
// Use the CUDA implementation
42-
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
40+
return std::make_unique<CUDAStream<T>>(args...);
4341

4442
#elif defined(HIP)
4543
// Use the HIP implementation
46-
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
44+
return std::make_unique<HIPStream<T>>(args...);
4745

4846
#elif defined(HC)
4947
// Use the HC implementation
50-
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
48+
return std::make_unique<HCStream<T>>(args...);
5149

5250
#elif defined(OCL)
5351
// Use the OpenCL implementation
54-
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
52+
return std::make_unique<OCLStream<T>>(args...);
5553

5654
#elif defined(USE_RAJA)
5755
// Use the RAJA implementation
58-
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
56+
return std::make_unique<RAJAStream<T>>(args...);
5957

6058
#elif defined(KOKKOS)
6159
// Use the Kokkos implementation
62-
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
60+
return std::make_unique<KokkosStream<T>>(args...);
6361

64-
#elif defined(STD_DATA)
62+
#elif defined(STD)
6563
// Use the C++ STD data-oriented implementation
66-
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
67-
68-
#elif defined(STD_INDICES)
69-
// Use the C++ STD index-oriented implementation
70-
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
71-
72-
#elif defined(STD_RANGES)
73-
// Use the C++ STD ranges implementation
74-
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
64+
return std::make_unique<STDStream<T>>(args...);
7565

7666
#elif defined(TBB)
7767
// Use the C++20 implementation
78-
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
68+
return std::make_unique<TBBStream<T>>(args...);
7969

8070
#elif defined(THRUST)
8171
// Use the Thrust implementation
82-
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
72+
return std::make_unique<ThrustStream<T>>(args...);
8373

8474
#elif defined(ACC)
8575
// Use the OpenACC implementation
86-
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
76+
return std::make_unique<ACCStream<T>>(args...);
8777

8878
#elif defined(SYCL) || defined(SYCL2020)
8979
// Use the SYCL implementation
90-
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
80+
return std::make_unique<SYCLStream<T>>(args...);
9181

9282
#elif defined(OMP)
9383
// Use the OpenMP implementation
94-
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
84+
return std::make_unique<OMPStream<T>>(args...);
85+
86+
#elif defined(SERIAL)
87+
// Use the Serial implementation
88+
return std::make_unique<SerialStream<T>>(args...);
9589

9690
#elif defined(FUTHARK)
9791
// Use the Futhark implementation
98-
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
92+
return std::make_unique<FutharkStream<T>>(args...);
9993

10094
#else
10195

src/acc/ACCStream.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,12 @@
88
#include "ACCStream.h"
99

1010
template <class T>
11-
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
12-
: array_size{ARRAY_SIZE}
11+
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
12+
T initA, T initB, T initC)
13+
: array_size{array_size}
1314
{
1415
acc_device_t device_type = acc_get_device_type();
15-
acc_set_device_num(device, device_type);
16+
acc_set_device_num(device_id, device_type);
1617

1718
// Set up data region on device
1819
this->a = new T[array_size];
@@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
2526

2627
#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
2728
{}
29+
30+
init_arrays(initA, initB, initC);
2831
}
2932

3033
template <class T>
@@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
6265
}
6366

6467
template <class T>
65-
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
68+
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
6669
{
6770
T *a = this->a;
6871
T *b = this->b;
6972
T *c = this->c;
7073
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
7174
{}
7275

73-
for (intptr_t i = 0; i < array_size; i++)
74-
{
75-
h_a[i] = a[i];
76-
h_b[i] = b[i];
77-
h_c[i] = c[i];
78-
}
76+
h_a = a;
77+
h_b = b;
78+
h_c = c;
7979
}
8080

8181
template <class T>

src/acc/ACCStream.h

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,32 +19,25 @@
1919
template <class T>
2020
class ACCStream : public Stream<T>
2121
{
22-
struct A{
23-
T *a;
24-
T *b;
25-
T *c;
26-
};
27-
28-
protected:
2922
// Size of arrays
3023
intptr_t array_size;
31-
A aa;
3224
// Device side pointers
33-
T *a;
34-
T *b;
35-
T *c;
25+
T* restrict a;
26+
T* restrict b;
27+
T* restrict c;
3628

3729
public:
38-
ACCStream(const intptr_t, int);
30+
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
31+
T initA, T initB, T initC);
3932
~ACCStream();
4033

41-
virtual void copy() override;
42-
virtual void add() override;
43-
virtual void mul() override;
44-
virtual void triad() override;
45-
virtual void nstream() override;
46-
virtual T dot() override;
34+
void copy() override;
35+
void add() override;
36+
void mul() override;
37+
void triad() override;
38+
void nstream() override;
39+
T dot() override;
4740

48-
virtual void init_arrays(T initA, T initB, T initC) override;
49-
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
41+
void get_arrays(T const*& a, T const*& b, T const*& c) override;
42+
void init_arrays(T initA, T initB, T initC);
5043
};

src/benchmark.h

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#pragma once
2+
3+
#include <algorithm>
4+
#include <array>
5+
#include <initializer_list>
6+
#include <iostream>
7+
8+
// Array values
9+
#define startA (0.1)
10+
#define startB (0.2)
11+
#define startC (0.0)
12+
#define startScalar (0.4)
13+
14+
// Benchmark Identifier: identifies individual & groups of benchmarks:
15+
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
16+
// - All: all kernels.
17+
// - Individual kernels only.
18+
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
19+
20+
struct Benchmark {
21+
BenchId id;
22+
char const* label;
23+
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
24+
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
25+
size_t weight;
26+
// Is it one of: Copy, Mul, Add, Triad, Dot?
27+
bool classic = false;
28+
};
29+
30+
// Benchmarks in the order in which - if present - should be run for validation purposes:
31+
constexpr size_t num_benchmarks = 6;
32+
constexpr std::array<Benchmark, num_benchmarks> bench = {
33+
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
34+
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
35+
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
36+
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
37+
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
38+
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
39+
};
40+
41+
// Which buffers are needed by each benchmark
42+
inline bool needs_buffer(BenchId id, char n) {
43+
auto in = [n](std::initializer_list<char> values) {
44+
return std::find(values.begin(), values.end(), n) != values.end();
45+
};
46+
switch(id) {
47+
case BenchId::All: return in({'a','b','c'});
48+
case BenchId::Classic: return in({'a','b','c'});
49+
case BenchId::Copy: return in({'a','c'});
50+
case BenchId::Mul: return in({'b','c'});
51+
case BenchId::Add: return in({'a','b','c'});
52+
case BenchId::Triad: return in({'a','b','c'});
53+
case BenchId::Dot: return in({'a','b'});
54+
case BenchId::Nstream: return in({'a','b','c'});
55+
default:
56+
std::cerr << "Unknown benchmark" << std::endl;
57+
abort();
58+
}
59+
}
60+
61+
// Returns true if the benchmark needs to be run:
62+
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
63+
if (selection == BenchId::All) return true;
64+
if (selection == BenchId::Classic && b.classic) return true;
65+
return selection == b.id;
66+
}

0 commit comments

Comments
 (0)