Skip to content

Commit bb66aec

Browse files
committed
Add parallel_copy algorithm
1 parent e712093 commit bb66aec

5 files changed

Lines changed: 264 additions & 0 deletions

File tree

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ target_sources(
173173
src/ddc/for_each_block.hpp
174174
src/ddc/kokkos_allocator.hpp
175175
src/ddc/non_uniform_point_sampling.hpp
176+
src/ddc/parallel_copy.hpp
176177
src/ddc/parallel_deepcopy.hpp
177178
src/ddc/parallel_fill.hpp
178179
src/ddc/parallel_for_each.hpp

src/ddc/ddc.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ namespace ddc {
7272
#include "create_mirror.hpp"
7373
#include "for_each.hpp"
7474
#include "for_each_block.hpp"
75+
#include "parallel_copy.hpp"
7576
#include "parallel_deepcopy.hpp"
7677
#include "parallel_fill.hpp"
7778
#include "parallel_for_each.hpp"

src/ddc/parallel_copy.hpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// Copyright (C) The DDC development team, see COPYRIGHT.md file
2+
//
3+
// SPDX-License-Identifier: MIT
4+
5+
#pragma once
6+
7+
#include <cassert>
8+
#include <type_traits>
9+
10+
#include <Kokkos_Core.hpp>
11+
12+
#include "chunk_span.hpp"
13+
#include "chunk_traits.hpp"
14+
#include "parallel_for_each.hpp"
15+
16+
namespace ddc {
17+
18+
namespace detail {
19+
20+
template <
21+
typename Tsrc,
22+
typename Tdst,
23+
typename DDomSrc,
24+
typename DDomDst,
25+
typename MemorySpace,
26+
typename LayoutSrc,
27+
typename LayoutDst>
28+
class CopyKokkosLambdaAdapter
29+
{
30+
ddc::ChunkSpan<Tdst, DDomDst, LayoutDst, MemorySpace> m_dst;
31+
32+
ddc::ChunkSpan<Tsrc const, DDomSrc, LayoutSrc, MemorySpace> m_src;
33+
34+
public:
35+
explicit CopyKokkosLambdaAdapter(
36+
ddc::ChunkSpan<Tdst, DDomDst, LayoutDst, MemorySpace> const& dst,
37+
ddc::ChunkSpan<Tsrc const, DDomSrc, LayoutSrc, MemorySpace> const& src)
38+
: m_dst(dst)
39+
, m_src(src)
40+
{
41+
}
42+
43+
KOKKOS_FUNCTION void operator()(DDomDst::discrete_element_type idst) const
44+
{
45+
m_dst(idst) = m_src(typename DDomSrc::discrete_element_type(idst));
46+
}
47+
};
48+
49+
} // namespace detail
50+
51+
/** Copy the content of a borrowed chunk into another. It supports transposition and broadcasting at the same time.
52+
* The two arrays must be accessible from execution_space.
53+
* @param[in] execution_space a Kokkos execution space where the loop will be executed on
54+
* @param[out] dst the borrowed chunk in which to copy
55+
* @param[in] src the borrowed chunk from which to copy
56+
* @return dst as a ChunkSpan
57+
*/
58+
template <class ExecSpace, class ChunkDst, class ChunkSrc>
59+
auto parallel_copy(ExecSpace const& execution_space, ChunkDst&& dst, ChunkSrc&& src)
60+
{
61+
static_assert(is_borrowed_chunk_v<ChunkDst>);
62+
static_assert(is_borrowed_chunk_v<ChunkSrc>);
63+
static_assert(Kokkos::SpaceAccessibility<
64+
ExecSpace,
65+
typename std::remove_cvref_t<ChunkDst>::memory_space>::accessible);
66+
static_assert(Kokkos::SpaceAccessibility<
67+
ExecSpace,
68+
typename std::remove_cvref_t<ChunkSrc>::memory_space>::accessible);
69+
static_assert(
70+
std::is_assignable_v<chunk_reference_t<ChunkDst>, chunk_reference_t<ChunkSrc>>,
71+
"Not assignable");
72+
using DDomDst = decltype(dst.domain());
73+
using DDomSrc = decltype(src.domain());
74+
assert(DDomSrc(dst.domain()) == src.domain());
75+
if constexpr (std::is_same_v<DDomDst, DDomSrc>) {
76+
Kokkos::deep_copy(
77+
execution_space,
78+
dst.allocation_kokkos_view(),
79+
src.allocation_kokkos_view());
80+
} else {
81+
// The current implementation uses a loop over dst dimensions.
82+
// Alternative implementations:
83+
// - outer loop over src dimensions and inner loop over batch dimensions
84+
// - outer loop over batch dimensions and inner loop over src dimensions
85+
ddc::parallel_for_each(
86+
execution_space,
87+
dst.domain(),
88+
detail::CopyKokkosLambdaAdapter(dst.span_view(), src.span_cview()));
89+
}
90+
return dst.span_view();
91+
}
92+
93+
} // namespace ddc

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ add_executable(
3232
for_each_block.cpp
3333
multiple_discrete_dimensions.cpp
3434
non_uniform_point_sampling.cpp
35+
parallel_copy.cpp
3536
parallel_deepcopy.cpp
3637
parallel_fill.cpp
3738
parallel_for_each.cpp

tests/parallel_copy.cpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// Copyright (C) The DDC development team, see COPYRIGHT.md file
2+
//
3+
// SPDX-License-Identifier: MIT
4+
5+
#include <ddc/ddc.hpp>
6+
7+
#include <gtest/gtest.h>
8+
9+
#include <Kokkos_Core.hpp>
10+
#include <Kokkos_StdAlgorithms.hpp>
11+
12+
#include "ddc/kokkos_allocator.hpp"
13+
14+
inline namespace anonymous_namespace_workaround_parallel_copy_cpp {
15+
16+
using DElem0D = ddc::DiscreteElement<>;
17+
using DVect0D = ddc::DiscreteVector<>;
18+
using DDom0D = ddc::DiscreteDomain<>;
19+
20+
template <class Datatype>
21+
using Chunk0D = ddc::Chunk<Datatype, DDom0D>;
22+
template <class Datatype>
23+
using ChunkSpan0D = ddc::ChunkSpan<Datatype, DDom0D>;
24+
25+
26+
struct DDimX
27+
{
28+
};
29+
using DElemX = ddc::DiscreteElement<DDimX>;
30+
using DVectX = ddc::DiscreteVector<DDimX>;
31+
using DDomX = ddc::DiscreteDomain<DDimX>;
32+
33+
template <class Datatype>
34+
using ChunkX = ddc::Chunk<Datatype, DDomX>;
35+
36+
37+
struct DDimY
38+
{
39+
};
40+
using DElemY = ddc::DiscreteElement<DDimY>;
41+
using DVectY = ddc::DiscreteVector<DDimY>;
42+
using DDomY = ddc::DiscreteDomain<DDimY>;
43+
44+
template <class Datatype>
45+
using ChunkY = ddc::Chunk<Datatype, DDomY>;
46+
47+
48+
struct DDimZ
49+
{
50+
};
51+
using DElemZ = ddc::DiscreteElement<DDimZ>;
52+
using DVectZ = ddc::DiscreteVector<DDimZ>;
53+
using DDomZ = ddc::DiscreteDomain<DDimZ>;
54+
55+
56+
using DElemXY = ddc::DiscreteElement<DDimX, DDimY>;
57+
using DVectXY = ddc::DiscreteVector<DDimX, DDimY>;
58+
using DDomXY = ddc::DiscreteDomain<DDimX, DDimY>;
59+
60+
template <class Datatype>
61+
using ChunkXY = ddc::Chunk<Datatype, DDomXY>;
62+
63+
64+
using DElemYX = ddc::DiscreteElement<DDimY, DDimX>;
65+
using DVectYX = ddc::DiscreteVector<DDimY, DDimX>;
66+
using DDomYX = ddc::DiscreteDomain<DDimY, DDimX>;
67+
68+
template <class Datatype>
69+
using ChunkYX = ddc::Chunk<Datatype, DDomYX>;
70+
71+
72+
using DElemXYZ = ddc::DiscreteElement<DDimX, DDimY, DDimZ>;
73+
using DVectXYZ = ddc::DiscreteVector<DDimX, DDimY, DDimZ>;
74+
using DDomXYZ = ddc::DiscreteDomain<DDimX, DDimY, DDimZ>;
75+
76+
DElem0D constexpr lbound_0d {};
77+
DVect0D constexpr nelems_0d {};
78+
DDom0D constexpr dom_0d(lbound_0d, nelems_0d);
79+
80+
DElemX constexpr lbound_x = ddc::init_trivial_half_bounded_space<DDimX>();
81+
DVectX constexpr nelems_x(3);
82+
DDomX constexpr dom_x(lbound_x, nelems_x);
83+
84+
DElemY constexpr lbound_y = ddc::init_trivial_half_bounded_space<DDimY>();
85+
DVectY constexpr nelems_y(12);
86+
87+
DElemZ constexpr lbound_z = ddc::init_trivial_half_bounded_space<DDimZ>();
88+
DVectZ constexpr nelems_z(5);
89+
90+
DElemXY constexpr lbound_x_y(lbound_x, lbound_y);
91+
DVectXY constexpr nelems_x_y(nelems_x, nelems_y);
92+
DDomXY constexpr dom_x_y(lbound_x_y, nelems_x_y);
93+
94+
DElemXYZ constexpr lbound_x_y_z(lbound_x, lbound_y, lbound_z);
95+
DVectXYZ constexpr nelems_x_y_z(nelems_x, nelems_y, nelems_z);
96+
DDomXYZ constexpr dom_x_y_z(lbound_x_y_z, nelems_x_y_z);
97+
98+
} // namespace anonymous_namespace_workaround_parallel_copy_cpp
99+
100+
TEST(ParallelCopy, BroadcastScalar2XY)
101+
{
102+
Kokkos::DefaultExecutionSpace const exec_space;
103+
104+
Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
105+
ddc::ChunkSpan const
106+
chunk_x_y(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
107+
ddc::parallel_fill(exec_space, chunk_x_y, 0);
108+
109+
ddc::Chunk chunk(dom_0d, ddc::DeviceAllocator<int>());
110+
ddc::parallel_fill(exec_space, chunk, 1);
111+
112+
ddc::parallel_copy(exec_space, chunk_x_y, chunk);
113+
114+
EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
115+
}
116+
117+
TEST(ParallelCopy, BroadcastX2XY)
118+
{
119+
Kokkos::DefaultExecutionSpace const exec_space;
120+
121+
Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
122+
ddc::ChunkSpan const
123+
chunk_x_y(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
124+
ddc::parallel_fill(exec_space, chunk_x_y, 0);
125+
126+
ddc::Chunk chunk_x(dom_x, ddc::DeviceAllocator<int>());
127+
ddc::parallel_fill(exec_space, chunk_x, 1);
128+
129+
ddc::parallel_copy(exec_space, chunk_x_y, chunk_x);
130+
131+
EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
132+
}
133+
134+
TEST(ParallelCopy, TransposeXY2XY)
135+
{
136+
Kokkos::DefaultExecutionSpace const exec_space;
137+
138+
Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
139+
ddc::ChunkSpan const
140+
chunk_x_y_out(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
141+
ddc::parallel_fill(exec_space, chunk_x_y_out, 0);
142+
143+
ddc::Chunk chunk_x_y_in(dom_x_y, ddc::DeviceAllocator<int>());
144+
ddc::parallel_fill(exec_space, chunk_x_y_in, 1);
145+
146+
ddc::parallel_copy(exec_space, chunk_x_y_out, chunk_x_y_in);
147+
148+
EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
149+
}
150+
151+
TEST(ParallelCopy, BroadcastAndTransposeYX2XYZ)
152+
{
153+
Kokkos::DefaultExecutionSpace const exec_space;
154+
155+
Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y_z.size());
156+
ddc::ChunkSpan const chunk_x_y_z(
157+
Kokkos::View<int***>(storage.data(), nelems_x, nelems_y, nelems_z),
158+
dom_x_y_z);
159+
ddc::parallel_fill(exec_space, chunk_x_y_z, 0);
160+
161+
DDomYX const dom_y_x(dom_x_y);
162+
ddc::Chunk chunk_y_x(dom_y_x, ddc::DeviceAllocator<int>());
163+
ddc::parallel_fill(exec_space, chunk_y_x, 1);
164+
165+
ddc::parallel_copy(exec_space, chunk_x_y_z, chunk_y_x);
166+
167+
EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y_z.size());
168+
}

0 commit comments

Comments
 (0)