Skip to content

Commit ac9d25c

Browse files
committed
Faster implementation
1 parent bb66aec commit ac9d25c

1 file changed

Lines changed: 38 additions & 20 deletions

File tree

src/ddc/parallel_copy.hpp

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -6,46 +6,62 @@
66

77
#include <cassert>
88
#include <type_traits>
9+
#include <utility>
910

1011
#include <Kokkos_Core.hpp>
1112

12-
#include "chunk_span.hpp"
1313
#include "chunk_traits.hpp"
14-
#include "parallel_for_each.hpp"
14+
#include "ddc_to_kokkos_execution_policy.hpp"
1515

1616
namespace ddc {
1717

1818
namespace detail {
1919

20-
template <
21-
typename Tsrc,
22-
typename Tdst,
23-
typename DDomSrc,
24-
typename DDomDst,
25-
typename MemorySpace,
26-
typename LayoutSrc,
27-
typename LayoutDst>
20+
template <typename ChunkSpanDst, typename ChunkSpanSrc, typename IndexSequence>
2821
class CopyKokkosLambdaAdapter
2922
{
30-
ddc::ChunkSpan<Tdst, DDomDst, LayoutDst, MemorySpace> m_dst;
23+
};
24+
25+
template <typename ChunkSpanDst, typename ChunkSpanSrc, std::size_t... Idx>
26+
class CopyKokkosLambdaAdapter<ChunkSpanDst, ChunkSpanSrc, std::index_sequence<Idx...>>
27+
{
28+
template <std::size_t I>
29+
using index_type = DiscreteVectorElement;
3130

32-
ddc::ChunkSpan<Tsrc const, DDomSrc, LayoutSrc, MemorySpace> m_src;
31+
ChunkSpanDst m_dst;
32+
33+
ChunkSpanSrc m_src;
3334

3435
public:
35-
explicit CopyKokkosLambdaAdapter(
36-
ddc::ChunkSpan<Tdst, DDomDst, LayoutDst, MemorySpace> const& dst,
37-
ddc::ChunkSpan<Tsrc const, DDomSrc, LayoutSrc, MemorySpace> const& src)
36+
explicit CopyKokkosLambdaAdapter(ChunkSpanDst const& dst, ChunkSpanSrc const& src)
3837
: m_dst(dst)
3938
, m_src(src)
4039
{
4140
}
4241

43-
KOKKOS_FUNCTION void operator()(DDomDst::discrete_element_type idst) const
42+
KOKKOS_FUNCTION void operator()(index_type<0> /*id*/) const
43+
requires(sizeof...(Idx) == 0)
4444
{
45-
m_dst(idst) = m_src(typename DDomSrc::discrete_element_type(idst));
45+
m_dst() = m_src();
46+
}
47+
48+
KOKKOS_FUNCTION void operator()(index_type<Idx>... ids) const
49+
requires(sizeof...(Idx) > 0)
50+
{
51+
using DVectDst = typename ChunkSpanDst::discrete_vector_type;
52+
using DVectSrc = typename ChunkSpanSrc::discrete_vector_type;
53+
DVectDst const ddst(ids...);
54+
m_dst(ddst) = m_src(DVectSrc(ddst));
4655
}
4756
};
4857

58+
template <typename ChunkSpanDst, typename ChunkSpanSrc>
59+
CopyKokkosLambdaAdapter(ChunkSpanDst const& dst, ChunkSpanSrc const& src)
60+
-> CopyKokkosLambdaAdapter<
61+
ChunkSpanDst,
62+
ChunkSpanSrc,
63+
std::make_index_sequence<ChunkSpanDst::rank()>>;
64+
4965
} // namespace detail
5066

5167
/** Copy the content of a borrowed chunk into another. It supports transposition and broadcasting at the same time.
@@ -82,9 +98,11 @@ auto parallel_copy(ExecSpace const& execution_space, ChunkDst&& dst, ChunkSrc&&
8298
// Alternative implementations:
8399
// - outer loop over src dimensions and inner loop over batch dimensions
84100
// - outer loop over batch dimensions and inner loop over src dimensions
85-
ddc::parallel_for_each(
86-
execution_space,
87-
dst.domain(),
101+
Kokkos::parallel_for(
102+
"ddc_copy_default",
103+
detail::ddc_to_kokkos_execution_policy(
104+
execution_space,
105+
detail::array(dst.domain().extents())),
88106
detail::CopyKokkosLambdaAdapter(dst.span_view(), src.span_cview()));
89107
}
90108
return dst.span_view();

0 commit comments

Comments
 (0)