Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ target_sources(
src/ddc/for_each_block.hpp
src/ddc/kokkos_allocator.hpp
src/ddc/non_uniform_point_sampling.hpp
src/ddc/parallel_copy.hpp
src/ddc/parallel_deepcopy.hpp
src/ddc/parallel_fill.hpp
src/ddc/parallel_for_each.hpp
Expand Down
1 change: 1 addition & 0 deletions src/ddc/ddc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ namespace ddc {
#include "create_mirror.hpp"
#include "for_each.hpp"
#include "for_each_block.hpp"
#include "parallel_copy.hpp"
#include "parallel_deepcopy.hpp"
#include "parallel_fill.hpp"
#include "parallel_for_each.hpp"
Expand Down
111 changes: 111 additions & 0 deletions src/ddc/parallel_copy.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright (C) The DDC development team, see COPYRIGHT.md file
//
// SPDX-License-Identifier: MIT

#pragma once

#include <cassert>
#include <type_traits>
#include <utility>

#include <Kokkos_Core.hpp>

#include "chunk_traits.hpp"
#include "ddc_to_kokkos_execution_policy.hpp"

namespace ddc {

namespace detail {

template <typename ChunkSpanDst, typename ChunkSpanSrc, typename IndexSequence>
class CopyKokkosLambdaAdapter
{
};

template <typename ChunkSpanDst, typename ChunkSpanSrc, std::size_t... Idx>
class CopyKokkosLambdaAdapter<ChunkSpanDst, ChunkSpanSrc, std::index_sequence<Idx...>>
{
template <std::size_t I>
using index_type = DiscreteVectorElement;

ChunkSpanDst m_dst;

ChunkSpanSrc m_src;

public:
explicit CopyKokkosLambdaAdapter(ChunkSpanDst const& dst, ChunkSpanSrc const& src)
: m_dst(dst)
, m_src(src)
{
}

KOKKOS_FUNCTION void operator()(index_type<0> /*id*/) const
requires(sizeof...(Idx) == 0)
{
m_dst() = m_src();
}

KOKKOS_FUNCTION void operator()(index_type<Idx>... ids) const
requires(sizeof...(Idx) > 0)
{
using DVectDst = ChunkSpanDst::discrete_vector_type;
using DVectSrc = ChunkSpanSrc::discrete_vector_type;
DVectDst const ddst(ids...);
m_dst(ddst) = m_src(DVectSrc(ddst));
}
};

template <typename ChunkSpanDst, typename ChunkSpanSrc>
CopyKokkosLambdaAdapter(ChunkSpanDst const& dst, ChunkSpanSrc const& src)
-> CopyKokkosLambdaAdapter<
ChunkSpanDst,
ChunkSpanSrc,
std::make_index_sequence<ChunkSpanDst::rank()>>;

} // namespace detail

/** Copy the content of a borrowed chunk into another. It supports transposition and broadcasting at the same time.
* The two arrays must be accessible from execution_space.
* @param[in] execution_space a Kokkos execution space where the loop will be executed on
* @param[out] dst the borrowed chunk in which to copy
* @param[in] src the borrowed chunk from which to copy
* @return dst as a ChunkSpan
*/
template <class ExecSpace, class ChunkDst, class ChunkSrc>
auto parallel_copy(ExecSpace const& execution_space, ChunkDst&& dst, ChunkSrc&& src)
{
static_assert(is_borrowed_chunk_v<ChunkDst>);
static_assert(is_borrowed_chunk_v<ChunkSrc>);
static_assert(Kokkos::SpaceAccessibility<
ExecSpace,
typename std::remove_cvref_t<ChunkDst>::memory_space>::accessible);
static_assert(Kokkos::SpaceAccessibility<
ExecSpace,
typename std::remove_cvref_t<ChunkSrc>::memory_space>::accessible);
static_assert(
std::is_assignable_v<chunk_reference_t<ChunkDst>, chunk_reference_t<ChunkSrc>>,
"Not assignable");
using DDomDst = decltype(dst.domain());
using DDomSrc = decltype(src.domain());
assert(DDomSrc(dst.domain()) == src.domain());
if constexpr (std::is_same_v<DDomDst, DDomSrc>) {
Kokkos::deep_copy(
execution_space,
dst.allocation_kokkos_view(),
src.allocation_kokkos_view());
} else {
// The current implementation uses a loop over dst dimensions.
// Alternative implementations:
// - outer loop over src dimensions and inner loop over batch dimensions
// - outer loop over batch dimensions and inner loop over src dimensions
Kokkos::parallel_for(
"ddc_copy_default",
detail::ddc_to_kokkos_execution_policy(
execution_space,
detail::array(dst.domain().extents())),
detail::CopyKokkosLambdaAdapter(dst.span_view(), src.span_cview()));
}
return dst.span_view();
}

} // namespace ddc
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ add_executable(
for_each_block.cpp
multiple_discrete_dimensions.cpp
non_uniform_point_sampling.cpp
parallel_copy.cpp
parallel_deepcopy.cpp
parallel_fill.cpp
parallel_for_each.cpp
Expand Down
168 changes: 168 additions & 0 deletions tests/parallel_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (C) The DDC development team, see COPYRIGHT.md file
//
// SPDX-License-Identifier: MIT

#include <ddc/ddc.hpp>

#include <gtest/gtest.h>

#include <Kokkos_Core.hpp>
#include <Kokkos_StdAlgorithms.hpp>

#include "ddc/kokkos_allocator.hpp"

inline namespace anonymous_namespace_workaround_parallel_copy_cpp {

using DElem0D = ddc::DiscreteElement<>;
using DVect0D = ddc::DiscreteVector<>;
using DDom0D = ddc::DiscreteDomain<>;

template <class Datatype>
using Chunk0D = ddc::Chunk<Datatype, DDom0D>;
template <class Datatype>
using ChunkSpan0D = ddc::ChunkSpan<Datatype, DDom0D>;


struct DDimX
{
};
using DElemX = ddc::DiscreteElement<DDimX>;
using DVectX = ddc::DiscreteVector<DDimX>;
using DDomX = ddc::DiscreteDomain<DDimX>;

template <class Datatype>
using ChunkX = ddc::Chunk<Datatype, DDomX>;


struct DDimY
{
};
using DElemY = ddc::DiscreteElement<DDimY>;
using DVectY = ddc::DiscreteVector<DDimY>;
using DDomY = ddc::DiscreteDomain<DDimY>;

template <class Datatype>
using ChunkY = ddc::Chunk<Datatype, DDomY>;


struct DDimZ
{
};
using DElemZ = ddc::DiscreteElement<DDimZ>;
using DVectZ = ddc::DiscreteVector<DDimZ>;
using DDomZ = ddc::DiscreteDomain<DDimZ>;


using DElemXY = ddc::DiscreteElement<DDimX, DDimY>;
using DVectXY = ddc::DiscreteVector<DDimX, DDimY>;
using DDomXY = ddc::DiscreteDomain<DDimX, DDimY>;

template <class Datatype>
using ChunkXY = ddc::Chunk<Datatype, DDomXY>;


using DElemYX = ddc::DiscreteElement<DDimY, DDimX>;
using DVectYX = ddc::DiscreteVector<DDimY, DDimX>;
using DDomYX = ddc::DiscreteDomain<DDimY, DDimX>;

template <class Datatype>
using ChunkYX = ddc::Chunk<Datatype, DDomYX>;


using DElemXYZ = ddc::DiscreteElement<DDimX, DDimY, DDimZ>;
using DVectXYZ = ddc::DiscreteVector<DDimX, DDimY, DDimZ>;
using DDomXYZ = ddc::DiscreteDomain<DDimX, DDimY, DDimZ>;

DElem0D constexpr lbound_0d {};
DVect0D constexpr nelems_0d {};
DDom0D constexpr dom_0d(lbound_0d, nelems_0d);

DElemX constexpr lbound_x = ddc::init_trivial_half_bounded_space<DDimX>();
DVectX constexpr nelems_x(3);
DDomX constexpr dom_x(lbound_x, nelems_x);

DElemY constexpr lbound_y = ddc::init_trivial_half_bounded_space<DDimY>();
DVectY constexpr nelems_y(12);

DElemZ constexpr lbound_z = ddc::init_trivial_half_bounded_space<DDimZ>();
DVectZ constexpr nelems_z(5);

DElemXY constexpr lbound_x_y(lbound_x, lbound_y);
DVectXY constexpr nelems_x_y(nelems_x, nelems_y);
DDomXY constexpr dom_x_y(lbound_x_y, nelems_x_y);

DElemXYZ constexpr lbound_x_y_z(lbound_x, lbound_y, lbound_z);
DVectXYZ constexpr nelems_x_y_z(nelems_x, nelems_y, nelems_z);
DDomXYZ constexpr dom_x_y_z(lbound_x_y_z, nelems_x_y_z);

} // namespace anonymous_namespace_workaround_parallel_copy_cpp

TEST(ParallelCopy, BroadcastScalar2XY)
{
Kokkos::DefaultExecutionSpace const exec_space;

Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
ddc::ChunkSpan const
chunk_x_y(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
ddc::parallel_fill(exec_space, chunk_x_y, 0);

ddc::Chunk chunk(dom_0d, ddc::DeviceAllocator<int>());
ddc::parallel_fill(exec_space, chunk, 1);

ddc::parallel_copy(exec_space, chunk_x_y, chunk);

EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
}

TEST(ParallelCopy, BroadcastX2XY)
{
Kokkos::DefaultExecutionSpace const exec_space;

Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
ddc::ChunkSpan const
chunk_x_y(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
ddc::parallel_fill(exec_space, chunk_x_y, 0);

ddc::Chunk chunk_x(dom_x, ddc::DeviceAllocator<int>());
ddc::parallel_fill(exec_space, chunk_x, 1);

ddc::parallel_copy(exec_space, chunk_x_y, chunk_x);

EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
}

TEST(ParallelCopy, TransposeXY2XY)
{
Kokkos::DefaultExecutionSpace const exec_space;

Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y.size());
ddc::ChunkSpan const
chunk_x_y_out(Kokkos::View<int**>(storage.data(), nelems_x, nelems_y), dom_x_y);
ddc::parallel_fill(exec_space, chunk_x_y_out, 0);

ddc::Chunk chunk_x_y_in(dom_x_y, ddc::DeviceAllocator<int>());
ddc::parallel_fill(exec_space, chunk_x_y_in, 1);

ddc::parallel_copy(exec_space, chunk_x_y_out, chunk_x_y_in);

EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y.size());
}

TEST(ParallelCopy, BroadcastAndTransposeYX2XYZ)
{
Kokkos::DefaultExecutionSpace const exec_space;

Kokkos::View<int*> const storage(Kokkos::view_alloc("storage", exec_space), dom_x_y_z.size());
ddc::ChunkSpan const chunk_x_y_z(
Kokkos::View<int***>(storage.data(), nelems_x, nelems_y, nelems_z),
dom_x_y_z);
ddc::parallel_fill(exec_space, chunk_x_y_z, 0);

DDomYX const dom_y_x(dom_x_y);
ddc::Chunk chunk_y_x(dom_y_x, ddc::DeviceAllocator<int>());
ddc::parallel_fill(exec_space, chunk_y_x, 1);

ddc::parallel_copy(exec_space, chunk_x_y_z, chunk_y_x);

EXPECT_EQ(Kokkos::Experimental::count(exec_space, storage, 1), dom_x_y_z.size());
}
Loading