Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/command_graph_generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ class command_graph_generator {

struct assigned_chunk {
node_id executed_on = -1;
chunk<3> chnk;
box<3> chnk;
};

struct buffer_requirements {
Expand All @@ -239,7 +239,7 @@ class command_graph_generator {

std::vector<assigned_chunk> split_task_and_assign_chunks(const task& tsk) const;

buffer_requirements_list get_buffer_requirements_for_mapped_access(const task& tsk, const subrange<3>& sr) const;
buffer_requirements_list get_buffer_requirements_for_mapped_access(const task& tsk, const box<3>& box) const;

assigned_chunks_with_requirements compute_per_chunk_requirements(const task& tsk, const std::vector<assigned_chunk>& chunks) const;

Expand Down
5 changes: 3 additions & 2 deletions include/split.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "grid.h"
#include "ranges.h"

#include <cstddef>
Expand All @@ -8,7 +9,7 @@

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);

} // namespace celerity::detail
30 changes: 14 additions & 16 deletions src/command_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,9 @@ std::vector<const command*> command_graph_generator::build_task(const task& tsk)
}

void command_graph_generator::report_overlapping_writes(const task& tsk, const box_vector<3>& local_chunks) const {
const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};

// Since this check is run distributed on every node, we avoid quadratic behavior by only checking for conflicts between all local chunks and the
// region-union of remote chunks. This way, every conflict will be reported by at least one node.
const box<3> global_chunk(subrange(full_chunk.offset, full_chunk.range));
const box<3> global_chunk(subrange(tsk.get_global_offset(), tsk.get_global_size()));
auto remote_chunks = region_difference(global_chunk, region(box_vector<3>(local_chunks))).into_boxes();

// detect_overlapping_writes takes a single box_vector, so we concatenate local and global chunks (the order does not matter)
Expand All @@ -140,13 +138,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b
}

std::vector<command_graph_generator::assigned_chunk> command_graph_generator::split_task_and_assign_chunks(const task& tsk) const {
const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())};
const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier;
const auto chunks = ([&] {
if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) {
std::vector<chunk<3>> chunks;
std::vector<box<3>> chunks;
for(size_t nid = 0; nid < m_num_nodes; ++nid) {
chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}}));
const id<1> min = tsk.get_type() == task_type::collective ? nid : 0;
const id<1> max = min + 1;
chunks.push_back(box_cast<3>(box<1>{min, max}));
}
return chunks;
}
Expand All @@ -157,7 +157,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
}
return std::vector<chunk<3>>{full_chunk};
return std::vector<box<3>>{full_chunk};
})();
assert(chunks.size() <= num_chunks); // We may have created less than requested
assert(!chunks.empty());
Expand All @@ -176,12 +176,11 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
return assigned_chunks;
}

command_graph_generator::buffer_requirements_list command_graph_generator::get_buffer_requirements_for_mapped_access(
const task& tsk, const subrange<3>& sr) const {
command_graph_generator::buffer_requirements_list command_graph_generator::get_buffer_requirements_for_mapped_access(const task& tsk, const box<3>& box) const {
buffer_requirements_list result;
const auto& access_map = tsk.get_buffer_access_map();
for(const buffer_id bid : access_map.get_accessed_buffers()) {
result.push_back(buffer_requirements{bid, access_map.compute_consumed_region(bid, box<3>(sr)), access_map.compute_produced_region(bid, box<3>(sr))});
result.push_back(buffer_requirements{bid, access_map.compute_consumed_region(bid, box), access_map.compute_produced_region(bid, box)});
}
return result;
}
Expand Down Expand Up @@ -283,7 +282,7 @@ void command_graph_generator::resolve_pending_reductions(
// as oversubscription is handled by the instruction graph).
// NOTE: The participating_nodes.count() check below relies on this being true
assert(chunks_with_requirements.local_chunks.size() <= 1);
for(const auto& [a_chunk, requirements] : chunks_with_requirements.local_chunks) {
for(const auto& [_, requirements] : chunks_with_requirements.local_chunks) {
if(std::none_of(requirements.begin(), requirements.end(), [&](const buffer_requirements& br) { return br.bid == bid && !br.consumed.empty(); })) {
// This chunk doesn't read from the buffer
continue;
Expand Down Expand Up @@ -390,7 +389,7 @@ void command_graph_generator::generate_pushes(batch& current_batch, const task&

// TODO: We currently generate an await push command for each local chunk, whereas we only generate a single push command for all remote chunks
void command_graph_generator::generate_await_pushes(batch& current_batch, const task& tsk, const assigned_chunks_with_requirements& chunks_with_requirements) {
for(auto& [a_chunk, requirements] : chunks_with_requirements.local_chunks) {
for(auto& [_, requirements] : chunks_with_requirements.local_chunks) {
for(auto& [bid, consumed, _] : requirements) {
if(consumed.empty()) continue;
auto& buffer = m_buffers.at(bid);
Expand Down Expand Up @@ -462,7 +461,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
if(m_policy.overlapping_write_error != error_policy::ignore) {
box_vector<3> local_chunks;
for(const auto& [a_chunk, _] : chunks_with_requirements.local_chunks) {
local_chunks.push_back(box<3>{a_chunk.chnk});
local_chunks.push_back(a_chunk.chnk);
}
report_overlapping_writes(tsk, local_chunks);
}
Expand All @@ -488,7 +487,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
// we have to include it in exactly one of the per-node intermediate reductions.
const bool is_reduction_initializer = std::any_of(tsk.get_reductions().begin(), tsk.get_reductions().end(),
[&](const auto& reduction) { return m_local_nid == reduction_initializer_nid && reduction.init_from_buffer; });
cmd = create_command<execution_command>(current_batch, &tsk, subrange{a_chunk.chnk}, is_reduction_initializer,
cmd = create_command<execution_command>(current_batch, &tsk, a_chunk.chnk.get_subrange(), is_reduction_initializer,
[&](const auto& record_debug_info) { record_debug_info(tsk, [this](const buffer_id bid) { return m_buffers.at(bid).debug_name; }); });
}

Expand Down Expand Up @@ -539,8 +538,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
if(const auto uninitialized_reads = region_difference(consumed, buffer.initialized_region); !uninitialized_reads.empty()) {
utils::report_error(m_policy.uninitialized_read_error,
"Command C{} on N{}, which executes {} of {}, reads {} {}, which has not been written by any node.", cmd->get_id(), m_local_nid,
box(subrange(a_chunk.chnk.offset, a_chunk.chnk.range)), print_task_debug_label(tsk), print_buffer_debug_label(bid),
uninitialized_reads);
a_chunk.chnk, print_task_debug_label(tsk), print_buffer_debug_label(bid), uninitialized_reads);
}
}
}
Expand Down
7 changes: 3 additions & 4 deletions src/instruction_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1502,12 +1502,11 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id;
const auto split = tsk.get_hint<experimental::hints::split_2d>() != nullptr ? split_2d : split_1d;

const auto command_sr = ecmd.get_execution_range();
const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size());
const auto command_chunk = box<3>(ecmd.get_execution_range());

// As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain
// contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap).
std::vector<chunk<3>> coarse_chunks;
std::vector<box<3>> coarse_chunks;
if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) {
coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size());
} else {
Expand Down Expand Up @@ -1537,7 +1536,7 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) {
for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) {
auto& localized_chunk = concurrent_chunks.emplace_back();
localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range));
localized_chunk.execution_range = fine_chunk;
if(tsk.get_execution_target() == execution_target::device) {
assert(coarse_idx < m_system.devices.size());
localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory;
Expand Down
99 changes: 54 additions & 45 deletions src/split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,26 @@ namespace {
using namespace celerity;
using namespace celerity::detail;

[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector<box<3>>& split) {
region<3> reconstructed_chunk;
for(auto& chnk : split) {
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
assert(region_intersection(reconstructed_chunk, chnk).empty());
reconstructed_chunk = region_union(chnk, reconstructed_chunk);
}
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
assert(region_difference(reconstructed_chunk, full_chunk).empty());
}

template <int Dims>
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
const box<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
range<Dims> small_chunk_size{zeros};
range<Dims> large_chunk_size{zeros};
range<Dims> num_large_chunks{zeros};
for(int d = 0; d < Dims; ++d) {
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d];
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
}
return {small_chunk_size, large_chunk_size, num_large_chunks};
}
Expand All @@ -51,9 +51,9 @@ std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
*/
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
std::array<size_t, 2> assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
assert(num_chunks % factor == 0);
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]};
const size_t f0 = factor;
const size_t f1 = num_chunks / factor;

Expand All @@ -71,12 +71,12 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

// If domain is square(-ish), prefer splitting along slower dimension.
// (These bounds have been chosen arbitrarily!)
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast<double>(full_chunk.get_range()[0]);
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }

// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1];
const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1];
return circ0 < circ1 ? split_0_1 : split_1_0;

// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
Expand All @@ -87,85 +87,94 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs) {
#ifndef NDEBUG
assert(num_chunks > 0);
assert(num_boxs > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_box.get_range()[d] % granularity[d] == 0);
}
#endif

// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);
// we may not be able to create the requested number of boxs.
const std::array<size_t, 1> actual_num_boxs = {std::min(num_boxs, full_box.get_range()[0] / granularity[0])};
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_box, granularity, actual_num_boxs);

std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
std::vector<box<3>> result;
result.reserve(actual_num_boxs[0]);
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
result[i].range[0] = large_chunk_size[0];
result[i].offset[0] += i * large_chunk_size[0];
id<3> min = full_box.get_min();
id<3> max = full_box.get_max();
min[0] += i * large_chunk_size[0];
max[0] = min[0] + large_chunk_size[0];
result.emplace_back(min, max);
}
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
result[i].range[0] = small_chunk_size[0];
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
for(auto i = num_large_chunks[0]; i < actual_num_boxs[0]; ++i) {
id<3> min = full_box.get_min();
id<3> max = full_box.get_max();
min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
max[0] = min[0] + small_chunk_size[0];
result.emplace_back(min, max);
}

#ifndef NDEBUG
sanity_check_split(full_chunk, result);
sanity_check_split(full_box, result);
#endif

return result;
}

// TODO: Make the split dimensions configurable for 3D chunks?
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs) {
#ifndef NDEBUG
assert(num_chunks > 0);
assert(num_boxs > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_box.get_range()[d] % granularity[d] == 0);
}
#endif

// Factorize num_chunks
// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
// Factorize num_boxs
// We start out with an initial guess of `factor = floor(sqrt(num_boxs))` (the other one is implicitly given by `num_boxs / factor`),
// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
size_t factor = std::floor(std::sqrt(num_chunks));
// the requested number of chunks, or until we reach (1, num_boxs), i.e., a 1D split.
size_t factor = std::floor(std::sqrt(num_boxs));
std::array<size_t, 2> best_chunk_counts = {0, 0};
while(factor >= 1) {
while(factor > 1 && num_chunks % factor != 0) {
while(factor > 1 && num_boxs % factor != 0) {
factor--;
}
// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
// The returned counts are at most (factor, num_boxs / factor), however may be less if constrained by the split granularity.
const auto chunk_counts = assign_split_factors_2d(full_box, granularity, factor, num_boxs);
if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
if(chunk_counts[0] * chunk_counts[1] == num_boxs) { break; }
factor--;
}
const auto actual_num_chunks = best_chunk_counts;
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_box, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
id<3> offset = full_chunk.offset;
std::vector<box<3>> result;
result.reserve(actual_num_chunks[0] * actual_num_chunks[1]);
id<3> offset = full_box.get_min();

for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
auto& chnk = result[j * actual_num_chunks[1] + i];
chnk.offset = offset;
chnk.range[0] = chunk_size[0];
chnk.range[1] = chunk_size[1];
const id<3> min = offset;
id<3> max = full_box.get_max();
max[0] = min[0] + chunk_size[0];
max[1] = min[1] + chunk_size[1];
result.emplace_back(min, max);
offset[1] += chunk_size[1];
}
offset[0] += chunk_size[0];
offset[1] = full_chunk.offset[1];
offset[1] = full_box.get_min()[1];
}

#ifndef NDEBUG
sanity_check_split(full_chunk, result);
sanity_check_split(full_box, result);
#endif

return result;
Expand Down
Loading