celerity · PeterTh · Jun 3, 2025 · Oct 21, 2024 · May 26, 2025 · Jun 2, 2025
diff --git a/include/command_graph_generator.h b/include/command_graph_generator.h
@@ -217,7 +217,7 @@ class command_graph_generator {
 
 	struct assigned_chunk {
 		node_id executed_on = -1;
-		chunk<3> chnk;
+		box<3> chnk;
 	};
 
 	struct buffer_requirements {
@@ -239,7 +239,7 @@ class command_graph_generator {
 
 	std::vector<assigned_chunk> split_task_and_assign_chunks(const task& tsk) const;
 
-	buffer_requirements_list get_buffer_requirements_for_mapped_access(const task& tsk, const subrange<3>& sr) const;
+	buffer_requirements_list get_buffer_requirements_for_mapped_access(const task& tsk, const box<3>& box) const;
 
 	assigned_chunks_with_requirements compute_per_chunk_requirements(const task& tsk, const std::vector<assigned_chunk>& chunks) const;
 

diff --git a/include/split.h b/include/split.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include "grid.h"
 #include "ranges.h"
 
 #include <cstddef>
@@ -8,7 +9,7 @@
 
 namespace celerity::detail {
 
-std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
-std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
+std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
+std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
 
 } // namespace celerity::detail
diff --git a/src/command_graph_generator.cc b/src/command_graph_generator.cc
@@ -117,11 +117,9 @@ std::vector<const command*> command_graph_generator::build_task(const task& tsk)
 }
 
 void command_graph_generator::report_overlapping_writes(const task& tsk, const box_vector<3>& local_chunks) const {
-	const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
-
 	// Since this check is run distributed on every node, we avoid quadratic behavior by only checking for conflicts between all local chunks and the
 	// region-union of remote chunks. This way, every conflict will be reported by at least one node.
-	const box<3> global_chunk(subrange(full_chunk.offset, full_chunk.range));
+	const box<3> global_chunk(subrange(tsk.get_global_offset(), tsk.get_global_size()));
 	auto remote_chunks = region_difference(global_chunk, region(box_vector<3>(local_chunks))).into_boxes();
 
 	// detect_overlapping_writes takes a single box_vector, so we concatenate local and global chunks (the order does not matter)
@@ -140,13 +138,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b
 }
 
 std::vector<command_graph_generator::assigned_chunk> command_graph_generator::split_task_and_assign_chunks(const task& tsk) const {
-	const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
+	const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())};
 	const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier;
 	const auto chunks = ([&] {
 		if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) {
-			std::vector<chunk<3>> chunks;
+			std::vector<box<3>> chunks;
 			for(size_t nid = 0; nid < m_num_nodes; ++nid) {
-				chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}}));
+				const id<1> min = tsk.get_type() == task_type::collective ? nid : 0;
+				const id<1> max = min + 1;
+				chunks.push_back(box_cast<3>(box<1>{min, max}));
 			}
 			return chunks;
 		}
@@ -157,7 +157,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
 			if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
 			return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
 		}
-		return std::vector<chunk<3>>{full_chunk};
+		return std::vector<box<3>>{full_chunk};
 	})();
 	assert(chunks.size() <= num_chunks); // We may have created less than requested
 	assert(!chunks.empty());
@@ -176,12 +176,11 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
 	return assigned_chunks;
 }
 
-command_graph_generator::buffer_requirements_list command_graph_generator::get_buffer_requirements_for_mapped_access(
-    const task& tsk, const subrange<3>& sr) const {
+command_graph_generator::buffer_requirements_list command_graph_generator::get_buffer_requirements_for_mapped_access(const task& tsk, const box<3>& box) const {
 	buffer_requirements_list result;
 	const auto& access_map = tsk.get_buffer_access_map();
 	for(const buffer_id bid : access_map.get_accessed_buffers()) {
-		result.push_back(buffer_requirements{bid, access_map.compute_consumed_region(bid, box<3>(sr)), access_map.compute_produced_region(bid, box<3>(sr))});
+		result.push_back(buffer_requirements{bid, access_map.compute_consumed_region(bid, box), access_map.compute_produced_region(bid, box)});
 	}
 	return result;
 }
@@ -283,7 +282,7 @@ void command_graph_generator::resolve_pending_reductions(
 		// as oversubscription is handled by the instruction graph).
 		// NOTE: The participating_nodes.count() check below relies on this being true
 		assert(chunks_with_requirements.local_chunks.size() <= 1);
-		for(const auto& [a_chunk, requirements] : chunks_with_requirements.local_chunks) {
+		for(const auto& [_, requirements] : chunks_with_requirements.local_chunks) {
 			if(std::none_of(requirements.begin(), requirements.end(), [&](const buffer_requirements& br) { return br.bid == bid && !br.consumed.empty(); })) {
 				// This chunk doesn't read from the buffer
 				continue;
@@ -390,7 +389,7 @@ void command_graph_generator::generate_pushes(batch& current_batch, const task&
 
 // TODO: We currently generate an await push command for each local chunk, whereas we only generate a single push command for all remote chunks
 void command_graph_generator::generate_await_pushes(batch& current_batch, const task& tsk, const assigned_chunks_with_requirements& chunks_with_requirements) {
-	for(auto& [a_chunk, requirements] : chunks_with_requirements.local_chunks) {
+	for(auto& [_, requirements] : chunks_with_requirements.local_chunks) {
 		for(auto& [bid, consumed, _] : requirements) {
 			if(consumed.empty()) continue;
 			auto& buffer = m_buffers.at(bid);
@@ -462,7 +461,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
 	if(m_policy.overlapping_write_error != error_policy::ignore) {
 		box_vector<3> local_chunks;
 		for(const auto& [a_chunk, _] : chunks_with_requirements.local_chunks) {
-			local_chunks.push_back(box<3>{a_chunk.chnk});
+			local_chunks.push_back(a_chunk.chnk);
 		}
 		report_overlapping_writes(tsk, local_chunks);
 	}
@@ -488,7 +487,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
 			// we have to include it in exactly one of the per-node intermediate reductions.
 			const bool is_reduction_initializer = std::any_of(tsk.get_reductions().begin(), tsk.get_reductions().end(),
 			    [&](const auto& reduction) { return m_local_nid == reduction_initializer_nid && reduction.init_from_buffer; });
-			cmd = create_command<execution_command>(current_batch, &tsk, subrange{a_chunk.chnk}, is_reduction_initializer,
+			cmd = create_command<execution_command>(current_batch, &tsk, a_chunk.chnk.get_subrange(), is_reduction_initializer,
 			    [&](const auto& record_debug_info) { record_debug_info(tsk, [this](const buffer_id bid) { return m_buffers.at(bid).debug_name; }); });
 		}
 
@@ -539,8 +538,7 @@ void command_graph_generator::generate_distributed_commands(batch& current_batch
 				if(const auto uninitialized_reads = region_difference(consumed, buffer.initialized_region); !uninitialized_reads.empty()) {
 					utils::report_error(m_policy.uninitialized_read_error,
 					    "Command C{} on N{}, which executes {} of {}, reads {} {}, which has not been written by any node.", cmd->get_id(), m_local_nid,
-					    box(subrange(a_chunk.chnk.offset, a_chunk.chnk.range)), print_task_debug_label(tsk), print_buffer_debug_label(bid),
-					    uninitialized_reads);
+					    a_chunk.chnk, print_task_debug_label(tsk), print_buffer_debug_label(bid), uninitialized_reads);
 				}
 			}
 		}

diff --git a/src/instruction_graph_generator.cc b/src/instruction_graph_generator.cc
@@ -1502,12 +1502,11 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
 	    tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id;
 	const auto split = tsk.get_hint<experimental::hints::split_2d>() != nullptr ? split_2d : split_1d;
 
-	const auto command_sr = ecmd.get_execution_range();
-	const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size());
+	const auto command_chunk = box<3>(ecmd.get_execution_range());
 
 	// As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain
 	// contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap).
-	std::vector<chunk<3>> coarse_chunks;
+	std::vector<box<3>> coarse_chunks;
 	if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) {
 		coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size());
 	} else {
@@ -1537,7 +1536,7 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
 	for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) {
 		for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) {
 			auto& localized_chunk = concurrent_chunks.emplace_back();
-			localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range));
+			localized_chunk.execution_range = fine_chunk;
 			if(tsk.get_execution_target() == execution_target::device) {
 				assert(coarse_idx < m_system.devices.size());
 				localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory;

diff --git a/src/split.cc b/src/split.cc
@@ -17,26 +17,26 @@ namespace {
 using namespace celerity;
 using namespace celerity::detail;
 
-[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
+[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector<box<3>>& split) {
 	region<3> reconstructed_chunk;
 	for(auto& chnk : split) {
-		assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
-		reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
+		assert(region_intersection(reconstructed_chunk, chnk).empty());
+		reconstructed_chunk = region_union(chnk, reconstructed_chunk);
 	}
-	assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
+	assert(region_difference(reconstructed_chunk, full_chunk).empty());
 }
 
 template <int Dims>
 std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
-    const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
+    const box<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
 	range<Dims> small_chunk_size{zeros};
 	range<Dims> large_chunk_size{zeros};
 	range<Dims> num_large_chunks{zeros};
 	for(int d = 0; d < Dims; ++d) {
-		const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
+		const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d];
 		small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
 		large_chunk_size[d] = small_chunk_size[d] + granularity[d];
-		num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
+		num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
 	}
 	return {small_chunk_size, large_chunk_size, num_large_chunks};
 }
@@ -51,9 +51,9 @@ std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks
  * @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
  *          (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
  */
-std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
+std::array<size_t, 2> assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
 	assert(num_chunks % factor == 0);
-	const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
+	const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]};
 	const size_t f0 = factor;
 	const size_t f1 = num_chunks / factor;
 
@@ -71,12 +71,12 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const
 
 	// If domain is square(-ish), prefer splitting along slower dimension.
 	// (These bounds have been chosen arbitrarily!)
-	const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
+	const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast<double>(full_chunk.get_range()[0]);
 	if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }
 
 	// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
-	const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
-	const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
+	const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1];
+	const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1];
 	return circ0 < circ1 ? split_0_1 : split_1_0;
 
 	// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
@@ -87,85 +87,94 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const
 
 namespace celerity::detail {
 
-std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
+std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs) {
 #ifndef NDEBUG
-	assert(num_chunks > 0);
+	assert(num_boxs > 0);
 	for(int d = 0; d < 3; ++d) {
 		assert(granularity[d] > 0);
-		assert(full_chunk.range[d] % granularity[d] == 0);
+		assert(full_box.get_range()[d] % granularity[d] == 0);
 	}
 #endif
 
 	// Due to split granularity requirements or if num_workers > global_size[0],
-	// we may not be able to create the requested number of chunks.
-	const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
-	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);
+	// we may not be able to create the requested number of boxs.
+	const std::array<size_t, 1> actual_num_boxs = {std::min(num_boxs, full_box.get_range()[0] / granularity[0])};
+	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_box, granularity, actual_num_boxs);
 
-	std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
+	std::vector<box<3>> result;
+	result.reserve(actual_num_boxs[0]);
 	for(auto i = 0u; i < num_large_chunks[0]; ++i) {
-		result[i].range[0] = large_chunk_size[0];
-		result[i].offset[0] += i * large_chunk_size[0];
+		id<3> min = full_box.get_min();
+		id<3> max = full_box.get_max();
+		min[0] += i * large_chunk_size[0];
+		max[0] = min[0] + large_chunk_size[0];
+		result.emplace_back(min, max);
 	}
-	for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
-		result[i].range[0] = small_chunk_size[0];
-		result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
+	for(auto i = num_large_chunks[0]; i < actual_num_boxs[0]; ++i) {
+		id<3> min = full_box.get_min();
+		id<3> max = full_box.get_max();
+		min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
+		max[0] = min[0] + small_chunk_size[0];
+		result.emplace_back(min, max);
 	}
 
 #ifndef NDEBUG
-	sanity_check_split(full_chunk, result);
+	sanity_check_split(full_box, result);
 #endif
 
 	return result;
 }
 
 // TODO: Make the split dimensions configurable for 3D chunks?
-std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
+std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs) {
 #ifndef NDEBUG
-	assert(num_chunks > 0);
+	assert(num_boxs > 0);
 	for(int d = 0; d < 3; ++d) {
 		assert(granularity[d] > 0);
-		assert(full_chunk.range[d] % granularity[d] == 0);
+		assert(full_box.get_range()[d] % granularity[d] == 0);
 	}
 #endif
 
-	// Factorize num_chunks
-	// We start out with an initial guess of `factor = floor(sqrt(num_chunks))` (the other one is implicitly given by `num_chunks / factor`),
+	// Factorize num_boxs
+	// We start out with an initial guess of `factor = floor(sqrt(num_boxs))` (the other one is implicitly given by `num_boxs / factor`),
 	// and work our way down, keeping track of the best factorization we've found so far, until we find a factorization that produces
-	// the requested number of chunks, or until we reach (1, num_chunks), i.e., a 1D split.
-	size_t factor = std::floor(std::sqrt(num_chunks));
+	// the requested number of chunks, or until we reach (1, num_boxs), i.e., a 1D split.
+	size_t factor = std::floor(std::sqrt(num_boxs));
 	std::array<size_t, 2> best_chunk_counts = {0, 0};
 	while(factor >= 1) {
-		while(factor > 1 && num_chunks % factor != 0) {
+		while(factor > 1 && num_boxs % factor != 0) {
 			factor--;
 		}
-		// The returned counts are at most (factor, num_chunks / factor), however may be less if constrained by the split granularity.
-		const auto chunk_counts = assign_split_factors_2d(full_chunk, granularity, factor, num_chunks);
+		// The returned counts are at most (factor, num_boxs / factor), however may be less if constrained by the split granularity.
+		const auto chunk_counts = assign_split_factors_2d(full_box, granularity, factor, num_boxs);
 		if(chunk_counts[0] * chunk_counts[1] > best_chunk_counts[0] * best_chunk_counts[1]) { best_chunk_counts = chunk_counts; }
-		if(chunk_counts[0] * chunk_counts[1] == num_chunks) { break; }
+		if(chunk_counts[0] * chunk_counts[1] == num_boxs) { break; }
 		factor--;
 	}
 	const auto actual_num_chunks = best_chunk_counts;
-	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);
+	const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_box, granularity, actual_num_chunks);
 
-	std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
-	id<3> offset = full_chunk.offset;
+	std::vector<box<3>> result;
+	result.reserve(actual_num_chunks[0] * actual_num_chunks[1]);
+	id<3> offset = full_box.get_min();
 
 	for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
 		range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
 		for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
 			chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
-			auto& chnk = result[j * actual_num_chunks[1] + i];
-			chnk.offset = offset;
-			chnk.range[0] = chunk_size[0];
-			chnk.range[1] = chunk_size[1];
+			const id<3> min = offset;
+			id<3> max = full_box.get_max();
+			max[0] = min[0] + chunk_size[0];
+			max[1] = min[1] + chunk_size[1];
+			result.emplace_back(min, max);
 			offset[1] += chunk_size[1];
 		}
 		offset[0] += chunk_size[0];
-		offset[1] = full_chunk.offset[1];
+		offset[1] = full_box.get_min()[1];
 	}
 
 #ifndef NDEBUG
-	sanity_check_split(full_chunk, result);
+	sanity_check_split(full_box, result);
 #endif
 
 	return result;