Skip to content

Commit 95b117a

Browse files
committed
Change split functions to work on box instead of chunk
1 parent 6a2b416 commit 95b117a

5 files changed

Lines changed: 76 additions & 69 deletions

File tree

include/split.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22

3+
#include "grid.h"
34
#include "ranges.h"
45

56
#include <cstddef>
@@ -8,7 +9,7 @@
89

910
namespace celerity::detail {
1011

11-
std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
12-
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
12+
std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
13+
std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);
1314

1415
} // namespace celerity::detail

src/command_graph_generator.cc

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -140,13 +140,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b
140140
}
141141

142142
std::vector<command_graph_generator::assigned_chunk> command_graph_generator::split_task_and_assign_chunks(const task& tsk) const {
143-
const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
143+
const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())};
144144
const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier;
145145
const auto chunks = ([&] {
146146
if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) {
147-
std::vector<chunk<3>> chunks;
147+
std::vector<box<3>> chunks;
148148
for(size_t nid = 0; nid < m_num_nodes; ++nid) {
149-
chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}}));
149+
const id<1> min = tsk.get_type() == task_type::collective ? nid : 0;
150+
const id<1> max = min + 1;
151+
chunks.push_back(box_cast<3>(box<1>{min, max}));
150152
}
151153
return chunks;
152154
}
@@ -157,7 +159,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
157159
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
158160
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
159161
}
160-
return std::vector<chunk<3>>{full_chunk};
162+
return std::vector<box<3>>{full_chunk};
161163
})();
162164
assert(chunks.size() <= num_chunks); // We may have created less than requested
163165
assert(!chunks.empty());
@@ -171,7 +173,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
171173
std::vector<assigned_chunk> assigned_chunks;
172174
for(size_t i = 0; i < chunks.size(); ++i) {
173175
const node_id nid = (i / chunks_per_node) % m_num_nodes;
174-
assigned_chunks.push_back({nid, chunks[i]});
176+
assigned_chunks.push_back({nid, chunk<3>(chunks[i].get_min(), chunks[i].get_range(), tsk.get_global_size())});
175177
}
176178
return assigned_chunks;
177179
}

src/instruction_graph_generator.cc

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,12 +1502,11 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
15021502
tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id;
15031503
const auto split = tsk.get_hint<experimental::hints::split_2d>() != nullptr ? split_2d : split_1d;
15041504

1505-
const auto command_sr = ecmd.get_execution_range();
1506-
const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size());
1505+
const auto command_chunk = box<3>(ecmd.get_execution_range());
15071506

15081507
// As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain
15091508
// contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap).
1510-
std::vector<chunk<3>> coarse_chunks;
1509+
std::vector<box<3>> coarse_chunks;
15111510
if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) {
15121511
coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size());
15131512
} else {
@@ -1537,7 +1536,7 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
15371536
for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) {
15381537
for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) {
15391538
auto& localized_chunk = concurrent_chunks.emplace_back();
1540-
localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range));
1539+
localized_chunk.execution_range = fine_chunk;
15411540
if(tsk.get_execution_target() == execution_target::device) {
15421541
assert(coarse_idx < m_system.devices.size());
15431542
localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory;

src/split.cc

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -17,26 +17,26 @@ namespace {
1717
using namespace celerity;
1818
using namespace celerity::detail;
1919

20-
[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
20+
[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector<box<3>>& split) {
2121
region<3> reconstructed_chunk;
2222
for(auto& chnk : split) {
23-
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
24-
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
23+
assert(region_intersection(reconstructed_chunk, chnk).empty());
24+
reconstructed_chunk = region_union(chnk, reconstructed_chunk);
2525
}
26-
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
26+
assert(region_difference(reconstructed_chunk, full_chunk).empty());
2727
}
2828

2929
template <int Dims>
3030
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
31-
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
31+
const box<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
3232
range<Dims> small_chunk_size{zeros};
3333
range<Dims> large_chunk_size{zeros};
3434
range<Dims> num_large_chunks{zeros};
3535
for(int d = 0; d < Dims; ++d) {
36-
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
36+
const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d];
3737
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
3838
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
39-
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
39+
num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
4040
}
4141
return {small_chunk_size, large_chunk_size, num_large_chunks};
4242
}
@@ -51,9 +51,9 @@ std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks
5151
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
5252
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
5353
*/
54-
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
54+
std::array<size_t, 2> assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
5555
assert(num_chunks % factor == 0);
56-
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
56+
const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]};
5757
const size_t f0 = factor;
5858
const size_t f1 = num_chunks / factor;
5959

@@ -71,12 +71,12 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const
7171

7272
// If domain is square(-ish), prefer splitting along slower dimension.
7373
// (These bounds have been chosen arbitrarily!)
74-
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
74+
const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast<double>(full_chunk.get_range()[0]);
7575
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }
7676

7777
// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
78-
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
79-
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
78+
const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1];
79+
const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1];
8080
return circ0 < circ1 ? split_0_1 : split_1_0;
8181

8282
// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
@@ -87,28 +87,35 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const
8787

8888
namespace celerity::detail {
8989

90-
std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
90+
std::vector<box<3>> split_1d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
9191
#ifndef NDEBUG
9292
assert(num_chunks > 0);
9393
for(int d = 0; d < 3; ++d) {
9494
assert(granularity[d] > 0);
95-
assert(full_chunk.range[d] % granularity[d] == 0);
95+
assert(full_chunk.get_range()[d] % granularity[d] == 0);
9696
}
9797
#endif
9898

9999
// Due to split granularity requirements or if num_workers > global_size[0],
100100
// we may not be able to create the requested number of chunks.
101-
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
101+
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.get_range()[0] / granularity[0])};
102102
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);
103103

104-
std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
104+
std::vector<box<3>> result;
105+
result.reserve(actual_num_chunks[0]);
105106
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
106-
result[i].range[0] = large_chunk_size[0];
107-
result[i].offset[0] += i * large_chunk_size[0];
107+
id<3> min = full_chunk.get_min();
108+
id<3> max = full_chunk.get_max();
109+
min[0] += i * large_chunk_size[0];
110+
max[0] = min[0] + large_chunk_size[0];
111+
result.emplace_back(min, max);
108112
}
109113
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
110-
result[i].range[0] = small_chunk_size[0];
111-
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
114+
id<3> min = full_chunk.get_min();
115+
id<3> max = full_chunk.get_max();
116+
min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
117+
max[0] = min[0] + small_chunk_size[0];
118+
result.emplace_back(min, max);
112119
}
113120

114121
#ifndef NDEBUG
@@ -119,12 +126,12 @@ std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granu
119126
}
120127

121128
// TODO: Make the split dimensions configurable for 3D chunks?
122-
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
129+
std::vector<box<3>> split_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
123130
#ifndef NDEBUG
124131
assert(num_chunks > 0);
125132
for(int d = 0; d < 3; ++d) {
126133
assert(granularity[d] > 0);
127-
assert(full_chunk.range[d] % granularity[d] == 0);
134+
assert(full_chunk.get_range()[d] % granularity[d] == 0);
128135
}
129136
#endif
130137

@@ -147,21 +154,23 @@ std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granu
147154
const auto actual_num_chunks = best_chunk_counts;
148155
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);
149156

150-
std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
151-
id<3> offset = full_chunk.offset;
157+
std::vector<box<3>> result;
158+
result.reserve(actual_num_chunks[0] * actual_num_chunks[1]);
159+
id<3> offset = full_chunk.get_min();
152160

153161
for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
154162
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
155163
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
156164
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
157-
auto& chnk = result[j * actual_num_chunks[1] + i];
158-
chnk.offset = offset;
159-
chnk.range[0] = chunk_size[0];
160-
chnk.range[1] = chunk_size[1];
165+
const id<3> min = offset;
166+
id<3> max = full_chunk.get_max();
167+
max[0] = min[0] + chunk_size[0];
168+
max[1] = min[1] + chunk_size[1];
169+
result.emplace_back(min, max);
161170
offset[1] += chunk_size[1];
162171
}
163172
offset[0] += chunk_size[0];
164-
offset[1] = full_chunk.offset[1];
173+
offset[1] = full_chunk.get_min()[1];
165174
}
166175

167176
#ifndef NDEBUG

test/split_tests.cc

Lines changed: 25 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
#include <unordered_set>
2-
31
#include <catch2/catch_template_test_macros.hpp>
42
#include <catch2/catch_test_macros.hpp>
53
#include <catch2/generators/catch_generators_range.hpp>
@@ -14,19 +12,18 @@ using namespace celerity::detail;
1412
namespace {
1513

1614
template <int Dims>
17-
chunk<3> make_full_chunk(range<Dims> range) {
18-
return {id<3>{}, range_cast<3>(range), range_cast<3>(range)};
15+
box<3> make_full_chunk(range<Dims> range) {
16+
return {id<3>{}, range_cast<3>(range)};
1917
}
2018

21-
void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
19+
void check_1d_split(const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
2220
REQUIRE(split_chunks.size() == chunk_ranges.size());
23-
id<3> offset = full_chunk.offset;
21+
id<3> offset = full_chunk.get_min();
2422
for(size_t i = 0; i < split_chunks.size(); ++i) {
2523
const auto& chnk = split_chunks[i];
26-
REQUIRE_LOOP(chnk.offset == offset);
27-
REQUIRE_LOOP(chnk.range[0] == chunk_ranges[i]);
28-
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
29-
offset[0] += split_chunks[i].range[0];
24+
REQUIRE_LOOP(chnk.get_min() == offset);
25+
REQUIRE_LOOP(chnk.get_range()[0] == chunk_ranges[i]);
26+
offset[0] += chnk.get_range()[0];
3027
}
3128
}
3229

@@ -48,21 +45,20 @@ void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& spl
4845
* to the width of an individual chunk.
4946
*/
5047
void check_2d_split(
51-
const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
48+
const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
5249
REQUIRE(split_chunks.size() == std::accumulate(chunk_ranges.begin(), chunk_ranges.end(), size_t(0), [](size_t c, auto& p) { return c + p.second.size(); }));
5350
REQUIRE(std::all_of(chunk_ranges.begin(), chunk_ranges.end(), [&](auto& p) { return p.second.size() == chunk_ranges[0].second.size(); }));
54-
id<3> offset = full_chunk.offset;
51+
id<3> offset = full_chunk.get_min();
5552
for(size_t j = 0; j < chunk_ranges.size(); ++j) {
5653
const auto& [height, widths] = chunk_ranges[j];
5754
for(size_t i = 0; i < widths.size(); ++i) {
5855
const auto& chnk = split_chunks[j * chunk_ranges[0].second.size() + i];
59-
REQUIRE_LOOP(chnk.offset == offset);
60-
REQUIRE_LOOP(chnk.range[0] == height);
61-
REQUIRE_LOOP(chnk.range[1] == widths[i]);
62-
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
56+
REQUIRE_LOOP(chnk.get_min() == offset);
57+
REQUIRE_LOOP(chnk.get_range()[0] == height);
58+
REQUIRE_LOOP(chnk.get_range()[1] == widths[i]);
6359
offset[1] += widths[i];
6460
}
65-
offset[1] = full_chunk.offset[1];
61+
offset[1] = full_chunk.get_min()[1];
6662
offset[0] += height;
6763
}
6864
}
@@ -94,13 +90,13 @@ TEST_CASE("split_1d creates fewer chunks than requested if mandated by granulari
9490
}
9591

9692
TEST_CASE("split_1d preserves offset of original chunk", "[split]") {
97-
const auto full_chunk = chunk<3>{{37, 42, 7}, {128, 1, 1}, {128, 1, 1}};
93+
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {128, 1, 1})};
9894
const auto chunks = split_1d(full_chunk, ones, 4);
9995

100-
CHECK(chunks[0].offset == id<3>{37 + 0, 42, 7});
101-
CHECK(chunks[1].offset == id<3>{37 + 32, 42, 7});
102-
CHECK(chunks[2].offset == id<3>{37 + 64, 42, 7});
103-
CHECK(chunks[3].offset == id<3>{37 + 96, 42, 7});
96+
CHECK(chunks[0].get_min() == id<3>{37 + 0, 42, 7});
97+
CHECK(chunks[1].get_min() == id<3>{37 + 32, 42, 7});
98+
CHECK(chunks[2].get_min() == id<3>{37 + 64, 42, 7});
99+
CHECK(chunks[3].get_min() == id<3>{37 + 96, 42, 7});
104100

105101
check_1d_split(full_chunk, chunks, {32, 32, 32, 32});
106102
}
@@ -109,7 +105,7 @@ TEST_CASE("split_1d preserves ranges of original chunk in other dimensions", "[s
109105
const auto full_chunk = make_full_chunk<3>({128, 42, 341});
110106
const auto chunks = split_1d(full_chunk, ones, 4);
111107
for(size_t i = 0; i < 4; ++i) {
112-
REQUIRE_LOOP(chunks[0].range == range<3>{32, 42, 341});
108+
REQUIRE_LOOP(chunks[0].get_range() == range<3>{32, 42, 341});
113109
}
114110
}
115111

@@ -251,19 +247,19 @@ TEST_CASE("split_2d minimizes edge lengths for non-square domains") {
251247
}
252248

253249
TEST_CASE("split_2d preserves offset of original chunk", "[split]") {
254-
const auto full_chunk = chunk<3>{{37, 42, 7}, {64, 64, 1}, {128, 128, 1}};
250+
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {64, 64, 1})};
255251
const auto chunks = split_2d(full_chunk, ones, 4);
256-
CHECK(chunks[0].offset == id<3>{37, 42, 7});
257-
CHECK(chunks[1].offset == id<3>{37, 42 + 32, 7});
258-
CHECK(chunks[2].offset == id<3>{37 + 32, 42 + 0, 7});
259-
CHECK(chunks[3].offset == id<3>{37 + 32, 42 + 32, 7});
252+
CHECK(chunks[0].get_min() == id<3>{37, 42, 7});
253+
CHECK(chunks[1].get_min() == id<3>{37, 42 + 32, 7});
254+
CHECK(chunks[2].get_min() == id<3>{37 + 32, 42 + 0, 7});
255+
CHECK(chunks[3].get_min() == id<3>{37 + 32, 42 + 32, 7});
260256
}
261257

262258
TEST_CASE("split_2d preserves ranges of original chunk in other dimensions", "[split]") {
263259
const auto full_chunk = make_full_chunk<3>({128, 128, 341});
264260
const auto chunks = split_2d(full_chunk, ones, 4);
265261
for(size_t i = 0; i < 4; ++i) {
266-
REQUIRE_LOOP(chunks[i].range == range<3>{64, 64, 341});
262+
REQUIRE_LOOP(chunks[i].get_range() == range<3>{64, 64, 341});
267263
}
268264
}
269265

0 commit comments

Comments
 (0)