Skip to content

Commit 1e831cc

Browse files
authored
Skip extent-one dimensions in a device copy (#8957)
* Skip extent-one dimensions in a device copy Their strides are meaningless anyway Fixes #8956
1 parent d5a30fb commit 1e831cc

3 files changed

Lines changed: 93 additions & 1 deletion

File tree

src/runtime/device_buffer_utils.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
129129
// are added to the copy by inserting it such that the stride is
130130
// in ascending order in the dst.
131131
for (int i = 0; i < dst->dimensions; i++) {
132+
// Skip extent-one dimensions
133+
if (dst->dim[i].extent == 1) {
134+
continue;
135+
}
132136
// TODO: deal with negative strides.
133137
uint64_t dst_stride_bytes = (uint64_t)dst->dim[i].stride * dst->type.bytes();
134138
uint64_t src_stride_bytes = (uint64_t)src->dim[i].stride * src->type.bytes();
@@ -147,7 +151,6 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
147151
c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
148152
}
149153
c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent);
150-
// debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
151154
c.dst_stride_bytes[insert] = dst_stride_bytes;
152155
c.src_stride_bytes[insert] = src_stride_bytes;
153156
};
@@ -174,6 +177,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
174177
c.src_stride_bytes[MAX_COPY_DIMS - 1] = 0;
175178
c.dst_stride_bytes[MAX_COPY_DIMS - 1] = 0;
176179
}
180+
177181
return c;
178182
}
179183

test/performance/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ tests(GROUPS performance
1212
boundary_conditions.cpp
1313
clamped_vector_load.cpp
1414
const_division.cpp
15+
device_copy.cpp
1516
fast_inverse.cpp
1617
fast_pow.cpp
1718
fast_sine_cosine.cpp

test/performance/device_copy.cpp

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#include <vector>
2+
3+
#include "halide_benchmark.h"
4+
#include <Halide.h>
5+
6+
using namespace Halide;
7+
using namespace Halide::Tools;
8+
9+
double test(const Target &target, std::vector<int> order, std::vector<int> shape) {
10+
Buffer<float> buf(shape, order);
11+
for (int t = 0; t < shape[3]; t++) {
12+
for (int c = 0; c < shape[2]; c++) {
13+
for (int y = 0; y < shape[1]; y++) {
14+
for (int x = 0; x < shape[0]; x++) {
15+
buf(x, y, c, t) = x * .5f + y * 2.f + c * 4.f + t * .8f;
16+
}
17+
}
18+
}
19+
}
20+
21+
buf.set_host_dirty();
22+
23+
buf.device_malloc();
24+
25+
double time = benchmark([&]() {
26+
buf.set_host_dirty();
27+
buf.copy_to_device(target);
28+
buf.device_sync();
29+
});
30+
31+
// Nuke the host side data so we can check the data transferred back and forth OK.
32+
buf.set_device_dirty(false);
33+
buf.fill(0.f);
34+
buf.set_host_dirty(false);
35+
buf.set_device_dirty(true);
36+
buf.copy_to_host();
37+
38+
for (int t = 0; t < shape[3]; t++) {
39+
for (int c = 0; c < shape[2]; c++) {
40+
for (int y = 0; y < shape[1]; y++) {
41+
for (int x = 0; x < shape[0]; x++) {
42+
float correct = x * .5f + y * 2.f + c * 4.f + t * .8f;
43+
if (buf(x, y, c, t) != correct) {
44+
printf("buf(%d, %d, %d, %d) = %f instead of %f\n",
45+
x, y, c, t, buf(x, y, c, t), correct);
46+
exit(-1);
47+
}
48+
}
49+
}
50+
}
51+
}
52+
53+
return time;
54+
}
55+
56+
int main() {
57+
auto target = get_jit_target_from_environment();
58+
if (!target.has_gpu_feature()) {
59+
printf("[SKIP] This test requires a GPU target.\n");
60+
return 0;
61+
}
62+
63+
// These copies are all the same size and dense, but are in different memory
64+
// orderings and some of them have some extent=1 dimensions. (See
65+
// https://github.com/halide/Halide/issues/8956)
66+
67+
double t1 = test(target, {3, 2, 0, 1}, {1024, 1024, 3, 2});
68+
double t2 = test(target, {3, 2, 0, 1}, {1024, 1024, 6, 1});
69+
double t3 = test(target, {0, 1, 2, 3}, {1024, 1024, 3, 2});
70+
double t4 = test(target, {0, 1, 2, 3}, {1024, 1024, 6, 1});
71+
72+
double slowest = std::max({t1, t2, t3, t4});
73+
double fastest = std::min({t1, t2, t3, t4});
74+
75+
printf("Timings: %f %f %f %f\n", t1, t2, t3, t4);
76+
77+
// If one of these gets broken into a large number of separate copies, it
78+
// will be a lot more than 10x slower.
79+
if (slowest > 10 * fastest) {
80+
printf("Suspiciously large variation in timings for what should "
81+
"be a dense host->device copy.\n");
82+
return -1;
83+
}
84+
85+
printf("Success!\n");
86+
return 0;
87+
}

0 commit comments

Comments
 (0)