Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/runtime/device_buffer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
// are added to the copy by inserting it such that the stride is
// in ascending order in the dst.
for (int i = 0; i < dst->dimensions; i++) {
// Skip extent-one dimensions
if (dst->dim[i].extent == 1) {
continue;
}
// TODO: deal with negative strides.
uint64_t dst_stride_bytes = (uint64_t)dst->dim[i].stride * dst->type.bytes();
uint64_t src_stride_bytes = (uint64_t)src->dim[i].stride * src->type.bytes();
Expand All @@ -147,7 +151,6 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
c.src_stride_bytes[j] = c.src_stride_bytes[j - 1];
}
c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent);
// debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n";
c.dst_stride_bytes[insert] = dst_stride_bytes;
c.src_stride_bytes[insert] = src_stride_bytes;
};
Expand All @@ -174,6 +177,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host,
c.src_stride_bytes[MAX_COPY_DIMS - 1] = 0;
c.dst_stride_bytes[MAX_COPY_DIMS - 1] = 0;
}

return c;
}

Expand Down
1 change: 1 addition & 0 deletions test/performance/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ tests(GROUPS performance
boundary_conditions.cpp
clamped_vector_load.cpp
const_division.cpp
device_copy.cpp
fast_inverse.cpp
fast_pow.cpp
fast_sine_cosine.cpp
Expand Down
87 changes: 87 additions & 0 deletions test/performance/device_copy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#include <vector>

#include "halide_benchmark.h"
#include <Halide.h>

using namespace Halide;
using namespace Halide::Tools;

double test(const Target &target, std::vector<int> order, std::vector<int> shape) {
Buffer<float> buf(shape, order);
for (int t = 0; t < shape[3]; t++) {
for (int c = 0; c < shape[2]; c++) {
for (int y = 0; y < shape[1]; y++) {
for (int x = 0; x < shape[0]; x++) {
buf(x, y, c, t) = x * .5f + y * 2.f + c * 4.f + t * .8f;
}
}
}
}

buf.set_host_dirty();

buf.device_malloc();

double time = benchmark([&]() {
buf.set_host_dirty();
buf.copy_to_device(target);
buf.device_sync();
});

// Nuke the host side data so we can check the data transferred back and forth OK.
buf.set_device_dirty(false);
buf.fill(0.f);
buf.set_host_dirty(false);
buf.set_device_dirty(true);
buf.copy_to_host();

for (int t = 0; t < shape[3]; t++) {
for (int c = 0; c < shape[2]; c++) {
for (int y = 0; y < shape[1]; y++) {
for (int x = 0; x < shape[0]; x++) {
float correct = x * .5f + y * 2.f + c * 4.f + t * .8f;
if (buf(x, y, c, t) != correct) {
printf("buf(%d, %d, %d, %d) = %f instead of %f\n",
x, y, c, t, buf(x, y, c, t), correct);
exit(-1);
}
}
}
}
}

return time;
}

int main() {
auto target = get_jit_target_from_environment();
if (!target.has_gpu_feature()) {
printf("[SKIP] This test requires a GPU target.\n");
return 0;
}

// These copies are all the same size and dense, but are in different memory
// orderings and some of them have some extent=1 dimensions. (See
// https://github.com/halide/Halide/issues/8956)

double t1 = test(target, {3, 2, 0, 1}, {1024, 1024, 3, 2});
double t2 = test(target, {3, 2, 0, 1}, {1024, 1024, 6, 1});
double t3 = test(target, {0, 1, 2, 3}, {1024, 1024, 3, 2});
double t4 = test(target, {0, 1, 2, 3}, {1024, 1024, 6, 1});

double slowest = std::max({t1, t2, t3, t4});
double fastest = std::min({t1, t2, t3, t4});

printf("Timings: %f %f %f %f\n", t1, t2, t3, t4);

// If one of these gets broken into a large number of separate copies, it
// will be a lot more than 10x slower.
if (slowest > 10 * fastest) {
printf("Suspiciously large variation in timings for what should "
"be a dense host->device copy.\n");
return -1;
}

printf("Success!\n");
return 0;
}
Loading