From 0c734bb51ecbfe3f4d1b7639f7e35ea221641e73 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 20 Feb 2026 10:58:39 -0800 Subject: [PATCH 1/3] Skip extent-one dimensions in a device copy Their strides are meaningless anyway Fixes #8956 --- src/runtime/device_buffer_utils.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/runtime/device_buffer_utils.h b/src/runtime/device_buffer_utils.h index 36d1bcf98058..d2a8105dc148 100644 --- a/src/runtime/device_buffer_utils.h +++ b/src/runtime/device_buffer_utils.h @@ -129,6 +129,10 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, // are added to the copy by inserting it such that the stride is // in ascending order in the dst. for (int i = 0; i < dst->dimensions; i++) { + // Skip extent-one dimensions + if (dst->dim[i].extent == 1) { + continue; + } // TODO: deal with negative strides. uint64_t dst_stride_bytes = (uint64_t)dst->dim[i].stride * dst->type.bytes(); uint64_t src_stride_bytes = (uint64_t)src->dim[i].stride * src->type.bytes(); @@ -147,7 +151,6 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, c.src_stride_bytes[j] = c.src_stride_bytes[j - 1]; } c.extent[insert] = min(src->dim[i].extent, dst->dim[i].extent); - // debug(nullptr) << "c.extent[" << insert << "] = " << (int)(c.extent[insert]) << "\n"; c.dst_stride_bytes[insert] = dst_stride_bytes; c.src_stride_bytes[insert] = src_stride_bytes; }; @@ -174,6 +177,7 @@ WEAK device_copy make_buffer_copy(const halide_buffer_t *src, bool src_host, c.src_stride_bytes[MAX_COPY_DIMS - 1] = 0; c.dst_stride_bytes[MAX_COPY_DIMS - 1] = 0; } + return c; } From 7c92aec084d36653de34bfff7020939971a481f9 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 20 Feb 2026 12:32:15 -0800 Subject: [PATCH 2/3] Add test --- test/performance/CMakeLists.txt | 1 + test/performance/device_copy.cpp | 87 ++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 test/performance/device_copy.cpp diff --git a/test/performance/CMakeLists.txt b/test/performance/CMakeLists.txt index 851e7e3ae506..96ab02ff811d 100644 --- a/test/performance/CMakeLists.txt +++ b/test/performance/CMakeLists.txt @@ -12,6 +12,7 @@ tests(GROUPS performance boundary_conditions.cpp clamped_vector_load.cpp const_division.cpp + device_copy.cpp fast_inverse.cpp fast_pow.cpp fast_sine_cosine.cpp diff --git a/test/performance/device_copy.cpp b/test/performance/device_copy.cpp new file mode 100644 index 000000000000..62e561686e96 --- /dev/null +++ b/test/performance/device_copy.cpp @@ -0,0 +1,87 @@ +#include + +#include "halide_benchmark.h" +#include + +using namespace Halide; +using namespace Halide::Tools; + +double test(const Target &target, std::vector order, std::vector shape) { + Buffer buf(shape, order); + for (int t = 0; t < shape[3]; t++) { + for (int c = 0; c < shape[2]; c++) { + for (int y = 0; y < shape[1]; y++) { + for (int x = 0; x < shape[0]; x++) { + buf(x, y, c, t) = x * .5f + y * 2.f + c * 4.f + t * .8f; + } + } + } + } + + buf.set_host_dirty(); + + buf.device_malloc(); + + double time = benchmark([&]() { + buf.set_host_dirty(); + buf.copy_to_device(target); + buf.device_sync(); + }); + + // Nuke the host side data so we can check the data transferred back and forth OK. + buf.set_device_dirty(false); + buf.fill(0.f); + buf.set_host_dirty(false); + buf.set_device_dirty(true); + buf.copy_to_host(); + + for (int t = 0; t < shape[3]; t++) { + for (int c = 0; c < shape[2]; c++) { + for (int y = 0; y < shape[1]; y++) { + for (int x = 0; x < shape[0]; x++) { + float correct = x * .5f + y * 2.f + c * 4.f + t * .8f; + if (buf(x, y, c, t) != correct) { + printf("buf(%d, %d, %d, %d) = %f instead of %f\n", + x, y, c, t, buf(x, y, c, t), correct); + exit(-1); + } + } + } + } + } + + return time; +} + +int main() { + auto target = get_jit_target_from_environment(); + if (!target.has_gpu_feature()) { + printf("This test requires a GPU target.\n"); + return 0; + } + + // These copies are all the same size and dense, but are in different memory + // orderings and some of them have some extent=1 dimensions. (See + // https://github.com/halide/Halide/issues/8956) + + double t1 = test(target, {3, 2, 0, 1}, {1024, 1024, 3, 2}); + double t2 = test(target, {3, 2, 0, 1}, {1024, 1024, 6, 1}); + double t3 = test(target, {0, 1, 2, 3}, {1024, 1024, 3, 2}); + double t4 = test(target, {0, 1, 2, 3}, {1024, 1024, 6, 1}); + + double slowest = std::max({t1, t2, t3, t4}); + double fastest = std::min({t1, t2, t3, t4}); + + printf("Timings: %f %f %f %f\n", t1, t2, t3, t4); + + // If one of these gets broken into a large number of separate copies, it + // will be a lot more than 10x slower. + if (slowest > 10 * fastest) { + printf("Suspiciously large variation in timings for what should " + "be a dense host->device copy.\n"); + return -1; + } + + printf("Success!\n"); + return 0; +} From 1d796f01efd7811162e4398cf5ca52c2678dc008 Mon Sep 17 00:00:00 2001 From: Andrew Adams Date: Fri, 20 Feb 2026 12:57:51 -0800 Subject: [PATCH 3/3] Add SKIP annotation --- test/performance/device_copy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/performance/device_copy.cpp b/test/performance/device_copy.cpp index 62e561686e96..eaea48d74ff5 100644 --- a/test/performance/device_copy.cpp +++ b/test/performance/device_copy.cpp @@ -56,7 +56,7 @@ double test(const Target &target, std::vector order, std::vector shape int main() { auto target = get_jit_target_from_environment(); if (!target.has_gpu_feature()) { - printf("This test requires a GPU target.\n"); + printf("[SKIP] This test requires a GPU target.\n"); return 0; }