|
| 1 | +#include <vector> |
| 2 | + |
| 3 | +#include "halide_benchmark.h" |
| 4 | +#include <Halide.h> |
| 5 | + |
| 6 | +using namespace Halide; |
| 7 | +using namespace Halide::Tools; |
| 8 | + |
| 9 | +double test(const Target &target, std::vector<int> order, std::vector<int> shape) { |
| 10 | + Buffer<float> buf(shape, order); |
| 11 | + for (int t = 0; t < shape[3]; t++) { |
| 12 | + for (int c = 0; c < shape[2]; c++) { |
| 13 | + for (int y = 0; y < shape[1]; y++) { |
| 14 | + for (int x = 0; x < shape[0]; x++) { |
| 15 | + buf(x, y, c, t) = x * .5f + y * 2.f + c * 4.f + t * .8f; |
| 16 | + } |
| 17 | + } |
| 18 | + } |
| 19 | + } |
| 20 | + |
| 21 | + buf.set_host_dirty(); |
| 22 | + |
| 23 | + buf.device_malloc(); |
| 24 | + |
| 25 | + double time = benchmark([&]() { |
| 26 | + buf.set_host_dirty(); |
| 27 | + buf.copy_to_device(target); |
| 28 | + buf.device_sync(); |
| 29 | + }); |
| 30 | + |
| 31 | + // Nuke the host side data so we can check the data transferred back and forth OK. |
| 32 | + buf.set_device_dirty(false); |
| 33 | + buf.fill(0.f); |
| 34 | + buf.set_host_dirty(false); |
| 35 | + buf.set_device_dirty(true); |
| 36 | + buf.copy_to_host(); |
| 37 | + |
| 38 | + for (int t = 0; t < shape[3]; t++) { |
| 39 | + for (int c = 0; c < shape[2]; c++) { |
| 40 | + for (int y = 0; y < shape[1]; y++) { |
| 41 | + for (int x = 0; x < shape[0]; x++) { |
| 42 | + float correct = x * .5f + y * 2.f + c * 4.f + t * .8f; |
| 43 | + if (buf(x, y, c, t) != correct) { |
| 44 | + printf("buf(%d, %d, %d, %d) = %f instead of %f\n", |
| 45 | + x, y, c, t, buf(x, y, c, t), correct); |
| 46 | + exit(-1); |
| 47 | + } |
| 48 | + } |
| 49 | + } |
| 50 | + } |
| 51 | + } |
| 52 | + |
| 53 | + return time; |
| 54 | +} |
| 55 | + |
| 56 | +int main() { |
| 57 | + auto target = get_jit_target_from_environment(); |
| 58 | + if (!target.has_gpu_feature()) { |
| 59 | + printf("[SKIP] This test requires a GPU target.\n"); |
| 60 | + return 0; |
| 61 | + } |
| 62 | + |
| 63 | + // These copies are all the same size and dense, but are in different memory |
| 64 | + // orderings and some of them have some extent=1 dimensions. (See |
| 65 | + // https://github.com/halide/Halide/issues/8956) |
| 66 | + |
| 67 | + double t1 = test(target, {3, 2, 0, 1}, {1024, 1024, 3, 2}); |
| 68 | + double t2 = test(target, {3, 2, 0, 1}, {1024, 1024, 6, 1}); |
| 69 | + double t3 = test(target, {0, 1, 2, 3}, {1024, 1024, 3, 2}); |
| 70 | + double t4 = test(target, {0, 1, 2, 3}, {1024, 1024, 6, 1}); |
| 71 | + |
| 72 | + double slowest = std::max({t1, t2, t3, t4}); |
| 73 | + double fastest = std::min({t1, t2, t3, t4}); |
| 74 | + |
| 75 | + printf("Timings: %f %f %f %f\n", t1, t2, t3, t4); |
| 76 | + |
| 77 | + // If one of these gets broken into a large number of separate copies, it |
| 78 | + // will be a lot more than 10x slower. |
| 79 | + if (slowest > 10 * fastest) { |
| 80 | + printf("Suspiciously large variation in timings for what should " |
| 81 | + "be a dense host->device copy.\n"); |
| 82 | + return -1; |
| 83 | + } |
| 84 | + |
| 85 | + printf("Success!\n"); |
| 86 | + return 0; |
| 87 | +} |
0 commit comments