|
| 1 | +#include <algorithm> |
| 2 | +#include <chrono> |
| 3 | +#include <cmath> |
| 4 | +#include <iostream> |
| 5 | +#include <vector> |
| 6 | + |
| 7 | +#include "gtest/gtest.h" |
| 8 | +#include "layers/BinaryOpLayer.hpp" |
| 9 | + |
| 10 | +#define ENABLE_TIMING_OUTPUT 1 |
| 11 | + |
| 12 | +#if ENABLE_TIMING_OUTPUT |
| 13 | +#define PRINT_TIMING(msg) std::cout << msg << std::endl |
| 14 | +#else |
| 15 | +#define PRINT_TIMING(msg) ((void)0) |
| 16 | +#endif |
| 17 | + |
| 18 | +using namespace it_lab_ai; |
| 19 | + |
| 20 | +static void ExpectTensorsNear(const Tensor& a, const Tensor& b, |
| 21 | + float tolerance = 1e-5f) { |
| 22 | + ASSERT_EQ(a.get_shape(), b.get_shape()); |
| 23 | + ASSERT_EQ(a.get_type(), b.get_type()); |
| 24 | + |
| 25 | + if (a.get_type() == Type::kFloat) { |
| 26 | + auto data_a = *a.as<float>(); |
| 27 | + auto data_b = *b.as<float>(); |
| 28 | + ASSERT_EQ(data_a.size(), data_b.size()); |
| 29 | + for (size_t i = 0; i < data_a.size(); ++i) { |
| 30 | + EXPECT_NEAR(data_a[i], data_b[i], tolerance) << "Mismatch at index " << i; |
| 31 | + } |
| 32 | + } else if (a.get_type() == Type::kInt) { |
| 33 | + auto data_a = *a.as<int>(); |
| 34 | + auto data_b = *b.as<int>(); |
| 35 | + ASSERT_EQ(data_a.size(), data_b.size()); |
| 36 | + for (size_t i = 0; i < data_a.size(); ++i) { |
| 37 | + EXPECT_EQ(data_a[i], data_b[i]) << "Mismatch at index " << i; |
| 38 | + } |
| 39 | + } |
| 40 | +} |
| 41 | + |
| 42 | +static Tensor RunBinary(BinaryOpLayer& layer, const Tensor& a, const Tensor& b, |
| 43 | + ParBackend backend, long long* duration_ms = nullptr) { |
| 44 | + RuntimeOptions options; |
| 45 | + options.par_backend = backend; |
| 46 | + |
| 47 | + Tensor output; |
| 48 | + std::vector<Tensor> in{a, b}; |
| 49 | + std::vector<Tensor> out{output}; |
| 50 | + |
| 51 | + auto start = std::chrono::high_resolution_clock::now(); |
| 52 | + layer.run(in, out, options); |
| 53 | + auto end = std::chrono::high_resolution_clock::now(); |
| 54 | + |
| 55 | + if (duration_ms != nullptr) { |
| 56 | + *duration_ms = |
| 57 | + std::chrono::duration_cast<std::chrono::milliseconds>(end - start) |
| 58 | + .count(); |
| 59 | + } |
| 60 | + |
| 61 | + return out[0]; |
| 62 | +} |
| 63 | + |
| 64 | +static void RunAllBackendsAndCompare(BinaryOpLayer& layer, const Tensor& a, |
| 65 | + const Tensor& b, const std::string& label, |
| 66 | + float tolerance = 1e-5f) { |
| 67 | + std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads, |
| 68 | + ParBackend::kTbb, ParBackend::kOmp, |
| 69 | + ParBackend::kKokkos}; |
| 70 | + |
| 71 | + Tensor baseline = RunBinary(layer, a, b, ParBackend::kSeq); |
| 72 | + |
| 73 | + for (auto backend : backends) { |
| 74 | + long long ms = 0; |
| 75 | + Tensor result = RunBinary(layer, a, b, backend, &ms); |
| 76 | + |
| 77 | + PRINT_TIMING("BinaryOp " << label << " Backend " |
| 78 | + << static_cast<int>(backend) << " time: " << ms |
| 79 | + << " ms"); |
| 80 | + |
| 81 | + ExpectTensorsNear(baseline, result, tolerance); |
| 82 | + } |
| 83 | +} |
| 84 | + |
| 85 | +TEST(binaryoplayer_parall, parallel_add_basic_float) { |
| 86 | + Tensor a = make_tensor<float>({1.f, 2.f, 3.f, 4.f}, {2, 2}); |
| 87 | + Tensor b = make_tensor<float>({5.f, 6.f, 7.f, 8.f}, {2, 2}); |
| 88 | + |
| 89 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); |
| 90 | + RunAllBackendsAndCompare(layer, a, b, "add_basic_float"); |
| 91 | +} |
| 92 | + |
| 93 | +TEST(binaryoplayer_parall, parallel_mul_basic_int) { |
| 94 | + Tensor a = make_tensor<int>({1, 2, 3, 4}, {2, 2}); |
| 95 | + Tensor b = make_tensor<int>({2, 3, 4, 5}, {2, 2}); |
| 96 | + |
| 97 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); |
| 98 | + RunAllBackendsAndCompare(layer, a, b, "mul_basic_int", 0.0f); |
| 99 | +} |
| 100 | + |
| 101 | +TEST(binaryoplayer_parall, parallel_sub_scalar_float) { |
| 102 | + Shape shape({1024, 1024}); |
| 103 | + Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 5.0f), shape); |
| 104 | + Tensor scalar = make_tensor<float>({2.0f}); |
| 105 | + |
| 106 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kSub); |
| 107 | + RunAllBackendsAndCompare(layer, a, scalar, "sub_scalar_float"); |
| 108 | +} |
| 109 | + |
| 110 | +TEST(binaryoplayer_parall, parallel_div_scalar_float) { |
| 111 | + Shape shape({1024, 1024}); |
| 112 | + Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 8.0f), shape); |
| 113 | + Tensor scalar = make_tensor<float>({2.0f}); |
| 114 | + |
| 115 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kDiv); |
| 116 | + RunAllBackendsAndCompare(layer, a, scalar, "div_scalar_float"); |
| 117 | +} |
| 118 | + |
| 119 | +TEST(binaryoplayer_parall, parallel_broadcast_2d_add) { |
| 120 | + Tensor a = make_tensor<float>(std::vector<float>(1024 * 1, 3.0f), {1024, 1}); |
| 121 | + Tensor b = make_tensor<float>(std::vector<float>(1 * 1024, 4.0f), {1, 1024}); |
| 122 | + |
| 123 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); |
| 124 | + RunAllBackendsAndCompare(layer, a, b, "broadcast_2d_add"); |
| 125 | +} |
| 126 | + |
| 127 | +TEST(binaryoplayer_parall, parallel_broadcast_3d_mul) { |
| 128 | + Tensor a = |
| 129 | + make_tensor<float>(std::vector<float>(64 * 1 * 512, 1.5f), {64, 1, 512}); |
| 130 | + Tensor b = |
| 131 | + make_tensor<float>(std::vector<float>(64 * 512 * 1, 2.0f), {64, 512, 1}); |
| 132 | + |
| 133 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); |
| 134 | + RunAllBackendsAndCompare(layer, a, b, "broadcast_3d_mul"); |
| 135 | +} |
| 136 | + |
| 137 | +TEST(binaryoplayer_parall, parallel_large_add_same_shape) { |
| 138 | + Shape shape({2048, 2048}); |
| 139 | + Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 1.0f), shape); |
| 140 | + Tensor b = make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape); |
| 141 | + |
| 142 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); |
| 143 | + RunAllBackendsAndCompare(layer, a, b, "large_add_same_shape"); |
| 144 | +} |
| 145 | + |
| 146 | +TEST(binaryoplayer_parall, parallel_large_mul_same_shape) { |
| 147 | + Shape shape({1024, 1024, 4}); |
| 148 | + Tensor a = |
| 149 | + make_tensor<float>(std::vector<float>(shape.count(), 1.25f), shape); |
| 150 | + Tensor b = make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape); |
| 151 | + |
| 152 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kMul); |
| 153 | + RunAllBackendsAndCompare(layer, a, b, "large_mul_same_shape"); |
| 154 | +} |
| 155 | + |
| 156 | +TEST(binaryoplayer_parall, parallel_large_broadcast_4d_add) { |
| 157 | + Shape a_shape({16, 32, 128, 1}); |
| 158 | + Shape b_shape({1, 32, 1, 128}); |
| 159 | + |
| 160 | + Tensor a = |
| 161 | + make_tensor<float>(std::vector<float>(a_shape.count(), 1.0f), a_shape); |
| 162 | + Tensor b = |
| 163 | + make_tensor<float>(std::vector<float>(b_shape.count(), 2.0f), b_shape); |
| 164 | + |
| 165 | + BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); |
| 166 | + RunAllBackendsAndCompare(layer, a, b, "large_broadcast_4d_add"); |
| 167 | +} |
| 168 | + |
| 169 | +// TEST(binaryoplayer_parall, parallel_huge_timing_add) { |
| 170 | +// Shape shape({128, 512, 512}); |
| 171 | +// Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 1.0f), |
| 172 | +// shape); Tensor b = |
| 173 | +// make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape); |
| 174 | + |
| 175 | +// BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd); |
| 176 | +// RunAllBackendsAndCompare(layer, a, b, "huge_timing_add"); |
| 177 | +// } |
0 commit comments