Skip to content

Commit c751af4

Browse files
BinaryOp Layer && parallelfor (#274)
1 parent 66b2b24 commit c751af4

3 files changed

Lines changed: 210 additions & 12 deletions

File tree

include/layers/BinaryOpLayer.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ class BinaryOpLayer : public Layer {
1919

2020
void run(const std::vector<Tensor>& input,
2121
std::vector<Tensor>& output) override;
22+
void run(const std::vector<Tensor>& input, std::vector<Tensor>& output,
23+
const RuntimeOptions& options) override;
2224
static bool is_scalar_tensor(const Tensor& t);
2325

2426
#ifdef ENABLE_STATISTIC_WEIGHTS
@@ -30,6 +32,7 @@ class BinaryOpLayer : public Layer {
3032

3133
private:
3234
Operation op_;
35+
ParBackend parallel_backend_ = ParBackend::kSeq;
3336

3437
template <typename ValueType>
3538
void run_with_scalar_impl(const Tensor& input, ValueType scalar,

src/layers/BinaryOpLayer.cpp

Lines changed: 30 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@ void BinaryOpLayer::run(const std::vector<Tensor>& input,
8080
}
8181
}
8282

83+
void BinaryOpLayer::run(const std::vector<Tensor>& input,
84+
std::vector<Tensor>& output,
85+
const RuntimeOptions& options) {
86+
parallel_backend_ = options.par_backend;
87+
run(input, output);
88+
}
89+
8390
void BinaryOpLayer::run_with_scalar(const Tensor& input, float scalar,
8491
Tensor& output) const {
8592
switch (input.get_type()) {
@@ -101,12 +108,17 @@ template <typename ValueType>
101108
void BinaryOpLayer::run_with_scalar_impl(const Tensor& input, ValueType scalar,
102109
Tensor& output) const {
103110
const auto& input_data = *input.as<ValueType>();
104-
std::vector<ValueType> result;
105-
result.reserve(input_data.size());
111+
std::vector<ValueType> result(input_data.size());
106112

107-
for (const auto& val : input_data) {
108-
result.push_back(apply_binary_op(val, scalar, op_));
109-
}
113+
parallel::Options options;
114+
options.backend = parallel_backend_;
115+
116+
parallel::parallel_for(
117+
input_data.size(),
118+
[&](size_t i) {
119+
result[i] = apply_binary_op(input_data[i], scalar, op_);
120+
},
121+
options);
110122

111123
output = make_tensor(result, input.get_shape());
112124
}
@@ -122,13 +134,19 @@ void BinaryOpLayer::run_broadcast_impl(const Tensor& A, const Tensor& B,
122134
const auto strides_b = get_strides(B.get_shape());
123135
const auto strides_output = get_strides(output_shape);
124136

125-
for (size_t i = 0; i < result.size(); ++i) {
126-
size_t a_idx = get_broadcasted_index(i, A.get_shape(), output_shape,
127-
strides_a, strides_output);
128-
size_t b_idx = get_broadcasted_index(i, B.get_shape(), output_shape,
129-
strides_b, strides_output);
130-
result[i] = apply_binary_op(a_data[a_idx], b_data[b_idx], op_);
131-
}
137+
parallel::Options options;
138+
options.backend = parallel_backend_;
139+
140+
parallel::parallel_for(
141+
result.size(),
142+
[&](size_t i) {
143+
size_t a_idx = get_broadcasted_index(i, A.get_shape(), output_shape,
144+
strides_a, strides_output);
145+
size_t b_idx = get_broadcasted_index(i, B.get_shape(), output_shape,
146+
strides_b, strides_output);
147+
result[i] = apply_binary_op(a_data[a_idx], b_data[b_idx], op_);
148+
},
149+
options);
132150

133151
output = make_tensor(result, output_shape);
134152
}
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#include <algorithm>
2+
#include <chrono>
3+
#include <cmath>
4+
#include <iostream>
5+
#include <vector>
6+
7+
#include "gtest/gtest.h"
8+
#include "layers/BinaryOpLayer.hpp"
9+
10+
#define ENABLE_TIMING_OUTPUT 1
11+
12+
#if ENABLE_TIMING_OUTPUT
13+
#define PRINT_TIMING(msg) std::cout << msg << std::endl
14+
#else
15+
#define PRINT_TIMING(msg) ((void)0)
16+
#endif
17+
18+
using namespace it_lab_ai;
19+
20+
static void ExpectTensorsNear(const Tensor& a, const Tensor& b,
21+
float tolerance = 1e-5f) {
22+
ASSERT_EQ(a.get_shape(), b.get_shape());
23+
ASSERT_EQ(a.get_type(), b.get_type());
24+
25+
if (a.get_type() == Type::kFloat) {
26+
auto data_a = *a.as<float>();
27+
auto data_b = *b.as<float>();
28+
ASSERT_EQ(data_a.size(), data_b.size());
29+
for (size_t i = 0; i < data_a.size(); ++i) {
30+
EXPECT_NEAR(data_a[i], data_b[i], tolerance) << "Mismatch at index " << i;
31+
}
32+
} else if (a.get_type() == Type::kInt) {
33+
auto data_a = *a.as<int>();
34+
auto data_b = *b.as<int>();
35+
ASSERT_EQ(data_a.size(), data_b.size());
36+
for (size_t i = 0; i < data_a.size(); ++i) {
37+
EXPECT_EQ(data_a[i], data_b[i]) << "Mismatch at index " << i;
38+
}
39+
}
40+
}
41+
42+
static Tensor RunBinary(BinaryOpLayer& layer, const Tensor& a, const Tensor& b,
43+
ParBackend backend, long long* duration_ms = nullptr) {
44+
RuntimeOptions options;
45+
options.par_backend = backend;
46+
47+
Tensor output;
48+
std::vector<Tensor> in{a, b};
49+
std::vector<Tensor> out{output};
50+
51+
auto start = std::chrono::high_resolution_clock::now();
52+
layer.run(in, out, options);
53+
auto end = std::chrono::high_resolution_clock::now();
54+
55+
if (duration_ms != nullptr) {
56+
*duration_ms =
57+
std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
58+
.count();
59+
}
60+
61+
return out[0];
62+
}
63+
64+
static void RunAllBackendsAndCompare(BinaryOpLayer& layer, const Tensor& a,
65+
const Tensor& b, const std::string& label,
66+
float tolerance = 1e-5f) {
67+
std::vector<ParBackend> backends = {ParBackend::kSeq, ParBackend::kThreads,
68+
ParBackend::kTbb, ParBackend::kOmp,
69+
ParBackend::kKokkos};
70+
71+
Tensor baseline = RunBinary(layer, a, b, ParBackend::kSeq);
72+
73+
for (auto backend : backends) {
74+
long long ms = 0;
75+
Tensor result = RunBinary(layer, a, b, backend, &ms);
76+
77+
PRINT_TIMING("BinaryOp " << label << " Backend "
78+
<< static_cast<int>(backend) << " time: " << ms
79+
<< " ms");
80+
81+
ExpectTensorsNear(baseline, result, tolerance);
82+
}
83+
}
84+
85+
TEST(binaryoplayer_parall, parallel_add_basic_float) {
86+
Tensor a = make_tensor<float>({1.f, 2.f, 3.f, 4.f}, {2, 2});
87+
Tensor b = make_tensor<float>({5.f, 6.f, 7.f, 8.f}, {2, 2});
88+
89+
BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd);
90+
RunAllBackendsAndCompare(layer, a, b, "add_basic_float");
91+
}
92+
93+
TEST(binaryoplayer_parall, parallel_mul_basic_int) {
94+
Tensor a = make_tensor<int>({1, 2, 3, 4}, {2, 2});
95+
Tensor b = make_tensor<int>({2, 3, 4, 5}, {2, 2});
96+
97+
BinaryOpLayer layer(BinaryOpLayer::Operation::kMul);
98+
RunAllBackendsAndCompare(layer, a, b, "mul_basic_int", 0.0f);
99+
}
100+
101+
TEST(binaryoplayer_parall, parallel_sub_scalar_float) {
102+
Shape shape({1024, 1024});
103+
Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 5.0f), shape);
104+
Tensor scalar = make_tensor<float>({2.0f});
105+
106+
BinaryOpLayer layer(BinaryOpLayer::Operation::kSub);
107+
RunAllBackendsAndCompare(layer, a, scalar, "sub_scalar_float");
108+
}
109+
110+
TEST(binaryoplayer_parall, parallel_div_scalar_float) {
111+
Shape shape({1024, 1024});
112+
Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 8.0f), shape);
113+
Tensor scalar = make_tensor<float>({2.0f});
114+
115+
BinaryOpLayer layer(BinaryOpLayer::Operation::kDiv);
116+
RunAllBackendsAndCompare(layer, a, scalar, "div_scalar_float");
117+
}
118+
119+
TEST(binaryoplayer_parall, parallel_broadcast_2d_add) {
120+
Tensor a = make_tensor<float>(std::vector<float>(1024 * 1, 3.0f), {1024, 1});
121+
Tensor b = make_tensor<float>(std::vector<float>(1 * 1024, 4.0f), {1, 1024});
122+
123+
BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd);
124+
RunAllBackendsAndCompare(layer, a, b, "broadcast_2d_add");
125+
}
126+
127+
TEST(binaryoplayer_parall, parallel_broadcast_3d_mul) {
128+
Tensor a =
129+
make_tensor<float>(std::vector<float>(64 * 1 * 512, 1.5f), {64, 1, 512});
130+
Tensor b =
131+
make_tensor<float>(std::vector<float>(64 * 512 * 1, 2.0f), {64, 512, 1});
132+
133+
BinaryOpLayer layer(BinaryOpLayer::Operation::kMul);
134+
RunAllBackendsAndCompare(layer, a, b, "broadcast_3d_mul");
135+
}
136+
137+
TEST(binaryoplayer_parall, parallel_large_add_same_shape) {
138+
Shape shape({2048, 2048});
139+
Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 1.0f), shape);
140+
Tensor b = make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape);
141+
142+
BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd);
143+
RunAllBackendsAndCompare(layer, a, b, "large_add_same_shape");
144+
}
145+
146+
TEST(binaryoplayer_parall, parallel_large_mul_same_shape) {
147+
Shape shape({1024, 1024, 4});
148+
Tensor a =
149+
make_tensor<float>(std::vector<float>(shape.count(), 1.25f), shape);
150+
Tensor b = make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape);
151+
152+
BinaryOpLayer layer(BinaryOpLayer::Operation::kMul);
153+
RunAllBackendsAndCompare(layer, a, b, "large_mul_same_shape");
154+
}
155+
156+
TEST(binaryoplayer_parall, parallel_large_broadcast_4d_add) {
157+
Shape a_shape({16, 32, 128, 1});
158+
Shape b_shape({1, 32, 1, 128});
159+
160+
Tensor a =
161+
make_tensor<float>(std::vector<float>(a_shape.count(), 1.0f), a_shape);
162+
Tensor b =
163+
make_tensor<float>(std::vector<float>(b_shape.count(), 2.0f), b_shape);
164+
165+
BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd);
166+
RunAllBackendsAndCompare(layer, a, b, "large_broadcast_4d_add");
167+
}
168+
169+
// TEST(binaryoplayer_parall, parallel_huge_timing_add) {
170+
// Shape shape({128, 512, 512});
171+
// Tensor a = make_tensor<float>(std::vector<float>(shape.count(), 1.0f),
172+
// shape); Tensor b =
173+
// make_tensor<float>(std::vector<float>(shape.count(), 2.0f), shape);
174+
175+
// BinaryOpLayer layer(BinaryOpLayer::Operation::kAdd);
176+
// RunAllBackendsAndCompare(layer, a, b, "huge_timing_add");
177+
// }

0 commit comments

Comments
 (0)